Merge branch 'master' into patch-6

78ddf6eb · cclauss · GitHub · 50cb0365 · 1f34fcaf · 78ddf6eb
Unverified Commit 78ddf6eb authored Jan 26, 2018 by cclauss Committed by GitHub Jan 26, 2018
20 changed files
--- a/research/compression/entropy_coder/lib/blocks_std_test.py
+++ b/research/compression/entropy_coder/lib/blocks_std_test.py
@@ -22,6 +22,7 @@ import math
 import os

 import numpy as np
+from six.moves import xrange
 import tensorflow as tf

 import blocks_std

--- a/research/delf/delf/python/feature_io.py
+++ b/research/delf/delf/python/feature_io.py
@@ -25,6 +25,7 @@ from __future__ import print_function
 from delf import feature_pb2
 from delf import datum_io
 import numpy as np
+from six.moves import xrange
 import tensorflow as tf



--- a/research/differential_privacy/dp_sgd/dp_mnist/dp_mnist.py
+++ b/research/differential_privacy/dp_sgd/dp_mnist/dp_mnist.py
@@ -22,6 +22,7 @@ import sys
 import time

 import numpy as np
+from six.moves import xrange
 import tensorflow as tf

 from differential_privacy.dp_sgd.dp_optimizer import dp_optimizer

--- a/research/differential_privacy/dp_sgd/per_example_gradients/per_example_gradients.py
+++ b/research/differential_privacy/dp_sgd/per_example_gradients/per_example_gradients.py
@@ -17,6 +17,7 @@

 import collections

+from six.moves import xrange
 import tensorflow as tf

 OrderedDict = collections.OrderedDict

--- a/research/differential_privacy/multiple_teachers/aggregation.py
+++ b/research/differential_privacy/multiple_teachers/aggregation.py
@@ -19,6 +19,7 @@ from __future__ import division
 from __future__ import print_function

 import numpy as np
+from six.moves import xrange


 def labels_from_probs(probs):
@@ -127,5 +128,3 @@ def aggregation_most_frequent(logits):
    result[i] = np.argmax(label_counts)

  return np.asarray(result, dtype=np.int32)
-
-
--- a/research/differential_privacy/multiple_teachers/analysis.py
+++ b/research/differential_privacy/multiple_teachers/analysis.py
@@ -41,6 +41,7 @@ python analysis.py
 import os
 import math
 import numpy as np
+from six.moves import xrange
 import tensorflow as tf

 from differential_privacy.multiple_teachers.input import maybe_download
@@ -139,7 +140,7 @@ def logmgf_exact(q, priv_eps, l):
    try:
      log_t = math.log(t)
    except ValueError:
-      print "Got ValueError in math.log for values :" + str((q, priv_eps, l, t))
+      print("Got ValueError in math.log for values :" + str((q, priv_eps, l, t)))
      log_t = priv_eps * l
  else:
    log_t = priv_eps * l
@@ -171,7 +172,7 @@ def sens_at_k(counts, noise_eps, l, k):
  """
  counts_sorted = sorted(counts, reverse=True)
  if 0.5 * noise_eps * l > 1:
-    print "l too large to compute sensitivity"
+    print("l too large to compute sensitivity")
    return 0
  # Now we can assume that at k, gap remains positive
  # or we have reached the point where logmgf_exact is
@@ -268,8 +269,8 @@ def main(unused_argv):
  # Solving gives eps = (alpha - ln (delta))/l
  eps_list_nm = (total_log_mgf_nm - math.log(delta)) / l_list

-  print "Epsilons (Noisy Max): " + str(eps_list_nm)
-  print "Smoothed sensitivities (Noisy Max): " + str(total_ss_nm / l_list)
+  print("Epsilons (Noisy Max): " + str(eps_list_nm))
+  print("Smoothed sensitivities (Noisy Max): " + str(total_ss_nm / l_list))

  # If beta < eps / 2 ln (1/delta), then adding noise Lap(1) * 2 SS/eps
  # is eps,delta DP
@@ -280,12 +281,12 @@ def main(unused_argv):
  # Print the first one's scale
  ss_eps = 2.0 * beta * math.log(1/delta)
  ss_scale = 2.0 / ss_eps
-  print "To get an " + str(ss_eps) + "-DP estimate of epsilon, "
-  print "..add noise ~ " + str(ss_scale)
-  print "... times " + str(total_ss_nm / l_list)
-  print "Epsilon = " + str(min(eps_list_nm)) + "."
+  print("To get an " + str(ss_eps) + "-DP estimate of epsilon, ")
+  print("..add noise ~ " + str(ss_scale))
+  print("... times " + str(total_ss_nm / l_list))
+  print("Epsilon = " + str(min(eps_list_nm)) + ".")
  if min(eps_list_nm) == eps_list_nm[-1]:
-    print "Warning: May not have used enough values of l"
+    print("Warning: May not have used enough values of l")

  # Data independent bound, as mechanism is
  # 2*noise_eps DP.
@@ -294,7 +295,7 @@ def main(unused_argv):
      [logmgf_exact(1.0, 2.0 * noise_eps, l) for l in l_list])

  data_ind_eps_list = (data_ind_log_mgf - math.log(delta)) / l_list
-  print "Data independent bound = " + str(min(data_ind_eps_list)) + "."
+  print("Data independent bound = " + str(min(data_ind_eps_list)) + ".")

  return


--- a/research/differential_privacy/multiple_teachers/deep_cnn.py
+++ b/research/differential_privacy/multiple_teachers/deep_cnn.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 from datetime import datetime
 import math
 import numpy as np
+from six.moves import xrange
 import tensorflow as tf
 import time

@@ -600,5 +601,3 @@ def softmax_preds(images, ckpt_path, return_logits=False):
  tf.reset_default_graph()

  return preds
-
-
--- a/research/differential_privacy/multiple_teachers/input.py
+++ b/research/differential_privacy/multiple_teachers/input.py
@@ -24,6 +24,7 @@ import numpy as np
 import os
 from scipy.io import loadmat as loadmat
 from six.moves import urllib
+from six.moves import xrange
 import sys
 import tarfile


--- a/research/differential_privacy/multiple_teachers/train_student.py
+++ b/research/differential_privacy/multiple_teachers/train_student.py
@@ -19,6 +19,7 @@ from __future__ import division
 from __future__ import print_function

 import numpy as np
+from six.moves import xrange
 import tensorflow as tf

 from differential_privacy.multiple_teachers import aggregation

--- a/research/differential_privacy/privacy_accountant/python/gaussian_moments.py
+++ b/research/differential_privacy/privacy_accountant/python/gaussian_moments.py
@@ -40,12 +40,15 @@ To verify that the I1 >= I2 (see comments in GaussianMomentsAccountant in
 accountant.py for the context), run the same loop above with verify=True
 passed to compute_log_moment.
 """
+from __future__ import print_function
+
 import math
 import sys

 import numpy as np
 import scipy.integrate as integrate
 import scipy.stats
+from six.moves import xrange
 from sympy.mpmath import mp


@@ -108,10 +111,10 @@ def compute_a(sigma, q, lmbd, verbose=False):
  a_lambda_exact = ((1.0 - q) * a_lambda_first_term_exact +
                    q * a_lambda_second_term_exact)
  if verbose:
-    print "A: by binomial expansion    {} = {} + {}".format(
+    print("A: by binomial expansion    {} = {} + {}".format(
        a_lambda_exact,
        (1.0 - q) * a_lambda_first_term_exact,
-        q * a_lambda_second_term_exact)
+        q * a_lambda_second_term_exact))
  return _to_np_float64(a_lambda_exact)


@@ -125,8 +128,8 @@ def compute_b(sigma, q, lmbd, verbose=False):
  b_fn = lambda z: (np.power(mu0(z) / mu(z), lmbd) -
                    np.power(mu(-z) / mu0(z), lmbd))
  if verbose:
-    print "M =", m
-    print "f(-M) = {} f(M) = {}".format(b_fn(-m), b_fn(m))
+    print("M =", m)
+    print("f(-M) = {} f(M) = {}".format(b_fn(-m), b_fn(m)))
    assert b_fn(-m) < 0 and b_fn(m) < 0

  b_lambda_int1_fn = lambda z: (mu0(z) *
@@ -140,9 +143,9 @@ def compute_b(sigma, q, lmbd, verbose=False):
  b_bound = a_lambda_m1 + b_int1 - b_int2

  if verbose:
-    print "B: by numerical integration", b_lambda
-    print "B must be no more than     ", b_bound
-  print b_lambda, b_bound
+    print("B: by numerical integration", b_lambda)
+    print("B must be no more than     ", b_bound)
+  print(b_lambda, b_bound)
  return _to_np_float64(b_lambda)


@@ -188,10 +191,10 @@ def compute_a_mp(sigma, q, lmbd, verbose=False):
  a_lambda_second_term = integral_inf_mp(a_lambda_second_term_fn)

  if verbose:
-    print "A: by numerical integration {} = {} + {}".format(
+    print("A: by numerical integration {} = {} + {}".format(
        a_lambda,
        (1 - q) * a_lambda_first_term,
-        q * a_lambda_second_term)
+        q * a_lambda_second_term))

  return _to_np_float64(a_lambda)

@@ -210,8 +213,8 @@ def compute_b_mp(sigma, q, lmbd, verbose=False):
  b_fn = lambda z: ((mu0(z) / mu(z)) ** lmbd_int -
                    (mu(-z) / mu0(z)) ** lmbd_int)
  if verbose:
-    print "M =", m
-    print "f(-M) = {} f(M) = {}".format(b_fn(-m), b_fn(m))
+    print("M =", m)
+    print("f(-M) = {} f(M) = {}".format(b_fn(-m), b_fn(m)))
    assert b_fn(-m) < 0 and b_fn(m) < 0

  b_lambda_int1_fn = lambda z: mu0(z) * (mu0(z) / mu(z)) ** lmbd_int
@@ -223,8 +226,8 @@ def compute_b_mp(sigma, q, lmbd, verbose=False):
  b_bound = a_lambda_m1 + b_int1 - b_int2

  if verbose:
-    print "B by numerical integration", b_lambda
-    print "B must be no more than    ", b_bound
+    print("B by numerical integration", b_lambda)
+    print("B must be no more than    ", b_bound)
  assert b_lambda < b_bound + 1e-5
  return _to_np_float64(b_lambda)


--- a/research/domain_adaptation/domain_separation/dsn_eval.py
+++ b/research/domain_adaptation/domain_separation/dsn_eval.py
@@ -19,6 +19,7 @@
 import math

 import numpy as np
+from six.moves import xrange
 import tensorflow as tf

 from domain_adaptation.datasets import dataset_factory

--- a/research/fivo/README.md
+++ b/research/fivo/README.md
+# Filtering Variational Objectives
+
+This folder contains a TensorFlow implementation of the algorithms from
+
+Chris J. Maddison\*, Dieterich Lawson\*, George Tucker\*, Nicolas Heess, Mohammad Norouzi, Andriy Mnih, Arnaud Doucet, and Yee Whye Teh. "Filtering Variational Objectives." NIPS 2017.
+
+[https://arxiv.org/abs/1705.09279](https://arxiv.org/abs/1705.09279)
+
+This code implements 3 different bounds for training sequential latent variable models: the evidence lower bound (ELBO), the importance weighted auto-encoder bound (IWAE), and our bound, the filtering variational objective (FIVO).
+
+Additionally it contains an implementation of the variational recurrent neural network (VRNN), a sequential latent variable model that can be trained using these three objectives. This repo provides code for training a VRNN to do sequence modeling of pianoroll and speech data.
+
+#### Directory Structure
+The important parts of the code are organized as follows.
+
+```
+fivo.py           # main script, contains flag definitions
+runners.py        # graph construction code for training and evaluation
+bounds.py         # code for computing each bound
+data
+├── datasets.py                    # readers for pianoroll and speech datasets
+├── calculate_pianoroll_mean.py    # preprocesses the pianoroll datasets
+└── create_timit_dataset.py        # preprocesses the TIMIT dataset
+models
+└── vrnn.py       # variational RNN implementation
+bin
+├── run_train.sh            # an example script that runs training
+├── run_eval.sh             # an example script that runs evaluation
+└── download_pianorolls.sh  # a script that downloads the pianoroll files
+```
+
+### Training on Pianorolls
+
+Requirements before we start:
+
+* TensorFlow (see [tensorflow.org](http://tensorflow.org) for how to install)
+* [scipy](https://www.scipy.org/)
+* [sonnet](https://github.com/deepmind/sonnet)
+
+
+#### Download the Data
+
+The pianoroll datasets are encoded as pickled sparse arrays and are available at [http://www-etud.iro.umontreal.ca/~boulanni/icml2012](http://www-etud.iro.umontreal.ca/~boulanni/icml2012). You can use the script `bin/download_pianorolls.sh` to download the files into a directory of your choosing.
+```
+export PIANOROLL_DIR=~/pianorolls
+mkdir $PIANOROLL_DIR
+sh bin/download_pianorolls.sh $PIANOROLL_DIR
+```
+
+#### Preprocess the Data
+
+The script `calculate_pianoroll_mean.py` loads a pianoroll pickle file, calculates the mean, updates the pickle file to include the mean under the key `train_mean`, and writes the file back to disk in-place. You should do this for all pianoroll datasets you wish to train on.
+
+```
+python data/calculate_pianoroll_mean.py --in_file=$PIANOROLL_DIR/piano-midi.de.pkl
+python data/calculate_pianoroll_mean.py --in_file=$PIANOROLL_DIR/nottingham.de.pkl
+python data/calculate_pianoroll_mean.py --in_file=$PIANOROLL_DIR/musedata.pkl
+python data/calculate_pianoroll_mean.py --in_file=$PIANOROLL_DIR/jsb.pkl
+```
+
+#### Training
+
+Now we can train a model. Here is a standard training run, taken from `bin/run_train.sh`:
+```
+python fivo.py \
+  --mode=train \
+  --logdir=/tmp/fivo \
+  --model=vrnn \
+  --bound=fivo \
+  --summarize_every=100 \
+  --batch_size=4 \
+  --num_samples=4 \
+  --learning_rate=0.0001 \
+  --dataset_path="$PIANOROLL_DIR/jsb.pkl" \
+  --dataset_type="pianoroll"
+```
+
+You should see output that looks something like this (with a lot of extra logging cruft):
+
+```
+Step 1, fivo bound per timestep: -11.801050
+global_step/sec: 9.89825
+Step 101, fivo bound per timestep: -11.198309
+global_step/sec: 9.55475
+Step 201, fivo bound per timestep: -11.287262
+global_step/sec: 9.68146
+step 301, fivo bound per timestep: -11.316490
+global_step/sec: 9.94295
+Step 401, fivo bound per timestep: -11.151743
+```
+You will also see lines saying `Out of range: exceptions.StopIteration: Iteration finished`. This is not an error and is fine.
+#### Evaluation
+
+You can also evaluate saved checkpoints. The `eval` mode loads a model checkpoint, tests its performance on all items in a dataset, and reports the log-likelihood averaged over the dataset. For example here is a command, taken from `bin/run_eval.sh`, that will evaluate a JSB model on the test set:
+
+```
+python fivo.py \
+  --mode=eval \
+  --split=test \
+  --alsologtostderr \
+  --logdir=/tmp/fivo \
+  --model=vrnn \
+  --batch_size=4 \
+  --num_samples=4 \
+  --dataset_path="$PIANOROLL_DIR/jsb.pkl" \
+  --dataset_type="pianoroll"
+```
+
+You should see output like this:
+```
+Model restored from step 1, evaluating.
+test elbo ll/t: -12.299635, iwae ll/t: -12.128336 fivo ll/t: -11.656939
+test elbo ll/seq: -754.750312, iwae ll/seq: -744.238773 fivo ll/seq: -715.3121490
+```
+The evaluation script prints log-likelihood in both nats per timestep (ll/t) and nats per sequence (ll/seq) for all three bounds.
+
+### Training on TIMIT
+
+The TIMIT speech dataset is available at the [Linguistic Data Consortium website](https://catalog.ldc.upenn.edu/LDC93S1), but is unfortunately not free. These instructions will proceed assuming you have downloaded the TIMIT archive and extracted it into the directory `$RAW_TIMIT_DIR`.
+
+#### Preprocess TIMIT
+
+We preprocess TIMIT (as described in our paper) and write it out to a series of TFRecord files. To prepare the TIMIT dataset use the script `create_timit_dataset.py`
+```
+export $TIMIT_DIR=~/timit_dataset
+mkdir $TIMIT_DIR
+python data/create_timit_dataset.py \
+  --raw_timit_dir=$RAW_TIMIT_DIR \
+  --out_dir=$TIMIT_DIR
+```
+You should see this exact output:
+```
+4389 train / 231 valid / 1680 test
+train mean: 0.006060  train std: 548.136169
+```
+
+#### Training on TIMIT
+This is very similar to training on pianoroll datasets, with just a few flags switched.
+```
+python fivo.py \
+  --mode=train \
+  --logdir=/tmp/fivo \
+  --model=vrnn \
+  --bound=fivo \
+  --summarize_every=100 \
+  --batch_size=4 \
+  --num_samples=4 \
+  --learning_rate=0.0001 \
+  --dataset_path="$TIMIT_DIR/train" \
+  --dataset_type="speech"
+```
+
+### Contact
+
+This codebase is maintained by Dieterich Lawson, reachable via email at dieterichl@google.com. For questions and issues please open an issue on the tensorflow/models issues tracker and assign it to @dieterichlawson.
--- a/research/fivo/bin/download_pianorolls.sh
+++ b/research/fivo/bin/download_pianorolls.sh
+#!/bin/bash
+# Copyright 2017 The TensorFlow Authors All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+# A script to download the pianoroll datasets.
+# Accepts one argument, the directory to put the files in
+
+if [ -z "$1" ]
+  then
+    echo "Error, must provide a directory to download the files to."
+    exit
+fi
+
+echo "Downloading datasets into $1"
+curl -s "http://www-etud.iro.umontreal.ca/~boulanni/Piano-midi.de.pickle" > $1/piano-midi.de.pkl
+curl -s "http://www-etud.iro.umontreal.ca/~boulanni/Nottingham.pickle" > $1/nottingham.pkl
+curl -s "http://www-etud.iro.umontreal.ca/~boulanni/MuseData.pickle" > $1/musedata.pkl
+curl -s "http://www-etud.iro.umontreal.ca/~boulanni/JSB%20Chorales.pickle" > $1/jsb.pkl
--- a/research/fivo/bin/run_eval.sh
+++ b/research/fivo/bin/run_eval.sh
+#!/bin/bash
+# Copyright 2017 The TensorFlow Authors All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+# An example of running evaluation.
+
+PIANOROLL_DIR=$HOME/pianorolls
+
+python fivo.py \
+  --mode=eval \
+  --logdir=/tmp/fivo \
+  --model=vrnn \
+  --batch_size=4 \
+  --num_samples=4 \
+  --split=test \
+  --dataset_path="$PIANOROLL_DIR/jsb.pkl" \
+  --dataset_type="pianoroll"
--- a/research/fivo/bin/run_train.sh
+++ b/research/fivo/bin/run_train.sh
+#!/bin/bash
+# Copyright 2017 The TensorFlow Authors All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+# An example of running training.
+
+PIANOROLL_DIR=$HOME/pianorolls
+
+python fivo.py \
+  --mode=train \
+  --logdir=/tmp/fivo \
+  --model=vrnn \
+  --bound=fivo \
+  --summarize_every=100 \
+  --batch_size=4 \
+  --num_samples=4 \
+  --learning_rate=0.0001 \
+  --dataset_path="$PIANOROLL_DIR/jsb.pkl" \
+  --dataset_type="pianoroll"
--- a/research/fivo/bounds.py
+++ b/research/fivo/bounds.py
+# Copyright 2017 The TensorFlow Authors All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Implementation of objectives for training stochastic latent variable models.
+
+Contains implementations of the Importance Weighted Autoencoder objective (IWAE)
+and the Filtering Variational objective (FIVO).
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tensorflow as tf
+
+import nested_utils as nested
+
+
+def iwae(cell,
+         inputs,
+         seq_lengths,
+         num_samples=1,
+         parallel_iterations=30,
+         swap_memory=True):
+  """Computes the IWAE lower bound on the log marginal probability.
+
+  This method accepts a stochastic latent variable model and some observations
+  and computes a stochastic lower bound on the log marginal probability of the
+  observations. The IWAE estimator is defined by averaging multiple importance
+  weights. For more details see "Importance Weighted Autoencoders" by Burda
+  et al. https://arxiv.org/abs/1509.00519.
+
+  When num_samples = 1, this bound becomes the evidence lower bound (ELBO).
+
+  Args:
+    cell: A callable that implements one timestep of the model. See
+      models/vrnn.py for an example.
+    inputs: The inputs to the model. A potentially nested list or tuple of
+      Tensors each of shape [max_seq_len, batch_size, ...]. The Tensors must
+      have a rank at least two and have matching shapes in the first two
+      dimensions, which represent time and the batch respectively. At each
+      timestep 'cell' will be called with a slice of the Tensors in inputs.
+    seq_lengths: A [batch_size] Tensor of ints encoding the length of each
+      sequence in the batch (sequences can be padded to a common length).
+    num_samples: The number of samples to use.
+    parallel_iterations: The number of parallel iterations to use for the
+      internal while loop.
+    swap_memory: Whether GPU-CPU memory swapping should be enabled for the
+      internal while loop.
+
+  Returns:
+    log_p_hat: A Tensor of shape [batch_size] containing IWAE's estimate of the
+      log marginal probability of the observations.
+    kl: A Tensor of shape [batch_size] containing the kl divergence
+      from q(z|x) to p(z), averaged over samples.
+    log_weights: A Tensor of shape [max_seq_len, batch_size, num_samples]
+      containing the log weights at each timestep. Will not be valid for
+      timesteps past the end of a sequence.
+    log_ess: A Tensor of shape [max_seq_len, batch_size] containing the log
+      effective sample size at each timestep. Will not be valid for timesteps
+      past the end of a sequence.
+  """
+  batch_size = tf.shape(seq_lengths)[0]
+  max_seq_len = tf.reduce_max(seq_lengths)
+  seq_mask = tf.transpose(
+      tf.sequence_mask(seq_lengths, maxlen=max_seq_len, dtype=tf.float32),
+      perm=[1, 0])
+  if num_samples > 1:
+    inputs, seq_mask = nested.tile_tensors([inputs, seq_mask], [1, num_samples])
+  inputs_ta, mask_ta = nested.tas_for_tensors([inputs, seq_mask], max_seq_len)
+
+  t0 = tf.constant(0, tf.int32)
+  init_states = cell.zero_state(batch_size * num_samples, tf.float32)
+  ta_names = ['log_weights', 'log_ess']
+  tas = [tf.TensorArray(tf.float32, max_seq_len, name='%s_ta' % n)
+         for n in ta_names]
+  log_weights_acc = tf.zeros([num_samples, batch_size], dtype=tf.float32)
+  kl_acc = tf.zeros([num_samples * batch_size], dtype=tf.float32)
+  accs = (log_weights_acc, kl_acc)
+
+  def while_predicate(t, *unused_args):
+    return t < max_seq_len
+
+  def while_step(t, rnn_state, tas, accs):
+    """Implements one timestep of IWAE computation."""
+    log_weights_acc, kl_acc = accs
+    cur_inputs, cur_mask = nested.read_tas([inputs_ta, mask_ta], t)
+    # Run the cell for one step.
+    log_q_z, log_p_z, log_p_x_given_z, kl, new_state = cell(
+        cur_inputs,
+        rnn_state,
+        cur_mask,
+    )
+    # Compute the incremental weight and use it to update the current
+    # accumulated weight.
+    kl_acc += kl * cur_mask
+    log_alpha = (log_p_x_given_z + log_p_z - log_q_z) * cur_mask
+    log_alpha = tf.reshape(log_alpha, [num_samples, batch_size])
+    log_weights_acc += log_alpha
+    # Calculate the effective sample size.
+    ess_num = 2 * tf.reduce_logsumexp(log_weights_acc, axis=0)
+    ess_denom = tf.reduce_logsumexp(2 * log_weights_acc, axis=0)
+    log_ess = ess_num - ess_denom
+    # Update the  Tensorarrays and accumulators.
+    ta_updates = [log_weights_acc, log_ess]
+    new_tas = [ta.write(t, x) for ta, x in zip(tas, ta_updates)]
+    new_accs = (log_weights_acc, kl_acc)
+    return t + 1, new_state, new_tas, new_accs
+
+  _, _, tas, accs = tf.while_loop(
+      while_predicate,
+      while_step,
+      loop_vars=(t0, init_states, tas, accs),
+      parallel_iterations=parallel_iterations,
+      swap_memory=swap_memory)
+
+  log_weights, log_ess = [x.stack() for x in tas]
+  final_log_weights, kl = accs
+  log_p_hat = (tf.reduce_logsumexp(final_log_weights, axis=0) -
+               tf.log(tf.to_float(num_samples)))
+  kl = tf.reduce_mean(tf.reshape(kl, [num_samples, batch_size]), axis=0)
+  log_weights = tf.transpose(log_weights, perm=[0, 2, 1])
+  return log_p_hat, kl, log_weights, log_ess
+
+
+def ess_criterion(num_samples, log_ess, unused_t):
+  """A criterion that resamples based on effective sample size."""
+  return log_ess <= tf.log(num_samples / 2.0)
+
+
+def never_resample_criterion(unused_num_samples, log_ess, unused_t):
+  """A criterion that never resamples."""
+  return tf.cast(tf.zeros_like(log_ess), tf.bool)
+
+
+def always_resample_criterion(unused_num_samples, log_ess, unused_t):
+  """A criterion resamples at every timestep."""
+  return tf.cast(tf.ones_like(log_ess), tf.bool)
+
+
+def fivo(cell,
+         inputs,
+         seq_lengths,
+         num_samples=1,
+         resampling_criterion=ess_criterion,
+         parallel_iterations=30,
+         swap_memory=True,
+         random_seed=None):
+  """Computes the FIVO lower bound on the log marginal probability.
+
+  This method accepts a stochastic latent variable model and some observations
+  and computes a stochastic lower bound on the log marginal probability of the
+  observations. The lower bound is defined by a particle filter's unbiased
+  estimate of the marginal probability of the observations. For more details see
+  "Filtering Variational Objectives" by Maddison et al.
+  https://arxiv.org/abs/1705.09279.
+
+  When the resampling criterion is "never resample", this bound becomes IWAE.
+
+  Args:
+    cell: A callable that implements one timestep of the model. See
+      models/vrnn.py for an example.
+    inputs: The inputs to the model. A potentially nested list or tuple of
+      Tensors each of shape [max_seq_len, batch_size, ...]. The Tensors must
+      have a rank at least two and have matching shapes in the first two
+      dimensions, which represent time and the batch respectively. At each
+      timestep 'cell' will be called with a slice of the Tensors in inputs.
+    seq_lengths: A [batch_size] Tensor of ints encoding the length of each
+      sequence in the batch (sequences can be padded to a common length).
+    num_samples: The number of particles to use in each particle filter.
+    resampling_criterion: The resampling criterion to use for this particle
+      filter. Must accept the number of samples, the effective sample size,
+      and the current timestep and return a boolean Tensor of shape [batch_size]
+      indicating whether each particle filter should resample. See
+      ess_criterion and related functions defined in this file for examples.
+    parallel_iterations: The number of parallel iterations to use for the
+      internal while loop. Note that values greater than 1 can introduce
+      non-determinism even when random_seed is provided.
+    swap_memory: Whether GPU-CPU memory swapping should be enabled for the
+      internal while loop.
+    random_seed: The random seed to pass to the resampling operations in
+      the particle filter. Mainly useful for testing.
+
+  Returns:
+    log_p_hat: A Tensor of shape [batch_size] containing FIVO's estimate of the
+      log marginal probability of the observations.
+    kl: A Tensor of shape [batch_size] containing the sum over time of the kl
+      divergence from q_t(z_t|x) to p_t(z_t), averaged over particles. Note that
+      this includes kl terms from trajectories that are culled during resampling
+      steps.
+    log_weights: A Tensor of shape [max_seq_len, batch_size, num_samples]
+      containing the log weights at each timestep of the particle filter. Note
+      that on timesteps when a resampling operation is performed the log weights
+      are reset to 0. Will not be valid for timesteps past the end of a
+      sequence.
+    log_ess: A Tensor of shape [max_seq_len, batch_size] containing the log
+      effective sample size of each particle filter at each timestep. Will not
+      be valid for timesteps past the end of a sequence.
+    resampled: A Tensor of shape [max_seq_len, batch_size] indicating when the
+      particle filters resampled. Will be 1.0 on timesteps when resampling
+      occurred and 0.0 on timesteps when it did not.
+  """
+  # batch_size represents the number of particle filters running in parallel.
+  batch_size = tf.shape(seq_lengths)[0]
+  max_seq_len = tf.reduce_max(seq_lengths)
+  seq_mask = tf.transpose(
+      tf.sequence_mask(seq_lengths, maxlen=max_seq_len, dtype=tf.float32),
+      perm=[1, 0])
+
+  # Each sequence in the batch will be the input data for a different
+  # particle filter. The batch will be laid out as:
+  #   particle 1 of particle filter 1
+  #   particle 1 of particle filter 2
+  #   ...
+  #   particle 1 of particle filter batch_size
+  #   particle 2 of particle filter 1
+  #   ...
+  #   particle num_samples of particle filter batch_size
+  if num_samples > 1:
+    inputs, seq_mask = nested.tile_tensors([inputs, seq_mask], [1, num_samples])
+  inputs_ta, mask_ta = nested.tas_for_tensors([inputs, seq_mask], max_seq_len)
+
+  t0 = tf.constant(0, tf.int32)
+  init_states = cell.zero_state(batch_size * num_samples, tf.float32)
+  ta_names = ['log_weights', 'log_ess', 'resampled']
+  tas = [tf.TensorArray(tf.float32, max_seq_len, name='%s_ta' % n)
+         for n in ta_names]
+  log_weights_acc = tf.zeros([num_samples, batch_size], dtype=tf.float32)
+  log_p_hat_acc = tf.zeros([batch_size], dtype=tf.float32)
+  kl_acc = tf.zeros([num_samples * batch_size], dtype=tf.float32)
+  accs = (log_weights_acc, log_p_hat_acc, kl_acc)
+
+  def while_predicate(t, *unused_args):
+    return t < max_seq_len
+
+  def while_step(t, rnn_state, tas, accs):
+    """Implements one timestep of FIVO computation."""
+    log_weights_acc, log_p_hat_acc, kl_acc = accs
+    cur_inputs, cur_mask = nested.read_tas([inputs_ta, mask_ta], t)
+    # Run the cell for one step.
+    log_q_z, log_p_z, log_p_x_given_z, kl, new_state = cell(
+        cur_inputs,
+        rnn_state,
+        cur_mask,
+    )
+    # Compute the incremental weight and use it to update the current
+    # accumulated weight.
+    kl_acc += kl * cur_mask
+    log_alpha = (log_p_x_given_z + log_p_z - log_q_z) * cur_mask
+    log_alpha = tf.reshape(log_alpha, [num_samples, batch_size])
+    log_weights_acc += log_alpha
+    # Calculate the effective sample size.
+    ess_num = 2 * tf.reduce_logsumexp(log_weights_acc, axis=0)
+    ess_denom = tf.reduce_logsumexp(2 * log_weights_acc, axis=0)
+    log_ess = ess_num - ess_denom
+    # Calculate the ancestor indices via resampling. Because we maintain the
+    # log unnormalized weights, we pass the weights in as logits, allowing
+    # the distribution object to apply a softmax and normalize them.
+    resampling_dist = tf.contrib.distributions.Categorical(
+        logits=tf.transpose(log_weights_acc, perm=[1, 0]))
+    ancestor_inds = tf.stop_gradient(
+        resampling_dist.sample(sample_shape=num_samples, seed=random_seed))
+    # Because the batch is flattened and laid out as discussed
+    # above, we must modify ancestor_inds to index the proper samples.
+    # The particles in the ith filter are distributed every batch_size rows
+    # in the batch, and offset i rows from the top. So, to correct the indices
+    # we multiply by the batch_size and add the proper offset. Crucially,
+    # when ancestor_inds is flattened the layout of the batch is maintained.
+    offset = tf.expand_dims(tf.range(batch_size), 0)
+    ancestor_inds = tf.reshape(ancestor_inds * batch_size + offset, [-1])
+    noresample_inds = tf.range(num_samples * batch_size)
+    # Decide whether or not we should resample; don't resample if we are past
+    # the end of a sequence.
+    should_resample = resampling_criterion(num_samples, log_ess, t)
+    should_resample = tf.logical_and(should_resample,
+                                     cur_mask[:batch_size] > 0.)
+    float_should_resample = tf.to_float(should_resample)
+    ancestor_inds = tf.where(
+        tf.tile(should_resample, [num_samples]),
+        ancestor_inds,
+        noresample_inds)
+    new_state = nested.gather_tensors(new_state, ancestor_inds)
+    # Update the TensorArrays before we reset the weights so that we capture
+    # the incremental weights and not zeros.
+    ta_updates = [log_weights_acc, log_ess, float_should_resample]
+    new_tas = [ta.write(t, x) for ta, x in zip(tas, ta_updates)]
+    # For the particle filters that resampled, update log_p_hat and
+    # reset weights to zero.
+    log_p_hat_update = tf.reduce_logsumexp(
+        log_weights_acc, axis=0) - tf.log(tf.to_float(num_samples))
+    log_p_hat_acc += log_p_hat_update * float_should_resample
+    log_weights_acc *= (1. - tf.tile(float_should_resample[tf.newaxis, :],
+                                     [num_samples, 1]))
+    new_accs = (log_weights_acc, log_p_hat_acc, kl_acc)
+    return t + 1, new_state, new_tas, new_accs
+
+  _, _, tas, accs = tf.while_loop(
+      while_predicate,
+      while_step,
+      loop_vars=(t0, init_states, tas, accs),
+      parallel_iterations=parallel_iterations,
+      swap_memory=swap_memory)
+
+  log_weights, log_ess, resampled = [x.stack() for x in tas]
+  final_log_weights, log_p_hat, kl = accs
+  # Add in the final weight update to log_p_hat.
+  log_p_hat += (tf.reduce_logsumexp(final_log_weights, axis=0) -
+                tf.log(tf.to_float(num_samples)))
+  kl = tf.reduce_mean(tf.reshape(kl, [num_samples, batch_size]), axis=0)
+  log_weights = tf.transpose(log_weights, perm=[0, 2, 1])
+  return log_p_hat, kl, log_weights, log_ess, resampled
--- a/research/fivo/data/__init__.py
+++ b/research/fivo/data/__init__.py
--- a/research/fivo/data/calculate_pianoroll_mean.py
+++ b/research/fivo/data/calculate_pianoroll_mean.py
+# Copyright 2017 The TensorFlow Authors All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Script to calculate the mean of a pianoroll dataset.
+
+Given a pianoroll pickle file, this script loads the dataset and
+calculates the mean of the training set. Then it updates the pickle file
+so that the key "train_mean" points to the mean vector.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import pickle
+import numpy as np
+
+import tensorflow as tf
+
+
+from datasets import sparse_pianoroll_to_dense
+
+tf.app.flags.DEFINE_string('in_file', None,
+                           'Filename of the pickled pianoroll dataset to load.')
+tf.app.flags.DEFINE_string('out_file', None,
+                           'Name of the output pickle file. Defaults to in_file, '
+                           'updating the input pickle file.')
+tf.app.flags.mark_flag_as_required('in_file')
+
+FLAGS = tf.app.flags.FLAGS
+
+MIN_NOTE = 21
+MAX_NOTE = 108
+NUM_NOTES = MAX_NOTE - MIN_NOTE + 1
+
+
+def main(unused_argv):
+  if FLAGS.out_file is None:
+    FLAGS.out_file = FLAGS.in_file
+  with tf.gfile.Open(FLAGS.in_file, 'r') as f:
+    pianorolls = pickle.load(f)
+  dense_pianorolls = [sparse_pianoroll_to_dense(p, MIN_NOTE, NUM_NOTES)[0]
+                      for p in pianorolls['train']]
+  # Concatenate all elements along the time axis.
+  concatenated = np.concatenate(dense_pianorolls, axis=0)
+  mean = np.mean(concatenated, axis=0)
+  pianorolls['train_mean'] = mean
+  # Write out the whole pickle file, including the train mean.
+  pickle.dump(pianorolls, open(FLAGS.out_file, 'wb'))
+
+
+if __name__ == '__main__':
+  tf.app.run()
--- a/research/fivo/data/create_timit_dataset.py
+++ b/research/fivo/data/create_timit_dataset.py
+# Copyright 2017 The TensorFlow Authors All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Preprocesses TIMIT from raw wavfiles to create a set of TFRecords.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import glob
+import os
+import random
+import re
+
+import numpy as np
+import tensorflow as tf
+
+tf.app.flags.DEFINE_string("raw_timit_dir", None,
+                          "Directory containing TIMIT files.")
+tf.app.flags.DEFINE_string("out_dir", None,
+                          "Output directory for TFRecord files.")
+tf.app.flags.DEFINE_float("valid_frac", 0.05,
+                          "Fraction of train set to use as valid set. "
+                          "Must be between 0.0 and 1.0.")
+
+tf.app.flags.mark_flag_as_required("raw_timit_dir")
+tf.app.flags.mark_flag_as_required("out_dir")
+
+FLAGS = tf.app.flags.FLAGS
+
+NUM_TRAIN_FILES = 4620
+NUM_TEST_FILES = 1680
+SAMPLES_PER_TIMESTEP = 200
+
+# Regexes for reading SPHERE header files.
+SAMPLE_COUNT_REGEX = re.compile(r"sample_count -i (\d+)")
+SAMPLE_MIN_REGEX = re.compile(r"sample_min -i (-?\d+)")
+SAMPLE_MAX_REGEX = re.compile(r"sample_max -i (-?\d+)")
+
+
+def get_filenames(split):
+  """Get all wav filenames from the TIMIT archive."""
+  path = os.path.join(FLAGS.raw_timit_dir, "TIMIT", split, "*", "*", "*.WAV")
+  # Sort the output by name so the order is deterministic.
+  files = sorted(glob.glob(path))
+  return files
+
+
+def load_timit_wav(filename):
+  """Loads a TIMIT wavfile into a numpy array.
+
+  TIMIT wavfiles include a SPHERE header, detailed in the TIMIT docs. The first
+  line is the header type and the second is the length of the header in bytes.
+  After the header, the remaining bytes are actual WAV data.
+
+  The header includes information about the WAV data such as the number of
+  samples and minimum and maximum amplitude. This function asserts that the
+  loaded wav data matches the header.
+
+  Args:
+    filename: The name of the TIMIT wavfile to load.
+  Returns:
+    wav: A numpy array containing the loaded wav data.
+  """
+  wav_file = open(filename, "rb")
+  header_type = wav_file.readline()
+  header_length_str = wav_file.readline()
+  # The header length includes the length of the first two lines.
+  header_remaining_bytes = (int(header_length_str) - len(header_type) -
+                            len(header_length_str))
+  header = wav_file.read(header_remaining_bytes)
+  # Read the relevant header fields.
+  sample_count = int(SAMPLE_COUNT_REGEX.search(header).group(1))
+  sample_min = int(SAMPLE_MIN_REGEX.search(header).group(1))
+  sample_max = int(SAMPLE_MAX_REGEX.search(header).group(1))
+  wav = np.fromstring(wav_file.read(), dtype="int16").astype("float32")
+  # Check that the loaded data conforms to the header description.
+  assert len(wav) == sample_count
+  assert wav.min() == sample_min
+  assert wav.max() == sample_max
+  return wav
+
+
+def preprocess(wavs, block_size, mean, std):
+  """Normalize the wav data and reshape it into chunks."""
+  processed_wavs = []
+  for wav in wavs:
+    wav = (wav - mean) / std
+    wav_length = wav.shape[0]
+    if wav_length % block_size != 0:
+      pad_width = block_size - (wav_length % block_size)
+      wav = np.pad(wav, (0, pad_width), "constant")
+    assert wav.shape[0] % block_size == 0
+    wav = wav.reshape((-1, block_size))
+    processed_wavs.append(wav)
+  return processed_wavs
+
+
+def create_tfrecord_from_wavs(wavs, output_file):
+  """Writes processed wav files to disk as sharded TFRecord files."""
+  with tf.python_io.TFRecordWriter(output_file) as builder:
+    for wav in wavs:
+      builder.write(wav.astype(np.float32).tobytes())
+
+
+def main(unused_argv):
+  train_filenames = get_filenames("TRAIN")
+  test_filenames = get_filenames("TEST")
+
+  num_train_files = len(train_filenames)
+  num_test_files = len(test_filenames)
+  num_valid_files = int(num_train_files * FLAGS.valid_frac)
+  num_train_files -= num_valid_files
+
+  print("%d train / %d valid / %d test" % (
+      num_train_files, num_valid_files, num_test_files))
+
+  random.seed(1234)
+  random.shuffle(train_filenames)
+
+  valid_filenames = train_filenames[:num_valid_files]
+  train_filenames = train_filenames[num_valid_files:]
+
+  # Make sure there is no overlap in the train, test, and valid sets.
+  train_s = set(train_filenames)
+  test_s = set(test_filenames)
+  valid_s = set(valid_filenames)
+  # Disable explicit length testing to make the assertions more readable.
+  # pylint: disable=g-explicit-length-test
+  assert len(train_s & test_s) == 0
+  assert len(train_s & valid_s) == 0
+  assert len(valid_s & test_s) == 0
+  # pylint: enable=g-explicit-length-test
+
+  train_wavs = [load_timit_wav(f) for f in train_filenames]
+  valid_wavs = [load_timit_wav(f) for f in valid_filenames]
+  test_wavs = [load_timit_wav(f) for f in test_filenames]
+  assert len(train_wavs) + len(valid_wavs) == NUM_TRAIN_FILES
+  assert len(test_wavs) == NUM_TEST_FILES
+
+  # Calculate the mean and standard deviation of the train set.
+  train_stacked = np.hstack(train_wavs)
+  train_mean = np.mean(train_stacked)
+  train_std = np.std(train_stacked)
+  print("train mean: %f  train std: %f" % (train_mean, train_std))
+
+  # Process all data, normalizing with the train set statistics.
+  processed_train_wavs = preprocess(train_wavs, SAMPLES_PER_TIMESTEP,
+                                    train_mean, train_std)
+  processed_valid_wavs = preprocess(valid_wavs, SAMPLES_PER_TIMESTEP,
+                                    train_mean, train_std)
+  processed_test_wavs = preprocess(test_wavs, SAMPLES_PER_TIMESTEP, train_mean,
+                                   train_std)
+
+  # Write the datasets to disk.
+  create_tfrecord_from_wavs(
+      processed_train_wavs,
+      os.path.join(FLAGS.out_dir, "train"))
+  create_tfrecord_from_wavs(
+      processed_valid_wavs,
+      os.path.join(FLAGS.out_dir, "valid"))
+  create_tfrecord_from_wavs(
+      processed_test_wavs,
+      os.path.join(FLAGS.out_dir, "test"))
+
+
+if __name__ == "__main__":
+  tf.app.run()
--- a/research/fivo/data/datasets.py
+++ b/research/fivo/data/datasets.py
+# Copyright 2017 The TensorFlow Authors All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Code for creating sequence datasets.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import pickle
+from scipy.sparse import coo_matrix
+import tensorflow as tf
+
+# The default number of threads used to process data in parallel.
+DEFAULT_PARALLELISM = 12
+
+
+def sparse_pianoroll_to_dense(pianoroll, min_note, num_notes):
+  """Converts a sparse pianoroll to a dense numpy array.
+
+  Given a sparse pianoroll, converts it to a dense numpy array of shape
+  [num_timesteps, num_notes] where entry i,j is 1.0 if note j is active on
+  timestep i and 0.0 otherwise.
+
+  Args:
+    pianoroll: A sparse pianoroll object, a list of tuples where the i'th tuple
+      contains the indices of the notes active at timestep i.
+    min_note: The minimum note in the pianoroll, subtracted from all notes so
+      that the minimum note becomes 0.
+    num_notes: The number of possible different note indices, determines the
+      second dimension of the resulting dense array.
+  Returns:
+    dense_pianoroll: A [num_timesteps, num_notes] numpy array of floats.
+    num_timesteps: A python int, the number of timesteps in the pianoroll.
+  """
+  num_timesteps = len(pianoroll)
+  inds = []
+  for time, chord in enumerate(pianoroll):
+    # Re-index the notes to start from min_note.
+    inds.extend((time, note-min_note) for note in chord)
+  shape = [num_timesteps, num_notes]
+  values = [1.] * len(inds)
+  sparse_pianoroll = coo_matrix(
+      (values, ([x[0] for x in inds], [x[1] for x in inds])),
+      shape=shape)
+  return sparse_pianoroll.toarray(), num_timesteps
+
+
+def create_pianoroll_dataset(path,
+                             split,
+                             batch_size,
+                             num_parallel_calls=DEFAULT_PARALLELISM,
+                             shuffle=False,
+                             repeat=False,
+                             min_note=21,
+                             max_note=108):
+  """Creates a pianoroll dataset.
+
+  Args:
+    path: The path of a pickle file containing the dataset to load.
+    split: The split to use, can be train, test, or valid.
+    batch_size: The batch size. If repeat is False then it is not guaranteed
+      that the true batch size will match for all batches since batch_size
+      may not necessarily evenly divide the number of elements.
+    num_parallel_calls: The number of threads to use for parallel processing of
+      the data.
+    shuffle: If true, shuffles the order of the dataset.
+    repeat: If true, repeats the dataset endlessly.
+    min_note: The minimum note number of the dataset. For all pianoroll datasets
+      the minimum note is number 21, and changing this affects the dimension of
+      the data. This is useful mostly for testing.
+    max_note: The maximum note number of the dataset. For all pianoroll datasets
+      the maximum note is number 108, and changing this affects the dimension of
+      the data. This is useful mostly for testing.
+  Returns:
+    inputs: A batch of input sequences represented as a dense Tensor of shape
+      [time, batch_size, data_dimension]. The sequences in inputs are the
+      sequences in targets shifted one timestep into the future, padded with
+      zeros. This tensor is mean-centered, with the mean taken from the pickle
+      file key 'train_mean'.
+    targets: A batch of target sequences represented as a dense Tensor of
+      shape [time, batch_size, data_dimension].
+    lens: An int Tensor of shape [batch_size] representing the lengths of each
+      sequence in the batch.
+    mean: A float Tensor of shape [data_dimension] containing the mean loaded
+      from the pickle file.
+  """
+  # Load the data from disk.
+  num_notes = max_note - min_note + 1
+  with tf.gfile.Open(path, "r") as f:
+    raw_data = pickle.load(f)
+  pianorolls = raw_data[split]
+  mean = raw_data["train_mean"]
+  num_examples = len(pianorolls)
+
+  def pianoroll_generator():
+    for sparse_pianoroll in pianorolls:
+      yield sparse_pianoroll_to_dense(sparse_pianoroll, min_note, num_notes)
+
+  dataset = tf.data.Dataset.from_generator(
+      pianoroll_generator,
+      output_types=(tf.float64, tf.int64),
+      output_shapes=([None, num_notes], []))
+
+  if repeat: dataset = dataset.repeat()
+  if shuffle: dataset = dataset.shuffle(num_examples)
+
+  # Batch sequences togther, padding them to a common length in time.
+  dataset = dataset.padded_batch(batch_size,
+                                 padded_shapes=([None, num_notes], []))
+
+  def process_pianoroll_batch(data, lengths):
+    """Create mean-centered and time-major next-step prediction Tensors."""
+    data = tf.to_float(tf.transpose(data, perm=[1, 0, 2]))
+    lengths = tf.to_int32(lengths)
+    targets = data
+    # Mean center the inputs.
+    inputs = data - tf.constant(mean, dtype=tf.float32,
+                                shape=[1, 1, mean.shape[0]])
+    # Shift the inputs one step forward in time. Also remove the last timestep
+    # so that targets and inputs are the same length.
+    inputs = tf.pad(inputs, [[1, 0], [0, 0], [0, 0]], mode="CONSTANT")[:-1]
+    # Mask out unused timesteps.
+    inputs *= tf.expand_dims(tf.transpose(
+        tf.sequence_mask(lengths, dtype=inputs.dtype)), 2)
+    return inputs, targets, lengths
+
+  dataset = dataset.map(process_pianoroll_batch,
+                        num_parallel_calls=num_parallel_calls)
+  dataset = dataset.prefetch(num_examples)
+
+  itr = dataset.make_one_shot_iterator()
+  inputs, targets, lengths = itr.get_next()
+  return inputs, targets, lengths, tf.constant(mean, dtype=tf.float32)
+
+
+def create_speech_dataset(path,
+                          batch_size,
+                          samples_per_timestep=200,
+                          num_parallel_calls=DEFAULT_PARALLELISM,
+                          prefetch_buffer_size=2048,
+                          shuffle=False,
+                          repeat=False):
+  """Creates a speech dataset.
+
+  Args:
+    path: The path of a possibly sharded TFRecord file containing the data.
+    batch_size: The batch size. If repeat is False then it is not guaranteed
+      that the true batch size will match for all batches since batch_size
+      may not necessarily evenly divide the number of elements.
+    samples_per_timestep: The number of audio samples per timestep. Used to
+      reshape the data into sequences of shape [time, samples_per_timestep].
+      Should not change except for testing -- in all speech datasets 200 is the
+      number of samples per timestep.
+    num_parallel_calls: The number of threads to use for parallel processing of
+      the data.
+    prefetch_buffer_size: The size of the prefetch queues to use after reading
+      and processing the raw data.
+    shuffle: If true, shuffles the order of the dataset.
+    repeat: If true, repeats the dataset endlessly.
+  Returns:
+    inputs: A batch of input sequences represented as a dense Tensor of shape
+      [time, batch_size, samples_per_timestep]. The sequences in inputs are the
+      sequences in targets shifted one timestep into the future, padded with
+      zeros.
+    targets: A batch of target sequences represented as a dense Tensor of
+      shape [time, batch_size, samples_per_timestep].
+    lens: An int Tensor of shape [batch_size] representing the lengths of each
+      sequence in the batch.
+  """
+  filenames = [path]
+
+  def read_speech_example(value):
+    """Parses a single tf.Example from the TFRecord file."""
+    decoded = tf.decode_raw(value, out_type=tf.float32)
+    example = tf.reshape(decoded, [-1, samples_per_timestep])
+    length = tf.shape(example)[0]
+    return example, length
+
+  # Create the dataset from the TFRecord files
+  dataset = tf.data.TFRecordDataset(filenames).map(
+      read_speech_example, num_parallel_calls=num_parallel_calls)
+  dataset = dataset.prefetch(prefetch_buffer_size)
+
+  if repeat: dataset = dataset.repeat()
+  if shuffle: dataset = dataset.shuffle(prefetch_buffer_size)
+
+  dataset = dataset.padded_batch(
+      batch_size, padded_shapes=([None, samples_per_timestep], []))
+
+  def process_speech_batch(data, lengths):
+    """Creates Tensors for next step prediction."""
+    data = tf.transpose(data, perm=[1, 0, 2])
+    lengths = tf.to_int32(lengths)
+    targets = data
+    # Shift the inputs one step forward in time. Also remove the last timestep
+    # so that targets and inputs are the same length.
+    inputs = tf.pad(data, [[1, 0], [0, 0], [0, 0]], mode="CONSTANT")[:-1]
+    # Mask out unused timesteps.
+    inputs *= tf.expand_dims(
+        tf.transpose(tf.sequence_mask(lengths, dtype=inputs.dtype)), 2)
+    return inputs, targets, lengths
+
+  dataset = dataset.map(process_speech_batch,
+                        num_parallel_calls=num_parallel_calls)
+  dataset = dataset.prefetch(prefetch_buffer_size)
+
+  itr = dataset.make_one_shot_iterator()
+  inputs, targets, lengths = itr.get_next()
+  return inputs, targets, lengths