Update to the Neural GPU.

a315e568 · Lukasz Kaiser · d66941ac · a315e568 · a315e568 · a315e568
Commit a315e568 authored Jan 25, 2017 by Lukasz Kaiser
6 changed files
--- a/neural_gpu/README.md
+++ b/neural_gpu/README.md
@@ -4,7 +4,6 @@ in [[http://arxiv.org/abs/1511.08228]].

 Requirements:
 * TensorFlow (see tensorflow.org for how to install)
-* Matplotlib for Python (sudo apt-get install python-matplotlib)

 The model can be trained on the following algorithmic tasks:

@@ -26,17 +25,27 @@ The model can be trained on the following algorithmic tasks:
 * `qadd` - Long quaternary addition
 * `search` - Search for symbol key in dictionary

-The value range for symbols are defined by the `niclass` and `noclass` flags.
-In particular, the values are in the range `min(--niclass, noclass) - 1`.
-So if you set `--niclass=33` and `--noclass=33` (the default) then `--task=rev`
-will be reversing lists of 32 symbols, and `--task=id` will be identity on a
-list of up to 32 symbols.
+It can also be trained on the WMT English-French translation task:

+* `wmt` - WMT English-French translation (data will be downloaded)

-To train the model on the reverse task run:
+The value range for symbols are defined by the `vocab_size` flag.
+In particular, the values are in the range `vocab_size - 1`.
+So if you set `--vocab_size=16` (the default) then `--problem=rev`
+will be reversing lists of 15 symbols, and `--problem=id` will be identity
+on a list of up to 15 symbols.
+
+
+To train the model on the binary multiplication task run:
+
+```
+python neural_gpu_trainer.py --problem=bmul
+```
+
+This trains the Extended Neural GPU, to train the original model run:

 ```
-python neural_gpu_trainer.py --task=rev
+python neural_gpu_trainer.py --problem=bmul --beam_size=0
 ```

 While training, interim / checkpoint model parameters will be
@@ -47,16 +56,16 @@ with, hit `Ctrl-C` to stop the training process. The latest
 model parameters will be in `/tmp/neural_gpu/neural_gpu.ckpt-<step>`
 and used on any subsequent run.

-To test a trained model on how well it decodes run:
+To evaluate a trained model on how well it decodes run:

 ```
-python neural_gpu_trainer.py --task=rev --mode=1
+python neural_gpu_trainer.py --problem=bmul --mode=1
 ```

-To produce an animation of the result run:
+To interact with a model (experimental, see code) run:

 ```
-python neural_gpu_trainer.py --task=rev --mode=1 --animate=True
+python neural_gpu_trainer.py --problem=bmul --mode=2
 ```

 Maintained by Lukasz Kaiser (lukaszkaiser)
--- a/neural_gpu/data_utils.py
+++ b/neural_gpu/data_utils.py
@@ -12,9 +12,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Convolutional Gated Recurrent Networks for Algorithm Learning."""
+"""Neural GPU -- data generation and batching utilities."""

 import math
+import os
 import random
 import sys
 import time
@@ -22,22 +23,28 @@ import time
 import numpy as np
 import tensorflow as tf

-from tensorflow.python.platform import gfile
+import program_utils

 FLAGS = tf.app.flags.FLAGS

-bins = [8, 12, 16, 20, 24, 28, 32, 36, 40, 48, 64, 128]
+bins = [2 + bin_idx_i for bin_idx_i in xrange(256)]
 all_tasks = ["sort", "kvsort", "id", "rev", "rev2", "incr", "add", "left",
             "right", "left-shift", "right-shift", "bmul", "mul", "dup",
-             "badd", "qadd", "search"]
-forward_max = 128
+             "badd", "qadd", "search", "progeval", "progsynth"]
 log_filename = ""
+vocab, rev_vocab = None, None


 def pad(l):
  for b in bins:
    if b >= l: return b
-  return forward_max
+  return bins[-1]
+
+
+def bin_for(l):
+  for i, b in enumerate(bins):
+    if b >= l: return i
+  return len(bins) - 1


 train_set = {}
@@ -50,6 +57,35 @@ for some_task in all_tasks:
    test_set[some_task].append([])


+def read_tmp_file(name):
+  """Read from a file with the given name in our log directory or above."""
+  dirname = os.path.dirname(log_filename)
+  fname = os.path.join(dirname, name + ".txt")
+  if not tf.gfile.Exists(fname):
+    print_out("== not found file: " + fname)
+    fname = os.path.join(dirname, "../" + name + ".txt")
+  if not tf.gfile.Exists(fname):
+    print_out("== not found file: " + fname)
+    fname = os.path.join(dirname, "../../" + name + ".txt")
+  if not tf.gfile.Exists(fname):
+    print_out("== not found file: " + fname)
+    return None
+  print_out("== found file: " + fname)
+  res = []
+  with tf.gfile.GFile(fname, mode="r") as f:
+    for line in f:
+      res.append(line.strip())
+  return res
+
+
+def write_tmp_file(name, lines):
+  dirname = os.path.dirname(log_filename)
+  fname = os.path.join(dirname, name + ".txt")
+  with tf.gfile.GFile(fname, mode="w") as f:
+    for line in lines:
+      f.write(line + "\n")
+
+
 def add(n1, n2, base=10):
  """Add two numbers represented as lower-endian digit lists."""
  k = max(len(n1), len(n2)) + 1
@@ -130,6 +166,30 @@ def init_data(task, length, nbr_cases, nclass):
    sorted_kv = [(k, vals[i]) for (k, i) in sorted(keys)]
    return [x for p in kv for x in p], [x for p in sorted_kv for x in p]

+  def prog_io_pair(prog, max_len, counter=0):
+    try:
+      ilen = np.random.randint(max_len - 3) + 1
+      bound = max(15 - (counter / 20), 1)
+      inp = [random.choice(range(-bound, bound)) for _ in range(ilen)]
+      inp_toks = [program_utils.prog_rev_vocab[t]
+                  for t in program_utils.tokenize(str(inp)) if t != ","]
+      out = program_utils.evaluate(prog, {"a": inp})
+      out_toks = [program_utils.prog_rev_vocab[t]
+                  for t in program_utils.tokenize(str(out)) if t != ","]
+      if counter > 400:
+        out_toks = []
+      if (out_toks and out_toks[0] == program_utils.prog_rev_vocab["["] and
+          len(out_toks) != len([o for o in out if o == ","]) + 3):
+        raise ValueError("generated list with too long ints")
+      if (out_toks and out_toks[0] != program_utils.prog_rev_vocab["["] and
+          len(out_toks) > 1):
+        raise ValueError("generated one int but tokenized it to many")
+      if len(out_toks) > max_len:
+        raise ValueError("output too long")
+      return (inp_toks, out_toks)
+    except ValueError:
+      return prog_io_pair(prog, max_len, counter+1)
+
  def spec(inp):
    """Return the target given the input for some tasks."""
    if task == "sort":
@@ -164,43 +224,114 @@ def init_data(task, length, nbr_cases, nclass):
  l = length
  cur_time = time.time()
  total_time = 0.0
-  for case in xrange(nbr_cases):
+
+  is_prog = task in ["progeval", "progsynth"]
+  if is_prog:
+    inputs_per_prog = 5
+    program_utils.make_vocab()
+    progs = read_tmp_file("programs_len%d" % (l / 10))
+    if not progs:
+      progs = program_utils.gen(l / 10, 1.2 * nbr_cases / inputs_per_prog)
+      write_tmp_file("programs_len%d" % (l / 10), progs)
+    prog_ios = read_tmp_file("programs_len%d_io" % (l / 10))
+    nbr_cases = min(nbr_cases, len(progs) * inputs_per_prog) / 1.2
+    if not prog_ios:
+      # Generate program io data.
+      prog_ios = []
+      for pidx, prog in enumerate(progs):
+        if pidx % 500 == 0:
+          print_out("== generating io pairs for program %d" % pidx)
+        if pidx * inputs_per_prog > nbr_cases * 1.2:
+          break
+        ptoks = [program_utils.prog_rev_vocab[t]
+                 for t in program_utils.tokenize(prog)]
+        ptoks.append(program_utils.prog_rev_vocab["_EOS"])
+        plen = len(ptoks)
+        for _ in xrange(inputs_per_prog):
+          if task == "progeval":
+            inp, out = prog_io_pair(prog, plen)
+            prog_ios.append(str(inp) + "\t" + str(out) + "\t" + prog)
+          elif task == "progsynth":
+            plen = max(len(ptoks), 8)
+            for _ in xrange(3):
+              inp, out = prog_io_pair(prog, plen / 2)
+              prog_ios.append(str(inp) + "\t" + str(out) + "\t" + prog)
+      write_tmp_file("programs_len%d_io" % (l / 10), prog_ios)
+    prog_ios_dict = {}
+    for s in prog_ios:
+      i, o, p = s.split("\t")
+      i_clean = "".join([c for c in i if c.isdigit() or c == " "])
+      o_clean = "".join([c for c in o if c.isdigit() or c == " "])
+      inp = [int(x) for x in i_clean.split()]
+      out = [int(x) for x in o_clean.split()]
+      if inp and out:
+        if p in prog_ios_dict:
+          prog_ios_dict[p].append([inp, out])
+        else:
+          prog_ios_dict[p] = [[inp, out]]
+    # Use prog_ios_dict to create data.
+    progs = []
+    for prog in prog_ios_dict:
+      if len([c for c in prog if c == ";"]) <= (l / 10):
+        progs.append(prog)
+    nbr_cases = min(nbr_cases, len(progs) * inputs_per_prog) / 1.2
+    print_out("== %d training cases on %d progs" % (nbr_cases, len(progs)))
+    for pidx, prog in enumerate(progs):
+      if pidx * inputs_per_prog > nbr_cases * 1.2:
+        break
+      ptoks = [program_utils.prog_rev_vocab[t]
+               for t in program_utils.tokenize(prog)]
+      ptoks.append(program_utils.prog_rev_vocab["_EOS"])
+      plen = len(ptoks)
+      dset = train_set if pidx < nbr_cases / inputs_per_prog else test_set
+      for _ in xrange(inputs_per_prog):
+        if task == "progeval":
+          inp, out = prog_ios_dict[prog].pop()
+          dset[task][bin_for(plen)].append([[ptoks, inp, [], []], [out]])
+        elif task == "progsynth":
+          plen, ilist = max(len(ptoks), 8), [[]]
+          for _ in xrange(3):
+            inp, out = prog_ios_dict[prog].pop()
+            ilist.append(inp + out)
+          dset[task][bin_for(plen)].append([ilist, [ptoks]])
+
+  for case in xrange(0 if is_prog else nbr_cases):
    total_time += time.time() - cur_time
    cur_time = time.time()
    if l > 10000 and case % 100 == 1:
      print_out("  avg gen time %.4f s" % (total_time / float(case)))
    if task in ["add", "badd", "qadd", "bmul", "mul"]:
      i, t = rand_pair(l, task)
-      train_set[task][len(i)].append([i, t])
+      train_set[task][bin_for(len(i))].append([[[], i, [], []], [t]])
      i, t = rand_pair(l, task)
-      test_set[task][len(i)].append([i, t])
+      test_set[task][bin_for(len(i))].append([[[], i, [], []], [t]])
    elif task == "dup":
      i, t = rand_dup_pair(l)
-      train_set[task][len(i)].append([i, t])
+      train_set[task][bin_for(len(i))].append([[i], [t]])
      i, t = rand_dup_pair(l)
-      test_set[task][len(i)].append([i, t])
+      test_set[task][bin_for(len(i))].append([[i], [t]])
    elif task == "rev2":
      i, t = rand_rev2_pair(l)
-      train_set[task][len(i)].append([i, t])
+      train_set[task][bin_for(len(i))].append([[i], [t]])
      i, t = rand_rev2_pair(l)
-      test_set[task][len(i)].append([i, t])
+      test_set[task][bin_for(len(i))].append([[i], [t]])
    elif task == "search":
      i, t = rand_search_pair(l)
-      train_set[task][len(i)].append([i, t])
+      train_set[task][bin_for(len(i))].append([[i], [t]])
      i, t = rand_search_pair(l)
-      test_set[task][len(i)].append([i, t])
+      test_set[task][bin_for(len(i))].append([[i], [t]])
    elif task == "kvsort":
      i, t = rand_kvsort_pair(l)
-      train_set[task][len(i)].append([i, t])
+      train_set[task][bin_for(len(i))].append([[i], [t]])
      i, t = rand_kvsort_pair(l)
-      test_set[task][len(i)].append([i, t])
-    else:
+      test_set[task][bin_for(len(i))].append([[i], [t]])
+    elif task not in ["progeval", "progsynth"]:
      inp = [np.random.randint(nclass - 1) + 1 for i in xrange(l)]
      target = spec(inp)
-      train_set[task][l].append([inp, target])
+      train_set[task][bin_for(l)].append([[inp], [target]])
      inp = [np.random.randint(nclass - 1) + 1 for i in xrange(l)]
      target = spec(inp)
-      test_set[task][l].append([inp, target])
+      test_set[task][bin_for(l)].append([[inp], [target]])


 def to_symbol(i):
@@ -218,37 +349,31 @@ def to_id(s):
  return int(s) + 1


-def get_batch(max_length, batch_size, do_train, task, offset=None, preset=None):
+def get_batch(bin_id, batch_size, data_set, height, offset=None, preset=None):
  """Get a batch of data, training or testing."""
-  inputs = []
-  targets = []
-  length = max_length
-  if preset is None:
-    cur_set = test_set[task]
-    if do_train: cur_set = train_set[task]
-    while not cur_set[length]:
-      length -= 1
-  pad_length = pad(length)
+  inputs, targets = [], []
+  pad_length = bins[bin_id]
  for b in xrange(batch_size):
    if preset is None:
-      elem = random.choice(cur_set[length])
-      if offset is not None and offset + b < len(cur_set[length]):
-        elem = cur_set[length][offset + b]
+      elem = random.choice(data_set[bin_id])
+      if offset is not None and offset + b < len(data_set[bin_id]):
+        elem = data_set[bin_id][offset + b]
    else:
      elem = preset
-    inp, target = elem[0], elem[1]
-    assert len(inp) == length
-    inputs.append(inp + [0 for l in xrange(pad_length - len(inp))])
-    targets.append(target + [0 for l in xrange(pad_length - len(target))])
-  res_input = []
-  res_target = []
-  for l in xrange(pad_length):
-    new_input = np.array([inputs[b][l] for b in xrange(batch_size)],
-                         dtype=np.int32)
-    new_target = np.array([targets[b][l] for b in xrange(batch_size)],
-                          dtype=np.int32)
-    res_input.append(new_input)
-    res_target.append(new_target)
+    inpt, targett, inpl, targetl = elem[0], elem[1], [], []
+    for inp in inpt:
+      inpl.append(inp + [0 for _ in xrange(pad_length - len(inp))])
+    if len(inpl) == 1:
+      for _ in xrange(height - 1):
+        inpl.append([0 for _ in xrange(pad_length)])
+    for target in targett:
+      targetl.append(target + [0 for _ in xrange(pad_length - len(target))])
+    inputs.append(inpl)
+    targets.append(targetl)
+  res_input = np.array(inputs, dtype=np.int32)
+  res_target = np.array(targets, dtype=np.int32)
+  assert list(res_input.shape) == [batch_size, height, pad_length]
+  assert list(res_target.shape) == [batch_size, 1, pad_length]
  return res_input, res_target


@@ -256,11 +381,11 @@ def print_out(s, newline=True):
  """Print a message out and log it to file."""
  if log_filename:
    try:
-      with gfile.GFile(log_filename, mode="a") as f:
+      with tf.gfile.GFile(log_filename, mode="a") as f:
        f.write(s + ("\n" if newline else ""))
    # pylint: disable=bare-except
    except:
-      sys.stdout.write("Error appending to %s\n" % log_filename)
+      sys.stderr.write("Error appending to %s\n" % log_filename)
  sys.stdout.write(s + ("\n" if newline else ""))
  sys.stdout.flush()

@@ -269,21 +394,36 @@ def decode(output):
  return [np.argmax(o, axis=1) for o in output]


-def accuracy(inpt, output, target, batch_size, nprint):
+def accuracy(inpt_t, output, target_t, batch_size, nprint,
+             beam_out=None, beam_scores=None):
  """Calculate output accuracy given target."""
  assert nprint < batch_size + 1
+  inpt = []
+  for h in xrange(inpt_t.shape[1]):
+    inpt.extend([inpt_t[:, h, l] for l in xrange(inpt_t.shape[2])])
+  target = [target_t[:, 0, l] for l in xrange(target_t.shape[2])]
+  def tok(i):
+    if rev_vocab and i < len(rev_vocab):
+      return rev_vocab[i]
+    return str(i - 1)
  def task_print(inp, output, target):
    stop_bound = 0
    print_len = 0
    while print_len < len(target) and target[print_len] > stop_bound:
      print_len += 1
-    print_out("    i: " + " ".join([str(i - 1) for i in inp if i > 0]))
+    print_out("    i: " + " ".join([tok(i) for i in inp if i > 0]))
    print_out("    o: " +
-              " ".join([str(output[l] - 1) for l in xrange(print_len)]))
+              " ".join([tok(output[l]) for l in xrange(print_len)]))
    print_out("    t: " +
-              " ".join([str(target[l] - 1) for l in xrange(print_len)]))
+              " ".join([tok(target[l]) for l in xrange(print_len)]))
  decoded_target = target
  decoded_output = decode(output)
+  # Use beam output if given and score is high enough.
+  if beam_out is not None:
+    for b in xrange(batch_size):
+      if beam_scores[b] >= 10.0:
+        for l in xrange(min(len(decoded_output), beam_out.shape[2])):
+          decoded_output[l][b] = int(beam_out[b, 0, l])
  total = 0
  errors = 0
  seq = [0 for b in xrange(batch_size)]
@@ -311,6 +451,7 @@ def accuracy(inpt, output, target, batch_size, nprint):

 def safe_exp(x):
  perp = 10000
+  x = float(x)
  if x < 100: perp = math.exp(x)
  if perp > 10000: return 10000
  return perp
--- a/neural_gpu/neural_gpu.py
+++ b/neural_gpu/neural_gpu.py
@@ -16,26 +16,34 @@

 import time

+import numpy as np
 import tensorflow as tf

-import data_utils
+from tensorflow.python.framework import function
+import data_utils as data

+do_jit = False  # Gives more speed but experimental for now.
+jit_scope = tf.contrib.compiler.jit.experimental_jit_scope

-def conv_linear(args, kw, kh, nin, nout, do_bias, bias_start, prefix):
+
+def conv_linear(args, kw, kh, nin, nout, rate, do_bias, bias_start, prefix):
  """Convolutional linear map."""
-  assert args is not None
  if not isinstance(args, (list, tuple)):
    args = [args]
  with tf.variable_scope(prefix):
-    k = tf.get_variable("CvK", [kw, kh, nin, nout])
+    with tf.device("/cpu:0"):
+      k = tf.get_variable("CvK", [kw, kh, nin, nout])
    if len(args) == 1:
-      res = tf.nn.conv2d(args[0], k, [1, 1, 1, 1], "SAME")
+      arg = args[0]
    else:
-      res = tf.nn.conv2d(tf.concat(3, args), k, [1, 1, 1, 1], "SAME")
+      arg = tf.concat(args, 3)
+    res = tf.nn.convolution(arg, k, dilation_rate=(rate, 1), padding="SAME")
    if not do_bias: return res
-    bias_term = tf.get_variable("CvB", [nout],
-                                initializer=tf.constant_initializer(0.0))
-    return res + bias_term + bias_start
+    with tf.device("/cpu:0"):
+      bias_term = tf.get_variable(
+          "CvB", [nout], initializer=tf.constant_initializer(bias_start))
+    bias_term = tf.reshape(bias_term, [1, 1, 1, nout])
+    return res + bias_term


 def sigmoid_cutoff(x, cutoff):
@@ -43,7 +51,34 @@ def sigmoid_cutoff(x, cutoff):
  y = tf.sigmoid(x)
  if cutoff < 1.01: return y
  d = (cutoff - 1.0) / 2.0
-  return tf.minimum(1.0, tf.maximum(0.0, cutoff * y - d))
+  return tf.minimum(1.0, tf.maximum(0.0, cutoff * y - d), name="cutoff_min")
+
+
+@function.Defun(tf.float32, noinline=True)
+def sigmoid_cutoff_12(x):
+  """Sigmoid with cutoff 1.2, specialized for speed and memory use."""
+  y = tf.sigmoid(x)
+  return tf.minimum(1.0, tf.maximum(0.0, 1.2 * y - 0.1), name="cutoff_min_12")
+
+
+@function.Defun(tf.float32, noinline=True)
+def sigmoid_hard(x):
+  """Hard sigmoid."""
+  return tf.minimum(1.0, tf.maximum(0.0, 0.25 * x + 0.5))
+
+
+def place_at14(decided, selected, it):
+  """Place selected at it-th coordinate of decided, dim=1 of 4."""
+  slice1 = decided[:, :it, :, :]
+  slice2 = decided[:, it + 1:, :, :]
+  return tf.concat([slice1, selected, slice2], 1)
+
+
+def place_at13(decided, selected, it):
+  """Place selected at it-th coordinate of decided, dim=1 of 3."""
+  slice1 = decided[:, :it, :]
+  slice2 = decided[:, it + 1:, :]
+  return tf.concat([slice1, selected, slice2], 1)


 def tanh_cutoff(x, cutoff):
@@ -54,18 +89,80 @@ def tanh_cutoff(x, cutoff):
  return tf.minimum(1.0, tf.maximum(-1.0, (1.0 + d) * y))


-def conv_gru(inpts, mem, kw, kh, nmaps, cutoff, prefix):
+@function.Defun(tf.float32, noinline=True)
+def tanh_hard(x):
+  """Hard tanh."""
+  return tf.minimum(1.0, tf.maximum(0.0, x))
+
+
+def layer_norm(x, nmaps, prefix, epsilon=1e-5):
+  """Layer normalize the 4D tensor x, averaging over the last dimension."""
+  with tf.variable_scope(prefix):
+    scale = tf.get_variable("layer_norm_scale", [nmaps],
+                            initializer=tf.ones_initializer())
+    bias = tf.get_variable("layer_norm_bias", [nmaps],
+                           initializer=tf.zeros_initializer())
+    mean, variance = tf.nn.moments(x, [3], keep_dims=True)
+    norm_x = (x - mean) / tf.sqrt(variance + epsilon)
+    return norm_x * scale + bias
+
+
+def conv_gru(inpts, mem, kw, kh, nmaps, rate, cutoff, prefix, do_layer_norm,
+             args_len=None):
  """Convolutional GRU."""
  def conv_lin(args, suffix, bias_start):
-    return conv_linear(args, kw, kh, len(args) * nmaps, nmaps, True, bias_start,
-                       prefix + "/" + suffix)
-  reset = sigmoid_cutoff(conv_lin(inpts + [mem], "r", 1.0), cutoff)
-  # candidate = tanh_cutoff(conv_lin(inpts + [reset * mem], "c", 0.0), cutoff)
-  candidate = tf.tanh(conv_lin(inpts + [reset * mem], "c", 0.0))
-  gate = sigmoid_cutoff(conv_lin(inpts + [mem], "g", 1.0), cutoff)
+    total_args_len = args_len or len(args) * nmaps
+    res = conv_linear(args, kw, kh, total_args_len, nmaps, rate, True,
+                      bias_start, prefix + "/" + suffix)
+    if do_layer_norm:
+      return layer_norm(res, nmaps, prefix + "/" + suffix)
+    else:
+      return res
+  if cutoff == 1.2:
+    reset = sigmoid_cutoff_12(conv_lin(inpts + [mem], "r", 1.0))
+    gate = sigmoid_cutoff_12(conv_lin(inpts + [mem], "g", 1.0))
+  elif cutoff > 10:
+    reset = sigmoid_hard(conv_lin(inpts + [mem], "r", 1.0))
+    gate = sigmoid_hard(conv_lin(inpts + [mem], "g", 1.0))
+  else:
+    reset = sigmoid_cutoff(conv_lin(inpts + [mem], "r", 1.0), cutoff)
+    gate = sigmoid_cutoff(conv_lin(inpts + [mem], "g", 1.0), cutoff)
+  if cutoff > 10:
+    candidate = tf.tanh_hard(conv_lin(inpts + [reset * mem], "c", 0.0))
+  else:
+    # candidate = tanh_cutoff(conv_lin(inpts + [reset * mem], "c", 0.0), cutoff)
+    candidate = tf.tanh(conv_lin(inpts + [reset * mem], "c", 0.0))
  return gate * mem + (1 - gate) * candidate


+CHOOSE_K = 256
+
+
+def memory_call(q, l, nmaps, mem_size, vocab_size, num_gpus, update_mem):
+  raise ValueError("Fill for experiments with additional memory structures.")
+
+
+def memory_run(step, nmaps, mem_size, batch_size, vocab_size,
+               global_step, do_training, update_mem, decay_factor, num_gpus,
+               target_emb_weights, output_w, gpu_targets_tn, it):
+  """Run memory."""
+  q = step[:, 0, it, :]
+  mlabels = gpu_targets_tn[:, it, 0]
+  res, mask, mem_loss = memory_call(
+      q, mlabels, nmaps, mem_size, vocab_size, num_gpus, update_mem)
+  res = tf.gather(target_emb_weights, res) * tf.expand_dims(mask[:, 0], 1)
+
+  # Mix gold and original in the first steps, 20% later.
+  gold = tf.nn.dropout(tf.gather(target_emb_weights, mlabels), 0.7)
+  use_gold = 1.0 - tf.cast(global_step, tf.float32) / (1000. * decay_factor)
+  use_gold = tf.maximum(use_gold, 0.2) * do_training
+  mem = tf.cond(tf.less(tf.random_uniform([]), use_gold),
+                lambda: use_gold * gold + (1.0 - use_gold) * res,
+                lambda: res)
+  mem = tf.reshape(mem, [-1, 1, 1, nmaps])
+  return mem, mem_loss, update_mem
+
+
 @tf.RegisterGradient("CustomIdG")
 def _custom_id_grad(_, grads):
  return grads
@@ -86,237 +183,560 @@ def quantize_weights_op(quant_scale, max_value):
  return tf.group(*ops)


-def relaxed_average(var_name_suffix, rx_step):
-  """Calculate the average of relaxed variables having var_name_suffix."""
-  relaxed_vars = []
-  for l in xrange(rx_step):
-    with tf.variable_scope("RX%d" % l, reuse=True):
-      try:
-        relaxed_vars.append(tf.get_variable(var_name_suffix))
-      except ValueError:
-        pass
-  dsum = tf.add_n(relaxed_vars)
-  avg = dsum / len(relaxed_vars)
-  diff = [v - avg for v in relaxed_vars]
-  davg = tf.add_n([d*d for d in diff])
-  return avg, tf.reduce_sum(davg)
-
-
-def relaxed_distance(rx_step):
-  """Distance between relaxed variables and their average."""
-  res, ops, rx_done = [], [], {}
-  for v in tf.trainable_variables():
-    if v.name[0:2] == "RX":
-      rx_name = v.op.name[v.name.find("/") + 1:]
-      if rx_name not in rx_done:
-        avg, dist_loss = relaxed_average(rx_name, rx_step)
-        res.append(dist_loss)
-        rx_done[rx_name] = avg
-      ops.append(v.assign(rx_done[rx_name]))
-  return tf.add_n(res), tf.group(*ops)
-
-
-def make_dense(targets, noclass):
+def autoenc_quantize(x, nbits, nmaps, do_training, layers=1):
+  """Autoencoder into nbits vectors of bits, using noise and sigmoids."""
+  enc_x = tf.reshape(x, [-1, nmaps])
+  for i in xrange(layers - 1):
+    enc_x = tf.layers.dense(enc_x, nmaps, name="autoenc_%d" % i)
+  enc_x = tf.layers.dense(enc_x, nbits, name="autoenc_%d" % (layers - 1))
+  noise = tf.truncated_normal(tf.shape(enc_x), stddev=2.0)
+  dec_x = sigmoid_cutoff_12(enc_x + noise * do_training)
+  dec_x = tf.reshape(dec_x, [-1, nbits])
+  for i in xrange(layers):
+    dec_x = tf.layers.dense(dec_x, nmaps, name="autodec_%d" % i)
+  return tf.reshape(dec_x, tf.shape(x))
+
+
+def make_dense(targets, noclass, low_param):
  """Move a batch of targets to a dense 1-hot representation."""
-  with tf.device("/cpu:0"):
-    shape = tf.shape(targets)
-    batch_size = shape[0]
-    indices = targets + noclass * tf.range(0, batch_size)
-    length = tf.expand_dims(batch_size * noclass, 0)
-    dense = tf.sparse_to_dense(indices, length, 1.0, 0.0)
-  return tf.reshape(dense, [-1, noclass])
-
-
-def check_for_zero(sparse):
-  """In a sparse batch of ints, make 1.0 if it's 0 and 0.0 else."""
-  with tf.device("/cpu:0"):
-    shape = tf.shape(sparse)
-    batch_size = shape[0]
-    sparse = tf.minimum(sparse, 1)
-    indices = sparse + 2 * tf.range(0, batch_size)
-    dense = tf.sparse_to_dense(indices, tf.expand_dims(2 * batch_size, 0),
-                               1.0, 0.0)
-    reshaped = tf.reshape(dense, [-1, 2])
-  return tf.reshape(tf.slice(reshaped, [0, 0], [-1, 1]), [-1])
+  low = low_param / float(noclass - 1)
+  high = 1.0 - low * (noclass - 1)
+  targets = tf.cast(targets, tf.int64)
+  return tf.one_hot(targets, depth=noclass, on_value=high, off_value=low)
+
+
+def reorder_beam(beam_size, batch_size, beam_val, output, is_first,
+                 tensors_to_reorder):
+  """Reorder to minimize beam costs."""
+  # beam_val is [batch_size x beam_size]; let b = batch_size * beam_size
+  # decided is len x b x a x b
+  # output is b x out_size; step is b x len x a x b;
+  outputs = tf.split(tf.nn.log_softmax(output), beam_size, 0)
+  all_beam_vals, all_beam_idx = [], []
+  beam_range = 1 if is_first else beam_size
+  for i in xrange(beam_range):
+    top_out, top_out_idx = tf.nn.top_k(outputs[i], k=beam_size)
+    cur_beam_val = beam_val[:, i]
+    top_out = tf.Print(top_out, [top_out, top_out_idx, beam_val, i,
+                                 cur_beam_val], "GREPO", summarize=8)
+    all_beam_vals.append(top_out + tf.expand_dims(cur_beam_val, 1))
+    all_beam_idx.append(top_out_idx)
+  all_beam_idx = tf.reshape(tf.transpose(tf.concat(all_beam_idx, 1), [1, 0]),
+                            [-1])
+  top_beam, top_beam_idx = tf.nn.top_k(tf.concat(all_beam_vals, 1), k=beam_size)
+  top_beam_idx = tf.Print(top_beam_idx, [top_beam, top_beam_idx],
+                          "GREP", summarize=8)
+  reordered = [[] for _ in xrange(len(tensors_to_reorder) + 1)]
+  top_out_idx = []
+  for i in xrange(beam_size):
+    which_idx = top_beam_idx[:, i] * batch_size + tf.range(batch_size)
+    top_out_idx.append(tf.gather(all_beam_idx, which_idx))
+    which_beam = top_beam_idx[:, i] / beam_size  # [batch]
+    which_beam = which_beam * batch_size + tf.range(batch_size)
+    reordered[0].append(tf.gather(output, which_beam))
+    for i, t in enumerate(tensors_to_reorder):
+      reordered[i + 1].append(tf.gather(t, which_beam))
+  new_tensors = [tf.concat(t, 0) for t in reordered]
+  top_out_idx = tf.concat(top_out_idx, 0)
+  return (top_beam, new_tensors[0], top_out_idx, new_tensors[1:])


 class NeuralGPU(object):
  """Neural GPU Model."""

-  def __init__(self, nmaps, vec_size, niclass, noclass, dropout, rx_step,
-               max_grad_norm, cutoff, nconvs, kw, kh, height, mode,
-               learning_rate, pull, pull_incr, min_length, act_noise=0.0):
+  def __init__(self, nmaps, vec_size, niclass, noclass, dropout,
+               max_grad_norm, cutoff, nconvs, kw, kh, height, mem_size,
+               learning_rate, min_length, num_gpus, num_replicas,
+               grad_noise_scale, sampling_rate, act_noise=0.0, do_rnn=False,
+               atrous=False, beam_size=1, backward=True, do_layer_norm=False,
+               autoenc_decay=1.0):
    # Feeds for parameters and ops to update them.
-    self.global_step = tf.Variable(0, trainable=False)
-    self.cur_length = tf.Variable(min_length, trainable=False)
-    self.cur_length_incr_op = self.cur_length.assign_add(1)
-    self.lr = tf.Variable(float(learning_rate), trainable=False)
-    self.lr_decay_op = self.lr.assign(self.lr * 0.98)
-    self.pull = tf.Variable(float(pull), trainable=False)
-    self.pull_incr_op = self.pull.assign(self.pull * pull_incr)
+    self.nmaps = nmaps
+    if backward:
+      self.global_step = tf.Variable(0, trainable=False, name="global_step")
+      self.cur_length = tf.Variable(min_length, trainable=False)
+      self.cur_length_incr_op = self.cur_length.assign_add(1)
+      self.lr = tf.Variable(learning_rate, trainable=False)
+      self.lr_decay_op = self.lr.assign(self.lr * 0.995)
    self.do_training = tf.placeholder(tf.float32, name="do_training")
+    self.update_mem = tf.placeholder(tf.int32, name="update_mem")
    self.noise_param = tf.placeholder(tf.float32, name="noise_param")

    # Feeds for inputs, targets, outputs, losses, etc.
-    self.input = []
-    self.target = []
-    for l in xrange(data_utils.forward_max + 1):
-      self.input.append(tf.placeholder(tf.int32, name="inp{0}".format(l)))
-      self.target.append(tf.placeholder(tf.int32, name="tgt{0}".format(l)))
-    self.outputs = []
-    self.losses = []
-    self.grad_norms = []
-    self.updates = []
+    self.input = tf.placeholder(tf.int32, name="inp")
+    self.target = tf.placeholder(tf.int32, name="tgt")
+    self.prev_step = tf.placeholder(tf.float32, name="prev_step")
+    gpu_input = tf.split(self.input, num_gpus, 0)
+    gpu_target = tf.split(self.target, num_gpus, 0)
+    gpu_prev_step = tf.split(self.prev_step, num_gpus, 0)
+    batch_size = tf.shape(gpu_input[0])[0]
+
+    if backward:
+      adam_lr = 0.005 * self.lr
+      adam = tf.train.AdamOptimizer(adam_lr, epsilon=2e-4)
+
+      def adam_update(grads):
+        return adam.apply_gradients(zip(grads, tf.trainable_variables()),
+                                    global_step=self.global_step,
+                                    name="adam_update")
+
+    # When switching from Adam to SGD we perform reverse-decay.
+    if backward:
+      global_step_float = tf.cast(self.global_step, tf.float32)
+      sampling_decay_exponent = global_step_float / 100000.0
+      sampling_decay = tf.maximum(0.05, tf.pow(0.5, sampling_decay_exponent))
+      self.sampling = sampling_rate * 0.05 / sampling_decay
+    else:
+      self.sampling = tf.constant(0.0)
+
+    # Cache variables on cpu if needed.
+    if num_replicas > 1 or num_gpus > 1:
+      with tf.device("/cpu:0"):
+        caching_const = tf.constant(0)
+      tf.get_variable_scope().set_caching_device(caching_const.op.device)
+      # partitioner = tf.variable_axis_size_partitioner(1024*256*4)
+      # tf.get_variable_scope().set_partitioner(partitioner)
+
+    def gpu_avg(l):
+      if l[0] is None:
+        for elem in l:
+          assert elem is None
+        return 0.0
+      if len(l) < 2:
+        return l[0]
+      return sum(l) / float(num_gpus)
+
+    self.length_tensor = tf.placeholder(tf.int32, name="length")

-    # Computation.
-    inp0_shape = tf.shape(self.input[0])
-    batch_size = inp0_shape[0]
    with tf.device("/cpu:0"):
      emb_weights = tf.get_variable(
          "embedding", [niclass, vec_size],
          initializer=tf.random_uniform_initializer(-1.7, 1.7))
+      if beam_size > 0:
+        target_emb_weights = tf.get_variable(
+            "target_embedding", [noclass, nmaps],
+            initializer=tf.random_uniform_initializer(-1.7, 1.7))
      e0 = tf.scatter_update(emb_weights,
                             tf.constant(0, dtype=tf.int32, shape=[1]),
                             tf.zeros([1, vec_size]))
-
-    adam = tf.train.AdamOptimizer(self.lr, epsilon=1e-4)
-
-    # Main graph creation loop, for every bin in data_utils.
-    self.steps = []
-    for length in sorted(list(set(data_utils.bins + [data_utils.forward_max]))):
-      data_utils.print_out("Creating model for bin of length %d." % length)
-      start_time = time.time()
-      if length > data_utils.bins[0]:
+      output_w = tf.get_variable("output_w", [nmaps, noclass], tf.float32)
+
+    def conv_rate(layer):
+      if atrous:
+        return 2**layer
+      return 1
+
+    # pylint: disable=cell-var-from-loop
+    def enc_step(step):
+      """Encoder step."""
+      if autoenc_decay < 1.0:
+        quant_step = autoenc_quantize(step, 16, nmaps, self.do_training)
+        if backward:
+          exp_glob = tf.train.exponential_decay(1.0, self.global_step - 10000,
+                                                1000, autoenc_decay)
+          dec_factor = 1.0 - exp_glob  # * self.do_training
+          dec_factor = tf.cond(tf.less(self.global_step, 10500),
+                               lambda: tf.constant(0.05), lambda: dec_factor)
+        else:
+          dec_factor = 1.0
+        cur = tf.cond(tf.less(tf.random_uniform([]), dec_factor),
+                      lambda: quant_step, lambda: step)
+      else:
+        cur = step
+      if dropout > 0.0001:
+        cur = tf.nn.dropout(cur, keep_prob)
+      if act_noise > 0.00001:
+        cur += tf.truncated_normal(tf.shape(cur)) * act_noise_scale
+      # Do nconvs-many CGRU steps.
+      if do_jit and tf.get_variable_scope().reuse:
+        with jit_scope():
+          for layer in xrange(nconvs):
+            cur = conv_gru([], cur, kw, kh, nmaps, conv_rate(layer),
+                           cutoff, "ecgru_%d" % layer, do_layer_norm)
+      else:
+        for layer in xrange(nconvs):
+          cur = conv_gru([], cur, kw, kh, nmaps, conv_rate(layer),
+                         cutoff, "ecgru_%d" % layer, do_layer_norm)
+      return cur
+
+    zero_tgt = tf.zeros([batch_size, nmaps, 1])
+    zero_tgt.set_shape([None, nmaps, 1])
+
+    def dec_substep(step, decided):
+      """Decoder sub-step."""
+      cur = step
+      if dropout > 0.0001:
+        cur = tf.nn.dropout(cur, keep_prob)
+      if act_noise > 0.00001:
+        cur += tf.truncated_normal(tf.shape(cur)) * act_noise_scale
+      # Do nconvs-many CGRU steps.
+      if do_jit and tf.get_variable_scope().reuse:
+        with jit_scope():
+          for layer in xrange(nconvs):
+            cur = conv_gru([decided], cur, kw, kh, nmaps, conv_rate(layer),
+                           cutoff, "dcgru_%d" % layer, do_layer_norm)
+      else:
+        for layer in xrange(nconvs):
+          cur = conv_gru([decided], cur, kw, kh, nmaps, conv_rate(layer),
+                         cutoff, "dcgru_%d" % layer, do_layer_norm)
+      return cur
+    # pylint: enable=cell-var-from-loop
+
+    def dec_step(step, it, it_int, decided, output_ta, tgts,
+                 mloss, nupd_in, out_idx, beam_cost):
+      """Decoder step."""
+      nupd, mem_loss = 0, 0.0
+      if mem_size > 0:
+        it_incr = tf.minimum(it+1, length - 1)
+        mem, mem_loss, nupd = memory_run(
+            step, nmaps, mem_size, batch_size, noclass, self.global_step,
+            self.do_training, self.update_mem, 10, num_gpus,
+            target_emb_weights, output_w, gpu_targets_tn, it_incr)
+      step = dec_substep(step, decided)
+      output_l = tf.expand_dims(tf.expand_dims(step[:, it, 0, :], 1), 1)
+      # Calculate argmax output.
+      output = tf.reshape(output_l, [-1, nmaps])
+      # pylint: disable=cell-var-from-loop
+      output = tf.matmul(output, output_w)
+      if beam_size > 1:
+        beam_cost, output, out, reordered = reorder_beam(
+            beam_size, batch_size, beam_cost, output, it_int == 0,
+            [output_l, out_idx, step, decided])
+        [output_l, out_idx, step, decided] = reordered
+      else:
+        # Scheduled sampling.
+        out = tf.multinomial(tf.stop_gradient(output), 1)
+        out = tf.to_int32(tf.squeeze(out, [1]))
+      out_write = output_ta.write(it, output_l[:batch_size, :, :, :])
+      output = tf.gather(target_emb_weights, out)
+      output = tf.reshape(output, [-1, 1, nmaps])
+      output = tf.concat([output] * height, 1)
+      tgt = tgts[it, :, :, :]
+      selected = tf.cond(tf.less(tf.random_uniform([]), self.sampling),
+                         lambda: output, lambda: tgt)
+      # pylint: enable=cell-var-from-loop
+      dec_write = place_at14(decided, tf.expand_dims(selected, 1), it)
+      out_idx = place_at13(
+          out_idx, tf.reshape(out, [beam_size * batch_size, 1, 1]), it)
+      if mem_size > 0:
+        mem = tf.concat([mem] * height, 2)
+        dec_write = place_at14(dec_write, mem, it_incr)
+      return (step, dec_write, out_write, mloss + mem_loss, nupd_in + nupd,
+              out_idx, beam_cost)
+
+    # Main model construction.
+    gpu_outputs = []
+    gpu_losses = []
+    gpu_grad_norms = []
+    grads_list = []
+    gpu_out_idx = []
+    self.after_enc_step = []
+    for gpu in xrange(num_gpus):  # Multi-GPU towers, average gradients later.
+      length = self.length_tensor
+      length_float = tf.cast(length, tf.float32)
+      if gpu > 0:
        tf.get_variable_scope().reuse_variables()
+      gpu_outputs.append([])
+      gpu_losses.append([])
+      gpu_grad_norms.append([])
+      with tf.name_scope("gpu%d" % gpu), tf.device("/gpu:%d" % gpu):
+        # Main graph creation loop.
+        data.print_out("Creating model.")
+        start_time = time.time()
+
+        # Embed inputs and calculate mask.
+        with tf.device("/cpu:0"):
+          tgt_shape = tf.shape(tf.squeeze(gpu_target[gpu], [1]))
+          weights = tf.where(tf.squeeze(gpu_target[gpu], [1]) > 0,
+                             tf.ones(tgt_shape), tf.zeros(tgt_shape))
+
+          # Embed inputs and targets.
+          with tf.control_dependencies([e0]):
+            start = tf.gather(emb_weights, gpu_input[gpu])  # b x h x l x nmaps
+            gpu_targets_tn = gpu_target[gpu]  # b x 1 x len
+            if beam_size > 0:
+              embedded_targets_tn = tf.gather(target_emb_weights,
+                                              gpu_targets_tn)
+              embedded_targets_tn = tf.transpose(
+                  embedded_targets_tn, [2, 0, 1, 3])  # len x b x 1 x nmaps
+              embedded_targets_tn = tf.concat([embedded_targets_tn] * height, 2)
+
+        # First image comes from start by applying convolution and adding 0s.
+        start = tf.transpose(start, [0, 2, 1, 3])  # Now b x len x h x vec_s
+        first = conv_linear(start, 1, 1, vec_size, nmaps, 1, True, 0.0, "input")
+        first = layer_norm(first, nmaps, "input")
+
+        # Computation steps.
+        keep_prob = dropout * 3.0 / tf.sqrt(length_float)
+        keep_prob = 1.0 - self.do_training * keep_prob
+        act_noise_scale = act_noise * self.do_training
+
+        # Start with a convolutional gate merging previous step.
+        step = conv_gru([gpu_prev_step[gpu]], first,
+                        kw, kh, nmaps, 1, cutoff, "first", do_layer_norm)
+
+        # This is just for running a baseline RNN seq2seq model.
+        if do_rnn:
+          self.after_enc_step.append(step)  # Not meaningful here, but needed.
+          lstm_cell = tf.contrib.rnn.BasicLSTMCell(height * nmaps)
+          cell = tf.contrib.rnn.MultiRNNCell([lstm_cell] * nconvs)
+          with tf.variable_scope("encoder"):
+            encoder_outputs, encoder_state = tf.nn.dynamic_rnn(
+                cell, tf.reshape(step, [batch_size, length, height * nmaps]),
+                dtype=tf.float32, time_major=False)
+
+          # Attention.
+          attn = tf.layers.dense(
+              encoder_outputs, height * nmaps, name="attn1")
+
+          # pylint: disable=cell-var-from-loop
+          @function.Defun(noinline=True)
+          def attention_query(query, attn_v):
+            vecs = tf.tanh(attn + tf.expand_dims(query, 1))
+            mask = tf.reduce_sum(vecs * tf.reshape(attn_v, [1, 1, -1]), 2)
+            mask = tf.nn.softmax(mask)
+            return tf.reduce_sum(encoder_outputs * tf.expand_dims(mask, 2), 1)
+
+          with tf.variable_scope("decoder"):
+            def decoder_loop_fn((state, prev_cell_out, _), (cell_inp, cur_tgt)):
+              """Decoder loop function."""
+              attn_q = tf.layers.dense(prev_cell_out, height * nmaps,
+                                       name="attn_query")
+              attn_res = attention_query(attn_q, tf.get_variable(
+                  "attn_v", [height * nmaps],
+                  initializer=tf.random_uniform_initializer(-0.1, 0.1)))
+              concatenated = tf.reshape(tf.concat([cell_inp, attn_res], 1),
+                                        [batch_size, 2 * height * nmaps])
+              cell_inp = tf.layers.dense(
+                  concatenated, height * nmaps, name="attn_merge")
+              output, new_state = cell(cell_inp, state)
+
+              mem_loss = 0.0
+              if mem_size > 0:
+                res, mask, mem_loss = memory_call(
+                    output, cur_tgt, height * nmaps, mem_size, noclass,
+                    num_gpus, self.update_mem)
+                res = tf.gather(target_emb_weights, res)
+                res *= tf.expand_dims(mask[:, 0], 1)
+                output = tf.layers.dense(
+                    tf.concat([output, res], 1), height * nmaps, name="rnnmem")
+
+              return new_state, output, mem_loss
+            # pylint: enable=cell-var-from-loop
+            gpu_targets = tf.squeeze(gpu_target[gpu], [1])  # b x len
+            gpu_tgt_trans = tf.transpose(gpu_targets, [1, 0])
+            dec_zero = tf.zeros([batch_size, 1], dtype=tf.int32)
+            dec_inp = tf.concat([dec_zero, gpu_targets], 1)
+            dec_inp = dec_inp[:, :length]
+            embedded_dec_inp = tf.gather(target_emb_weights, dec_inp)
+            embedded_dec_inp_proj = tf.layers.dense(
+                embedded_dec_inp, height * nmaps, name="dec_proj")
+            embedded_dec_inp_proj = tf.transpose(embedded_dec_inp_proj,
+                                                 [1, 0, 2])
+            init_vals = (encoder_state,
+                         tf.zeros([batch_size, height * nmaps]), 0.0)
+            _, dec_outputs, mem_losses = tf.scan(
+                decoder_loop_fn, (embedded_dec_inp_proj, gpu_tgt_trans),
+                initializer=init_vals)
+          mem_loss = tf.reduce_mean(mem_losses)
+          outputs = tf.layers.dense(dec_outputs, nmaps, name="out_proj")
+          # Final convolution to get logits, list outputs.
+          outputs = tf.matmul(tf.reshape(outputs, [-1, nmaps]), output_w)
+          outputs = tf.reshape(outputs, [length, batch_size, noclass])
+          gpu_out_idx.append(tf.argmax(outputs, 2))
+        else:  # Here we go with the Neural GPU.
+          # Encoder.
+          enc_length = length
+          step = enc_step(step)  # First step hard-coded.
+          # pylint: disable=cell-var-from-loop
+          i = tf.constant(1)
+          c = lambda i, _s: tf.less(i, enc_length)
+          def enc_step_lambda(i, step):
+            with tf.variable_scope(tf.get_variable_scope(), reuse=True):
+              new_step = enc_step(step)
+            return (i + 1, new_step)
+          _, step = tf.while_loop(
+              c, enc_step_lambda, [i, step],
+              parallel_iterations=1, swap_memory=True)
+          # pylint: enable=cell-var-from-loop
+
+          self.after_enc_step.append(step)
+
+          # Decoder.
+          if beam_size > 0:
+            output_ta = tf.TensorArray(
+                dtype=tf.float32, size=length, dynamic_size=False,
+                infer_shape=False, name="outputs")
+            out_idx = tf.zeros([beam_size * batch_size, length, 1],
+                               dtype=tf.int32)
+            decided_t = tf.zeros([beam_size * batch_size, length,
+                                  height, vec_size])
+
+            # Prepare for beam search.
+            tgts = tf.concat([embedded_targets_tn] * beam_size, 1)
+            beam_cost = tf.zeros([batch_size, beam_size])
+            step = tf.concat([step] * beam_size, 0)
+            # First step hard-coded.
+            step, decided_t, output_ta, mem_loss, nupd, oi, bc = dec_step(
+                step, 0, 0, decided_t, output_ta, tgts, 0.0, 0, out_idx,
+                beam_cost)
+            tf.get_variable_scope().reuse_variables()
+            # pylint: disable=cell-var-from-loop
+            def step_lambda(i, step, dec_t, out_ta, ml, nu, oi, bc):
+              with tf.variable_scope(tf.get_variable_scope(), reuse=True):
+                s, d, t, nml, nu, oi, bc = dec_step(
+                    step, i, 1, dec_t, out_ta, tgts, ml, nu, oi, bc)
+              return (i + 1, s, d, t, nml, nu, oi, bc)
+            i = tf.constant(1)
+            c = lambda i, _s, _d, _o, _ml, _nu, _oi, _bc: tf.less(i, length)
+            _, step, _, output_ta, mem_loss, nupd, out_idx, _ = tf.while_loop(
+                c, step_lambda,
+                [i, step, decided_t, output_ta, mem_loss, nupd, oi, bc],
+                parallel_iterations=1, swap_memory=True)
+            # pylint: enable=cell-var-from-loop
+            gpu_out_idx.append(tf.squeeze(out_idx, [2]))
+            outputs = output_ta.stack()
+            outputs = tf.squeeze(outputs, [2, 3])  # Now l x b x nmaps
+          else:
+            # If beam_size is 0 or less, we don't have a decoder.
+            mem_loss = 0.0
+            outputs = tf.transpose(step[:, :, 1, :], [1, 0, 2])
+            gpu_out_idx.append(tf.argmax(outputs, 2))
+
+          # Final convolution to get logits, list outputs.
+          outputs = tf.matmul(tf.reshape(outputs, [-1, nmaps]), output_w)
+          outputs = tf.reshape(outputs, [length, batch_size, noclass])
+        gpu_outputs[gpu] = tf.nn.softmax(outputs)
+
+        # Calculate cross-entropy loss and normalize it.
+        targets_soft = make_dense(tf.squeeze(gpu_target[gpu], [1]),
+                                  noclass, 0.1)
+        targets_soft = tf.reshape(targets_soft, [-1, noclass])
+        targets_hard = make_dense(tf.squeeze(gpu_target[gpu], [1]),
+                                  noclass, 0.0)
+        targets_hard = tf.reshape(targets_hard, [-1, noclass])
+        output = tf.transpose(outputs, [1, 0, 2])
+        xent_soft = tf.reshape(tf.nn.softmax_cross_entropy_with_logits(
+            logits=tf.reshape(output, [-1, noclass]), labels=targets_soft),
+                               [batch_size, length])
+        xent_hard = tf.reshape(tf.nn.softmax_cross_entropy_with_logits(
+            logits=tf.reshape(output, [-1, noclass]), labels=targets_hard),
+                               [batch_size, length])
+        low, high = 0.1 / float(noclass - 1), 0.9
+        const = high * tf.log(high) + float(noclass - 1) * low * tf.log(low)
+        weight_sum = tf.reduce_sum(weights) + 1e-20
+        true_perp = tf.reduce_sum(xent_hard * weights) / weight_sum
+        soft_loss = tf.reduce_sum(xent_soft * weights) / weight_sum
+        perp_loss = soft_loss + const
+        # Final loss: cross-entropy + shared parameter relaxation part + extra.
+        mem_loss = 0.5 * tf.reduce_mean(mem_loss) / length_float
+        total_loss = perp_loss + mem_loss
+        gpu_losses[gpu].append(true_perp)
+
+        # Gradients.
+        if backward:
+          data.print_out("Creating backward pass for the model.")
+          grads = tf.gradients(
+              total_loss, tf.trainable_variables(),
+              colocate_gradients_with_ops=True)
+          for g_i, g in enumerate(grads):
+            if isinstance(g, tf.IndexedSlices):
+              grads[g_i] = tf.convert_to_tensor(g)
+          grads, norm = tf.clip_by_global_norm(grads, max_grad_norm)
+          gpu_grad_norms[gpu].append(norm)
+          for g in grads:
+            if grad_noise_scale > 0.001:
+              g += tf.truncated_normal(tf.shape(g)) * self.noise_param
+          grads_list.append(grads)
+        else:
+          gpu_grad_norms[gpu].append(0.0)
+        data.print_out("Created model for gpu %d in %.2f s."
+                       % (gpu, time.time() - start_time))

-      # Embed inputs and calculate mask.
-      with tf.device("/cpu:0"):
-        with tf.control_dependencies([e0]):
-          embedded = [tf.nn.embedding_lookup(emb_weights, self.input[l])
-                      for l in xrange(length)]
-        # Mask to 0-out padding space in each step.
-        imask = [check_for_zero(self.input[l]) for l in xrange(length)]
-        omask = [check_for_zero(self.target[l]) for l in xrange(length)]
-        mask = [1.0 - (imask[i] * omask[i]) for i in xrange(length)]
-        mask = [tf.reshape(m, [-1, 1]) for m in mask]
-        # Use a shifted mask for step scaling and concatenated for weights.
-        shifted_mask = mask + [tf.zeros_like(mask[0])]
-        scales = [shifted_mask[i] * (1.0 - shifted_mask[i+1])
-                  for i in xrange(length)]
-        scales = [tf.reshape(s, [-1, 1, 1, 1]) for s in scales]
-        mask = tf.concat(1, mask[0:length])  # batch x length
-        weights = mask
-        # Add a height dimension to mask to use later for masking.
-        mask = tf.reshape(mask, [-1, length, 1, 1])
-        mask = tf.concat(2, [mask for _ in xrange(height)]) + tf.zeros(
-            tf.pack([batch_size, length, height, nmaps]), dtype=tf.float32)
-
-      # Start is a length-list of batch-by-nmaps tensors, reshape and concat.
-      start = [tf.tanh(embedded[l]) for l in xrange(length)]
-      start = [tf.reshape(start[l], [-1, 1, nmaps]) for l in xrange(length)]
-      start = tf.reshape(tf.concat(1, start), [-1, length, 1, nmaps])
-
-      # First image comes from start by applying one convolution and adding 0s.
-      first = conv_linear(start, 1, 1, vec_size, nmaps, True, 0.0, "input")
-      first = [first] + [tf.zeros(tf.pack([batch_size, length, 1, nmaps]),
-                                  dtype=tf.float32) for _ in xrange(height - 1)]
-      first = tf.concat(2, first)
-
-      # Computation steps.
-      keep_prob = 1.0 - self.do_training * (dropout * 8.0 / float(length))
-      step = [tf.nn.dropout(first, keep_prob) * mask]
-      act_noise_scale = act_noise * self.do_training * self.pull
-      outputs = []
-      for it in xrange(length):
-        with tf.variable_scope("RX%d" % (it % rx_step)) as vs:
-          if it >= rx_step:
-            vs.reuse_variables()
-          cur = step[it]
-          # Do nconvs-many CGRU steps.
-          for layer in xrange(nconvs):
-            cur = conv_gru([], cur, kw, kh, nmaps, cutoff, "cgru_%d" % layer)
-            cur *= mask
-          outputs.append(tf.slice(cur, [0, 0, 0, 0], [-1, -1, 1, -1]))
-          cur = tf.nn.dropout(cur, keep_prob)
-          if act_noise > 0.00001:
-            cur += tf.truncated_normal(tf.shape(cur)) * act_noise_scale
-          step.append(cur * mask)
-
-      self.steps.append([tf.reshape(s, [-1, length, height * nmaps])
-                         for s in step])
-      # Output is the n-th step output; n = current length, as in scales.
-      output = tf.add_n([outputs[i] * scales[i] for i in xrange(length)])
-      # Final convolution to get logits, list outputs.
-      output = conv_linear(output, 1, 1, nmaps, noclass, True, 0.0, "output")
-      output = tf.reshape(output, [-1, length, noclass])
-      external_output = [tf.reshape(o, [-1, noclass])
-                         for o in list(tf.split(1, length, output))]
-      external_output = [tf.nn.softmax(o) for o in external_output]
-      self.outputs.append(external_output)
-
-      # Calculate cross-entropy loss and normalize it.
-      targets = tf.concat(1, [make_dense(self.target[l], noclass)
-                              for l in xrange(length)])
-      targets = tf.reshape(targets, [-1, noclass])
-      xent = tf.reshape(tf.nn.softmax_cross_entropy_with_logits(
-          tf.reshape(output, [-1, noclass]), targets), [-1, length])
-      perp_loss = tf.reduce_sum(xent * weights)
-      perp_loss /= tf.cast(batch_size, dtype=tf.float32)
-      perp_loss /= length
-
-      # Final loss: cross-entropy + shared parameter relaxation part.
-      relax_dist, self.avg_op = relaxed_distance(rx_step)
-      total_loss = perp_loss + relax_dist * self.pull
-      self.losses.append(perp_loss)
-
-      # Gradients and Adam update operation.
-      if length == data_utils.bins[0] or (mode == 0 and
-                                          length < data_utils.bins[-1] + 1):
-        data_utils.print_out("Creating backward for bin of length %d." % length)
-        params = tf.trainable_variables()
-        grads = tf.gradients(total_loss, params)
-        grads, norm = tf.clip_by_global_norm(grads, max_grad_norm)
-        self.grad_norms.append(norm)
-        for grad in grads:
-          if isinstance(grad, tf.Tensor):
-            grad += tf.truncated_normal(tf.shape(grad)) * self.noise_param
-        update = adam.apply_gradients(zip(grads, params),
-                                      global_step=self.global_step)
-        self.updates.append(update)
-      data_utils.print_out("Created model for bin of length %d in"
-                           " %.2f s." % (length, time.time() - start_time))
-    self.saver = tf.train.Saver(tf.all_variables())
-
-  def step(self, sess, inp, target, do_backward, noise_param=None,
-           get_steps=False):
+    self.updates = []
+    self.after_enc_step = tf.concat(self.after_enc_step, 0)  # Concat GPUs.
+    if backward:
+      tf.get_variable_scope()._reuse = False
+      tf.get_variable_scope().set_caching_device(None)
+      grads = [gpu_avg([grads_list[g][i] for g in xrange(num_gpus)])
+               for i in xrange(len(grads_list[0]))]
+      update = adam_update(grads)
+      self.updates.append(update)
+    else:
+      self.updates.append(tf.no_op())
+
+    self.losses = [gpu_avg([gpu_losses[g][i] for g in xrange(num_gpus)])
+                   for i in xrange(len(gpu_losses[0]))]
+    self.out_idx = tf.concat(gpu_out_idx, 0)
+    self.grad_norms = [gpu_avg([gpu_grad_norms[g][i] for g in xrange(num_gpus)])
+                       for i in xrange(len(gpu_grad_norms[0]))]
+    self.outputs = [tf.concat([gpu_outputs[g] for g in xrange(num_gpus)], 1)]
+    self.quantize_op = quantize_weights_op(512, 8)
+    if backward:
+      self.saver = tf.train.Saver(tf.global_variables(), max_to_keep=10)
+
+  def step(self, sess, inp, target, do_backward_in, noise_param=None,
+           beam_size=2, eos_id=2, eos_cost=0.0, update_mem=None, state=None):
    """Run a step of the network."""
-    assert len(inp) == len(target)
-    length = len(target)
+    batch_size, height, length = inp.shape[0], inp.shape[1], inp.shape[2]
+    do_backward = do_backward_in
+    train_mode = True
+    if do_backward_in is None:
+      do_backward = False
+      train_mode = False
+    if update_mem is None:
+      update_mem = do_backward
    feed_in = {}
+    # print "    feeding sequences of length %d" % length
+    if state is None:
+      state = np.zeros([batch_size, length, height, self.nmaps])
+    feed_in[self.prev_step.name] = state
+    feed_in[self.length_tensor.name] = length
    feed_in[self.noise_param.name] = noise_param if noise_param else 0.0
    feed_in[self.do_training.name] = 1.0 if do_backward else 0.0
+    feed_in[self.update_mem.name] = 1 if update_mem else 0
+    if do_backward_in is False:
+      feed_in[self.sampling.name] = 0.0
+    index = 0  # We're dynamic now.
    feed_out = []
-    index = len(data_utils.bins)
-    if length < data_utils.bins[-1] + 1:
-      index = data_utils.bins.index(length)
    if do_backward:
      feed_out.append(self.updates[index])
      feed_out.append(self.grad_norms[index])
-    feed_out.append(self.losses[index])
-    for l in xrange(length):
-      feed_in[self.input[l].name] = inp[l]
-    for l in xrange(length):
-      feed_in[self.target[l].name] = target[l]
-      feed_out.append(self.outputs[index][l])
-    if get_steps:
-      for l in xrange(length+1):
-        feed_out.append(self.steps[index][l])
-    res = sess.run(feed_out, feed_in)
+    if train_mode:
+      feed_out.append(self.losses[index])
+    feed_in[self.input.name] = inp
+    feed_in[self.target.name] = target
+    feed_out.append(self.outputs[index])
+    if train_mode:
+      # Make a full-sequence training step with one call to session.run.
+      res = sess.run([self.after_enc_step] + feed_out, feed_in)
+      after_enc_state, res = res[0], res[1:]
+    else:
+      # Make a full-sequence decoding step with one call to session.run.
+      feed_in[self.sampling.name] = 1.1  # Sample every time.
+      res = sess.run([self.after_enc_step, self.out_idx] + feed_out, feed_in)
+      after_enc_state, out_idx = res[0], res[1]
+      res = [res[2][l] for l in xrange(length)]
+      outputs = [out_idx[:, i] for i in xrange(length)]
+      cost = [0.0 for _ in xrange(beam_size * batch_size)]
+      seen_eos = [0 for _ in xrange(beam_size * batch_size)]
+      for idx, logit in enumerate(res):
+        best = outputs[idx]
+        for b in xrange(batch_size):
+          if seen_eos[b] > 1:
+            cost[b] -= eos_cost
+          else:
+            cost[b] += np.log(logit[b][best[b]])
+          if best[b] in [eos_id]:
+            seen_eos[b] += 1
+      res = [[-c for c in cost]] + outputs
+    # Collect and output results.
    offset = 0
    norm = None
    if do_backward:
      offset = 2
      norm = res[1]
-    outputs = res[offset + 1:offset + 1 + length]
-    steps = res[offset + 1 + length:] if get_steps else None
-    return res[offset], outputs, norm, steps
+    if train_mode:
+      outputs = res[offset + 1]
+      outputs = [outputs[l] for l in xrange(length)]
+    return res[offset], outputs, norm, after_enc_state
--- a/neural_gpu/neural_gpu_trainer.py
+++ b/neural_gpu/neural_gpu_trainer.py
@@ -12,260 +12,744 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Neural GPU for Learning Algorithms."""
+"""Neural GPU."""

 import math
 import os
 import random
 import sys
+import threading
 import time

-import matplotlib.animation as anim
-import matplotlib.pyplot as plt
 import numpy as np
 import tensorflow as tf

-from tensorflow.python.platform import gfile
-
+import program_utils
 import data_utils as data
-import neural_gpu
+import neural_gpu as ngpu
+import wmt_utils as wmt

-tf.app.flags.DEFINE_float("lr", 0.001, "Learning rate.")
-tf.app.flags.DEFINE_float("init_weight", 1.0, "Initial weights deviation.")
-tf.app.flags.DEFINE_float("max_grad_norm", 1.0, "Clip gradients to this norm.")
+tf.app.flags.DEFINE_float("lr", 0.1, "Learning rate.")
+tf.app.flags.DEFINE_float("init_weight", 0.8, "Initial weights deviation.")
+tf.app.flags.DEFINE_float("max_grad_norm", 4.0, "Clip gradients to this norm.")
 tf.app.flags.DEFINE_float("cutoff", 1.2, "Cutoff at the gates.")
-tf.app.flags.DEFINE_float("pull", 0.0005, "Starting pull of the relaxations.")
-tf.app.flags.DEFINE_float("pull_incr", 1.2, "Increase pull by that much.")
-tf.app.flags.DEFINE_float("curriculum_bound", 0.15, "Move curriculum < this.")
-tf.app.flags.DEFINE_float("dropout", 0.15, "Dropout that much.")
+tf.app.flags.DEFINE_float("curriculum_ppx", 9.9, "Move curriculum if ppl < X.")
+tf.app.flags.DEFINE_float("curriculum_seq", 0.3, "Move curriculum if seq < X.")
+tf.app.flags.DEFINE_float("dropout", 0.0, "Dropout that much.")
 tf.app.flags.DEFINE_float("grad_noise_scale", 0.0, "Gradient noise scale.")
+tf.app.flags.DEFINE_float("max_sampling_rate", 0.1, "Maximal sampling rate.")
+tf.app.flags.DEFINE_float("length_norm", 0.0, "Length normalization.")
+tf.app.flags.DEFINE_float("train_beam_freq", 0.0, "Beam-based training.")
+tf.app.flags.DEFINE_float("train_beam_anneal", 20000, "How many steps anneal.")
+tf.app.flags.DEFINE_integer("eval_beam_steps", 4, "How many beam steps eval.")
 tf.app.flags.DEFINE_integer("batch_size", 32, "Batch size.")
-tf.app.flags.DEFINE_integer("low_batch_size", 16, "Low batch size.")
-tf.app.flags.DEFINE_integer("steps_per_checkpoint", 200, "Steps per epoch.")
-tf.app.flags.DEFINE_integer("nmaps", 128, "Number of floats in each cell.")
-tf.app.flags.DEFINE_integer("niclass", 33, "Number of classes (0 is padding).")
-tf.app.flags.DEFINE_integer("noclass", 33, "Number of classes (0 is padding).")
-tf.app.flags.DEFINE_integer("train_data_size", 5000, "Training examples/len.")
-tf.app.flags.DEFINE_integer("max_length", 41, "Maximum length.")
-tf.app.flags.DEFINE_integer("rx_step", 6, "Relax that many recursive steps.")
+tf.app.flags.DEFINE_integer("steps_per_checkpoint", 100, "Steps per epoch.")
+tf.app.flags.DEFINE_integer("nmaps", 64, "Number of floats in each cell.")
+tf.app.flags.DEFINE_integer("vec_size", 64, "Size of word vectors.")
+tf.app.flags.DEFINE_integer("train_data_size", 1000, "Training examples/len.")
+tf.app.flags.DEFINE_integer("max_length", 40, "Maximum length.")
 tf.app.flags.DEFINE_integer("random_seed", 125459, "Random seed.")
 tf.app.flags.DEFINE_integer("nconvs", 2, "How many convolutions / 1 step.")
 tf.app.flags.DEFINE_integer("kw", 3, "Kernel width.")
 tf.app.flags.DEFINE_integer("kh", 3, "Kernel height.")
 tf.app.flags.DEFINE_integer("height", 4, "Height.")
-tf.app.flags.DEFINE_integer("forward_max", 401, "Maximum forward length.")
-tf.app.flags.DEFINE_integer("jobid", -1, "Task id when running on borg.")
+tf.app.flags.DEFINE_integer("mem_size", -1, "Memory size (sqrt)")
+tf.app.flags.DEFINE_integer("soft_mem_size", 1024, "Softmax memory this size.")
+tf.app.flags.DEFINE_integer("num_gpus", 1, "Number of GPUs to use.")
+tf.app.flags.DEFINE_integer("num_replicas", 1, "Number of replicas in use.")
+tf.app.flags.DEFINE_integer("beam_size", 1, "Beam size during decoding. "
+                            "If 0, no decoder, the non-extended Neural GPU.")
+tf.app.flags.DEFINE_integer("max_target_vocab", 0,
+                            "Maximal size of target vocabulary.")
+tf.app.flags.DEFINE_integer("decode_offset", 0, "Offset for decoding.")
+tf.app.flags.DEFINE_integer("task", -1, "Task id when running on borg.")
 tf.app.flags.DEFINE_integer("nprint", 0, "How many test examples to print out.")
+tf.app.flags.DEFINE_integer("eval_bin_print", 3, "How many bins step in eval.")
 tf.app.flags.DEFINE_integer("mode", 0, "Mode: 0-train other-decode.")
-tf.app.flags.DEFINE_bool("animate", False, "Whether to produce an animation.")
+tf.app.flags.DEFINE_bool("atrous", False, "Whether to use atrous convs.")
+tf.app.flags.DEFINE_bool("layer_norm", False, "Do layer normalization.")
 tf.app.flags.DEFINE_bool("quantize", False, "Whether to quantize variables.")
-tf.app.flags.DEFINE_string("task", "rev", "Which task are we learning?")
+tf.app.flags.DEFINE_bool("do_train", True, "If false, only update memory.")
+tf.app.flags.DEFINE_bool("rnn_baseline", False, "If true build an RNN instead.")
+tf.app.flags.DEFINE_bool("simple_tokenizer", False,
+                         "If true, tokenize on spaces only, digits are 0.")
+tf.app.flags.DEFINE_bool("normalize_digits", True,
+                         "Whether to normalize digits with simple tokenizer.")
+tf.app.flags.DEFINE_integer("vocab_size", 16, "Joint vocabulary size.")
+tf.app.flags.DEFINE_string("data_dir", "/tmp", "Data directory")
 tf.app.flags.DEFINE_string("train_dir", "/tmp/", "Directory to store models.")
-tf.app.flags.DEFINE_string("ensemble", "", "Model paths for ensemble.")
+tf.app.flags.DEFINE_string("test_file_prefix", "", "Files to test (.en,.fr).")
+tf.app.flags.DEFINE_integer("max_train_data_size", 0,
+                            "Limit on the size of training data (0: no limit).")
+tf.app.flags.DEFINE_string("word_vector_file_en", "",
+                           "Optional file with word vectors to start training.")
+tf.app.flags.DEFINE_string("word_vector_file_fr", "",
+                           "Optional file with word vectors to start training.")
+tf.app.flags.DEFINE_string("problem", "wmt", "What problem are we solving?.")
+
+tf.app.flags.DEFINE_integer("ps_tasks", 0, "Number of ps tasks used.")
+tf.app.flags.DEFINE_string("master", "", "Name of the TensorFlow master.")

 FLAGS = tf.app.flags.FLAGS
-EXTRA_EVAL = 12
+EXTRA_EVAL = 10
+EVAL_LEN_INCR = 8
+MAXLEN_F = 2.0
+
+
+def zero_split(tok_list, append=None):
+  """Split tok_list (list of ints) on 0s, append int to all parts if given."""
+  res, cur, l = [], [], 0
+  for tok in tok_list:
+    if tok == 0:
+      if append is not None:
+        cur.append(append)
+      res.append(cur)
+      l = max(l, len(cur))
+      cur = []
+    else:
+      cur.append(tok)
+  if append is not None:
+    cur.append(append)
+  res.append(cur)
+  l = max(l, len(cur))
+  return res, l
+
+
+def read_data(source_path, target_path, buckets, max_size=None, print_out=True):
+  """Read data from source and target files and put into buckets.
+
+  Args:
+    source_path: path to the files with token-ids for the source language.
+    target_path: path to the file with token-ids for the target language;
+      it must be aligned with the source file: n-th line contains the desired
+      output for n-th line from the source_path.
+    buckets: the buckets to use.
+    max_size: maximum number of lines to read, all other will be ignored;
+      if 0 or None, data files will be read completely (no limit).
+      If set to 1, no data will be returned (empty lists of the right form).
+    print_out: whether to print out status or not.
+
+  Returns:
+    data_set: a list of length len(_buckets); data_set[n] contains a list of
+      (source, target) pairs read from the provided data files that fit
+      into the n-th bucket, i.e., such that len(source) < _buckets[n][0] and
+      len(target) < _buckets[n][1]; source and target are lists of token-ids.
+  """
+  data_set = [[] for _ in buckets]
+  counter = 0
+  if max_size != 1:
+    with tf.gfile.GFile(source_path, mode="r") as source_file:
+      with tf.gfile.GFile(target_path, mode="r") as target_file:
+        source, target = source_file.readline(), target_file.readline()
+        while source and target and (not max_size or counter < max_size):
+          counter += 1
+          if counter % 100000 == 0 and print_out:
+            print "  reading data line %d" % counter
+            sys.stdout.flush()
+          source_ids = [int(x) for x in source.split()]
+          target_ids = [int(x) for x in target.split()]
+          source_ids, source_len = zero_split(source_ids)
+          target_ids, target_len = zero_split(target_ids, append=wmt.EOS_ID)
+          for bucket_id, size in enumerate(buckets):
+            if source_len <= size and target_len <= size:
+              data_set[bucket_id].append([source_ids, target_ids])
+              break
+          source, target = source_file.readline(), target_file.readline()
+  return data_set
+
+
+global_train_set = {"wmt": []}
+train_buckets_scale = {"wmt": []}
+
+
+def calculate_buckets_scale(data_set, buckets, problem):
+  """Calculate buckets scales for the given data set."""
+  train_bucket_sizes = [len(data_set[b]) for b in xrange(len(buckets))]
+  train_total_size = max(1, float(sum(train_bucket_sizes)))
+
+  # A bucket scale is a list of increasing numbers from 0 to 1 that we'll use
+  # to select a bucket. Length of [scale[i], scale[i+1]] is proportional to
+  # the size if i-th training bucket, as used later.
+  if problem not in train_buckets_scale:
+    train_buckets_scale[problem] = []
+  train_buckets_scale[problem].append(
+      [sum(train_bucket_sizes[:i + 1]) / train_total_size
+       for i in xrange(len(train_bucket_sizes))])
+  return train_total_size
+
+
+def read_data_into_global(source_path, target_path, buckets,
+                          max_size=None, print_out=True):
+  """Read data into the global variables (can be in a separate thread)."""
+  # pylint: disable=global-variable-not-assigned
+  global global_train_set, train_buckets_scale
+  # pylint: enable=global-variable-not-assigned
+  data_set = read_data(source_path, target_path, buckets, max_size, print_out)
+  global_train_set["wmt"].append(data_set)
+  train_total_size = calculate_buckets_scale(data_set, buckets, "wmt")
+  if print_out:
+    print "  Finished global data reading (%d)." % train_total_size


-def initialize(sess):
+def initialize(sess=None):
  """Initialize data and model."""
-  if FLAGS.jobid >= 0:
-    data.log_filename = os.path.join(FLAGS.train_dir, "log%d" % FLAGS.jobid)
-  data.print_out("NN ", newline=False)
+  global MAXLEN_F
+  # Create training directory if it does not exist.
+  if not tf.gfile.IsDirectory(FLAGS.train_dir):
+    data.print_out("Creating training directory %s." % FLAGS.train_dir)
+    tf.gfile.MkDir(FLAGS.train_dir)
+  decode_suffix = "beam%dln%d" % (FLAGS.beam_size,
+                                  int(100 * FLAGS.length_norm))
+  if FLAGS.mode == 0:
+    decode_suffix = ""
+  if FLAGS.task >= 0:
+    data.log_filename = os.path.join(FLAGS.train_dir,
+                                     "log%d%s" % (FLAGS.task, decode_suffix))
+  else:
+    data.log_filename = os.path.join(FLAGS.train_dir, "neural_gpu/log")

  # Set random seed.
-  seed = FLAGS.random_seed + max(0, FLAGS.jobid)
-  tf.set_random_seed(seed)
-  random.seed(seed)
-  np.random.seed(seed)
+  if FLAGS.random_seed > 0:
+    seed = FLAGS.random_seed + max(0, FLAGS.task)
+    tf.set_random_seed(seed)
+    random.seed(seed)
+    np.random.seed(seed)

  # Check data sizes.
  assert data.bins
-  min_length = 3
  max_length = min(FLAGS.max_length, data.bins[-1])
+  while len(data.bins) > 1 and data.bins[-2] >= max_length + EXTRA_EVAL:
+    data.bins = data.bins[:-1]
+  if sess is None and FLAGS.task == 0 and FLAGS.num_replicas > 1:
+    if max_length > 60:
+      max_length = max_length * 1 / 2  # Save memory on chief.
+  min_length = min(14, max_length - 3) if FLAGS.problem == "wmt" else 3
+  for p in FLAGS.problem.split("-"):
+    if p in ["progeval", "progsynth"]:
+      min_length = max(26, min_length)
  assert max_length + 1 > min_length
-  while len(data.bins) > 1 and data.bins[-2] > max_length + EXTRA_EVAL:
+  while len(data.bins) > 1 and data.bins[-2] >= max_length + EXTRA_EVAL:
    data.bins = data.bins[:-1]
-  assert data.bins[0] > FLAGS.rx_step
-  data.forward_max = max(FLAGS.forward_max, data.bins[-1])
-  nclass = min(FLAGS.niclass, FLAGS.noclass)
-  data_size = FLAGS.train_data_size if FLAGS.mode == 0 else 1000
-
-  # Initialize data for each task.
-  tasks = FLAGS.task.split("-")
-  for t in tasks:
-    for l in xrange(max_length + EXTRA_EVAL - 1):
-      data.init_data(t, l, data_size, nclass)
-    data.init_data(t, data.bins[-2], data_size, nclass)
-    data.init_data(t, data.bins[-1], data_size, nclass)
-    end_size = 4 * 1024 if FLAGS.mode > 0 else 1024
-    data.init_data(t, data.forward_max, end_size, nclass)
-
-  # Print out parameters.
-  curriculum = FLAGS.curriculum_bound
-  msg1 = ("layers %d kw %d h %d kh %d relax %d batch %d noise %.2f task %s"
-          % (FLAGS.nconvs, FLAGS.kw, FLAGS.height, FLAGS.kh, FLAGS.rx_step,
-             FLAGS.batch_size, FLAGS.grad_noise_scale, FLAGS.task))
-  msg2 = "data %d %s" % (FLAGS.train_data_size, msg1)
-  msg3 = ("cut %.2f pull %.3f lr %.2f iw %.2f cr %.2f nm %d d%.4f gn %.2f %s" %
-          (FLAGS.cutoff, FLAGS.pull_incr, FLAGS.lr, FLAGS.init_weight,
-           curriculum, FLAGS.nmaps, FLAGS.dropout, FLAGS.max_grad_norm, msg2))
-  data.print_out(msg3)

  # Create checkpoint directory if it does not exist.
-  checkpoint_dir = os.path.join(FLAGS.train_dir, "neural_gpu%s"
-                                % ("" if FLAGS.jobid < 0 else str(FLAGS.jobid)))
-  if not gfile.IsDirectory(checkpoint_dir):
+  if FLAGS.mode == 0 or FLAGS.task < 0:
+    checkpoint_dir = os.path.join(FLAGS.train_dir, "neural_gpu%s"
+                                  % ("" if FLAGS.task < 0 else str(FLAGS.task)))
+  else:
+    checkpoint_dir = FLAGS.train_dir
+  if not tf.gfile.IsDirectory(checkpoint_dir):
    data.print_out("Creating checkpoint directory %s." % checkpoint_dir)
-    gfile.MkDir(checkpoint_dir)
+    tf.gfile.MkDir(checkpoint_dir)
+
+  # Prepare data.
+  if FLAGS.problem == "wmt":
+    # Prepare WMT data.
+    data.print_out("Preparing WMT data in %s" % FLAGS.data_dir)
+    if FLAGS.simple_tokenizer:
+      MAXLEN_F = 3.5
+      (en_train, fr_train, en_dev, fr_dev,
+       en_path, fr_path) = wmt.prepare_wmt_data(
+           FLAGS.data_dir, FLAGS.vocab_size,
+           tokenizer=wmt.space_tokenizer,
+           normalize_digits=FLAGS.normalize_digits)
+    else:
+      (en_train, fr_train, en_dev, fr_dev,
+       en_path, fr_path) = wmt.prepare_wmt_data(
+           FLAGS.data_dir, FLAGS.vocab_size)
+
+    # Read data into buckets and compute their sizes.
+    fr_vocab, rev_fr_vocab = wmt.initialize_vocabulary(fr_path)
+    data.vocab = fr_vocab
+    data.rev_vocab = rev_fr_vocab
+    data.print_out("Reading development and training data (limit: %d)."
+                   % FLAGS.max_train_data_size)
+    dev_set = read_data(en_dev, fr_dev, data.bins)
+    def data_read(size, print_out):
+      read_data_into_global(en_train, fr_train, data.bins, size, print_out)
+    data_read(50000, False)
+    read_thread_small = threading.Thread(
+        name="reading-data-small", target=lambda: data_read(900000, False))
+    read_thread_small.start()
+    read_thread_full = threading.Thread(
+        name="reading-data-full",
+        target=lambda: data_read(FLAGS.max_train_data_size, True))
+    read_thread_full.start()
+    data.print_out("Data reading set up.")
+  else:
+    # Prepare algorithmic data.
+    en_path, fr_path = None, None
+    tasks = FLAGS.problem.split("-")
+    data_size = FLAGS.train_data_size
+    for t in tasks:
+      data.print_out("Generating data for %s." % t)
+      if t in ["progeval", "progsynth"]:
+        data.init_data(t, data.bins[-1], 20 * data_size, FLAGS.vocab_size)
+        if len(program_utils.prog_vocab) > FLAGS.vocab_size - 2:
+          raise ValueError("Increase vocab_size to %d for prog-tasks."
+                           % (len(program_utils.prog_vocab) + 2))
+        data.rev_vocab = program_utils.prog_vocab
+        data.vocab = program_utils.prog_rev_vocab
+      else:
+        for l in xrange(max_length + EXTRA_EVAL - 1):
+          data.init_data(t, l, data_size, FLAGS.vocab_size)
+        data.init_data(t, data.bins[-2], data_size, FLAGS.vocab_size)
+        data.init_data(t, data.bins[-1], data_size, FLAGS.vocab_size)
+      if t not in global_train_set:
+        global_train_set[t] = []
+      global_train_set[t].append(data.train_set[t])
+      calculate_buckets_scale(data.train_set[t], data.bins, t)
+    dev_set = data.test_set
+
+  # Grid-search parameters.
+  lr = FLAGS.lr
+  init_weight = FLAGS.init_weight
+  max_grad_norm = FLAGS.max_grad_norm
+  if sess is not None and FLAGS.task > -1:
+    def job_id_factor(step):
+      """If jobid / step mod 3 is 0, 1, 2: say 0, 1, -1."""
+      return ((((FLAGS.task / step) % 3) + 1) % 3) - 1
+    lr *= math.pow(2, job_id_factor(1))
+    init_weight *= math.pow(1.5, job_id_factor(3))
+    max_grad_norm *= math.pow(2, job_id_factor(9))
+
+  # Print out parameters.
+  curriculum = FLAGS.curriculum_seq
+  msg1 = ("layers %d kw %d h %d kh %d batch %d noise %.2f"
+          % (FLAGS.nconvs, FLAGS.kw, FLAGS.height, FLAGS.kh,
+             FLAGS.batch_size, FLAGS.grad_noise_scale))
+  msg2 = ("cut %.2f lr %.3f iw %.2f cr %.2f nm %d d%.4f gn %.2f %s"
+          % (FLAGS.cutoff, lr, init_weight, curriculum, FLAGS.nmaps,
+             FLAGS.dropout, max_grad_norm, msg1))
+  data.print_out(msg2)

  # Create model and initialize it.
  tf.get_variable_scope().set_initializer(
-      tf.uniform_unit_scaling_initializer(factor=1.8 * FLAGS.init_weight))
-  model = neural_gpu.NeuralGPU(
-      FLAGS.nmaps, FLAGS.nmaps, FLAGS.niclass, FLAGS.noclass, FLAGS.dropout,
-      FLAGS.rx_step, FLAGS.max_grad_norm, FLAGS.cutoff, FLAGS.nconvs,
-      FLAGS.kw, FLAGS.kh, FLAGS.height, FLAGS.mode, FLAGS.lr,
-      FLAGS.pull, FLAGS.pull_incr, min_length + 3)
-  data.print_out("Created model.")
-  sess.run(tf.initialize_all_variables())
-  data.print_out("Initialized variables.")
+      tf.orthogonal_initializer(gain=1.8 * init_weight))
+  max_sampling_rate = FLAGS.max_sampling_rate if FLAGS.mode == 0 else 0.0
+  o = FLAGS.vocab_size if FLAGS.max_target_vocab < 1 else FLAGS.max_target_vocab
+  ngpu.CHOOSE_K = FLAGS.soft_mem_size
+  do_beam_model = FLAGS.train_beam_freq > 0.0001 and FLAGS.beam_size > 1
+  beam_size = FLAGS.beam_size if FLAGS.mode > 0 and not do_beam_model else 1
+  beam_model = None
+  def make_ngpu(cur_beam_size, back):
+    return ngpu.NeuralGPU(
+        FLAGS.nmaps, FLAGS.vec_size, FLAGS.vocab_size, o,
+        FLAGS.dropout, max_grad_norm, FLAGS.cutoff, FLAGS.nconvs,
+        FLAGS.kw, FLAGS.kh, FLAGS.height, FLAGS.mem_size,
+        lr / math.sqrt(FLAGS.num_replicas), min_length + 3, FLAGS.num_gpus,
+        FLAGS.num_replicas, FLAGS.grad_noise_scale, max_sampling_rate,
+        atrous=FLAGS.atrous, do_rnn=FLAGS.rnn_baseline,
+        do_layer_norm=FLAGS.layer_norm, beam_size=cur_beam_size, backward=back)
+  if sess is None:
+    with tf.device(tf.train.replica_device_setter(FLAGS.ps_tasks)):
+      model = make_ngpu(beam_size, True)
+      if do_beam_model:
+        tf.get_variable_scope().reuse_variables()
+        beam_model = make_ngpu(FLAGS.beam_size, False)
+  else:
+    model = make_ngpu(beam_size, True)
+    if do_beam_model:
+      tf.get_variable_scope().reuse_variables()
+      beam_model = make_ngpu(FLAGS.beam_size, False)
+
+  sv = None
+  if sess is None:
+    # The supervisor configuration has a few overriden options.
+    sv = tf.train.Supervisor(logdir=checkpoint_dir,
+                             is_chief=(FLAGS.task < 1),
+                             saver=model.saver,
+                             summary_op=None,
+                             save_summaries_secs=60,
+                             save_model_secs=15 * 60,
+                             global_step=model.global_step)
+
+    config = tf.ConfigProto(allow_soft_placement=True)
+    sess = sv.PrepareSession(FLAGS.master, config=config)
+
+  data.print_out("Created model. Checkpoint dir %s" % checkpoint_dir)

  # Load model from parameters if a checkpoint exists.
  ckpt = tf.train.get_checkpoint_state(checkpoint_dir)
-  if ckpt and gfile.Exists(ckpt.model_checkpoint_path):
+  if ckpt and tf.gfile.Exists(ckpt.model_checkpoint_path + ".index"):
    data.print_out("Reading model parameters from %s"
                   % ckpt.model_checkpoint_path)
    model.saver.restore(sess, ckpt.model_checkpoint_path)
-
-  # Check if there are ensemble models and get their checkpoints.
-  ensemble = []
-  ensemble_dir_list = [d for d in FLAGS.ensemble.split(",") if d]
-  for ensemble_dir in ensemble_dir_list:
-    ckpt = tf.train.get_checkpoint_state(ensemble_dir)
-    if ckpt and gfile.Exists(ckpt.model_checkpoint_path):
-      data.print_out("Found ensemble model %s" % ckpt.model_checkpoint_path)
-      ensemble.append(ckpt.model_checkpoint_path)
+  elif sv is None:
+    sess.run(tf.initialize_all_variables())
+    data.print_out("Initialized variables (no supervisor mode).")
+  elif FLAGS.task < 1 and FLAGS.mem_size > 0:
+    # sess.run(model.mem_norm_op)
+    data.print_out("Created new model and normalized mem (on chief).")

  # Return the model and needed variables.
-  return (model, min_length, max_length, checkpoint_dir, curriculum, ensemble)
-
-
-def single_test(l, model, sess, task, nprint, batch_size, print_out=True,
-                offset=None, ensemble=None, get_steps=False):
+  return (model, beam_model, min_length, max_length, checkpoint_dir,
+          (global_train_set, dev_set, en_path, fr_path), sv, sess)
+
+
+def m_step(model, beam_model, sess, batch_size, inp, target, bucket, nsteps, p):
+  """Evaluation multi-step for program synthesis."""
+  state, scores, hist = None, [[-11.0 for _ in xrange(batch_size)]], []
+  for _ in xrange(nsteps):
+    # Get the best beam (no training, just forward model).
+    new_target, new_first, new_inp, new_scores = get_best_beam(
+        beam_model, sess, inp, target,
+        batch_size, FLAGS.beam_size, bucket, hist, p, test_mode=True)
+    hist.append(new_first)
+    _, _, _, state = model.step(sess, inp, new_target, False, state=state)
+    inp = new_inp
+    scores.append([max(scores[-1][i], new_scores[i])
+                   for i in xrange(batch_size)])
+  # The final step with the true target.
+  loss, res, _, _ = model.step(sess, inp, target, False, state=state)
+  return loss, res, new_target, scores[1:]
+
+
+def single_test(bin_id, model, sess, nprint, batch_size, dev, p, print_out=True,
+                offset=None, beam_model=None):
  """Test model on test data of length l using the given session."""
-  inpt, target = data.get_batch(l, batch_size, False, task, offset)
-  _, res, _, steps = model.step(sess, inpt, target, False, get_steps=get_steps)
-  errors, total, seq_err = data.accuracy(inpt, res, target, batch_size, nprint)
+  if not dev[p][bin_id]:
+    data.print_out("  bin %d (%d)\t%s\tppl NA errors NA seq-errors NA"
+                   % (bin_id, data.bins[bin_id], p))
+    return 1.0, 1.0, 0.0
+  inpt, target = data.get_batch(
+      bin_id, batch_size, dev[p], FLAGS.height, offset)
+  if FLAGS.beam_size > 1 and beam_model:
+    loss, res, new_tgt, scores = m_step(
+        model, beam_model, sess, batch_size, inpt, target, bin_id,
+        FLAGS.eval_beam_steps, p)
+    score_avgs = [sum(s) / float(len(s)) for s in scores]
+    score_maxs = [max(s) for s in scores]
+    score_str = ["(%.2f, %.2f)" % (score_avgs[i], score_maxs[i])
+                 for i in xrange(FLAGS.eval_beam_steps)]
+    data.print_out("  == scores (avg, max): %s" % "; ".join(score_str))
+    errors, total, seq_err = data.accuracy(inpt, res, target, batch_size,
+                                           nprint, new_tgt, scores[-1])
+  else:
+    loss, res, _, _ = model.step(sess, inpt, target, False)
+    errors, total, seq_err = data.accuracy(inpt, res, target, batch_size,
+                                           nprint)
  seq_err = float(seq_err) / batch_size
  if total > 0:
    errors = float(errors) / total
  if print_out:
-    data.print_out("  %s len %d errors %.2f sequence-errors %.2f"
-                   % (task, l, 100*errors, 100*seq_err))
-  # Ensemble eval.
-  if ensemble:
-    results = []
-    for m in ensemble:
-      model.saver.restore(sess, m)
-      _, result, _, _ = model.step(sess, inpt, target, False)
-      m_errors, m_total, m_seq_err = data.accuracy(inpt, result, target,
-                                                   batch_size, nprint)
-      m_seq_err = float(m_seq_err) / batch_size
-      if total > 0:
-        m_errors = float(m_errors) / m_total
-      data.print_out("     %s len %d m-errors %.2f m-sequence-errors %.2f"
-                     % (task, l, 100*m_errors, 100*m_seq_err))
-      results.append(result)
-    ens = [sum(o) for o in zip(*results)]
-    errors, total, seq_err = data.accuracy(inpt, ens, target,
-                                           batch_size, nprint)
-    seq_err = float(seq_err) / batch_size
-    if total > 0:
-      errors = float(errors) / total
-    if print_out:
-      data.print_out("  %s len %d ens-errors %.2f ens-sequence-errors %.2f"
-                     % (task, l, 100*errors, 100*seq_err))
-  return errors, seq_err, (steps, inpt, [np.argmax(o, axis=1) for o in res])
-
-
-def multi_test(l, model, sess, task, nprint, batch_size, offset=None,
-               ensemble=None):
-  """Run multiple tests at lower batch size to save memory."""
-  errors, seq_err = 0.0, 0.0
-  to_print = nprint
-  low_batch = FLAGS.low_batch_size
-  low_batch = min(low_batch, batch_size)
-  for mstep in xrange(batch_size / low_batch):
-    cur_offset = None if offset is None else offset + mstep * low_batch
-    err, sq_err, _ = single_test(l, model, sess, task, to_print, low_batch,
-                                 False, cur_offset, ensemble=ensemble)
-    to_print = max(0, to_print - low_batch)
-    errors += err
-    seq_err += sq_err
-    if FLAGS.mode > 0:
-      cur_errors = float(low_batch * errors) / ((mstep+1) * low_batch)
-      cur_seq_err = float(low_batch * seq_err) / ((mstep+1) * low_batch)
-      data.print_out("    %s multitest current errors %.2f sequence-errors %.2f"
-                     % (task, 100*cur_errors, 100*cur_seq_err))
-  errors = float(low_batch) * float(errors) / batch_size
-  seq_err = float(low_batch) * float(seq_err) / batch_size
-  data.print_out("  %s len %d errors %.2f sequence-errors %.2f"
-                 % (task, l, 100*errors, 100*seq_err))
-  return errors, seq_err
+    data.print_out("  bin %d (%d)\t%s\tppl %.2f errors %.2f seq-errors %.2f"
+                   % (bin_id, data.bins[bin_id], p, data.safe_exp(loss),
+                      100 * errors, 100 * seq_err))
+  return (errors, seq_err, loss)
+
+
+def assign_vectors(word_vector_file, embedding_key, vocab_path, sess):
+  """Assign the embedding_key variable from the given word vectors file."""
+  # For words in the word vector file, set their embedding at start.
+  if not tf.gfile.Exists(word_vector_file):
+    data.print_out("Word vector file does not exist: %s" % word_vector_file)
+    sys.exit(1)
+  vocab, _ = wmt.initialize_vocabulary(vocab_path)
+  vectors_variable = [v for v in tf.trainable_variables()
+                      if embedding_key == v.name]
+  if len(vectors_variable) != 1:
+    data.print_out("Word vector variable not found or too many.")
+    sys.exit(1)
+  vectors_variable = vectors_variable[0]
+  vectors = vectors_variable.eval()
+  data.print_out("Pre-setting word vectors from %s" % word_vector_file)
+  with tf.gfile.GFile(word_vector_file, mode="r") as f:
+    # Lines have format: dog 0.045123 -0.61323 0.413667 ...
+    for line in f:
+      line_parts = line.split()
+      # The first part is the word.
+      word = line_parts[0]
+      if word in vocab:
+        # Remaining parts are components of the vector.
+        word_vector = np.array(map(float, line_parts[1:]))
+        if len(word_vector) != FLAGS.vec_size:
+          data.print_out("Warn: Word '%s', Expecting vector size %d, "
+                         "found %d" % (word, FLAGS.vec_size,
+                                       len(word_vector)))
+        else:
+          vectors[vocab[word]] = word_vector
+  # Assign the modified vectors to the vectors_variable in the graph.
+  sess.run([vectors_variable.initializer],
+           {vectors_variable.initializer.inputs[1]: vectors})
+
+
+def print_vectors(embedding_key, vocab_path, word_vector_file):
+  """Print vectors from the given variable."""
+  _, rev_vocab = wmt.initialize_vocabulary(vocab_path)
+  vectors_variable = [v for v in tf.trainable_variables()
+                      if embedding_key == v.name]
+  if len(vectors_variable) != 1:
+    data.print_out("Word vector variable not found or too many.")
+    sys.exit(1)
+  vectors_variable = vectors_variable[0]
+  vectors = vectors_variable.eval()
+  l, s = vectors.shape[0], vectors.shape[1]
+  data.print_out("Printing %d word vectors from %s to %s."
+                 % (l, embedding_key, word_vector_file))
+  with tf.gfile.GFile(word_vector_file, mode="w") as f:
+    # Lines have format: dog 0.045123 -0.61323 0.413667 ...
+    for i in xrange(l):
+      f.write(rev_vocab[i])
+      for j in xrange(s):
+        f.write(" %.8f" % vectors[i][j])
+      f.write("\n")
+
+
+def get_bucket_id(train_buckets_scale_c, max_cur_length, data_set):
+  """Get a random bucket id."""
+  # Choose a bucket according to data distribution. Pick a random number
+  # in [0, 1] and use the corresponding interval in train_buckets_scale.
+  random_number_01 = np.random.random_sample()
+  bucket_id = min([i for i in xrange(len(train_buckets_scale_c))
+                   if train_buckets_scale_c[i] > random_number_01])
+  while bucket_id > 0 and not data_set[bucket_id]:
+    bucket_id -= 1
+  for _ in xrange(10 if np.random.random_sample() < 0.9 else 1):
+    if data.bins[bucket_id] > max_cur_length:
+      random_number_01 = min(random_number_01, np.random.random_sample())
+      bucket_id = min([i for i in xrange(len(train_buckets_scale_c))
+                       if train_buckets_scale_c[i] > random_number_01])
+      while bucket_id > 0 and not data_set[bucket_id]:
+        bucket_id -= 1
+  return bucket_id
+
+
+def score_beams(beams, target, inp, history, p,
+                print_out=False, test_mode=False):
+  """Score beams."""
+  if p == "progsynth":
+    return score_beams_prog(beams, target, inp, history, print_out, test_mode)
+  elif test_mode:
+    return beams[0], 10.0 if str(beams[0][:len(target)]) == str(target) else 0.0
+  else:
+    history_s = [str(h) for h in history]
+    best, best_score, tgt, eos_id = None, -1000.0, target, None
+    if p == "wmt":
+      eos_id = wmt.EOS_ID
+    if eos_id and eos_id in target:
+      tgt = target[:target.index(eos_id)]
+    for beam in beams:
+      if eos_id and eos_id in beam:
+        beam = beam[:beam.index(eos_id)]
+      l = min(len(tgt), len(beam))
+      score = len([i for i in xrange(l) if tgt[i] == beam[i]]) / float(len(tgt))
+      hist_score = 20.0 if str([b for b in beam if b > 0]) in history_s else 0.0
+      if score < 1.0:
+        score -= hist_score
+      if score > best_score:
+        best = beam
+        best_score = score
+    return best, best_score
+
+
+def score_beams_prog(beams, target, inp, history, print_out=False,
+                     test_mode=False):
+  """Score beams for program synthesis."""
+  tgt_prog = linearize(target, program_utils.prog_vocab, True, 1)
+  hist_progs = [linearize(h, program_utils.prog_vocab, True, 1)
+                for h in history]
+  tgt_set = set(target)
+  if print_out:
+    print "target: ", tgt_prog
+  inps, tgt_outs = [], []
+  for i in xrange(3):
+    ilist = [inp[i + 1, l] for l in xrange(inp.shape[1])]
+    clist = [program_utils.prog_vocab[x] for x in ilist if x > 0]
+    olist = clist[clist.index("]") + 1:]  # outputs
+    clist = clist[1:clist.index("]")]     # inputs
+    inps.append([int(x) for x in clist])
+    if olist[0] == "[":  # olist may be [int] or just int
+      tgt_outs.append(str([int(x) for x in olist[1:-1]]))
+    else:
+      if len(olist) == 1:
+        tgt_outs.append(olist[0])
+      else:
+        print [program_utils.prog_vocab[x] for x in ilist if x > 0]
+        print olist
+        print tgt_prog
+        print program_utils.evaluate(tgt_prog, {"a": inps[-1]})
+        print "AAAAA"
+        tgt_outs.append(olist[0])
+  if not test_mode:
+    for _ in xrange(7):
+      ilen = np.random.randint(len(target) - 3) + 1
+      inps.append([random.choice(range(-15, 15)) for _ in range(ilen)])
+    tgt_outs.extend([program_utils.evaluate(tgt_prog, {"a": inp})
+                     for inp in inps[3:]])
+  best, best_prog, best_score = None, "", -1000.0
+  for beam in beams:
+    b_prog = linearize(beam, program_utils.prog_vocab, True, 1)
+    b_set = set(beam)
+    jsim = len(tgt_set & b_set) / float(len(tgt_set | b_set))
+    b_outs = [program_utils.evaluate(b_prog, {"a": inp}) for inp in inps]
+    errs = len([x for x in b_outs if x == "ERROR"])
+    imatches = len([i for i in xrange(3) if b_outs[i] == tgt_outs[i]])
+    perfect = 10.0 if imatches == 3 else 0.0
+    hist_score = 20.0 if b_prog in hist_progs else 0.0
+    if test_mode:
+      score = perfect - errs
+    else:
+      matches = len([i for i in xrange(10) if b_outs[i] == tgt_outs[i]])
+      score = perfect + matches + jsim - errs
+    if score < 10.0:
+      score -= hist_score
+    # print b_prog
+    # print "jsim: ", jsim, " errs: ", errs, " mtchs: ", matches, " s: ", score
+    if score > best_score:
+      best = beam
+      best_prog = b_prog
+      best_score = score
+  if print_out:
+    print "best score: ", best_score, " best prog: ", best_prog
+  return best, best_score
+
+
+def get_best_beam(beam_model, sess, inp, target, batch_size, beam_size,
+                  bucket, history, p, test_mode=False):
+  """Run beam_model, score beams, and return the best as target and in input."""
+  _, output_logits, _, _ = beam_model.step(
+      sess, inp, target, None, beam_size=FLAGS.beam_size)
+  new_targets, new_firsts, scores, new_inp = [], [], [], np.copy(inp)
+  for b in xrange(batch_size):
+    outputs = []
+    history_b = [[h[b, 0, l] for l in xrange(data.bins[bucket])]
+                 for h in history]
+    for beam_idx in xrange(beam_size):
+      outputs.append([int(o[beam_idx * batch_size + b])
+                      for o in output_logits])
+    target_t = [target[b, 0, l] for l in xrange(data.bins[bucket])]
+    best, best_score = score_beams(
+        outputs, [t for t in target_t if t > 0], inp[b, :, :],
+        [[t for t in h if t > 0] for h in history_b], p, test_mode=test_mode)
+    scores.append(best_score)
+    if 1 in best:  # Only until _EOS.
+      best = best[:best.index(1) + 1]
+    best += [0 for _ in xrange(len(target_t) - len(best))]
+    new_targets.append([best])
+    first, _ = score_beams(
+        outputs, [t for t in target_t if t > 0], inp[b, :, :],
+        [[t for t in h if t > 0] for h in history_b], p, test_mode=True)
+    if 1 in first:  # Only until _EOS.
+      first = first[:first.index(1) + 1]
+    first += [0 for _ in xrange(len(target_t) - len(first))]
+    new_inp[b, 0, :] = np.array(first, dtype=np.int32)
+    new_firsts.append([first])
+  # Change target if we found a great answer.
+  new_target = np.array(new_targets, dtype=np.int32)
+  for b in xrange(batch_size):
+    if scores[b] >= 10.0:
+      target[b, 0, :] = new_target[b, 0, :]
+  new_first = np.array(new_firsts, dtype=np.int32)
+  return new_target, new_first, new_inp, scores


 def train():
  """Train the model."""
-  batch_size = FLAGS.batch_size
-  tasks = FLAGS.task.split("-")
-  with tf.Session() as sess:
-    (model, min_length, max_length, checkpoint_dir,
-     curriculum, _) = initialize(sess)
-    quant_op = neural_gpu.quantize_weights_op(512, 8)
+  batch_size = FLAGS.batch_size * FLAGS.num_gpus
+  (model, beam_model, min_length, max_length, checkpoint_dir,
+   (train_set, dev_set, en_vocab_path, fr_vocab_path), sv, sess) = initialize()
+  with sess.as_default():
+    quant_op = model.quantize_op
    max_cur_length = min(min_length + 3, max_length)
-    prev_acc_perp = [1000000 for _ in xrange(3)]
+    prev_acc_perp = [1000000 for _ in xrange(5)]
    prev_seq_err = 1.0
+    is_chief = FLAGS.task < 1
+    do_report = False

    # Main traning loop.
-    while True:
-      global_step, pull, max_cur_length, learning_rate = sess.run(
-          [model.global_step, model.pull, model.cur_length, model.lr])
-      acc_loss, acc_total, acc_errors, acc_seq_err = 0.0, 0, 0, 0
-      acc_grad_norm, step_count, step_time = 0.0, 0, 0.0
+    while not sv.ShouldStop():
+      global_step, max_cur_length, learning_rate = sess.run(
+          [model.global_step, model.cur_length, model.lr])
+      acc_loss, acc_l1, acc_total, acc_errors, acc_seq_err = 0.0, 0.0, 0, 0, 0
+      acc_grad_norm, step_count, step_c1, step_time = 0.0, 0, 0, 0.0
+
+      # For words in the word vector file, set their embedding at start.
+      bound1 = FLAGS.steps_per_checkpoint - 1
+      if FLAGS.word_vector_file_en and global_step < bound1 and is_chief:
+        assign_vectors(FLAGS.word_vector_file_en, "embedding:0",
+                       en_vocab_path, sess)
+        if FLAGS.max_target_vocab < 1:
+          assign_vectors(FLAGS.word_vector_file_en, "target_embedding:0",
+                         en_vocab_path, sess)
+
+      if FLAGS.word_vector_file_fr and global_step < bound1 and is_chief:
+        assign_vectors(FLAGS.word_vector_file_fr, "embedding:0",
+                       fr_vocab_path, sess)
+        if FLAGS.max_target_vocab < 1:
+          assign_vectors(FLAGS.word_vector_file_fr, "target_embedding:0",
+                         fr_vocab_path, sess)
+
      for _ in xrange(FLAGS.steps_per_checkpoint):
-        global_step += 1
-        task = random.choice(tasks)
-
-        # Select the length for curriculum learning.
-        l = np.random.randint(max_cur_length - min_length + 1) + min_length
-        # Prefer longer stuff 60% of time.
-        if np.random.randint(100) < 60:
-          l1 = np.random.randint(max_cur_length - min_length+1) + min_length
-          l = max(l, l1)
-        # Mixed curriculum learning: in 25% of cases go to any larger length.
-        if np.random.randint(100) < 25:
-          l1 = np.random.randint(max_length - min_length + 1) + min_length
-          l = max(l, l1)
+        step_count += 1
+        step_c1 += 1
+        global_step = int(model.global_step.eval())
+        train_beam_anneal = global_step / float(FLAGS.train_beam_anneal)
+        train_beam_freq = FLAGS.train_beam_freq * min(1.0, train_beam_anneal)
+        p = random.choice(FLAGS.problem.split("-"))
+        train_set = global_train_set[p][-1]
+        bucket_id = get_bucket_id(train_buckets_scale[p][-1], max_cur_length,
+                                  train_set)
+        # Prefer longer stuff 60% of time if not wmt.
+        if np.random.randint(100) < 60 and FLAGS.problem != "wmt":
+          bucket1 = get_bucket_id(train_buckets_scale[p][-1], max_cur_length,
+                                  train_set)
+          bucket_id = max(bucket1, bucket_id)

        # Run a step and time it.
        start_time = time.time()
-        inp, target = data.get_batch(l, batch_size, True, task)
-        noise_param = math.sqrt(math.pow(global_step, -0.55) *
+        inp, target = data.get_batch(bucket_id, batch_size, train_set,
+                                     FLAGS.height)
+        noise_param = math.sqrt(math.pow(global_step + 1, -0.55) *
                                prev_seq_err) * FLAGS.grad_noise_scale
-        loss, res, gnorm, _ = model.step(sess, inp, target, True, noise_param)
+        # In multi-step mode, we use best from beam for middle steps.
+        state, new_target, scores, history = None, None, None, []
+        while (FLAGS.beam_size > 1 and
+               train_beam_freq > np.random.random_sample()):
+          # Get the best beam (no training, just forward model).
+          new_target, new_first, new_inp, scores = get_best_beam(
+              beam_model, sess, inp, target,
+              batch_size, FLAGS.beam_size, bucket_id, history, p)
+          history.append(new_first)
+          # Training step with the previous input and the best beam as target.
+          _, _, _, state = model.step(sess, inp, new_target, FLAGS.do_train,
+                                      noise_param, update_mem=True, state=state)
+          # Change input to the new one for the next step.
+          inp = new_inp
+          # If all results are great, stop (todo: not to wait for all?).
+          if FLAGS.nprint > 1:
+            print scores
+          if sum(scores) / float(len(scores)) >= 10.0:
+            break
+        # The final step with the true target.
+        loss, res, gnorm, _ = model.step(
+            sess, inp, target, FLAGS.do_train, noise_param,
+            update_mem=True, state=state)
        step_time += time.time() - start_time
-        acc_grad_norm += float(gnorm)
-
-        # Accumulate statistics only if we did not exceed curriculum length.
-        if l < max_cur_length + 1:
-          step_count += 1
-          acc_loss += loss
-          errors, total, seq_err = data.accuracy(inp, res, target,
-                                                 batch_size, 0)
-          acc_total += total
-          acc_errors += errors
-          acc_seq_err += seq_err
+        acc_grad_norm += 0.0 if gnorm is None else float(gnorm)
+
+        # Accumulate statistics.
+        acc_loss += loss
+        acc_l1 += loss
+        errors, total, seq_err = data.accuracy(
+            inp, res, target, batch_size, 0, new_target, scores)
+        if FLAGS.nprint > 1:
+          print "seq_err: ", seq_err
+        acc_total += total
+        acc_errors += errors
+        acc_seq_err += seq_err
+
+        # Report summary every 10 steps.
+        if step_count + 3 > FLAGS.steps_per_checkpoint:
+          do_report = True  # Don't polute plot too early.
+        if is_chief and step_count % 10 == 1 and do_report:
+          cur_loss = acc_l1 / float(step_c1)
+          acc_l1, step_c1 = 0.0, 0
+          cur_perp = data.safe_exp(cur_loss)
+          summary = tf.Summary()
+          summary.value.extend(
+              [tf.Summary.Value(tag="log_perplexity", simple_value=cur_loss),
+               tf.Summary.Value(tag="perplexity", simple_value=cur_perp)])
+          sv.SummaryComputed(sess, summary, global_step)

      # Normalize and print out accumulated statistics.
      acc_loss /= step_count
@@ -273,178 +757,257 @@ def train():
      acc_seq_err = float(acc_seq_err) / (step_count * batch_size)
      prev_seq_err = max(0.0, acc_seq_err - 0.02)  # No noise at error < 2%.
      acc_errors = float(acc_errors) / acc_total if acc_total > 0 else 1.0
-      msg1 = "step %d step-time %.2f" % (global_step, step_time)
-      msg2 = "lr %.8f pull %.3f" % (learning_rate, pull)
-      msg3 = ("%s %s grad-norm %.8f"
-              % (msg1, msg2, acc_grad_norm / FLAGS.steps_per_checkpoint))
-      data.print_out("%s len %d ppx %.8f errors %.2f sequence-errors %.2f" %
-                     (msg3, max_cur_length, data.safe_exp(acc_loss),
+      t_size = float(sum([len(x) for x in train_set])) / float(1000000)
+      msg = ("step %d step-time %.2f train-size %.3f lr %.6f grad-norm %.4f"
+             % (global_step + 1, step_time, t_size, learning_rate,
+                acc_grad_norm / FLAGS.steps_per_checkpoint))
+      data.print_out("%s len %d ppl %.6f errors %.2f sequence-errors %.2f" %
+                     (msg, max_cur_length, data.safe_exp(acc_loss),
                      100*acc_errors, 100*acc_seq_err))

      # If errors are below the curriculum threshold, move curriculum forward.
-      if curriculum > acc_seq_err:
+      is_good = FLAGS.curriculum_ppx > data.safe_exp(acc_loss)
+      is_good = is_good and FLAGS.curriculum_seq > acc_seq_err
+      if is_good and is_chief:
        if FLAGS.quantize:
          # Quantize weights.
          data.print_out("  Quantizing parameters.")
          sess.run([quant_op])
        # Increase current length (until the next with training data).
-        do_incr = True
-        while do_incr and max_cur_length < max_length:
-          sess.run(model.cur_length_incr_op)
-          for t in tasks:
-            if data.train_set[t]: do_incr = False
+        sess.run(model.cur_length_incr_op)
        # Forget last perplexities if we're not yet at the end.
        if max_cur_length < max_length:
          prev_acc_perp.append(1000000)
-        # Either increase pull or, if it's large, average parameters.
-        if pull < 0.1:
-          sess.run(model.pull_incr_op)
-        else:
-          data.print_out("  Averaging parameters.")
-          sess.run(model.avg_op)
-          if acc_seq_err < (curriculum / 3.0):
-            sess.run(model.lr_decay_op)

-      # Lower learning rate if we're worse than the last 3 checkpoints.
+      # Lower learning rate if we're worse than the last 5 checkpoints.
      acc_perp = data.safe_exp(acc_loss)
-      if acc_perp > max(prev_acc_perp[-3:]):
+      if acc_perp > max(prev_acc_perp[-5:]) and is_chief:
        sess.run(model.lr_decay_op)
      prev_acc_perp.append(acc_perp)

      # Save checkpoint.
-      checkpoint_path = os.path.join(checkpoint_dir, "neural_gpu.ckpt")
-      model.saver.save(sess, checkpoint_path,
-                       global_step=model.global_step)
-
-      # Run evaluation.
-      bound = data.bins[-1] + 1
-      for t in tasks:
-        l = min_length
-        while l < max_length + EXTRA_EVAL and l < bound:
-          _, seq_err, _ = single_test(l, model, sess, t,
-                                      FLAGS.nprint, batch_size)
-          l += 1
-          while l < bound + 1 and not data.test_set[t][l]:
-            l += 1
-        if seq_err < 0.05:  # Run larger test if we're good enough.
-          _, seq_err = multi_test(data.forward_max, model, sess, t,
-                                  FLAGS.nprint, batch_size * 4)
-      if seq_err < 0.01:  # Super-large test on 1-task large-forward models.
-        if data.forward_max > 4000 and len(tasks) == 1:
-          multi_test(data.forward_max, model, sess, tasks[0], FLAGS.nprint,
-                     batch_size * 16, 0)
-
-
-def animate(l, test_data, anim_size):
-  """Create animation for the given data (hacky matplotlib use)."""
-  xf = 12  # Extra frames to slow down at start and end.
-  fps = 2  # Frames per step.
-
-  # Make the figure.
-  fig = plt.figure(figsize=(16, 9), facecolor="white")
-  ax = fig.add_axes([0, 0, 1, 1], frameon=False, zorder=2)
-  ax.set_xticks([i * 24-0.5 for i in xrange(4)])
-  ax.set_xticklabels([])
-  ax.set_yticks([i - 0.5 for i in xrange(l+1)])
-  ax.grid(which="major", axis="both", linestyle="-", color="black")
-  # We need text fields.
-  text_fields = []
-  text_size = 24*32/l
-  for y in xrange(l):
-    text_fields.append(ax.text(
-        11.25, y + 0.15, "", color="g", ha="center", va="center",
-        bbox={"facecolor": "b", "alpha": 0.01, "pad": 24 * text_size},
-        size=text_size - (4 * 32 / l), animated=True))
-  im = ax.imshow(np.zeros_like(test_data[0][0][0]), vmin=-1.0,
-                 vmax=1.0, cmap="gray", aspect="auto", origin="upper",
-                 interpolation="none", animated=True)
-  im.set_zorder(1)
-
-  # Main animation step.
-  def animation_update(frame_no, test_data, xf, im, text_fields):
-    """Update an animation frame."""
-    steps, inpt, out_raw = test_data
-    length = len(steps)
-    batch = frame_no / (fps * (l+4*xf))
-    index = int((frame_no % (fps * (l+4*xf))) / fps)
-    # Cut output after first padding.
-    out = [out_raw[i][batch] for i in xrange(len(text_fields))]
-    if 0 in out:
-      i = out.index(0)
-      out = out[0:i] + [0 for _ in xrange(len(out) - i)]
-    # Show the state after the first frames.
-    if index >= 2*xf:
-      im.set_array(steps[min(length - 1, index - 2*xf)][batch])
-      for i, t in enumerate(text_fields):
-        if index - 2*xf < length:
-          t.set_text("")
-        else:
-          t.set_text(data.to_symbol(out[i]))
-    else:
-      for i, t in enumerate(text_fields):
-        t.set_text(data.to_symbol(inpt[i][batch]) if index < xf else "")
-      if index < xf:
-        im.set_array(np.zeros_like(steps[0][0]))
-      else:
-        im.set_array(steps[0][batch])
-    return im,
-
-  # Create the animation and save to mp4.
-  animation = anim.FuncAnimation(
-      fig, animation_update, blit=True, frames=(l+4*xf)*anim_size*fps,
-      interval=500/fps, fargs=(test_data, xf, im, text_fields))
-  animation.save("/tmp/neural_gpu.mp4", writer="mencoder", fps=4*fps, dpi=3*80)
+      if is_chief:
+        checkpoint_path = os.path.join(checkpoint_dir, "neural_gpu.ckpt")
+        model.saver.save(sess, checkpoint_path,
+                         global_step=model.global_step)
+
+        # Run evaluation.
+        bin_bound = 4
+        for p in FLAGS.problem.split("-"):
+          total_loss, total_err, tl_counter = 0.0, 0.0, 0
+          for bin_id in xrange(len(data.bins)):
+            if bin_id < bin_bound or bin_id % FLAGS.eval_bin_print == 1:
+              err, _, loss = single_test(bin_id, model, sess, FLAGS.nprint,
+                                         batch_size * 4, dev_set, p,
+                                         beam_model=beam_model)
+              if loss > 0.0:
+                total_loss += loss
+                total_err += err
+                tl_counter += 1
+          test_loss = total_loss / max(1, tl_counter)
+          test_err = total_err / max(1, tl_counter)
+          test_perp = data.safe_exp(test_loss)
+          summary = tf.Summary()
+          summary.value.extend(
+              [tf.Summary.Value(tag="test/%s/loss" % p, simple_value=test_loss),
+               tf.Summary.Value(tag="test/%s/error" % p, simple_value=test_err),
+               tf.Summary.Value(tag="test/%s/perplexity" % p,
+                                simple_value=test_perp)])
+          sv.SummaryComputed(sess, summary, global_step)
+
+
+def linearize(output, rev_fr_vocab, simple_tokenizer=None, eos_id=wmt.EOS_ID):
+  # If there is an EOS symbol in outputs, cut them at that point (WMT).
+  if eos_id in output:
+    output = output[:output.index(eos_id)]
+  # Print out French sentence corresponding to outputs.
+  if simple_tokenizer or FLAGS.simple_tokenizer:
+    vlen = len(rev_fr_vocab)
+    def vget(o):
+      if o < vlen:
+        return rev_fr_vocab[o]
+      return "UNK"
+    return " ".join([vget(o) for o in output])
+  else:
+    return wmt.basic_detokenizer([rev_fr_vocab[o] for o in output])


 def evaluate():
  """Evaluate an existing model."""
-  batch_size = FLAGS.batch_size
-  tasks = FLAGS.task.split("-")
-  with tf.Session() as sess:
-    model, min_length, max_length, _, _, ensemble = initialize(sess)
-    bound = data.bins[-1] + 1
-    for t in tasks:
-      l = min_length
-      while l < max_length + EXTRA_EVAL and l < bound:
-        _, seq_err, _ = single_test(l, model, sess, t, FLAGS.nprint,
-                                    batch_size, ensemble=ensemble)
-        l += 1
-        while l < bound + 1 and not data.test_set[t][l]:
-          l += 1
-      # Animate.
-      if FLAGS.animate:
-        anim_size = 2
-        _, _, test_data = single_test(l, model, sess, t, 0, anim_size,
-                                      get_steps=True)
-        animate(l, test_data, anim_size)
-      # More tests.
-      _, seq_err = multi_test(data.forward_max, model, sess, t, FLAGS.nprint,
-                              batch_size * 4, ensemble=ensemble)
-    if seq_err < 0.01:  # Super-test if we're very good and in large-test mode.
-      if data.forward_max > 4000 and len(tasks) == 1:
-        multi_test(data.forward_max, model, sess, tasks[0], FLAGS.nprint,
-                   batch_size * 64, 0, ensemble=ensemble)
+  batch_size = FLAGS.batch_size * FLAGS.num_gpus
+  with tf.Session(config=tf.ConfigProto(allow_soft_placement=True)) as sess:
+    (model, beam_model, _, _, _,
+     (_, dev_set, en_vocab_path, fr_vocab_path), _, sess) = initialize(sess)
+    for p in FLAGS.problem.split("-"):
+      for bin_id in xrange(len(data.bins)):
+        if (FLAGS.task >= 0 and bin_id > 4) or (FLAGS.nprint == 0 and
+                                                bin_id > 8 and p == "wmt"):
+          break
+        single_test(bin_id, model, sess, FLAGS.nprint, batch_size, dev_set, p,
+                    beam_model=beam_model)
+    path = FLAGS.test_file_prefix
+    xid = "" if FLAGS.task < 0 else ("%.4d" % (FLAGS.task+FLAGS.decode_offset))
+    en_path, fr_path = path + ".en" + xid, path + ".fr" + xid
+    # Evaluate the test file if they exist.
+    if path and tf.gfile.Exists(en_path) and tf.gfile.Exists(fr_path):
+      data.print_out("Translating test set %s" % en_path)
+      # Read lines.
+      en_lines, fr_lines = [], []
+      with tf.gfile.GFile(en_path, mode="r") as f:
+        for line in f:
+          en_lines.append(line.strip())
+      with tf.gfile.GFile(fr_path, mode="r") as f:
+        for line in f:
+          fr_lines.append(line.strip())
+      # Tokenize and convert to ids.
+      en_vocab, _ = wmt.initialize_vocabulary(en_vocab_path)
+      _, rev_fr_vocab = wmt.initialize_vocabulary(fr_vocab_path)
+      if FLAGS.simple_tokenizer:
+        en_ids = [wmt.sentence_to_token_ids(
+            l, en_vocab, tokenizer=wmt.space_tokenizer,
+            normalize_digits=FLAGS.normalize_digits)
+                  for l in en_lines]
+      else:
+        en_ids = [wmt.sentence_to_token_ids(l, en_vocab) for l in en_lines]
+      # Translate.
+      results = []
+      for idx, token_ids in enumerate(en_ids):
+        if idx % 5 == 0:
+          data.print_out("Translating example %d of %d." % (idx, len(en_ids)))
+        # Which bucket does it belong to?
+        buckets = [b for b in xrange(len(data.bins))
+                   if data.bins[b] >= len(token_ids)]
+        if buckets:
+          result, result_cost = [], 100000000.0
+          for bucket_id in buckets:
+            if data.bins[bucket_id] > MAXLEN_F * len(token_ids) + EVAL_LEN_INCR:
+              break
+            # Get a 1-element batch to feed the sentence to the model.
+            used_batch_size = 1  # batch_size
+            inp, target = data.get_batch(
+                bucket_id, used_batch_size, None, FLAGS.height,
+                preset=([token_ids], [[]]))
+            loss, output_logits, _, _ = model.step(
+                sess, inp, target, None, beam_size=FLAGS.beam_size)
+            outputs = [int(o[0]) for o in output_logits]
+            loss = loss[0] - (data.bins[bucket_id] * FLAGS.length_norm)
+            if FLAGS.simple_tokenizer:
+              cur_out = outputs
+              if wmt.EOS_ID in cur_out:
+                cur_out = cur_out[:cur_out.index(wmt.EOS_ID)]
+              res_tags = [rev_fr_vocab[o] for o in cur_out]
+              bad_words, bad_brack = wmt.parse_constraints(token_ids, res_tags)
+              loss += 1000.0 * bad_words + 100.0 * bad_brack
+            # print (bucket_id, loss)
+            if loss < result_cost:
+              result = outputs
+              result_cost = loss
+          final = linearize(result, rev_fr_vocab)
+          results.append("%s\t%s\n" % (final, fr_lines[idx]))
+          # print result_cost
+          sys.stderr.write(results[-1])
+          sys.stderr.flush()
+        else:
+          sys.stderr.write("TOOO_LONG\t%s\n" % fr_lines[idx])
+          sys.stderr.flush()
+      if xid:
+        decode_suffix = "beam%dln%dn" % (FLAGS.beam_size,
+                                         int(100 * FLAGS.length_norm))
+        with tf.gfile.GFile(path + ".res" + decode_suffix + xid, mode="w") as f:
+          for line in results:
+            f.write(line)
+
+
+def mul(l):
+  res = 1.0
+  for s in l:
+    res *= s
+  return res


 def interactive():
  """Interactively probe an existing model."""
-  with tf.Session() as sess:
-    model, _, _, _, _, _ = initialize(sess)
-    sys.stdout.write("Input to Neural GPU, e.g., 0 1. Use -1 for PAD.\n")
+  with tf.Session(config=tf.ConfigProto(allow_soft_placement=True)) as sess:
+    # Initialize model.
+    (model, _, _, _, _, (_, _, en_path, fr_path), _, _) = initialize(sess)
+    # Load vocabularies.
+    en_vocab, rev_en_vocab = wmt.initialize_vocabulary(en_path)
+    _, rev_fr_vocab = wmt.initialize_vocabulary(fr_path)
+    # Print out vectors and variables.
+    if FLAGS.nprint > 0 and FLAGS.word_vector_file_en:
+      print_vectors("embedding:0", en_path, FLAGS.word_vector_file_en)
+    if FLAGS.nprint > 0 and FLAGS.word_vector_file_fr:
+      print_vectors("target_embedding:0", fr_path, FLAGS.word_vector_file_fr)
+    total = 0
+    for v in tf.trainable_variables():
+      shape = v.get_shape().as_list()
+      total += mul(shape)
+      print (v.name, shape, mul(shape))
+    print total
+    # Start interactive loop.
+    sys.stdout.write("Input to Neural GPU Translation Model.\n")
    sys.stdout.write("> ")
    sys.stdout.flush()
-    inpt = sys.stdin.readline()
+    inpt = sys.stdin.readline(), ""
    while inpt:
-      ids = [data.to_id(s) for s in inpt.strip().split()]
-      inpt, target = data.get_batch(len(ids), 1, False, "",
-                                    preset=(ids, [0 for _ in ids]))
-      _, res, _, _ = model.step(sess, inpt, target, False)
-      res = [np.argmax(o, axis=1) for o in res]
-      res = [o for o in res[:len(ids)] if o > 0]
-      print "  " + " ".join([data.to_symbol(output[0]) for output in res])
+      cures = []
+      # Get token-ids for the input sentence.
+      if FLAGS.simple_tokenizer:
+        token_ids = wmt.sentence_to_token_ids(
+            inpt, en_vocab, tokenizer=wmt.space_tokenizer,
+            normalize_digits=FLAGS.normalize_digits)
+      else:
+        token_ids = wmt.sentence_to_token_ids(inpt, en_vocab)
+      print [rev_en_vocab[t] for t in token_ids]
+      # Which bucket does it belong to?
+      buckets = [b for b in xrange(len(data.bins))
+                 if data.bins[b] >= max(len(token_ids), len(cures))]
+      if cures:
+        buckets = [buckets[0]]
+      if buckets:
+        result, result_cost = [], 10000000.0
+        for bucket_id in buckets:
+          if data.bins[bucket_id] > MAXLEN_F * len(token_ids) + EVAL_LEN_INCR:
+            break
+          glen = 1
+          for gen_idx in xrange(glen):
+            # Get a 1-element batch to feed the sentence to the model.
+            inp, target = data.get_batch(
+                bucket_id, 1, None, FLAGS.height, preset=([token_ids], [cures]))
+            loss, output_logits, _, _ = model.step(
+                sess, inp, target, None, beam_size=FLAGS.beam_size,
+                update_mem=False)
+            # If it is a greedy decoder, outputs are argmaxes of output_logits.
+            if FLAGS.beam_size > 1:
+              outputs = [int(o) for o in output_logits]
+            else:
+              loss = loss[0] - (data.bins[bucket_id] * FLAGS.length_norm)
+              outputs = [int(np.argmax(logit, axis=1))
+                         for logit in output_logits]
+            print [rev_fr_vocab[t] for t in outputs]
+            print loss, data.bins[bucket_id]
+            print linearize(outputs, rev_fr_vocab)
+            cures.append(outputs[gen_idx])
+            print cures
+            print linearize(cures, rev_fr_vocab)
+          if FLAGS.simple_tokenizer:
+            cur_out = outputs
+            if wmt.EOS_ID in cur_out:
+              cur_out = cur_out[:cur_out.index(wmt.EOS_ID)]
+            res_tags = [rev_fr_vocab[o] for o in cur_out]
+            bad_words, bad_brack = wmt.parse_constraints(token_ids, res_tags)
+            loss += 1000.0 * bad_words + 100.0 * bad_brack
+          if loss < result_cost:
+            result = outputs
+            result_cost = loss
+        print ("FINAL", result_cost)
+        print [rev_fr_vocab[t] for t in result]
+        print linearize(result, rev_fr_vocab)
+      else:
+        print "TOOO_LONG"
      sys.stdout.write("> ")
      sys.stdout.flush()
-      inpt = sys.stdin.readline()
+      inpt = sys.stdin.readline(), ""


 def main(_):

--- a/neural_gpu/program_utils.py
+++ b/neural_gpu/program_utils.py
+# Copyright 2015 Google Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Utilities for generating program synthesis and evaluation data."""
+
+import contextlib
+import sys
+import StringIO
+import random
+import os
+
+class ListType(object):
+  def __init__(self, arg):
+    self.arg = arg
+
+  def __str__(self):
+    return "[" + str(self.arg) + "]"
+
+  def __eq__(self, other):
+    if not isinstance(other, ListType):
+      return False
+    return self.arg == other.arg
+  
+  def __hash__(self):
+    return hash(self.arg)
+
+class VarType(object):
+  def __init__(self, arg):
+    self.arg = arg
+
+  def __str__(self):
+    return str(self.arg)
+
+  def __eq__(self, other):
+    if not isinstance(other, VarType):
+      return False
+    return self.arg == other.arg
+
+  def __hash__(self):
+    return hash(self.arg)
+
+class FunctionType(object):
+  def __init__(self, args):
+    self.args = args
+
+  def __str__(self):
+    return str(self.args[0]) + " -> " + str(self.args[1])
+
+  def __eq__(self, other):
+    if not isinstance(other, FunctionType):
+      return False
+    return self.args == other.args
+
+  def __hash__(self):
+    return hash(tuple(self.args))
+
+
+class Function(object):
+  def __init__(self, name, arg_types, output_type, fn_arg_types = None):
+    self.name = name 
+    self.arg_types = arg_types
+    self.fn_arg_types = fn_arg_types or []
+    self.output_type = output_type
+
+Null = 100
+## Functions
+f_head = Function("c_head", [ListType("Int")], "Int")
+def c_head(xs): return xs[0] if len(xs) > 0 else Null
+
+f_last = Function("c_last", [ListType("Int")], "Int")
+def c_last(xs): return xs[-1] if len(xs) > 0 else Null
+
+f_take = Function("c_take", ["Int", ListType("Int")], ListType("Int"))
+def c_take(n, xs): return xs[:n]
+
+f_drop = Function("c_drop", ["Int", ListType("Int")], ListType("Int"))
+def c_drop(n, xs): return xs[n:]
+
+f_access = Function("c_access", ["Int", ListType("Int")], "Int")
+def c_access(n, xs): return xs[n] if n >= 0 and len(xs) > n else Null
+
+f_max = Function("c_max", [ListType("Int")], "Int")
+def c_max(xs): return max(xs) if len(xs) > 0 else Null
+
+f_min = Function("c_min", [ListType("Int")], "Int")
+def c_min(xs): return min(xs) if len(xs) > 0 else Null
+
+f_reverse = Function("c_reverse", [ListType("Int")], ListType("Int"))
+def c_reverse(xs): return list(reversed(xs))
+
+f_sort = Function("sorted", [ListType("Int")], ListType("Int"))
+# def c_sort(xs): return sorted(xs)
+
+f_sum = Function("sum", [ListType("Int")], "Int")
+# def c_sum(xs): return sum(xs)
+
+
+## Lambdas
+# Int -> Int
+def plus_one(x): return x + 1
+def minus_one(x): return x - 1
+def times_two(x): return x * 2
+def neg(x): return x * (-1)
+def div_two(x): return int(x/2)
+def sq(x): return x**2 
+def times_three(x): return x * 3
+def div_three(x): return int(x/3)
+def times_four(x): return x * 4
+def div_four(x): return int(x/4)
+
+# Int -> Bool 
+def pos(x): return x > 0 
+def neg(x): return x < 0
+def even(x): return x%2 == 0
+def odd(x): return x%2 == 1
+
+# Int -> Int -> Int
+def add(x, y): return x + y
+def sub(x, y): return x - y
+def mul(x, y): return x * y
+
+# HOFs
+f_map = Function("map", [ListType("Int")], 
+                        ListType("Int"), 
+                        [FunctionType(["Int", "Int"])])
+f_filter = Function("filter", [ListType("Int")], 
+                              ListType("Int"), 
+                              [FunctionType(["Int", "Bool"])])
+f_count = Function("c_count", [ListType("Int")], 
+                              "Int", 
+                              [FunctionType(["Int", "Bool"])])
+def c_count(f, xs): return len([x for x in xs if f(x)])
+
+f_zipwith = Function("c_zipwith", [ListType("Int"), ListType("Int")], 
+                                  ListType("Int"), 
+                                  [FunctionType(["Int", "Int", "Int"])]) #FIX
+def c_zipwith(f, xs, ys): return [f(x, y) for (x, y) in zip(xs, ys)]
+
+f_scan = Function("c_scan", [ListType("Int")],
+                            ListType("Int"), 
+                            [FunctionType(["Int", "Int", "Int"])])
+def c_scan(f, xs):
+  out = xs
+  for i in range(1, len(xs)):
+    out[i] = f(xs[i], xs[i -1])
+  return out
+
+@contextlib.contextmanager
+def stdoutIO(stdout=None):
+  old = sys.stdout
+  if stdout is None:
+    stdout = StringIO.StringIO()
+  sys.stdout = stdout
+  yield stdout
+  sys.stdout = old
+
+
+def evaluate(program_str, input_names_to_vals, default="ERROR"):
+  exec_str = []
+  for name, val in input_names_to_vals.iteritems():
+    exec_str += name + " = " + str(val) + "; "
+  exec_str += program_str
+  if type(exec_str) is list:
+    exec_str = "".join(exec_str)
+
+  with stdoutIO() as s:
+    # pylint: disable=bare-except
+    try:
+      exec exec_str + " print(out)"
+      return s.getvalue()[:-1]
+    except:
+      return default
+   # pylint: enable=bare-except
+
+
+class Statement(object):
+  """Statement class."""
+  
+  def __init__(self, fn, output_var, arg_vars, fn_args=None):
+    self.fn = fn
+    self.output_var = output_var
+    self.arg_vars = arg_vars
+    self.fn_args = fn_args or []
+
+  def __str__(self):
+    return "%s = %s(%s%s%s)"%(self.output_var,
+                              self.fn.name,
+                              ", ".join(self.fn_args),
+                              ", " if self.fn_args else "",
+                              ", ".join(self.arg_vars))
+
+  def substitute(self, env):
+    self.output_var = env.get(self.output_var, self.output_var)
+    self.arg_vars = [env.get(v, v) for v in self.arg_vars]
+
+
+class ProgramGrower(object):
+  """Grow programs."""
+
+  def __init__(self, functions, types_to_lambdas):
+    self.functions = functions
+    self.types_to_lambdas = types_to_lambdas
+
+  def grow_body(self, new_var_name, dependencies, types_to_vars):
+    """Grow the program body."""
+    choices = []
+    for f in self.functions:
+      if all([a in types_to_vars.keys() for a in f.arg_types]):
+        choices.append(f)
+
+    f = random.choice(choices)
+    args = []
+    for t in f.arg_types:
+      possible_vars = random.choice(types_to_vars[t])
+      var = random.choice(possible_vars)
+      args.append(var)
+      dependencies.setdefault(new_var_name, []).extend(
+          [var] + (dependencies[var]))
+
+    fn_args = [random.choice(self.types_to_lambdas[t]) for t in f.fn_arg_types]
+    types_to_vars.setdefault(f.output_type, []).append(new_var_name)
+
+    return Statement(f, new_var_name, args, fn_args)
+
+  def grow(self, program_len, input_types):
+    """Grow the program."""
+    var_names = list(reversed(map(chr, range(97, 123))))
+    dependencies = dict()
+    types_to_vars = dict()
+    input_names = []
+    for t in input_types:
+      var = var_names.pop()
+      dependencies[var] = []
+      types_to_vars.setdefault(t, []).append(var)
+      input_names.append(var)
+
+    statements = []
+    for _ in range(program_len - 1):
+      var = var_names.pop()
+      statements.append(self.grow_body(var, dependencies, types_to_vars))
+    statements.append(self.grow_body("out", dependencies, types_to_vars))
+
+    new_var_names = [c for c in map(chr, range(97, 123))
+                     if c not in input_names]
+    new_var_names.reverse()
+    keep_statements = []
+    env = dict()
+    for s in statements:
+      if s.output_var in dependencies["out"]:
+        keep_statements.append(s)
+        env[s.output_var] = new_var_names.pop()
+      if s.output_var == "out":
+        keep_statements.append(s)
+
+    for k in keep_statements:
+      k.substitute(env)
+
+    return Program(input_names, input_types, ";".join(
+        [str(k) for k in keep_statements]))
+
+
+class Program(object):
+  """The program class."""
+
+  def __init__(self, input_names, input_types, body):
+    self.input_names = input_names
+    self.input_types = input_types
+    self.body = body
+
+  def evaluate(self, inputs):
+    """Evaluate this program."""
+    if len(inputs) != len(self.input_names):
+      raise AssertionError("inputs and input_names have to"
+                           "have the same len. inp: %s , names: %s" %
+                           (str(inputs), str(self.input_names)))
+    inp_str = ""
+    for (name, inp) in zip(self.input_names, inputs):
+      inp_str += name + " = " + str(inp) + "; "
+
+    with stdoutIO() as s:
+      # pylint: disable=exec-used
+      exec inp_str + self.body + "; print(out)"
+      # pylint: enable=exec-used
+    return s.getvalue()[:-1]
+
+  def flat_str(self):
+    out = ""
+    for s in self.body.split(";"):
+      out += s + ";"
+    return out
+
+  def __str__(self):
+    out = ""
+    for (n, t) in zip(self.input_names, self.input_types):
+      out += n + " = " + str(t) + "\n"
+    for s in self.body.split(";"):
+      out += s + "\n"
+    return out
+
+
+prog_vocab = []
+prog_rev_vocab = {}
+
+
+def tokenize(string, tokens=None):
+  """Tokenize the program string."""
+  if tokens is None:
+    tokens = prog_vocab
+  tokens = sorted(tokens, key=len, reverse=True)
+  out = []
+  string = string.strip()
+  while string:
+    found = False
+    for t in tokens:
+      if string.startswith(t):
+        out.append(t)
+        string = string[len(t):]
+        found = True
+        break
+    if not found:
+      raise ValueError("Couldn't tokenize this: " + string)
+    string = string.strip()
+  return out
+
+
+def clean_up(output, max_val=100):
+  o = eval(str(output))
+  if isinstance(o, bool):
+    return o
+  if isinstance(o, int):
+    if o >= 0:
+      return min(o, max_val)
+    else:
+      return max(o, -1 * max_val)
+  if isinstance(o, list):
+    return [clean_up(l) for l in o]
+
+
+def make_vocab():
+  gen(2, 0)
+
+
+def gen(max_len, how_many):
+  """Generate some programs."""
+  functions = [f_head, f_last, f_take, f_drop, f_access, f_max, f_min,
+               f_reverse, f_sort, f_sum, f_map, f_filter, f_count, f_zipwith,
+               f_scan]
+
+  types_to_lambdas = {
+      FunctionType(["Int", "Int"]): ["plus_one", "minus_one", "times_two",
+                                     "div_two", "sq", "times_three",
+                                     "div_three", "times_four", "div_four"],
+      FunctionType(["Int", "Bool"]): ["pos", "neg", "even", "odd"],
+      FunctionType(["Int", "Int", "Int"]): ["add", "sub", "mul"]
+  }
+
+  tokens = []
+  for f in functions:
+    tokens.append(f.name)
+  for v in types_to_lambdas.values():
+    tokens.extend(v)
+  tokens.extend(["=", ";", ",", "(", ")", "[", "]", "Int", "out"])
+  tokens.extend(map(chr, range(97, 123)))
+
+  io_tokens = map(str, range(-220, 220))
+  if not prog_vocab:
+    prog_vocab.extend(["_PAD", "_EOS"] + tokens + io_tokens)
+    for i, t in enumerate(prog_vocab):
+      prog_rev_vocab[t] = i
+
+  io_tokens += [",", "[", "]", ")", "(", "None"]
+  grower = ProgramGrower(functions=functions,
+                         types_to_lambdas=types_to_lambdas)
+
+  def mk_inp(l):
+    return [random.choice(range(-5, 5)) for _ in range(l)]
+
+  tar = [ListType("Int")]
+  inps = [[mk_inp(3)], [mk_inp(5)], [mk_inp(7)], [mk_inp(15)]]
+
+  save_prefix = None
+  outcomes_to_programs = dict()
+  tried = set()
+  counter = 0
+  choices = [0] if max_len == 0 else range(max_len)
+  while counter < 100 * how_many and len(outcomes_to_programs) < how_many:
+    counter += 1
+    length = random.choice(choices)
+    t = grower.grow(length, tar)
+    while t in tried:
+      length = random.choice(choices)
+      t = grower.grow(length, tar)
+    # print(t.flat_str())
+    tried.add(t)
+    outcomes = [clean_up(t.evaluate(i)) for i in inps]
+    outcome_str = str(zip(inps, outcomes))
+    if outcome_str in outcomes_to_programs:
+      outcomes_to_programs[outcome_str] = min(
+          [t.flat_str(), outcomes_to_programs[outcome_str]],
+          key=lambda x: len(tokenize(x, tokens)))
+    else:
+      outcomes_to_programs[outcome_str] = t.flat_str()
+    if counter % 5000 == 0:
+      print "== proggen: tried: " + str(counter)
+      print "== proggen: kept:  " + str(len(outcomes_to_programs))
+
+    if counter % 250000 == 0 and save_prefix is not None:
+      print "saving..."
+      save_counter = 0
+      progfilename = os.path.join(save_prefix, "prog_" + str(counter) + ".txt")
+      iofilename = os.path.join(save_prefix, "io_" + str(counter) + ".txt")
+      prog_token_filename = os.path.join(save_prefix,
+                                         "prog_tokens_" + str(counter) + ".txt")
+      io_token_filename = os.path.join(save_prefix,
+                                       "io_tokens_" + str(counter) + ".txt")
+      with open(progfilename, "a+") as fp,  \
+           open(iofilename, "a+") as fi, \
+           open(prog_token_filename, "a+") as ftp, \
+           open(io_token_filename, "a+") as fti:
+        for (o, p) in outcomes_to_programs.iteritems():
+          save_counter += 1
+          if save_counter % 500 == 0:
+            print "saving %d of %d" % (save_counter, len(outcomes_to_programs))
+          fp.write(p+"\n")
+          fi.write(o+"\n")
+          ftp.write(str(tokenize(p, tokens))+"\n")
+          fti.write(str(tokenize(o, io_tokens))+"\n")
+
+  return list(outcomes_to_programs.values())
--- a/neural_gpu/wmt_utils.py
+++ b/neural_gpu/wmt_utils.py
+# Copyright 2015 Google Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Utilities for downloading data from WMT, tokenizing, vocabularies."""
+
+import gzip
+import os
+import re
+import tarfile
+
+from six.moves import urllib
+import tensorflow as tf
+
+# Special vocabulary symbols - we always put them at the start.
+_PAD = b"_PAD"
+_GO = b"_GO"
+_EOS = b"_EOS"
+_UNK = b"_CHAR_UNK"
+_SPACE = b"_SPACE"
+_START_VOCAB = [_PAD, _GO, _EOS, _UNK, _SPACE]
+
+PAD_ID = 0
+GO_ID = 1
+EOS_ID = 2
+UNK_ID = 3
+SPACE_ID = 4
+
+# Regular expressions used to tokenize.
+_CHAR_MARKER = "_CHAR_"
+_CHAR_MARKER_LEN = len(_CHAR_MARKER)
+_SPEC_CHARS = "" + chr(226) + chr(153) + chr(128)
+_PUNCTUATION = "][.,!?\"':;%$#@&*+}{|><=/^~)(_`,0123456789" + _SPEC_CHARS + "-"
+_WORD_SPLIT = re.compile(b"([" + _PUNCTUATION + "])")
+_OLD_WORD_SPLIT = re.compile(b"([.,!?\"':;)(])")
+_DIGIT_RE = re.compile(br"\d")
+
+# URLs for WMT data.
+_WMT_ENFR_TRAIN_URL = "http://www.statmt.org/wmt10/training-giga-fren.tar"
+_WMT_ENFR_DEV_URL = "http://www.statmt.org/wmt15/dev-v2.tgz"
+
+
+def maybe_download(directory, filename, url):
+  """Download filename from url unless it's already in directory."""
+  if not tf.gfile.Exists(directory):
+    print "Creating directory %s" % directory
+    os.mkdir(directory)
+  filepath = os.path.join(directory, filename)
+  if not tf.gfile.Exists(filepath):
+    print "Downloading %s to %s" % (url, filepath)
+    filepath, _ = urllib.request.urlretrieve(url, filepath)
+    statinfo = os.stat(filepath)
+    print "Succesfully downloaded", filename, statinfo.st_size, "bytes"
+  return filepath
+
+
+def gunzip_file(gz_path, new_path):
+  """Unzips from gz_path into new_path."""
+  print "Unpacking %s to %s" % (gz_path, new_path)
+  with gzip.open(gz_path, "rb") as gz_file:
+    with open(new_path, "wb") as new_file:
+      for line in gz_file:
+        new_file.write(line)
+
+
+def get_wmt_enfr_train_set(directory):
+  """Download the WMT en-fr training corpus to directory unless it's there."""
+  train_path = os.path.join(directory, "giga-fren.release2.fixed")
+  if not (tf.gfile.Exists(train_path +".fr") and
+          tf.gfile.Exists(train_path +".en")):
+    corpus_file = maybe_download(directory, "training-giga-fren.tar",
+                                 _WMT_ENFR_TRAIN_URL)
+    print "Extracting tar file %s" % corpus_file
+    with tarfile.open(corpus_file, "r") as corpus_tar:
+      corpus_tar.extractall(directory)
+    gunzip_file(train_path + ".fr.gz", train_path + ".fr")
+    gunzip_file(train_path + ".en.gz", train_path + ".en")
+  return train_path
+
+
+def get_wmt_enfr_dev_set(directory):
+  """Download the WMT en-fr training corpus to directory unless it's there."""
+  dev_name = "newstest2013"
+  dev_path = os.path.join(directory, dev_name)
+  if not (tf.gfile.Exists(dev_path + ".fr") and
+          tf.gfile.Exists(dev_path + ".en")):
+    dev_file = maybe_download(directory, "dev-v2.tgz", _WMT_ENFR_DEV_URL)
+    print "Extracting tgz file %s" % dev_file
+    with tarfile.open(dev_file, "r:gz") as dev_tar:
+      fr_dev_file = dev_tar.getmember("dev/" + dev_name + ".fr")
+      en_dev_file = dev_tar.getmember("dev/" + dev_name + ".en")
+      fr_dev_file.name = dev_name + ".fr"  # Extract without "dev/" prefix.
+      en_dev_file.name = dev_name + ".en"
+      dev_tar.extract(fr_dev_file, directory)
+      dev_tar.extract(en_dev_file, directory)
+  return dev_path
+
+
+def is_char(token):
+  if len(token) > _CHAR_MARKER_LEN:
+    if token[:_CHAR_MARKER_LEN] == _CHAR_MARKER:
+      return True
+  return False
+
+
+def basic_detokenizer(tokens):
+  """Reverse the process of the basic tokenizer below."""
+  result = []
+  previous_nospace = True
+  for t in tokens:
+    if is_char(t):
+      result.append(t[_CHAR_MARKER_LEN:])
+      previous_nospace = True
+    elif t == _SPACE:
+      result.append(" ")
+      previous_nospace = True
+    elif previous_nospace:
+      result.append(t)
+      previous_nospace = False
+    else:
+      result.extend([" ", t])
+      previous_nospace = False
+  return "".join(result)
+
+
+old_style = False
+
+
+def basic_tokenizer(sentence):
+  """Very basic tokenizer: split the sentence into a list of tokens."""
+  words = []
+  if old_style:
+    for space_separated_fragment in sentence.strip().split():
+      words.extend(re.split(_OLD_WORD_SPLIT, space_separated_fragment))
+    return [w for w in words if w]
+  for space_separated_fragment in sentence.strip().split():
+    tokens = [t for t in re.split(_WORD_SPLIT, space_separated_fragment) if t]
+    first_is_char = False
+    for i, t in enumerate(tokens):
+      if len(t) == 1 and t in _PUNCTUATION:
+        tokens[i] = _CHAR_MARKER + t
+        if i == 0:
+          first_is_char = True
+    if words and words[-1] != _SPACE and (first_is_char or is_char(words[-1])):
+      tokens = [_SPACE] + tokens
+    spaced_tokens = []
+    for i, tok in enumerate(tokens):
+      spaced_tokens.append(tokens[i])
+      if i < len(tokens) - 1:
+        if tok != _SPACE and not (is_char(tok) or is_char(tokens[i+1])):
+          spaced_tokens.append(_SPACE)
+    words.extend(spaced_tokens)
+  return words
+
+
+def space_tokenizer(sentence):
+  return sentence.strip().split()
+
+
+def is_pos_tag(token):
+  """Check if token is a part-of-speech tag."""
+  return(token in ["CC", "CD", "DT", "EX", "FW", "IN", "JJ", "JJR",
+                   "JJS", "LS", "MD", "NN", "NNS", "NNP", "NNPS", "PDT",
+                   "POS", "PRP", "PRP$", "RB", "RBR", "RBS", "RP", "SYM", "TO",
+                   "UH", "VB", "VBD", "VBG", "VBN", "VBP", "VBZ", "WDT", "WP",
+                   "WP$", "WRB", ".", ",", ":", ")", "-LRB-", "(", "-RRB-",
+                   "HYPH", "$", "``", "''", "ADD", "AFX", "QTR", "BES", "-DFL-",
+                   "GW", "HVS", "NFP"])
+
+
+def parse_constraints(inpt, res):
+  ntags = len(res)
+  nwords = len(inpt)
+  npostags = len([x for x in res if is_pos_tag(x)])
+  nclose = len([x for x in res if x[0] == "/"])
+  nopen = ntags - nclose - npostags
+  return (abs(npostags - nwords), abs(nclose - nopen))
+
+
+def create_vocabulary(vocabulary_path, data_path, max_vocabulary_size,
+                      tokenizer=None, normalize_digits=False):
+  """Create vocabulary file (if it does not exist yet) from data file.
+
+  Data file is assumed to contain one sentence per line. Each sentence is
+  tokenized and digits are normalized (if normalize_digits is set).
+  Vocabulary contains the most-frequent tokens up to max_vocabulary_size.
+  We write it to vocabulary_path in a one-token-per-line format, so that later
+  token in the first line gets id=0, second line gets id=1, and so on.
+
+  Args:
+    vocabulary_path: path where the vocabulary will be created.
+    data_path: data file that will be used to create vocabulary.
+    max_vocabulary_size: limit on the size of the created vocabulary.
+    tokenizer: a function to use to tokenize each data sentence;
+      if None, basic_tokenizer will be used.
+    normalize_digits: Boolean; if true, all digits are replaced by 0s.
+  """
+  if not tf.gfile.Exists(vocabulary_path):
+    print "Creating vocabulary %s from data %s" % (vocabulary_path, data_path)
+    vocab, chars = {}, {}
+    for c in _PUNCTUATION:
+      chars[c] = 1
+
+    # Read French file.
+    with tf.gfile.GFile(data_path + ".fr", mode="rb") as f:
+      counter = 0
+      for line_in in f:
+        line = " ".join(line_in.split())
+        counter += 1
+        if counter % 100000 == 0:
+          print "  processing fr line %d" % counter
+        for c in line:
+          if c in chars:
+            chars[c] += 1
+          else:
+            chars[c] = 1
+        tokens = tokenizer(line) if tokenizer else basic_tokenizer(line)
+        tokens = [t for t in tokens if not is_char(t) and t != _SPACE]
+        for w in tokens:
+          word = re.sub(_DIGIT_RE, b"0", w) if normalize_digits else w
+          if word in vocab:
+            vocab[word] += 1000000000  # We want target words first.
+          else:
+            vocab[word] = 1000000000
+
+    # Read English file.
+    with tf.gfile.GFile(data_path + ".en", mode="rb") as f:
+      counter = 0
+      for line_in in f:
+        line = " ".join(line_in.split())
+        counter += 1
+        if counter % 100000 == 0:
+          print "  processing en line %d" % counter
+        for c in line:
+          if c in chars:
+            chars[c] += 1
+          else:
+            chars[c] = 1
+        tokens = tokenizer(line) if tokenizer else basic_tokenizer(line)
+        tokens = [t for t in tokens if not is_char(t) and t != _SPACE]
+        for w in tokens:
+          word = re.sub(_DIGIT_RE, b"0", w) if normalize_digits else w
+          if word in vocab:
+            vocab[word] += 1
+          else:
+            vocab[word] = 1
+
+      sorted_vocab = sorted(vocab, key=vocab.get, reverse=True)
+      sorted_chars = sorted(chars, key=vocab.get, reverse=True)
+      sorted_chars = [_CHAR_MARKER + c for c in sorted_chars]
+      vocab_list = _START_VOCAB + sorted_chars + sorted_vocab
+      if tokenizer:
+        vocab_list = _START_VOCAB + sorted_vocab
+      if len(vocab_list) > max_vocabulary_size:
+        vocab_list = vocab_list[:max_vocabulary_size]
+      with tf.gfile.GFile(vocabulary_path, mode="wb") as vocab_file:
+        for w in vocab_list:
+          vocab_file.write(w + b"\n")
+
+
+def initialize_vocabulary(vocabulary_path):
+  """Initialize vocabulary from file.
+
+  We assume the vocabulary is stored one-item-per-line, so a file:
+    dog
+    cat
+  will result in a vocabulary {"dog": 0, "cat": 1}, and this function will
+  also return the reversed-vocabulary ["dog", "cat"].
+
+  Args:
+    vocabulary_path: path to the file containing the vocabulary.
+
+  Returns:
+    a pair: the vocabulary (a dictionary mapping string to integers), and
+    the reversed vocabulary (a list, which reverses the vocabulary mapping).
+
+  Raises:
+    ValueError: if the provided vocabulary_path does not exist.
+  """
+  if tf.gfile.Exists(vocabulary_path):
+    rev_vocab = []
+    with tf.gfile.GFile(vocabulary_path, mode="rb") as f:
+      rev_vocab.extend(f.readlines())
+    rev_vocab = [line.strip() for line in rev_vocab]
+    vocab = dict([(x, y) for (y, x) in enumerate(rev_vocab)])
+    return vocab, rev_vocab
+  else:
+    raise ValueError("Vocabulary file %s not found.", vocabulary_path)
+
+
+def sentence_to_token_ids_raw(sentence, vocabulary,
+                              tokenizer=None, normalize_digits=old_style):
+  """Convert a string to list of integers representing token-ids.
+
+  For example, a sentence "I have a dog" may become tokenized into
+  ["I", "have", "a", "dog"] and with vocabulary {"I": 1, "have": 2,
+  "a": 4, "dog": 7"} this function will return [1, 2, 4, 7].
+
+  Args:
+    sentence: the sentence in bytes format to convert to token-ids.
+    vocabulary: a dictionary mapping tokens to integers.
+    tokenizer: a function to use to tokenize each sentence;
+      if None, basic_tokenizer will be used.
+    normalize_digits: Boolean; if true, all digits are replaced by 0s.
+
+  Returns:
+    a list of integers, the token-ids for the sentence.
+  """
+  if tokenizer:
+    words = tokenizer(sentence)
+  else:
+    words = basic_tokenizer(sentence)
+  result = []
+  for w in words:
+    if normalize_digits:
+      w = re.sub(_DIGIT_RE, b"0", w)
+    if w in vocabulary:
+      result.append(vocabulary[w])
+    else:
+      if tokenizer:
+        result.append(UNK_ID)
+      else:
+        result.append(SPACE_ID)
+        for c in w:
+          result.append(vocabulary.get(_CHAR_MARKER + c, UNK_ID))
+        result.append(SPACE_ID)
+  while result and result[0] == SPACE_ID:
+    result = result[1:]
+  while result and result[-1] == SPACE_ID:
+    result = result[:-1]
+  return result
+
+
+def sentence_to_token_ids(sentence, vocabulary,
+                          tokenizer=None, normalize_digits=old_style):
+  """Convert a string to list of integers representing token-ids, tab=0."""
+  tab_parts = sentence.strip().split("\t")
+  toks = [sentence_to_token_ids_raw(t, vocabulary, tokenizer, normalize_digits)
+          for t in tab_parts]
+  res = []
+  for t in toks:
+    res.extend(t)
+    res.append(0)
+  return res[:-1]
+
+
+def data_to_token_ids(data_path, target_path, vocabulary_path,
+                      tokenizer=None, normalize_digits=False):
+  """Tokenize data file and turn into token-ids using given vocabulary file.
+
+  This function loads data line-by-line from data_path, calls the above
+  sentence_to_token_ids, and saves the result to target_path. See comment
+  for sentence_to_token_ids on the details of token-ids format.
+
+  Args:
+    data_path: path to the data file in one-sentence-per-line format.
+    target_path: path where the file with token-ids will be created.
+    vocabulary_path: path to the vocabulary file.
+    tokenizer: a function to use to tokenize each sentence;
+      if None, basic_tokenizer will be used.
+    normalize_digits: Boolean; if true, all digits are replaced by 0s.
+  """
+  if not tf.gfile.Exists(target_path):
+    print "Tokenizing data in %s" % data_path
+    vocab, _ = initialize_vocabulary(vocabulary_path)
+    with tf.gfile.GFile(data_path, mode="rb") as data_file:
+      with tf.gfile.GFile(target_path, mode="w") as tokens_file:
+        counter = 0
+        for line in data_file:
+          counter += 1
+          if counter % 100000 == 0:
+            print "  tokenizing line %d" % counter
+          token_ids = sentence_to_token_ids(line, vocab, tokenizer,
+                                            normalize_digits)
+          tokens_file.write(" ".join([str(tok) for tok in token_ids]) + "\n")
+
+
+def prepare_wmt_data(data_dir, vocabulary_size,
+                     tokenizer=None, normalize_digits=False):
+  """Get WMT data into data_dir, create vocabularies and tokenize data.
+
+  Args:
+    data_dir: directory in which the data sets will be stored.
+    vocabulary_size: size of the joint vocabulary to create and use.
+    tokenizer: a function to use to tokenize each data sentence;
+      if None, basic_tokenizer will be used.
+    normalize_digits: Boolean; if true, all digits are replaced by 0s.
+
+  Returns:
+    A tuple of 6 elements:
+      (1) path to the token-ids for English training data-set,
+      (2) path to the token-ids for French training data-set,
+      (3) path to the token-ids for English development data-set,
+      (4) path to the token-ids for French development data-set,
+      (5) path to the vocabulary file,
+      (6) path to the vocabulary file (for compatibility with non-joint vocab).
+  """
+  # Get wmt data to the specified directory.
+  train_path = get_wmt_enfr_train_set(data_dir)
+  dev_path = get_wmt_enfr_dev_set(data_dir)
+
+  # Create vocabularies of the appropriate sizes.
+  vocab_path = os.path.join(data_dir, "vocab%d.txt" % vocabulary_size)
+  create_vocabulary(vocab_path, train_path, vocabulary_size,
+                    tokenizer=tokenizer, normalize_digits=normalize_digits)
+
+  # Create token ids for the training data.
+  fr_train_ids_path = train_path + (".ids%d.fr" % vocabulary_size)
+  en_train_ids_path = train_path + (".ids%d.en" % vocabulary_size)
+  data_to_token_ids(train_path + ".fr", fr_train_ids_path, vocab_path,
+                    tokenizer=tokenizer, normalize_digits=normalize_digits)
+  data_to_token_ids(train_path + ".en", en_train_ids_path, vocab_path,
+                    tokenizer=tokenizer, normalize_digits=normalize_digits)
+
+  # Create token ids for the development data.
+  fr_dev_ids_path = dev_path + (".ids%d.fr" % vocabulary_size)
+  en_dev_ids_path = dev_path + (".ids%d.en" % vocabulary_size)
+  data_to_token_ids(dev_path + ".fr", fr_dev_ids_path, vocab_path,
+                    tokenizer=tokenizer, normalize_digits=normalize_digits)
+  data_to_token_ids(dev_path + ".en", en_dev_ids_path, vocab_path,
+                    tokenizer=tokenizer, normalize_digits=normalize_digits)
+
+  return (en_train_ids_path, fr_train_ids_path,
+          en_dev_ids_path, fr_dev_ids_path,
+          vocab_path, vocab_path)