Final tweaks for lower batch size.

574c981c · Lukasz Kaiser · 91ff146d · 574c981c · 574c981c
Commit 574c981c authored Jan 29, 2016 by Lukasz Kaiser
Hide whitespace changes
Inline Side-by-side

Showing with 4 additions and 4 deletions

neural_gpu/neural_gpu.py neural_gpu/neural_gpu.py +1 -1

neural_gpu/neural_gpu_trainer.py neural_gpu/neural_gpu_trainer.py +3 -3

No files found.
--- a/neural_gpu/neural_gpu.py
+++ b/neural_gpu/neural_gpu.py
@@ -151,7 +151,7 @@ class NeuralGPU(object):
                             tf.constant(0, dtype=tf.int32, shape=[1]),
                             tf.zeros([1, vec_size]))
-    adam = tf.train.AdamOptimizer(0.01*self.lr, epsilon=1e-4)
+    adam = tf.train.AdamOptimizer(self.lr, epsilon=1e-4)
    # Main graph creation loop, for every bin in data_utils.
    self.steps = []

--- a/neural_gpu/neural_gpu_trainer.py
+++ b/neural_gpu/neural_gpu_trainer.py
@@ -31,7 +31,7 @@ from tensorflow.python.platform import gfile
 import data_utils as data
 import neural_gpu
-tf.app.flags.DEFINE_float("lr", 0.3, "Learning rate.")
+tf.app.flags.DEFINE_float("lr", 0.003, "Learning rate.")
 tf.app.flags.DEFINE_float("init_weight", 1.0, "Initial weights deviation.")
 tf.app.flags.DEFINE_float("max_grad_norm", 0.05, "Clip gradients to this norm.")
 tf.app.flags.DEFINE_float("cutoff", 1.2, "Cutoff at the gates.")
@@ -215,7 +215,7 @@ def train():
        start_time = time.time()
        inp, target = data.get_batch(l, batch_size, True, task)
        noise_param = math.sqrt(math.pow(global_step, -0.55) *
-                                (20 * prev_seq_err)) * FLAGS.grad_noise_scale
+                                prev_seq_err) * FLAGS.grad_noise_scale
        loss, res, gnorm, _ = model.step(sess, inp, target, True, noise_param)
        step_time += time.time() - start_time
        acc_grad_norm += float(gnorm)
@@ -234,7 +234,7 @@ def train():
      acc_loss /= step_count
      step_time /= FLAGS.steps_per_checkpoint
      acc_seq_err = float(acc_seq_err) / (step_count * batch_size)
-      prev_seq_err = acc_seq_err
+      prev_seq_err = max(0.0, acc_seq_err - 0.02)  # No noise at error < 2%.
      acc_errors = float(acc_errors) / acc_total if acc_total > 0 else 1.0
      msg1 = "step %d step-time %.2f" % (global_step, step_time)
      msg2 = "lr %.8f pull %.3f" % (learning_rate, pull)