Commit 574c981c authored by Lukasz Kaiser's avatar Lukasz Kaiser
Browse files

Final tweaks for lower batch size.

parent 91ff146d
...@@ -151,7 +151,7 @@ class NeuralGPU(object): ...@@ -151,7 +151,7 @@ class NeuralGPU(object):
tf.constant(0, dtype=tf.int32, shape=[1]), tf.constant(0, dtype=tf.int32, shape=[1]),
tf.zeros([1, vec_size])) tf.zeros([1, vec_size]))
adam = tf.train.AdamOptimizer(0.01*self.lr, epsilon=1e-4) adam = tf.train.AdamOptimizer(self.lr, epsilon=1e-4)
# Main graph creation loop, for every bin in data_utils. # Main graph creation loop, for every bin in data_utils.
self.steps = [] self.steps = []
......
...@@ -31,7 +31,7 @@ from tensorflow.python.platform import gfile ...@@ -31,7 +31,7 @@ from tensorflow.python.platform import gfile
import data_utils as data import data_utils as data
import neural_gpu import neural_gpu
tf.app.flags.DEFINE_float("lr", 0.3, "Learning rate.") tf.app.flags.DEFINE_float("lr", 0.003, "Learning rate.")
tf.app.flags.DEFINE_float("init_weight", 1.0, "Initial weights deviation.") tf.app.flags.DEFINE_float("init_weight", 1.0, "Initial weights deviation.")
tf.app.flags.DEFINE_float("max_grad_norm", 0.05, "Clip gradients to this norm.") tf.app.flags.DEFINE_float("max_grad_norm", 0.05, "Clip gradients to this norm.")
tf.app.flags.DEFINE_float("cutoff", 1.2, "Cutoff at the gates.") tf.app.flags.DEFINE_float("cutoff", 1.2, "Cutoff at the gates.")
...@@ -215,7 +215,7 @@ def train(): ...@@ -215,7 +215,7 @@ def train():
start_time = time.time() start_time = time.time()
inp, target = data.get_batch(l, batch_size, True, task) inp, target = data.get_batch(l, batch_size, True, task)
noise_param = math.sqrt(math.pow(global_step, -0.55) * noise_param = math.sqrt(math.pow(global_step, -0.55) *
(20 * prev_seq_err)) * FLAGS.grad_noise_scale prev_seq_err) * FLAGS.grad_noise_scale
loss, res, gnorm, _ = model.step(sess, inp, target, True, noise_param) loss, res, gnorm, _ = model.step(sess, inp, target, True, noise_param)
step_time += time.time() - start_time step_time += time.time() - start_time
acc_grad_norm += float(gnorm) acc_grad_norm += float(gnorm)
...@@ -234,7 +234,7 @@ def train(): ...@@ -234,7 +234,7 @@ def train():
acc_loss /= step_count acc_loss /= step_count
step_time /= FLAGS.steps_per_checkpoint step_time /= FLAGS.steps_per_checkpoint
acc_seq_err = float(acc_seq_err) / (step_count * batch_size) acc_seq_err = float(acc_seq_err) / (step_count * batch_size)
prev_seq_err = acc_seq_err prev_seq_err = max(0.0, acc_seq_err - 0.02) # No noise at error < 2%.
acc_errors = float(acc_errors) / acc_total if acc_total > 0 else 1.0 acc_errors = float(acc_errors) / acc_total if acc_total > 0 else 1.0
msg1 = "step %d step-time %.2f" % (global_step, step_time) msg1 = "step %d step-time %.2f" % (global_step, step_time)
msg2 = "lr %.8f pull %.3f" % (learning_rate, pull) msg2 = "lr %.8f pull %.3f" % (learning_rate, pull)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment