Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
ModelZoo
ResNet50_tensorflow
Commits
a00389b5
Commit
a00389b5
authored
Jan 27, 2017
by
Lukasz Kaiser
Committed by
GitHub
Jan 27, 2017
Browse files
Merge pull request #960 from lukaszkaiser/ngpu-corrections
Corrections and explanations for the updated Neural GPU model.
parents
a298143c
a046ec81
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
20 additions
and
6 deletions
+20
-6
neural_gpu/README.md
neural_gpu/README.md
+14
-2
neural_gpu/neural_gpu.py
neural_gpu/neural_gpu.py
+2
-2
neural_gpu/neural_gpu_trainer.py
neural_gpu/neural_gpu_trainer.py
+4
-2
No files found.
neural_gpu/README.md
View file @
a00389b5
# NeuralGPU
# NeuralGPU
Code for the Neural GPU model
as
described
Code for the Neural GPU model described
in [[http://arxiv.org/abs/1511.08228]].
in [[http://arxiv.org/abs/1
511.08228
]].
The extended version was described
in [[http
s
://arxiv.org/abs/1
610.08613
]].
Requirements:
Requirements:
*
TensorFlow (see tensorflow.org for how to install)
*
TensorFlow (see tensorflow.org for how to install)
...
@@ -68,4 +68,16 @@ To interact with a model (experimental, see code) run:
...
@@ -68,4 +68,16 @@ To interact with a model (experimental, see code) run:
python neural_gpu_trainer.py --problem=bmul --mode=2
python neural_gpu_trainer.py --problem=bmul --mode=2
```
```
To train on WMT data, set a larger --nmaps and --vocab_size and avoid curriculum:
```
python neural_gpu_trainer.py --problem=wmt --vocab_size=32768 --nmaps=256
--vec_size=256 --curriculum_seq=1.0 --max_length=60 --data_dir ~/wmt
```
With less memory, try lower batch size, e.g.
`--batch_size=4`
. With more GPUs
in your system, there will be a batch on every GPU so you can run larger models.
For example,
`--batch_size=4 --num_gpus=4 --nmaps=512 --vec_size=512`
will
run a large model (512-size) on 4 GPUs, with effective batches of 4
*
4=16.
Maintained by Lukasz Kaiser (lukaszkaiser)
Maintained by Lukasz Kaiser (lukaszkaiser)
neural_gpu/neural_gpu.py
View file @
a00389b5
...
@@ -128,7 +128,7 @@ def conv_gru(inpts, mem, kw, kh, nmaps, rate, cutoff, prefix, do_layer_norm,
...
@@ -128,7 +128,7 @@ def conv_gru(inpts, mem, kw, kh, nmaps, rate, cutoff, prefix, do_layer_norm,
reset
=
sigmoid_cutoff
(
conv_lin
(
inpts
+
[
mem
],
"r"
,
1.0
),
cutoff
)
reset
=
sigmoid_cutoff
(
conv_lin
(
inpts
+
[
mem
],
"r"
,
1.0
),
cutoff
)
gate
=
sigmoid_cutoff
(
conv_lin
(
inpts
+
[
mem
],
"g"
,
1.0
),
cutoff
)
gate
=
sigmoid_cutoff
(
conv_lin
(
inpts
+
[
mem
],
"g"
,
1.0
),
cutoff
)
if
cutoff
>
10
:
if
cutoff
>
10
:
candidate
=
tf
.
tanh_hard
(
conv_lin
(
inpts
+
[
reset
*
mem
],
"c"
,
0.0
))
candidate
=
tanh_hard
(
conv_lin
(
inpts
+
[
reset
*
mem
],
"c"
,
0.0
))
else
:
else
:
# candidate = tanh_cutoff(conv_lin(inpts + [reset * mem], "c", 0.0), cutoff)
# candidate = tanh_cutoff(conv_lin(inpts + [reset * mem], "c", 0.0), cutoff)
candidate
=
tf
.
tanh
(
conv_lin
(
inpts
+
[
reset
*
mem
],
"c"
,
0.0
))
candidate
=
tf
.
tanh
(
conv_lin
(
inpts
+
[
reset
*
mem
],
"c"
,
0.0
))
...
@@ -273,7 +273,7 @@ class NeuralGPU(object):
...
@@ -273,7 +273,7 @@ class NeuralGPU(object):
if
backward
:
if
backward
:
adam_lr
=
0.005
*
self
.
lr
adam_lr
=
0.005
*
self
.
lr
adam
=
tf
.
train
.
AdamOptimizer
(
adam_lr
,
epsilon
=
2
e-
4
)
adam
=
tf
.
train
.
AdamOptimizer
(
adam_lr
,
epsilon
=
1
e-
3
)
def
adam_update
(
grads
):
def
adam_update
(
grads
):
return
adam
.
apply_gradients
(
zip
(
grads
,
tf
.
trainable_variables
()),
return
adam
.
apply_gradients
(
zip
(
grads
,
tf
.
trainable_variables
()),
...
...
neural_gpu/neural_gpu_trainer.py
View file @
a00389b5
...
@@ -35,7 +35,7 @@ tf.app.flags.DEFINE_float("max_grad_norm", 4.0, "Clip gradients to this norm.")
...
@@ -35,7 +35,7 @@ tf.app.flags.DEFINE_float("max_grad_norm", 4.0, "Clip gradients to this norm.")
tf
.
app
.
flags
.
DEFINE_float
(
"cutoff"
,
1.2
,
"Cutoff at the gates."
)
tf
.
app
.
flags
.
DEFINE_float
(
"cutoff"
,
1.2
,
"Cutoff at the gates."
)
tf
.
app
.
flags
.
DEFINE_float
(
"curriculum_ppx"
,
9.9
,
"Move curriculum if ppl < X."
)
tf
.
app
.
flags
.
DEFINE_float
(
"curriculum_ppx"
,
9.9
,
"Move curriculum if ppl < X."
)
tf
.
app
.
flags
.
DEFINE_float
(
"curriculum_seq"
,
0.3
,
"Move curriculum if seq < X."
)
tf
.
app
.
flags
.
DEFINE_float
(
"curriculum_seq"
,
0.3
,
"Move curriculum if seq < X."
)
tf
.
app
.
flags
.
DEFINE_float
(
"dropout"
,
0.
0
,
"Dropout that much."
)
tf
.
app
.
flags
.
DEFINE_float
(
"dropout"
,
0.
1
,
"Dropout that much."
)
tf
.
app
.
flags
.
DEFINE_float
(
"grad_noise_scale"
,
0.0
,
"Gradient noise scale."
)
tf
.
app
.
flags
.
DEFINE_float
(
"grad_noise_scale"
,
0.0
,
"Gradient noise scale."
)
tf
.
app
.
flags
.
DEFINE_float
(
"max_sampling_rate"
,
0.1
,
"Maximal sampling rate."
)
tf
.
app
.
flags
.
DEFINE_float
(
"max_sampling_rate"
,
0.1
,
"Maximal sampling rate."
)
tf
.
app
.
flags
.
DEFINE_float
(
"length_norm"
,
0.0
,
"Length normalization."
)
tf
.
app
.
flags
.
DEFINE_float
(
"length_norm"
,
0.0
,
"Length normalization."
)
...
@@ -263,7 +263,8 @@ def initialize(sess=None):
...
@@ -263,7 +263,8 @@ def initialize(sess=None):
data
.
rev_vocab
=
rev_fr_vocab
data
.
rev_vocab
=
rev_fr_vocab
data
.
print_out
(
"Reading development and training data (limit: %d)."
data
.
print_out
(
"Reading development and training data (limit: %d)."
%
FLAGS
.
max_train_data_size
)
%
FLAGS
.
max_train_data_size
)
dev_set
=
read_data
(
en_dev
,
fr_dev
,
data
.
bins
)
dev_set
=
{}
dev_set
[
"wmt"
]
=
read_data
(
en_dev
,
fr_dev
,
data
.
bins
)
def
data_read
(
size
,
print_out
):
def
data_read
(
size
,
print_out
):
read_data_into_global
(
en_train
,
fr_train
,
data
.
bins
,
size
,
print_out
)
read_data_into_global
(
en_train
,
fr_train
,
data
.
bins
,
size
,
print_out
)
data_read
(
50000
,
False
)
data_read
(
50000
,
False
)
...
@@ -330,6 +331,7 @@ def initialize(sess=None):
...
@@ -330,6 +331,7 @@ def initialize(sess=None):
ngpu
.
CHOOSE_K
=
FLAGS
.
soft_mem_size
ngpu
.
CHOOSE_K
=
FLAGS
.
soft_mem_size
do_beam_model
=
FLAGS
.
train_beam_freq
>
0.0001
and
FLAGS
.
beam_size
>
1
do_beam_model
=
FLAGS
.
train_beam_freq
>
0.0001
and
FLAGS
.
beam_size
>
1
beam_size
=
FLAGS
.
beam_size
if
FLAGS
.
mode
>
0
and
not
do_beam_model
else
1
beam_size
=
FLAGS
.
beam_size
if
FLAGS
.
mode
>
0
and
not
do_beam_model
else
1
beam_size
=
min
(
beam_size
,
FLAGS
.
beam_size
)
beam_model
=
None
beam_model
=
None
def
make_ngpu
(
cur_beam_size
,
back
):
def
make_ngpu
(
cur_beam_size
,
back
):
return
ngpu
.
NeuralGPU
(
return
ngpu
.
NeuralGPU
(
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment