Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
ModelZoo
ResNet50_tensorflow
Commits
a315e568
Commit
a315e568
authored
Jan 25, 2017
by
Lukasz Kaiser
Browse files
Update to the Neural GPU.
parent
d66941ac
Changes
6
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
2615 additions
and
607 deletions
+2615
-607
neural_gpu/README.md
neural_gpu/README.md
+21
-12
neural_gpu/data_utils.py
neural_gpu/data_utils.py
+193
-52
neural_gpu/neural_gpu.py
neural_gpu/neural_gpu.py
+637
-217
neural_gpu/neural_gpu_trainer.py
neural_gpu/neural_gpu_trainer.py
+889
-326
neural_gpu/program_utils.py
neural_gpu/program_utils.py
+440
-0
neural_gpu/wmt_utils.py
neural_gpu/wmt_utils.py
+435
-0
No files found.
neural_gpu/README.md
View file @
a315e568
...
...
@@ -4,7 +4,6 @@ in [[http://arxiv.org/abs/1511.08228]].
Requirements:
*
TensorFlow (see tensorflow.org for how to install)
*
Matplotlib for Python (sudo apt-get install python-matplotlib)
The model can be trained on the following algorithmic tasks:
...
...
@@ -26,17 +25,27 @@ The model can be trained on the following algorithmic tasks:
*
`qadd`
- Long quaternary addition
*
`search`
- Search for symbol key in dictionary
The value range for symbols are defined by the
`niclass`
and
`noclass`
flags.
In particular, the values are in the range
`min(--niclass, noclass) - 1`
.
So if you set
`--niclass=33`
and
`--noclass=33`
(the default) then
`--task=rev`
will be reversing lists of 32 symbols, and
`--task=id`
will be identity on a
list of up to 32 symbols.
It can also be trained on the WMT English-French translation task:
*
`wmt`
- WMT English-French translation (data will be downloaded)
To train the model on the reverse task run:
The value range for symbols are defined by the
`vocab_size`
flag.
In particular, the values are in the range
`vocab_size - 1`
.
So if you set
`--vocab_size=16`
(the default) then
`--problem=rev`
will be reversing lists of 15 symbols, and
`--problem=id`
will be identity
on a list of up to 15 symbols.
To train the model on the binary multiplication task run:
```
python neural_gpu_trainer.py --problem=bmul
```
This trains the Extended Neural GPU, to train the original model run:
```
python neural_gpu_trainer.py --
task=rev
python neural_gpu_trainer.py --
problem=bmul --beam_size=0
```
While training, interim / checkpoint model parameters will be
...
...
@@ -47,16 +56,16 @@ with, hit `Ctrl-C` to stop the training process. The latest
model parameters will be in
`/tmp/neural_gpu/neural_gpu.ckpt-<step>`
and used on any subsequent run.
To te
st
a trained model on how well it decodes run:
To
evalua
te a trained model on how well it decodes run:
```
python neural_gpu_trainer.py --
task=rev
--mode=1
python neural_gpu_trainer.py --
problem=bmul
--mode=1
```
To
produce an animation of the result
run:
To
interact with a model (experimental, see code)
run:
```
python neural_gpu_trainer.py --
task=rev --mode=1 --animate=True
python neural_gpu_trainer.py --
problem=bmul --mode=2
```
Maintained by Lukasz Kaiser (lukaszkaiser)
neural_gpu/data_utils.py
View file @
a315e568
...
...
@@ -12,9 +12,10 @@
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""
Convolutional Gated Recurrent Networks for Algorithm Learning
."""
"""
Neural GPU -- data generation and batching utilities
."""
import
math
import
os
import
random
import
sys
import
time
...
...
@@ -22,22 +23,28 @@ import time
import
numpy
as
np
import
tensorflow
as
tf
from
tensorflow.python.platform
import
gf
il
e
import
program_ut
il
s
FLAGS
=
tf
.
app
.
flags
.
FLAGS
bins
=
[
8
,
12
,
16
,
20
,
24
,
28
,
32
,
36
,
40
,
48
,
64
,
128
]
bins
=
[
2
+
bin_idx_i
for
bin_idx_i
in
xrange
(
256
)
]
all_tasks
=
[
"sort"
,
"kvsort"
,
"id"
,
"rev"
,
"rev2"
,
"incr"
,
"add"
,
"left"
,
"right"
,
"left-shift"
,
"right-shift"
,
"bmul"
,
"mul"
,
"dup"
,
"badd"
,
"qadd"
,
"search"
]
forward_max
=
128
"badd"
,
"qadd"
,
"search"
,
"progeval"
,
"progsynth"
]
log_filename
=
""
vocab
,
rev_vocab
=
None
,
None
def
pad
(
l
):
for
b
in
bins
:
if
b
>=
l
:
return
b
return
forward_max
return
bins
[
-
1
]
def
bin_for
(
l
):
for
i
,
b
in
enumerate
(
bins
):
if
b
>=
l
:
return
i
return
len
(
bins
)
-
1
train_set
=
{}
...
...
@@ -50,6 +57,35 @@ for some_task in all_tasks:
test_set
[
some_task
].
append
([])
def
read_tmp_file
(
name
):
"""Read from a file with the given name in our log directory or above."""
dirname
=
os
.
path
.
dirname
(
log_filename
)
fname
=
os
.
path
.
join
(
dirname
,
name
+
".txt"
)
if
not
tf
.
gfile
.
Exists
(
fname
):
print_out
(
"== not found file: "
+
fname
)
fname
=
os
.
path
.
join
(
dirname
,
"../"
+
name
+
".txt"
)
if
not
tf
.
gfile
.
Exists
(
fname
):
print_out
(
"== not found file: "
+
fname
)
fname
=
os
.
path
.
join
(
dirname
,
"../../"
+
name
+
".txt"
)
if
not
tf
.
gfile
.
Exists
(
fname
):
print_out
(
"== not found file: "
+
fname
)
return
None
print_out
(
"== found file: "
+
fname
)
res
=
[]
with
tf
.
gfile
.
GFile
(
fname
,
mode
=
"r"
)
as
f
:
for
line
in
f
:
res
.
append
(
line
.
strip
())
return
res
def
write_tmp_file
(
name
,
lines
):
dirname
=
os
.
path
.
dirname
(
log_filename
)
fname
=
os
.
path
.
join
(
dirname
,
name
+
".txt"
)
with
tf
.
gfile
.
GFile
(
fname
,
mode
=
"w"
)
as
f
:
for
line
in
lines
:
f
.
write
(
line
+
"
\n
"
)
def
add
(
n1
,
n2
,
base
=
10
):
"""Add two numbers represented as lower-endian digit lists."""
k
=
max
(
len
(
n1
),
len
(
n2
))
+
1
...
...
@@ -130,6 +166,30 @@ def init_data(task, length, nbr_cases, nclass):
sorted_kv
=
[(
k
,
vals
[
i
])
for
(
k
,
i
)
in
sorted
(
keys
)]
return
[
x
for
p
in
kv
for
x
in
p
],
[
x
for
p
in
sorted_kv
for
x
in
p
]
def
prog_io_pair
(
prog
,
max_len
,
counter
=
0
):
try
:
ilen
=
np
.
random
.
randint
(
max_len
-
3
)
+
1
bound
=
max
(
15
-
(
counter
/
20
),
1
)
inp
=
[
random
.
choice
(
range
(
-
bound
,
bound
))
for
_
in
range
(
ilen
)]
inp_toks
=
[
program_utils
.
prog_rev_vocab
[
t
]
for
t
in
program_utils
.
tokenize
(
str
(
inp
))
if
t
!=
","
]
out
=
program_utils
.
evaluate
(
prog
,
{
"a"
:
inp
})
out_toks
=
[
program_utils
.
prog_rev_vocab
[
t
]
for
t
in
program_utils
.
tokenize
(
str
(
out
))
if
t
!=
","
]
if
counter
>
400
:
out_toks
=
[]
if
(
out_toks
and
out_toks
[
0
]
==
program_utils
.
prog_rev_vocab
[
"["
]
and
len
(
out_toks
)
!=
len
([
o
for
o
in
out
if
o
==
","
])
+
3
):
raise
ValueError
(
"generated list with too long ints"
)
if
(
out_toks
and
out_toks
[
0
]
!=
program_utils
.
prog_rev_vocab
[
"["
]
and
len
(
out_toks
)
>
1
):
raise
ValueError
(
"generated one int but tokenized it to many"
)
if
len
(
out_toks
)
>
max_len
:
raise
ValueError
(
"output too long"
)
return
(
inp_toks
,
out_toks
)
except
ValueError
:
return
prog_io_pair
(
prog
,
max_len
,
counter
+
1
)
def
spec
(
inp
):
"""Return the target given the input for some tasks."""
if
task
==
"sort"
:
...
...
@@ -164,43 +224,114 @@ def init_data(task, length, nbr_cases, nclass):
l
=
length
cur_time
=
time
.
time
()
total_time
=
0.0
for
case
in
xrange
(
nbr_cases
):
is_prog
=
task
in
[
"progeval"
,
"progsynth"
]
if
is_prog
:
inputs_per_prog
=
5
program_utils
.
make_vocab
()
progs
=
read_tmp_file
(
"programs_len%d"
%
(
l
/
10
))
if
not
progs
:
progs
=
program_utils
.
gen
(
l
/
10
,
1.2
*
nbr_cases
/
inputs_per_prog
)
write_tmp_file
(
"programs_len%d"
%
(
l
/
10
),
progs
)
prog_ios
=
read_tmp_file
(
"programs_len%d_io"
%
(
l
/
10
))
nbr_cases
=
min
(
nbr_cases
,
len
(
progs
)
*
inputs_per_prog
)
/
1.2
if
not
prog_ios
:
# Generate program io data.
prog_ios
=
[]
for
pidx
,
prog
in
enumerate
(
progs
):
if
pidx
%
500
==
0
:
print_out
(
"== generating io pairs for program %d"
%
pidx
)
if
pidx
*
inputs_per_prog
>
nbr_cases
*
1.2
:
break
ptoks
=
[
program_utils
.
prog_rev_vocab
[
t
]
for
t
in
program_utils
.
tokenize
(
prog
)]
ptoks
.
append
(
program_utils
.
prog_rev_vocab
[
"_EOS"
])
plen
=
len
(
ptoks
)
for
_
in
xrange
(
inputs_per_prog
):
if
task
==
"progeval"
:
inp
,
out
=
prog_io_pair
(
prog
,
plen
)
prog_ios
.
append
(
str
(
inp
)
+
"
\t
"
+
str
(
out
)
+
"
\t
"
+
prog
)
elif
task
==
"progsynth"
:
plen
=
max
(
len
(
ptoks
),
8
)
for
_
in
xrange
(
3
):
inp
,
out
=
prog_io_pair
(
prog
,
plen
/
2
)
prog_ios
.
append
(
str
(
inp
)
+
"
\t
"
+
str
(
out
)
+
"
\t
"
+
prog
)
write_tmp_file
(
"programs_len%d_io"
%
(
l
/
10
),
prog_ios
)
prog_ios_dict
=
{}
for
s
in
prog_ios
:
i
,
o
,
p
=
s
.
split
(
"
\t
"
)
i_clean
=
""
.
join
([
c
for
c
in
i
if
c
.
isdigit
()
or
c
==
" "
])
o_clean
=
""
.
join
([
c
for
c
in
o
if
c
.
isdigit
()
or
c
==
" "
])
inp
=
[
int
(
x
)
for
x
in
i_clean
.
split
()]
out
=
[
int
(
x
)
for
x
in
o_clean
.
split
()]
if
inp
and
out
:
if
p
in
prog_ios_dict
:
prog_ios_dict
[
p
].
append
([
inp
,
out
])
else
:
prog_ios_dict
[
p
]
=
[[
inp
,
out
]]
# Use prog_ios_dict to create data.
progs
=
[]
for
prog
in
prog_ios_dict
:
if
len
([
c
for
c
in
prog
if
c
==
";"
])
<=
(
l
/
10
):
progs
.
append
(
prog
)
nbr_cases
=
min
(
nbr_cases
,
len
(
progs
)
*
inputs_per_prog
)
/
1.2
print_out
(
"== %d training cases on %d progs"
%
(
nbr_cases
,
len
(
progs
)))
for
pidx
,
prog
in
enumerate
(
progs
):
if
pidx
*
inputs_per_prog
>
nbr_cases
*
1.2
:
break
ptoks
=
[
program_utils
.
prog_rev_vocab
[
t
]
for
t
in
program_utils
.
tokenize
(
prog
)]
ptoks
.
append
(
program_utils
.
prog_rev_vocab
[
"_EOS"
])
plen
=
len
(
ptoks
)
dset
=
train_set
if
pidx
<
nbr_cases
/
inputs_per_prog
else
test_set
for
_
in
xrange
(
inputs_per_prog
):
if
task
==
"progeval"
:
inp
,
out
=
prog_ios_dict
[
prog
].
pop
()
dset
[
task
][
bin_for
(
plen
)].
append
([[
ptoks
,
inp
,
[],
[]],
[
out
]])
elif
task
==
"progsynth"
:
plen
,
ilist
=
max
(
len
(
ptoks
),
8
),
[[]]
for
_
in
xrange
(
3
):
inp
,
out
=
prog_ios_dict
[
prog
].
pop
()
ilist
.
append
(
inp
+
out
)
dset
[
task
][
bin_for
(
plen
)].
append
([
ilist
,
[
ptoks
]])
for
case
in
xrange
(
0
if
is_prog
else
nbr_cases
):
total_time
+=
time
.
time
()
-
cur_time
cur_time
=
time
.
time
()
if
l
>
10000
and
case
%
100
==
1
:
print_out
(
" avg gen time %.4f s"
%
(
total_time
/
float
(
case
)))
if
task
in
[
"add"
,
"badd"
,
"qadd"
,
"bmul"
,
"mul"
]:
i
,
t
=
rand_pair
(
l
,
task
)
train_set
[
task
][
len
(
i
)].
append
([
i
,
t
])
train_set
[
task
][
bin_for
(
len
(
i
)
)
].
append
([
[[],
i
,
[],
[]],
[
t
]
])
i
,
t
=
rand_pair
(
l
,
task
)
test_set
[
task
][
len
(
i
)].
append
([
i
,
t
])
test_set
[
task
][
bin_for
(
len
(
i
)
)
].
append
([
[[],
i
,
[],
[]],
[
t
]
])
elif
task
==
"dup"
:
i
,
t
=
rand_dup_pair
(
l
)
train_set
[
task
][
len
(
i
)].
append
([
i
,
t
])
train_set
[
task
][
bin_for
(
len
(
i
)
)
].
append
([
[
i
]
,
[
t
]
])
i
,
t
=
rand_dup_pair
(
l
)
test_set
[
task
][
len
(
i
)].
append
([
i
,
t
])
test_set
[
task
][
bin_for
(
len
(
i
)
)
].
append
([
[
i
]
,
[
t
]
])
elif
task
==
"rev2"
:
i
,
t
=
rand_rev2_pair
(
l
)
train_set
[
task
][
len
(
i
)].
append
([
i
,
t
])
train_set
[
task
][
bin_for
(
len
(
i
)
)
].
append
([
[
i
]
,
[
t
]
])
i
,
t
=
rand_rev2_pair
(
l
)
test_set
[
task
][
len
(
i
)].
append
([
i
,
t
])
test_set
[
task
][
bin_for
(
len
(
i
)
)
].
append
([
[
i
]
,
[
t
]
])
elif
task
==
"search"
:
i
,
t
=
rand_search_pair
(
l
)
train_set
[
task
][
len
(
i
)].
append
([
i
,
t
])
train_set
[
task
][
bin_for
(
len
(
i
)
)
].
append
([
[
i
]
,
[
t
]
])
i
,
t
=
rand_search_pair
(
l
)
test_set
[
task
][
len
(
i
)].
append
([
i
,
t
])
test_set
[
task
][
bin_for
(
len
(
i
)
)
].
append
([
[
i
]
,
[
t
]
])
elif
task
==
"kvsort"
:
i
,
t
=
rand_kvsort_pair
(
l
)
train_set
[
task
][
len
(
i
)].
append
([
i
,
t
])
train_set
[
task
][
bin_for
(
len
(
i
)
)
].
append
([
[
i
]
,
[
t
]
])
i
,
t
=
rand_kvsort_pair
(
l
)
test_set
[
task
][
len
(
i
)].
append
([
i
,
t
])
el
se
:
test_set
[
task
][
bin_for
(
len
(
i
)
)
].
append
([
[
i
]
,
[
t
]
])
el
if
task
not
in
[
"progeval"
,
"progsynth"
]
:
inp
=
[
np
.
random
.
randint
(
nclass
-
1
)
+
1
for
i
in
xrange
(
l
)]
target
=
spec
(
inp
)
train_set
[
task
][
l
].
append
([
inp
,
target
])
train_set
[
task
][
bin_for
(
l
)
].
append
([
[
inp
]
,
[
target
]
]
)
inp
=
[
np
.
random
.
randint
(
nclass
-
1
)
+
1
for
i
in
xrange
(
l
)]
target
=
spec
(
inp
)
test_set
[
task
][
l
].
append
([
inp
,
target
])
test_set
[
task
][
bin_for
(
l
)
].
append
([
[
inp
]
,
[
target
]
]
)
def
to_symbol
(
i
):
...
...
@@ -218,37 +349,31 @@ def to_id(s):
return
int
(
s
)
+
1
def
get_batch
(
max_length
,
batch_size
,
d
o_train
,
task
,
offset
=
None
,
preset
=
None
):
def
get_batch
(
bin_id
,
batch_size
,
d
ata_set
,
height
,
offset
=
None
,
preset
=
None
):
"""Get a batch of data, training or testing."""
inputs
=
[]
targets
=
[]
length
=
max_length
if
preset
is
None
:
cur_set
=
test_set
[
task
]
if
do_train
:
cur_set
=
train_set
[
task
]
while
not
cur_set
[
length
]:
length
-=
1
pad_length
=
pad
(
length
)
inputs
,
targets
=
[],
[]
pad_length
=
bins
[
bin_id
]
for
b
in
xrange
(
batch_size
):
if
preset
is
None
:
elem
=
random
.
choice
(
cur
_set
[
length
])
if
offset
is
not
None
and
offset
+
b
<
len
(
cur
_set
[
length
]):
elem
=
cur
_set
[
length
][
offset
+
b
]
elem
=
random
.
choice
(
data
_set
[
bin_id
])
if
offset
is
not
None
and
offset
+
b
<
len
(
data
_set
[
bin_id
]):
elem
=
data
_set
[
bin_id
][
offset
+
b
]
else
:
elem
=
preset
inp
,
target
=
elem
[
0
],
elem
[
1
]
assert
len
(
inp
)
==
length
inputs
.
append
(
inp
+
[
0
for
l
in
xrange
(
pad_length
-
len
(
inp
))])
targets
.
append
(
target
+
[
0
for
l
in
xrange
(
pad_length
-
len
(
target
))])
res_input
=
[]
res_target
=
[]
for
l
in
xrange
(
pad_length
):
new_input
=
np
.
array
([
inputs
[
b
][
l
]
for
b
in
xrange
(
batch_size
)],
dtype
=
np
.
int32
)
new_target
=
np
.
array
([
targets
[
b
][
l
]
for
b
in
xrange
(
batch_size
)],
dtype
=
np
.
int32
)
res_input
.
append
(
new_input
)
res_target
.
append
(
new_target
)
inpt
,
targett
,
inpl
,
targetl
=
elem
[
0
],
elem
[
1
],
[],
[]
for
inp
in
inpt
:
inpl
.
append
(
inp
+
[
0
for
_
in
xrange
(
pad_length
-
len
(
inp
))])
if
len
(
inpl
)
==
1
:
for
_
in
xrange
(
height
-
1
):
inpl
.
append
([
0
for
_
in
xrange
(
pad_length
)])
for
target
in
targett
:
targetl
.
append
(
target
+
[
0
for
_
in
xrange
(
pad_length
-
len
(
target
))])
inputs
.
append
(
inpl
)
targets
.
append
(
targetl
)
res_input
=
np
.
array
(
inputs
,
dtype
=
np
.
int32
)
res_target
=
np
.
array
(
targets
,
dtype
=
np
.
int32
)
assert
list
(
res_input
.
shape
)
==
[
batch_size
,
height
,
pad_length
]
assert
list
(
res_target
.
shape
)
==
[
batch_size
,
1
,
pad_length
]
return
res_input
,
res_target
...
...
@@ -256,11 +381,11 @@ def print_out(s, newline=True):
"""Print a message out and log it to file."""
if
log_filename
:
try
:
with
gfile
.
GFile
(
log_filename
,
mode
=
"a"
)
as
f
:
with
tf
.
gfile
.
GFile
(
log_filename
,
mode
=
"a"
)
as
f
:
f
.
write
(
s
+
(
"
\n
"
if
newline
else
""
))
# pylint: disable=bare-except
except
:
sys
.
std
out
.
write
(
"Error appending to %s
\n
"
%
log_filename
)
sys
.
std
err
.
write
(
"Error appending to %s
\n
"
%
log_filename
)
sys
.
stdout
.
write
(
s
+
(
"
\n
"
if
newline
else
""
))
sys
.
stdout
.
flush
()
...
...
@@ -269,21 +394,36 @@ def decode(output):
return
[
np
.
argmax
(
o
,
axis
=
1
)
for
o
in
output
]
def
accuracy
(
inpt
,
output
,
target
,
batch_size
,
nprint
):
def
accuracy
(
inpt_t
,
output
,
target_t
,
batch_size
,
nprint
,
beam_out
=
None
,
beam_scores
=
None
):
"""Calculate output accuracy given target."""
assert
nprint
<
batch_size
+
1
inpt
=
[]
for
h
in
xrange
(
inpt_t
.
shape
[
1
]):
inpt
.
extend
([
inpt_t
[:,
h
,
l
]
for
l
in
xrange
(
inpt_t
.
shape
[
2
])])
target
=
[
target_t
[:,
0
,
l
]
for
l
in
xrange
(
target_t
.
shape
[
2
])]
def
tok
(
i
):
if
rev_vocab
and
i
<
len
(
rev_vocab
):
return
rev_vocab
[
i
]
return
str
(
i
-
1
)
def
task_print
(
inp
,
output
,
target
):
stop_bound
=
0
print_len
=
0
while
print_len
<
len
(
target
)
and
target
[
print_len
]
>
stop_bound
:
print_len
+=
1
print_out
(
" i: "
+
" "
.
join
([
str
(
i
-
1
)
for
i
in
inp
if
i
>
0
]))
print_out
(
" i: "
+
" "
.
join
([
tok
(
i
)
for
i
in
inp
if
i
>
0
]))
print_out
(
" o: "
+
" "
.
join
([
str
(
output
[
l
]
-
1
)
for
l
in
xrange
(
print_len
)]))
" "
.
join
([
tok
(
output
[
l
])
for
l
in
xrange
(
print_len
)]))
print_out
(
" t: "
+
" "
.
join
([
str
(
target
[
l
]
-
1
)
for
l
in
xrange
(
print_len
)]))
" "
.
join
([
tok
(
target
[
l
])
for
l
in
xrange
(
print_len
)]))
decoded_target
=
target
decoded_output
=
decode
(
output
)
# Use beam output if given and score is high enough.
if
beam_out
is
not
None
:
for
b
in
xrange
(
batch_size
):
if
beam_scores
[
b
]
>=
10.0
:
for
l
in
xrange
(
min
(
len
(
decoded_output
),
beam_out
.
shape
[
2
])):
decoded_output
[
l
][
b
]
=
int
(
beam_out
[
b
,
0
,
l
])
total
=
0
errors
=
0
seq
=
[
0
for
b
in
xrange
(
batch_size
)]
...
...
@@ -311,6 +451,7 @@ def accuracy(inpt, output, target, batch_size, nprint):
def
safe_exp
(
x
):
perp
=
10000
x
=
float
(
x
)
if
x
<
100
:
perp
=
math
.
exp
(
x
)
if
perp
>
10000
:
return
10000
return
perp
neural_gpu/neural_gpu.py
View file @
a315e568
...
...
@@ -16,26 +16,34 @@
import
time
import
numpy
as
np
import
tensorflow
as
tf
import
data_utils
from
tensorflow.python.framework
import
function
import
data_utils
as
data
do_jit
=
False
# Gives more speed but experimental for now.
jit_scope
=
tf
.
contrib
.
compiler
.
jit
.
experimental_jit_scope
def
conv_linear
(
args
,
kw
,
kh
,
nin
,
nout
,
do_bias
,
bias_start
,
prefix
):
def
conv_linear
(
args
,
kw
,
kh
,
nin
,
nout
,
rate
,
do_bias
,
bias_start
,
prefix
):
"""Convolutional linear map."""
assert
args
is
not
None
if
not
isinstance
(
args
,
(
list
,
tuple
)):
args
=
[
args
]
with
tf
.
variable_scope
(
prefix
):
k
=
tf
.
get_variable
(
"CvK"
,
[
kw
,
kh
,
nin
,
nout
])
with
tf
.
device
(
"/cpu:0"
):
k
=
tf
.
get_variable
(
"CvK"
,
[
kw
,
kh
,
nin
,
nout
])
if
len
(
args
)
==
1
:
res
=
tf
.
nn
.
conv2d
(
args
[
0
],
k
,
[
1
,
1
,
1
,
1
],
"SAME"
)
arg
=
args
[
0
]
else
:
res
=
tf
.
nn
.
conv2d
(
tf
.
concat
(
3
,
args
),
k
,
[
1
,
1
,
1
,
1
],
"SAME"
)
arg
=
tf
.
concat
(
args
,
3
)
res
=
tf
.
nn
.
convolution
(
arg
,
k
,
dilation_rate
=
(
rate
,
1
),
padding
=
"SAME"
)
if
not
do_bias
:
return
res
bias_term
=
tf
.
get_variable
(
"CvB"
,
[
nout
],
initializer
=
tf
.
constant_initializer
(
0.0
))
return
res
+
bias_term
+
bias_start
with
tf
.
device
(
"/cpu:0"
):
bias_term
=
tf
.
get_variable
(
"CvB"
,
[
nout
],
initializer
=
tf
.
constant_initializer
(
bias_start
))
bias_term
=
tf
.
reshape
(
bias_term
,
[
1
,
1
,
1
,
nout
])
return
res
+
bias_term
def
sigmoid_cutoff
(
x
,
cutoff
):
...
...
@@ -43,7 +51,34 @@ def sigmoid_cutoff(x, cutoff):
y
=
tf
.
sigmoid
(
x
)
if
cutoff
<
1.01
:
return
y
d
=
(
cutoff
-
1.0
)
/
2.0
return
tf
.
minimum
(
1.0
,
tf
.
maximum
(
0.0
,
cutoff
*
y
-
d
))
return
tf
.
minimum
(
1.0
,
tf
.
maximum
(
0.0
,
cutoff
*
y
-
d
),
name
=
"cutoff_min"
)
@
function
.
Defun
(
tf
.
float32
,
noinline
=
True
)
def
sigmoid_cutoff_12
(
x
):
"""Sigmoid with cutoff 1.2, specialized for speed and memory use."""
y
=
tf
.
sigmoid
(
x
)
return
tf
.
minimum
(
1.0
,
tf
.
maximum
(
0.0
,
1.2
*
y
-
0.1
),
name
=
"cutoff_min_12"
)
@
function
.
Defun
(
tf
.
float32
,
noinline
=
True
)
def
sigmoid_hard
(
x
):
"""Hard sigmoid."""
return
tf
.
minimum
(
1.0
,
tf
.
maximum
(
0.0
,
0.25
*
x
+
0.5
))
def
place_at14
(
decided
,
selected
,
it
):
"""Place selected at it-th coordinate of decided, dim=1 of 4."""
slice1
=
decided
[:,
:
it
,
:,
:]
slice2
=
decided
[:,
it
+
1
:,
:,
:]
return
tf
.
concat
([
slice1
,
selected
,
slice2
],
1
)
def
place_at13
(
decided
,
selected
,
it
):
"""Place selected at it-th coordinate of decided, dim=1 of 3."""
slice1
=
decided
[:,
:
it
,
:]
slice2
=
decided
[:,
it
+
1
:,
:]
return
tf
.
concat
([
slice1
,
selected
,
slice2
],
1
)
def
tanh_cutoff
(
x
,
cutoff
):
...
...
@@ -54,18 +89,80 @@ def tanh_cutoff(x, cutoff):
return
tf
.
minimum
(
1.0
,
tf
.
maximum
(
-
1.0
,
(
1.0
+
d
)
*
y
))
def
conv_gru
(
inpts
,
mem
,
kw
,
kh
,
nmaps
,
cutoff
,
prefix
):
@
function
.
Defun
(
tf
.
float32
,
noinline
=
True
)
def
tanh_hard
(
x
):
"""Hard tanh."""
return
tf
.
minimum
(
1.0
,
tf
.
maximum
(
0.0
,
x
))
def
layer_norm
(
x
,
nmaps
,
prefix
,
epsilon
=
1e-5
):
"""Layer normalize the 4D tensor x, averaging over the last dimension."""
with
tf
.
variable_scope
(
prefix
):
scale
=
tf
.
get_variable
(
"layer_norm_scale"
,
[
nmaps
],
initializer
=
tf
.
ones_initializer
())
bias
=
tf
.
get_variable
(
"layer_norm_bias"
,
[
nmaps
],
initializer
=
tf
.
zeros_initializer
())
mean
,
variance
=
tf
.
nn
.
moments
(
x
,
[
3
],
keep_dims
=
True
)
norm_x
=
(
x
-
mean
)
/
tf
.
sqrt
(
variance
+
epsilon
)
return
norm_x
*
scale
+
bias
def
conv_gru
(
inpts
,
mem
,
kw
,
kh
,
nmaps
,
rate
,
cutoff
,
prefix
,
do_layer_norm
,
args_len
=
None
):
"""Convolutional GRU."""
def
conv_lin
(
args
,
suffix
,
bias_start
):
return
conv_linear
(
args
,
kw
,
kh
,
len
(
args
)
*
nmaps
,
nmaps
,
True
,
bias_start
,
prefix
+
"/"
+
suffix
)
reset
=
sigmoid_cutoff
(
conv_lin
(
inpts
+
[
mem
],
"r"
,
1.0
),
cutoff
)
# candidate = tanh_cutoff(conv_lin(inpts + [reset * mem], "c", 0.0), cutoff)
candidate
=
tf
.
tanh
(
conv_lin
(
inpts
+
[
reset
*
mem
],
"c"
,
0.0
))
gate
=
sigmoid_cutoff
(
conv_lin
(
inpts
+
[
mem
],
"g"
,
1.0
),
cutoff
)
total_args_len
=
args_len
or
len
(
args
)
*
nmaps
res
=
conv_linear
(
args
,
kw
,
kh
,
total_args_len
,
nmaps
,
rate
,
True
,
bias_start
,
prefix
+
"/"
+
suffix
)
if
do_layer_norm
:
return
layer_norm
(
res
,
nmaps
,
prefix
+
"/"
+
suffix
)
else
:
return
res
if
cutoff
==
1.2
:
reset
=
sigmoid_cutoff_12
(
conv_lin
(
inpts
+
[
mem
],
"r"
,
1.0
))
gate
=
sigmoid_cutoff_12
(
conv_lin
(
inpts
+
[
mem
],
"g"
,
1.0
))
elif
cutoff
>
10
:
reset
=
sigmoid_hard
(
conv_lin
(
inpts
+
[
mem
],
"r"
,
1.0
))
gate
=
sigmoid_hard
(
conv_lin
(
inpts
+
[
mem
],
"g"
,
1.0
))
else
:
reset
=
sigmoid_cutoff
(
conv_lin
(
inpts
+
[
mem
],
"r"
,
1.0
),
cutoff
)
gate
=
sigmoid_cutoff
(
conv_lin
(
inpts
+
[
mem
],
"g"
,
1.0
),
cutoff
)
if
cutoff
>
10
:
candidate
=
tf
.
tanh_hard
(
conv_lin
(
inpts
+
[
reset
*
mem
],
"c"
,
0.0
))
else
:
# candidate = tanh_cutoff(conv_lin(inpts + [reset * mem], "c", 0.0), cutoff)
candidate
=
tf
.
tanh
(
conv_lin
(
inpts
+
[
reset
*
mem
],
"c"
,
0.0
))
return
gate
*
mem
+
(
1
-
gate
)
*
candidate
CHOOSE_K
=
256
def
memory_call
(
q
,
l
,
nmaps
,
mem_size
,
vocab_size
,
num_gpus
,
update_mem
):
raise
ValueError
(
"Fill for experiments with additional memory structures."
)
def
memory_run
(
step
,
nmaps
,
mem_size
,
batch_size
,
vocab_size
,
global_step
,
do_training
,
update_mem
,
decay_factor
,
num_gpus
,
target_emb_weights
,
output_w
,
gpu_targets_tn
,
it
):
"""Run memory."""
q
=
step
[:,
0
,
it
,
:]
mlabels
=
gpu_targets_tn
[:,
it
,
0
]
res
,
mask
,
mem_loss
=
memory_call
(
q
,
mlabels
,
nmaps
,
mem_size
,
vocab_size
,
num_gpus
,
update_mem
)
res
=
tf
.
gather
(
target_emb_weights
,
res
)
*
tf
.
expand_dims
(
mask
[:,
0
],
1
)
# Mix gold and original in the first steps, 20% later.
gold
=
tf
.
nn
.
dropout
(
tf
.
gather
(
target_emb_weights
,
mlabels
),
0.7
)
use_gold
=
1.0
-
tf
.
cast
(
global_step
,
tf
.
float32
)
/
(
1000.
*
decay_factor
)
use_gold
=
tf
.
maximum
(
use_gold
,
0.2
)
*
do_training
mem
=
tf
.
cond
(
tf
.
less
(
tf
.
random_uniform
([]),
use_gold
),
lambda
:
use_gold
*
gold
+
(
1.0
-
use_gold
)
*
res
,
lambda
:
res
)
mem
=
tf
.
reshape
(
mem
,
[
-
1
,
1
,
1
,
nmaps
])
return
mem
,
mem_loss
,
update_mem
@
tf
.
RegisterGradient
(
"CustomIdG"
)
def
_custom_id_grad
(
_
,
grads
):
return
grads
...
...
@@ -86,237 +183,560 @@ def quantize_weights_op(quant_scale, max_value):
return
tf
.
group
(
*
ops
)
def
relaxed_average
(
var_name_suffix
,
rx_step
):
"""Calculate the average of relaxed variables having var_name_suffix."""
relaxed_vars
=
[]
for
l
in
xrange
(
rx_step
):
with
tf
.
variable_scope
(
"RX%d"
%
l
,
reuse
=
True
):
try
:
relaxed_vars
.
append
(
tf
.
get_variable
(
var_name_suffix
))
except
ValueError
:
pass
dsum
=
tf
.
add_n
(
relaxed_vars
)
avg
=
dsum
/
len
(
relaxed_vars
)
diff
=
[
v
-
avg
for
v
in
relaxed_vars
]
davg
=
tf
.
add_n
([
d
*
d
for
d
in
diff
])
return
avg
,
tf
.
reduce_sum
(
davg
)
def
relaxed_distance
(
rx_step
):
"""Distance between relaxed variables and their average."""
res
,
ops
,
rx_done
=
[],
[],
{}
for
v
in
tf
.
trainable_variables
():
if
v
.
name
[
0
:
2
]
==
"RX"
:
rx_name
=
v
.
op
.
name
[
v
.
name
.
find
(
"/"
)
+
1
:]
if
rx_name
not
in
rx_done
:
avg
,
dist_loss
=
relaxed_average
(
rx_name
,
rx_step
)
res
.
append
(
dist_loss
)
rx_done
[
rx_name
]
=
avg
ops
.
append
(
v
.
assign
(
rx_done
[
rx_name
]))
return
tf
.
add_n
(
res
),
tf
.
group
(
*
ops
)
def
make_dense
(
targets
,
noclass
):
def
autoenc_quantize
(
x
,
nbits
,
nmaps
,
do_training
,
layers
=
1
):
"""Autoencoder into nbits vectors of bits, using noise and sigmoids."""
enc_x
=
tf
.
reshape
(
x
,
[
-
1
,
nmaps
])
for
i
in
xrange
(
layers
-
1
):
enc_x
=
tf
.
layers
.
dense
(
enc_x
,
nmaps
,
name
=
"autoenc_%d"
%
i
)
enc_x
=
tf
.
layers
.
dense
(
enc_x
,
nbits
,
name
=
"autoenc_%d"
%
(
layers
-
1
))
noise
=
tf
.
truncated_normal
(
tf
.
shape
(
enc_x
),
stddev
=
2.0
)
dec_x
=
sigmoid_cutoff_12
(
enc_x
+
noise
*
do_training
)
dec_x
=
tf
.
reshape
(
dec_x
,
[
-
1
,
nbits
])
for
i
in
xrange
(
layers
):
dec_x
=
tf
.
layers
.
dense
(
dec_x
,
nmaps
,
name
=
"autodec_%d"
%
i
)
return
tf
.
reshape
(
dec_x
,
tf
.
shape
(
x
))
def
make_dense
(
targets
,
noclass
,
low_param
):
"""Move a batch of targets to a dense 1-hot representation."""
with
tf
.
device
(
"/cpu:0"
):
shape
=
tf
.
shape
(
targets
)
batch_size
=
shape
[
0
]
indices
=
targets
+
noclass
*
tf
.
range
(
0
,
batch_size
)
length
=
tf
.
expand_dims
(
batch_size
*
noclass
,
0
)
dense
=
tf
.
sparse_to_dense
(
indices
,
length
,
1.0
,
0.0
)
return
tf
.
reshape
(
dense
,
[
-
1
,
noclass
])
def
check_for_zero
(
sparse
):
"""In a sparse batch of ints, make 1.0 if it's 0 and 0.0 else."""
with
tf
.
device
(
"/cpu:0"
):
shape
=
tf
.
shape
(
sparse
)
batch_size
=
shape
[
0
]
sparse
=
tf
.
minimum
(
sparse
,
1
)
indices
=
sparse
+
2
*
tf
.
range
(
0
,
batch_size
)
dense
=
tf
.
sparse_to_dense
(
indices
,
tf
.
expand_dims
(
2
*
batch_size
,
0
),
1.0
,
0.0
)
reshaped
=
tf
.
reshape
(
dense
,
[
-
1
,
2
])
return
tf
.
reshape
(
tf
.
slice
(
reshaped
,
[
0
,
0
],
[
-
1
,
1
]),
[
-
1
])
low
=
low_param
/
float
(
noclass
-
1
)
high
=
1.0
-
low
*
(
noclass
-
1
)
targets
=
tf
.
cast
(
targets
,
tf
.
int64
)
return
tf
.
one_hot
(
targets
,
depth
=
noclass
,
on_value
=
high
,
off_value
=
low
)
def
reorder_beam
(
beam_size
,
batch_size
,
beam_val
,
output
,
is_first
,
tensors_to_reorder
):
"""Reorder to minimize beam costs."""
# beam_val is [batch_size x beam_size]; let b = batch_size * beam_size
# decided is len x b x a x b
# output is b x out_size; step is b x len x a x b;
outputs
=
tf
.
split
(
tf
.
nn
.
log_softmax
(
output
),
beam_size
,
0
)
all_beam_vals
,
all_beam_idx
=
[],
[]
beam_range
=
1
if
is_first
else
beam_size
for
i
in
xrange
(
beam_range
):
top_out
,
top_out_idx
=
tf
.
nn
.
top_k
(
outputs
[
i
],
k
=
beam_size
)
cur_beam_val
=
beam_val
[:,
i
]
top_out
=
tf
.
Print
(
top_out
,
[
top_out
,
top_out_idx
,
beam_val
,
i
,
cur_beam_val
],
"GREPO"
,
summarize
=
8
)
all_beam_vals
.
append
(
top_out
+
tf
.
expand_dims
(
cur_beam_val
,
1
))
all_beam_idx
.
append
(
top_out_idx
)
all_beam_idx
=
tf
.
reshape
(
tf
.
transpose
(
tf
.
concat
(
all_beam_idx
,
1
),
[
1
,
0
]),
[
-
1
])
top_beam
,
top_beam_idx
=
tf
.
nn
.
top_k
(
tf
.
concat
(
all_beam_vals
,
1
),
k
=
beam_size
)
top_beam_idx
=
tf
.
Print
(
top_beam_idx
,
[
top_beam
,
top_beam_idx
],
"GREP"
,
summarize
=
8
)
reordered
=
[[]
for
_
in
xrange
(
len
(
tensors_to_reorder
)
+
1
)]
top_out_idx
=
[]
for
i
in
xrange
(
beam_size
):
which_idx
=
top_beam_idx
[:,
i
]
*
batch_size
+
tf
.
range
(
batch_size
)
top_out_idx
.
append
(
tf
.
gather
(
all_beam_idx
,
which_idx
))
which_beam
=
top_beam_idx
[:,
i
]
/
beam_size
# [batch]
which_beam
=
which_beam
*
batch_size
+
tf
.
range
(
batch_size
)
reordered
[
0
].
append
(
tf
.
gather
(
output
,
which_beam
))
for
i
,
t
in
enumerate
(
tensors_to_reorder
):
reordered
[
i
+
1
].
append
(
tf
.
gather
(
t
,
which_beam
))
new_tensors
=
[
tf
.
concat
(
t
,
0
)
for
t
in
reordered
]
top_out_idx
=
tf
.
concat
(
top_out_idx
,
0
)
return
(
top_beam
,
new_tensors
[
0
],
top_out_idx
,
new_tensors
[
1
:])
class
NeuralGPU
(
object
):
"""Neural GPU Model."""
def
__init__
(
self
,
nmaps
,
vec_size
,
niclass
,
noclass
,
dropout
,
rx_step
,
max_grad_norm
,
cutoff
,
nconvs
,
kw
,
kh
,
height
,
mode
,
learning_rate
,
pull
,
pull_incr
,
min_length
,
act_noise
=
0.0
):
def
__init__
(
self
,
nmaps
,
vec_size
,
niclass
,
noclass
,
dropout
,
max_grad_norm
,
cutoff
,
nconvs
,
kw
,
kh
,
height
,
mem_size
,
learning_rate
,
min_length
,
num_gpus
,
num_replicas
,
grad_noise_scale
,
sampling_rate
,
act_noise
=
0.0
,
do_rnn
=
False
,
atrous
=
False
,
beam_size
=
1
,
backward
=
True
,
do_layer_norm
=
False
,
autoenc_decay
=
1.0
):
# Feeds for parameters and ops to update them.
self
.
global_step
=
tf
.
Variable
(
0
,
trainable
=
False
)
self
.
cur_length
=
tf
.
Variable
(
min_length
,
trainable
=
False
)
self
.
cur_length_incr_op
=
self
.
cur_length
.
assign_add
(
1
)
self
.
lr
=
tf
.
Variable
(
float
(
learning_rate
)
,
trainable
=
False
)
self
.
lr_decay_op
=
self
.
lr
.
assign
(
self
.
lr
*
0.98
)
self
.
pul
l
=
tf
.
Variable
(
float
(
pull
)
,
trainable
=
False
)
self
.
pull_incr
_op
=
self
.
pul
l
.
assign
(
self
.
pull
*
pull_incr
)
self
.
nmaps
=
nmaps
if
backward
:
self
.
global_step
=
tf
.
Variable
(
0
,
trainable
=
False
,
name
=
"global_step"
)
self
.
cur_length
=
tf
.
Variable
(
min_length
,
trainable
=
False
)
self
.
cur_length_incr_op
=
self
.
cur_length
.
assign_add
(
1
)
self
.
l
r
=
tf
.
Variable
(
learning_rate
,
trainable
=
False
)
self
.
lr_decay
_op
=
self
.
l
r
.
assign
(
self
.
lr
*
0.995
)
self
.
do_training
=
tf
.
placeholder
(
tf
.
float32
,
name
=
"do_training"
)
self
.
update_mem
=
tf
.
placeholder
(
tf
.
int32
,
name
=
"update_mem"
)
self
.
noise_param
=
tf
.
placeholder
(
tf
.
float32
,
name
=
"noise_param"
)
# Feeds for inputs, targets, outputs, losses, etc.
self
.
input
=
[]
self
.
target
=
[]
for
l
in
xrange
(
data_utils
.
forward_max
+
1
):
self
.
input
.
append
(
tf
.
placeholder
(
tf
.
int32
,
name
=
"inp{0}"
.
format
(
l
)))
self
.
target
.
append
(
tf
.
placeholder
(
tf
.
int32
,
name
=
"tgt{0}"
.
format
(
l
)))
self
.
outputs
=
[]
self
.
losses
=
[]
self
.
grad_norms
=
[]
self
.
updates
=
[]
self
.
input
=
tf
.
placeholder
(
tf
.
int32
,
name
=
"inp"
)
self
.
target
=
tf
.
placeholder
(
tf
.
int32
,
name
=
"tgt"
)
self
.
prev_step
=
tf
.
placeholder
(
tf
.
float32
,
name
=
"prev_step"
)
gpu_input
=
tf
.
split
(
self
.
input
,
num_gpus
,
0
)
gpu_target
=
tf
.
split
(
self
.
target
,
num_gpus
,
0
)
gpu_prev_step
=
tf
.
split
(
self
.
prev_step
,
num_gpus
,
0
)
batch_size
=
tf
.
shape
(
gpu_input
[
0
])[
0
]
if
backward
:
adam_lr
=
0.005
*
self
.
lr
adam
=
tf
.
train
.
AdamOptimizer
(
adam_lr
,
epsilon
=
2e-4
)
def
adam_update
(
grads
):
return
adam
.
apply_gradients
(
zip
(
grads
,
tf
.
trainable_variables
()),
global_step
=
self
.
global_step
,
name
=
"adam_update"
)
# When switching from Adam to SGD we perform reverse-decay.
if
backward
:
global_step_float
=
tf
.
cast
(
self
.
global_step
,
tf
.
float32
)
sampling_decay_exponent
=
global_step_float
/
100000.0
sampling_decay
=
tf
.
maximum
(
0.05
,
tf
.
pow
(
0.5
,
sampling_decay_exponent
))
self
.
sampling
=
sampling_rate
*
0.05
/
sampling_decay
else
:
self
.
sampling
=
tf
.
constant
(
0.0
)
# Cache variables on cpu if needed.
if
num_replicas
>
1
or
num_gpus
>
1
:
with
tf
.
device
(
"/cpu:0"
):
caching_const
=
tf
.
constant
(
0
)
tf
.
get_variable_scope
().
set_caching_device
(
caching_const
.
op
.
device
)
# partitioner = tf.variable_axis_size_partitioner(1024*256*4)
# tf.get_variable_scope().set_partitioner(partitioner)
def
gpu_avg
(
l
):
if
l
[
0
]
is
None
:
for
elem
in
l
:
assert
elem
is
None
return
0.0
if
len
(
l
)
<
2
:
return
l
[
0
]
return
sum
(
l
)
/
float
(
num_gpus
)
self
.
length_tensor
=
tf
.
placeholder
(
tf
.
int32
,
name
=
"length"
)
# Computation.
inp0_shape
=
tf
.
shape
(
self
.
input
[
0
])
batch_size
=
inp0_shape
[
0
]
with
tf
.
device
(
"/cpu:0"
):
emb_weights
=
tf
.
get_variable
(
"embedding"
,
[
niclass
,
vec_size
],
initializer
=
tf
.
random_uniform_initializer
(
-
1.7
,
1.7
))
if
beam_size
>
0
:
target_emb_weights
=
tf
.
get_variable
(
"target_embedding"
,
[
noclass
,
nmaps
],
initializer
=
tf
.
random_uniform_initializer
(
-
1.7
,
1.7
))
e0
=
tf
.
scatter_update
(
emb_weights
,
tf
.
constant
(
0
,
dtype
=
tf
.
int32
,
shape
=
[
1
]),
tf
.
zeros
([
1
,
vec_size
]))
adam
=
tf
.
train
.
AdamOptimizer
(
self
.
lr
,
epsilon
=
1e-4
)
# Main graph creation loop, for every bin in data_utils.
self
.
steps
=
[]
for
length
in
sorted
(
list
(
set
(
data_utils
.
bins
+
[
data_utils
.
forward_max
]))):
data_utils
.
print_out
(
"Creating model for bin of length %d."
%
length
)
start_time
=
time
.
time
()
if
length
>
data_utils
.
bins
[
0
]:
output_w
=
tf
.
get_variable
(
"output_w"
,
[
nmaps
,
noclass
],
tf
.
float32
)
def
conv_rate
(
layer
):
if
atrous
:
return
2
**
layer
return
1
# pylint: disable=cell-var-from-loop
def
enc_step
(
step
):
"""Encoder step."""
if
autoenc_decay
<
1.0
:
quant_step
=
autoenc_quantize
(
step
,
16
,
nmaps
,
self
.
do_training
)
if
backward
:
exp_glob
=
tf
.
train
.
exponential_decay
(
1.0
,
self
.
global_step
-
10000
,
1000
,
autoenc_decay
)
dec_factor
=
1.0
-
exp_glob
# * self.do_training
dec_factor
=
tf
.
cond
(
tf
.
less
(
self
.
global_step
,
10500
),
lambda
:
tf
.
constant
(
0.05
),
lambda
:
dec_factor
)
else
:
dec_factor
=
1.0
cur
=
tf
.
cond
(
tf
.
less
(
tf
.
random_uniform
([]),
dec_factor
),
lambda
:
quant_step
,
lambda
:
step
)
else
:
cur
=
step
if
dropout
>
0.0001
:
cur
=
tf
.
nn
.
dropout
(
cur
,
keep_prob
)
if
act_noise
>
0.00001
:
cur
+=
tf
.
truncated_normal
(
tf
.
shape
(
cur
))
*
act_noise_scale
# Do nconvs-many CGRU steps.
if
do_jit
and
tf
.
get_variable_scope
().
reuse
:
with
jit_scope
():
for
layer
in
xrange
(
nconvs
):
cur
=
conv_gru
([],
cur
,
kw
,
kh
,
nmaps
,
conv_rate
(
layer
),
cutoff
,
"ecgru_%d"
%
layer
,
do_layer_norm
)
else
:
for
layer
in
xrange
(
nconvs
):
cur
=
conv_gru
([],
cur
,
kw
,
kh
,
nmaps
,
conv_rate
(
layer
),
cutoff
,
"ecgru_%d"
%
layer
,
do_layer_norm
)
return
cur
zero_tgt
=
tf
.
zeros
([
batch_size
,
nmaps
,
1
])
zero_tgt
.
set_shape
([
None
,
nmaps
,
1
])
def
dec_substep
(
step
,
decided
):
"""Decoder sub-step."""
cur
=
step
if
dropout
>
0.0001
:
cur
=
tf
.
nn
.
dropout
(
cur
,
keep_prob
)
if
act_noise
>
0.00001
:
cur
+=
tf
.
truncated_normal
(
tf
.
shape
(
cur
))
*
act_noise_scale
# Do nconvs-many CGRU steps.
if
do_jit
and
tf
.
get_variable_scope
().
reuse
:
with
jit_scope
():
for
layer
in
xrange
(
nconvs
):
cur
=
conv_gru
([
decided
],
cur
,
kw
,
kh
,
nmaps
,
conv_rate
(
layer
),
cutoff
,
"dcgru_%d"
%
layer
,
do_layer_norm
)
else
:
for
layer
in
xrange
(
nconvs
):
cur
=
conv_gru
([
decided
],
cur
,
kw
,
kh
,
nmaps
,
conv_rate
(
layer
),
cutoff
,
"dcgru_%d"
%
layer
,
do_layer_norm
)
return
cur
# pylint: enable=cell-var-from-loop
def
dec_step
(
step
,
it
,
it_int
,
decided
,
output_ta
,
tgts
,
mloss
,
nupd_in
,
out_idx
,
beam_cost
):
"""Decoder step."""
nupd
,
mem_loss
=
0
,
0.0
if
mem_size
>
0
:
it_incr
=
tf
.
minimum
(
it
+
1
,
length
-
1
)
mem
,
mem_loss
,
nupd
=
memory_run
(
step
,
nmaps
,
mem_size
,
batch_size
,
noclass
,
self
.
global_step
,
self
.
do_training
,
self
.
update_mem
,
10
,
num_gpus
,
target_emb_weights
,
output_w
,
gpu_targets_tn
,
it_incr
)
step
=
dec_substep
(
step
,
decided
)
output_l
=
tf
.
expand_dims
(
tf
.
expand_dims
(
step
[:,
it
,
0
,
:],
1
),
1
)
# Calculate argmax output.
output
=
tf
.
reshape
(
output_l
,
[
-
1
,
nmaps
])
# pylint: disable=cell-var-from-loop
output
=
tf
.
matmul
(
output
,
output_w
)
if
beam_size
>
1
:
beam_cost
,
output
,
out
,
reordered
=
reorder_beam
(
beam_size
,
batch_size
,
beam_cost
,
output
,
it_int
==
0
,
[
output_l
,
out_idx
,
step
,
decided
])
[
output_l
,
out_idx
,
step
,
decided
]
=
reordered
else
:
# Scheduled sampling.
out
=
tf
.
multinomial
(
tf
.
stop_gradient
(
output
),
1
)
out
=
tf
.
to_int32
(
tf
.
squeeze
(
out
,
[
1
]))
out_write
=
output_ta
.
write
(
it
,
output_l
[:
batch_size
,
:,
:,
:])
output
=
tf
.
gather
(
target_emb_weights
,
out
)
output
=
tf
.
reshape
(
output
,
[
-
1
,
1
,
nmaps
])
output
=
tf
.
concat
([
output
]
*
height
,
1
)
tgt
=
tgts
[
it
,
:,
:,
:]
selected
=
tf
.
cond
(
tf
.
less
(
tf
.
random_uniform
([]),
self
.
sampling
),
lambda
:
output
,
lambda
:
tgt
)
# pylint: enable=cell-var-from-loop
dec_write
=
place_at14
(
decided
,
tf
.
expand_dims
(
selected
,
1
),
it
)
out_idx
=
place_at13
(
out_idx
,
tf
.
reshape
(
out
,
[
beam_size
*
batch_size
,
1
,
1
]),
it
)
if
mem_size
>
0
:
mem
=
tf
.
concat
([
mem
]
*
height
,
2
)
dec_write
=
place_at14
(
dec_write
,
mem
,
it_incr
)
return
(
step
,
dec_write
,
out_write
,
mloss
+
mem_loss
,
nupd_in
+
nupd
,
out_idx
,
beam_cost
)
# Main model construction.
gpu_outputs
=
[]
gpu_losses
=
[]
gpu_grad_norms
=
[]
grads_list
=
[]
gpu_out_idx
=
[]
self
.
after_enc_step
=
[]
for
gpu
in
xrange
(
num_gpus
):
# Multi-GPU towers, average gradients later.
length
=
self
.
length_tensor
length_float
=
tf
.
cast
(
length
,
tf
.
float32
)
if
gpu
>
0
:
tf
.
get_variable_scope
().
reuse_variables
()
gpu_outputs
.
append
([])
gpu_losses
.
append
([])
gpu_grad_norms
.
append
([])
with
tf
.
name_scope
(
"gpu%d"
%
gpu
),
tf
.
device
(
"/gpu:%d"
%
gpu
):
# Main graph creation loop.
data
.
print_out
(
"Creating model."
)
start_time
=
time
.
time
()
# Embed inputs and calculate mask.
with
tf
.
device
(
"/cpu:0"
):
tgt_shape
=
tf
.
shape
(
tf
.
squeeze
(
gpu_target
[
gpu
],
[
1
]))
weights
=
tf
.
where
(
tf
.
squeeze
(
gpu_target
[
gpu
],
[
1
])
>
0
,
tf
.
ones
(
tgt_shape
),
tf
.
zeros
(
tgt_shape
))
# Embed inputs and targets.
with
tf
.
control_dependencies
([
e0
]):
start
=
tf
.
gather
(
emb_weights
,
gpu_input
[
gpu
])
# b x h x l x nmaps
gpu_targets_tn
=
gpu_target
[
gpu
]
# b x 1 x len
if
beam_size
>
0
:
embedded_targets_tn
=
tf
.
gather
(
target_emb_weights
,
gpu_targets_tn
)
embedded_targets_tn
=
tf
.
transpose
(
embedded_targets_tn
,
[
2
,
0
,
1
,
3
])
# len x b x 1 x nmaps
embedded_targets_tn
=
tf
.
concat
([
embedded_targets_tn
]
*
height
,
2
)
# First image comes from start by applying convolution and adding 0s.
start
=
tf
.
transpose
(
start
,
[
0
,
2
,
1
,
3
])
# Now b x len x h x vec_s
first
=
conv_linear
(
start
,
1
,
1
,
vec_size
,
nmaps
,
1
,
True
,
0.0
,
"input"
)
first
=
layer_norm
(
first
,
nmaps
,
"input"
)
# Computation steps.
keep_prob
=
dropout
*
3.0
/
tf
.
sqrt
(
length_float
)
keep_prob
=
1.0
-
self
.
do_training
*
keep_prob
act_noise_scale
=
act_noise
*
self
.
do_training
# Start with a convolutional gate merging previous step.
step
=
conv_gru
([
gpu_prev_step
[
gpu
]],
first
,
kw
,
kh
,
nmaps
,
1
,
cutoff
,
"first"
,
do_layer_norm
)
# This is just for running a baseline RNN seq2seq model.
if
do_rnn
:
self
.
after_enc_step
.
append
(
step
)
# Not meaningful here, but needed.
lstm_cell
=
tf
.
contrib
.
rnn
.
BasicLSTMCell
(
height
*
nmaps
)
cell
=
tf
.
contrib
.
rnn
.
MultiRNNCell
([
lstm_cell
]
*
nconvs
)
with
tf
.
variable_scope
(
"encoder"
):
encoder_outputs
,
encoder_state
=
tf
.
nn
.
dynamic_rnn
(
cell
,
tf
.
reshape
(
step
,
[
batch_size
,
length
,
height
*
nmaps
]),
dtype
=
tf
.
float32
,
time_major
=
False
)
# Attention.
attn
=
tf
.
layers
.
dense
(
encoder_outputs
,
height
*
nmaps
,
name
=
"attn1"
)
# pylint: disable=cell-var-from-loop
@
function
.
Defun
(
noinline
=
True
)
def
attention_query
(
query
,
attn_v
):
vecs
=
tf
.
tanh
(
attn
+
tf
.
expand_dims
(
query
,
1
))
mask
=
tf
.
reduce_sum
(
vecs
*
tf
.
reshape
(
attn_v
,
[
1
,
1
,
-
1
]),
2
)
mask
=
tf
.
nn
.
softmax
(
mask
)
return
tf
.
reduce_sum
(
encoder_outputs
*
tf
.
expand_dims
(
mask
,
2
),
1
)
with
tf
.
variable_scope
(
"decoder"
):
def
decoder_loop_fn
((
state
,
prev_cell_out
,
_
),
(
cell_inp
,
cur_tgt
)):
"""Decoder loop function."""
attn_q
=
tf
.
layers
.
dense
(
prev_cell_out
,
height
*
nmaps
,
name
=
"attn_query"
)
attn_res
=
attention_query
(
attn_q
,
tf
.
get_variable
(
"attn_v"
,
[
height
*
nmaps
],
initializer
=
tf
.
random_uniform_initializer
(
-
0.1
,
0.1
)))
concatenated
=
tf
.
reshape
(
tf
.
concat
([
cell_inp
,
attn_res
],
1
),
[
batch_size
,
2
*
height
*
nmaps
])
cell_inp
=
tf
.
layers
.
dense
(
concatenated
,
height
*
nmaps
,
name
=
"attn_merge"
)
output
,
new_state
=
cell
(
cell_inp
,
state
)
mem_loss
=
0.0
if
mem_size
>
0
:
res
,
mask
,
mem_loss
=
memory_call
(
output
,
cur_tgt
,
height
*
nmaps
,
mem_size
,
noclass
,
num_gpus
,
self
.
update_mem
)
res
=
tf
.
gather
(
target_emb_weights
,
res
)
res
*=
tf
.
expand_dims
(
mask
[:,
0
],
1
)
output
=
tf
.
layers
.
dense
(
tf
.
concat
([
output
,
res
],
1
),
height
*
nmaps
,
name
=
"rnnmem"
)
return
new_state
,
output
,
mem_loss
# pylint: enable=cell-var-from-loop
gpu_targets
=
tf
.
squeeze
(
gpu_target
[
gpu
],
[
1
])
# b x len
gpu_tgt_trans
=
tf
.
transpose
(
gpu_targets
,
[
1
,
0
])
dec_zero
=
tf
.
zeros
([
batch_size
,
1
],
dtype
=
tf
.
int32
)
dec_inp
=
tf
.
concat
([
dec_zero
,
gpu_targets
],
1
)
dec_inp
=
dec_inp
[:,
:
length
]
embedded_dec_inp
=
tf
.
gather
(
target_emb_weights
,
dec_inp
)
embedded_dec_inp_proj
=
tf
.
layers
.
dense
(
embedded_dec_inp
,
height
*
nmaps
,
name
=
"dec_proj"
)
embedded_dec_inp_proj
=
tf
.
transpose
(
embedded_dec_inp_proj
,
[
1
,
0
,
2
])
init_vals
=
(
encoder_state
,
tf
.
zeros
([
batch_size
,
height
*
nmaps
]),
0.0
)
_
,
dec_outputs
,
mem_losses
=
tf
.
scan
(
decoder_loop_fn
,
(
embedded_dec_inp_proj
,
gpu_tgt_trans
),
initializer
=
init_vals
)
mem_loss
=
tf
.
reduce_mean
(
mem_losses
)
outputs
=
tf
.
layers
.
dense
(
dec_outputs
,
nmaps
,
name
=
"out_proj"
)
# Final convolution to get logits, list outputs.
outputs
=
tf
.
matmul
(
tf
.
reshape
(
outputs
,
[
-
1
,
nmaps
]),
output_w
)
outputs
=
tf
.
reshape
(
outputs
,
[
length
,
batch_size
,
noclass
])
gpu_out_idx
.
append
(
tf
.
argmax
(
outputs
,
2
))
else
:
# Here we go with the Neural GPU.
# Encoder.
enc_length
=
length
step
=
enc_step
(
step
)
# First step hard-coded.
# pylint: disable=cell-var-from-loop
i
=
tf
.
constant
(
1
)
c
=
lambda
i
,
_s
:
tf
.
less
(
i
,
enc_length
)
def
enc_step_lambda
(
i
,
step
):
with
tf
.
variable_scope
(
tf
.
get_variable_scope
(),
reuse
=
True
):
new_step
=
enc_step
(
step
)
return
(
i
+
1
,
new_step
)
_
,
step
=
tf
.
while_loop
(
c
,
enc_step_lambda
,
[
i
,
step
],
parallel_iterations
=
1
,
swap_memory
=
True
)
# pylint: enable=cell-var-from-loop
self
.
after_enc_step
.
append
(
step
)
# Decoder.
if
beam_size
>
0
:
output_ta
=
tf
.
TensorArray
(
dtype
=
tf
.
float32
,
size
=
length
,
dynamic_size
=
False
,
infer_shape
=
False
,
name
=
"outputs"
)
out_idx
=
tf
.
zeros
([
beam_size
*
batch_size
,
length
,
1
],
dtype
=
tf
.
int32
)
decided_t
=
tf
.
zeros
([
beam_size
*
batch_size
,
length
,
height
,
vec_size
])
# Prepare for beam search.
tgts
=
tf
.
concat
([
embedded_targets_tn
]
*
beam_size
,
1
)
beam_cost
=
tf
.
zeros
([
batch_size
,
beam_size
])
step
=
tf
.
concat
([
step
]
*
beam_size
,
0
)
# First step hard-coded.
step
,
decided_t
,
output_ta
,
mem_loss
,
nupd
,
oi
,
bc
=
dec_step
(
step
,
0
,
0
,
decided_t
,
output_ta
,
tgts
,
0.0
,
0
,
out_idx
,
beam_cost
)
tf
.
get_variable_scope
().
reuse_variables
()
# pylint: disable=cell-var-from-loop
def
step_lambda
(
i
,
step
,
dec_t
,
out_ta
,
ml
,
nu
,
oi
,
bc
):
with
tf
.
variable_scope
(
tf
.
get_variable_scope
(),
reuse
=
True
):
s
,
d
,
t
,
nml
,
nu
,
oi
,
bc
=
dec_step
(
step
,
i
,
1
,
dec_t
,
out_ta
,
tgts
,
ml
,
nu
,
oi
,
bc
)
return
(
i
+
1
,
s
,
d
,
t
,
nml
,
nu
,
oi
,
bc
)
i
=
tf
.
constant
(
1
)
c
=
lambda
i
,
_s
,
_d
,
_o
,
_ml
,
_nu
,
_oi
,
_bc
:
tf
.
less
(
i
,
length
)
_
,
step
,
_
,
output_ta
,
mem_loss
,
nupd
,
out_idx
,
_
=
tf
.
while_loop
(
c
,
step_lambda
,
[
i
,
step
,
decided_t
,
output_ta
,
mem_loss
,
nupd
,
oi
,
bc
],
parallel_iterations
=
1
,
swap_memory
=
True
)
# pylint: enable=cell-var-from-loop
gpu_out_idx
.
append
(
tf
.
squeeze
(
out_idx
,
[
2
]))
outputs
=
output_ta
.
stack
()
outputs
=
tf
.
squeeze
(
outputs
,
[
2
,
3
])
# Now l x b x nmaps
else
:
# If beam_size is 0 or less, we don't have a decoder.
mem_loss
=
0.0
outputs
=
tf
.
transpose
(
step
[:,
:,
1
,
:],
[
1
,
0
,
2
])
gpu_out_idx
.
append
(
tf
.
argmax
(
outputs
,
2
))
# Final convolution to get logits, list outputs.
outputs
=
tf
.
matmul
(
tf
.
reshape
(
outputs
,
[
-
1
,
nmaps
]),
output_w
)
outputs
=
tf
.
reshape
(
outputs
,
[
length
,
batch_size
,
noclass
])
gpu_outputs
[
gpu
]
=
tf
.
nn
.
softmax
(
outputs
)
# Calculate cross-entropy loss and normalize it.
targets_soft
=
make_dense
(
tf
.
squeeze
(
gpu_target
[
gpu
],
[
1
]),
noclass
,
0.1
)
targets_soft
=
tf
.
reshape
(
targets_soft
,
[
-
1
,
noclass
])
targets_hard
=
make_dense
(
tf
.
squeeze
(
gpu_target
[
gpu
],
[
1
]),
noclass
,
0.0
)
targets_hard
=
tf
.
reshape
(
targets_hard
,
[
-
1
,
noclass
])
output
=
tf
.
transpose
(
outputs
,
[
1
,
0
,
2
])
xent_soft
=
tf
.
reshape
(
tf
.
nn
.
softmax_cross_entropy_with_logits
(
logits
=
tf
.
reshape
(
output
,
[
-
1
,
noclass
]),
labels
=
targets_soft
),
[
batch_size
,
length
])
xent_hard
=
tf
.
reshape
(
tf
.
nn
.
softmax_cross_entropy_with_logits
(
logits
=
tf
.
reshape
(
output
,
[
-
1
,
noclass
]),
labels
=
targets_hard
),
[
batch_size
,
length
])
low
,
high
=
0.1
/
float
(
noclass
-
1
),
0.9
const
=
high
*
tf
.
log
(
high
)
+
float
(
noclass
-
1
)
*
low
*
tf
.
log
(
low
)
weight_sum
=
tf
.
reduce_sum
(
weights
)
+
1e-20
true_perp
=
tf
.
reduce_sum
(
xent_hard
*
weights
)
/
weight_sum
soft_loss
=
tf
.
reduce_sum
(
xent_soft
*
weights
)
/
weight_sum
perp_loss
=
soft_loss
+
const
# Final loss: cross-entropy + shared parameter relaxation part + extra.
mem_loss
=
0.5
*
tf
.
reduce_mean
(
mem_loss
)
/
length_float
total_loss
=
perp_loss
+
mem_loss
gpu_losses
[
gpu
].
append
(
true_perp
)
# Gradients.
if
backward
:
data
.
print_out
(
"Creating backward pass for the model."
)
grads
=
tf
.
gradients
(
total_loss
,
tf
.
trainable_variables
(),
colocate_gradients_with_ops
=
True
)
for
g_i
,
g
in
enumerate
(
grads
):
if
isinstance
(
g
,
tf
.
IndexedSlices
):
grads
[
g_i
]
=
tf
.
convert_to_tensor
(
g
)
grads
,
norm
=
tf
.
clip_by_global_norm
(
grads
,
max_grad_norm
)
gpu_grad_norms
[
gpu
].
append
(
norm
)
for
g
in
grads
:
if
grad_noise_scale
>
0.001
:
g
+=
tf
.
truncated_normal
(
tf
.
shape
(
g
))
*
self
.
noise_param
grads_list
.
append
(
grads
)
else
:
gpu_grad_norms
[
gpu
].
append
(
0.0
)
data
.
print_out
(
"Created model for gpu %d in %.2f s."
%
(
gpu
,
time
.
time
()
-
start_time
))
# Embed inputs and calculate mask.
with
tf
.
device
(
"/cpu:0"
):
with
tf
.
control_dependencies
([
e0
]):
embedded
=
[
tf
.
nn
.
embedding_lookup
(
emb_weights
,
self
.
input
[
l
])
for
l
in
xrange
(
length
)]
# Mask to 0-out padding space in each step.
imask
=
[
check_for_zero
(
self
.
input
[
l
])
for
l
in
xrange
(
length
)]
omask
=
[
check_for_zero
(
self
.
target
[
l
])
for
l
in
xrange
(
length
)]
mask
=
[
1.0
-
(
imask
[
i
]
*
omask
[
i
])
for
i
in
xrange
(
length
)]
mask
=
[
tf
.
reshape
(
m
,
[
-
1
,
1
])
for
m
in
mask
]
# Use a shifted mask for step scaling and concatenated for weights.
shifted_mask
=
mask
+
[
tf
.
zeros_like
(
mask
[
0
])]
scales
=
[
shifted_mask
[
i
]
*
(
1.0
-
shifted_mask
[
i
+
1
])
for
i
in
xrange
(
length
)]
scales
=
[
tf
.
reshape
(
s
,
[
-
1
,
1
,
1
,
1
])
for
s
in
scales
]
mask
=
tf
.
concat
(
1
,
mask
[
0
:
length
])
# batch x length
weights
=
mask
# Add a height dimension to mask to use later for masking.
mask
=
tf
.
reshape
(
mask
,
[
-
1
,
length
,
1
,
1
])
mask
=
tf
.
concat
(
2
,
[
mask
for
_
in
xrange
(
height
)])
+
tf
.
zeros
(
tf
.
pack
([
batch_size
,
length
,
height
,
nmaps
]),
dtype
=
tf
.
float32
)
# Start is a length-list of batch-by-nmaps tensors, reshape and concat.
start
=
[
tf
.
tanh
(
embedded
[
l
])
for
l
in
xrange
(
length
)]
start
=
[
tf
.
reshape
(
start
[
l
],
[
-
1
,
1
,
nmaps
])
for
l
in
xrange
(
length
)]
start
=
tf
.
reshape
(
tf
.
concat
(
1
,
start
),
[
-
1
,
length
,
1
,
nmaps
])
# First image comes from start by applying one convolution and adding 0s.
first
=
conv_linear
(
start
,
1
,
1
,
vec_size
,
nmaps
,
True
,
0.0
,
"input"
)
first
=
[
first
]
+
[
tf
.
zeros
(
tf
.
pack
([
batch_size
,
length
,
1
,
nmaps
]),
dtype
=
tf
.
float32
)
for
_
in
xrange
(
height
-
1
)]
first
=
tf
.
concat
(
2
,
first
)
# Computation steps.
keep_prob
=
1.0
-
self
.
do_training
*
(
dropout
*
8.0
/
float
(
length
))
step
=
[
tf
.
nn
.
dropout
(
first
,
keep_prob
)
*
mask
]
act_noise_scale
=
act_noise
*
self
.
do_training
*
self
.
pull
outputs
=
[]
for
it
in
xrange
(
length
):
with
tf
.
variable_scope
(
"RX%d"
%
(
it
%
rx_step
))
as
vs
:
if
it
>=
rx_step
:
vs
.
reuse_variables
()
cur
=
step
[
it
]
# Do nconvs-many CGRU steps.
for
layer
in
xrange
(
nconvs
):
cur
=
conv_gru
([],
cur
,
kw
,
kh
,
nmaps
,
cutoff
,
"cgru_%d"
%
layer
)
cur
*=
mask
outputs
.
append
(
tf
.
slice
(
cur
,
[
0
,
0
,
0
,
0
],
[
-
1
,
-
1
,
1
,
-
1
]))
cur
=
tf
.
nn
.
dropout
(
cur
,
keep_prob
)
if
act_noise
>
0.00001
:
cur
+=
tf
.
truncated_normal
(
tf
.
shape
(
cur
))
*
act_noise_scale
step
.
append
(
cur
*
mask
)
self
.
steps
.
append
([
tf
.
reshape
(
s
,
[
-
1
,
length
,
height
*
nmaps
])
for
s
in
step
])
# Output is the n-th step output; n = current length, as in scales.
output
=
tf
.
add_n
([
outputs
[
i
]
*
scales
[
i
]
for
i
in
xrange
(
length
)])
# Final convolution to get logits, list outputs.
output
=
conv_linear
(
output
,
1
,
1
,
nmaps
,
noclass
,
True
,
0.0
,
"output"
)
output
=
tf
.
reshape
(
output
,
[
-
1
,
length
,
noclass
])
external_output
=
[
tf
.
reshape
(
o
,
[
-
1
,
noclass
])
for
o
in
list
(
tf
.
split
(
1
,
length
,
output
))]
external_output
=
[
tf
.
nn
.
softmax
(
o
)
for
o
in
external_output
]
self
.
outputs
.
append
(
external_output
)
# Calculate cross-entropy loss and normalize it.
targets
=
tf
.
concat
(
1
,
[
make_dense
(
self
.
target
[
l
],
noclass
)
for
l
in
xrange
(
length
)])
targets
=
tf
.
reshape
(
targets
,
[
-
1
,
noclass
])
xent
=
tf
.
reshape
(
tf
.
nn
.
softmax_cross_entropy_with_logits
(
tf
.
reshape
(
output
,
[
-
1
,
noclass
]),
targets
),
[
-
1
,
length
])
perp_loss
=
tf
.
reduce_sum
(
xent
*
weights
)
perp_loss
/=
tf
.
cast
(
batch_size
,
dtype
=
tf
.
float32
)
perp_loss
/=
length
# Final loss: cross-entropy + shared parameter relaxation part.
relax_dist
,
self
.
avg_op
=
relaxed_distance
(
rx_step
)
total_loss
=
perp_loss
+
relax_dist
*
self
.
pull
self
.
losses
.
append
(
perp_loss
)
# Gradients and Adam update operation.
if
length
==
data_utils
.
bins
[
0
]
or
(
mode
==
0
and
length
<
data_utils
.
bins
[
-
1
]
+
1
):
data_utils
.
print_out
(
"Creating backward for bin of length %d."
%
length
)
params
=
tf
.
trainable_variables
()
grads
=
tf
.
gradients
(
total_loss
,
params
)
grads
,
norm
=
tf
.
clip_by_global_norm
(
grads
,
max_grad_norm
)
self
.
grad_norms
.
append
(
norm
)
for
grad
in
grads
:
if
isinstance
(
grad
,
tf
.
Tensor
):
grad
+=
tf
.
truncated_normal
(
tf
.
shape
(
grad
))
*
self
.
noise_param
update
=
adam
.
apply_gradients
(
zip
(
grads
,
params
),
global_step
=
self
.
global_step
)
self
.
updates
.
append
(
update
)
data_utils
.
print_out
(
"Created model for bin of length %d in"
" %.2f s."
%
(
length
,
time
.
time
()
-
start_time
))
self
.
saver
=
tf
.
train
.
Saver
(
tf
.
all_variables
())
def
step
(
self
,
sess
,
inp
,
target
,
do_backward
,
noise_param
=
None
,
get_steps
=
False
):
self
.
updates
=
[]
self
.
after_enc_step
=
tf
.
concat
(
self
.
after_enc_step
,
0
)
# Concat GPUs.
if
backward
:
tf
.
get_variable_scope
().
_reuse
=
False
tf
.
get_variable_scope
().
set_caching_device
(
None
)
grads
=
[
gpu_avg
([
grads_list
[
g
][
i
]
for
g
in
xrange
(
num_gpus
)])
for
i
in
xrange
(
len
(
grads_list
[
0
]))]
update
=
adam_update
(
grads
)
self
.
updates
.
append
(
update
)
else
:
self
.
updates
.
append
(
tf
.
no_op
())
self
.
losses
=
[
gpu_avg
([
gpu_losses
[
g
][
i
]
for
g
in
xrange
(
num_gpus
)])
for
i
in
xrange
(
len
(
gpu_losses
[
0
]))]
self
.
out_idx
=
tf
.
concat
(
gpu_out_idx
,
0
)
self
.
grad_norms
=
[
gpu_avg
([
gpu_grad_norms
[
g
][
i
]
for
g
in
xrange
(
num_gpus
)])
for
i
in
xrange
(
len
(
gpu_grad_norms
[
0
]))]
self
.
outputs
=
[
tf
.
concat
([
gpu_outputs
[
g
]
for
g
in
xrange
(
num_gpus
)],
1
)]
self
.
quantize_op
=
quantize_weights_op
(
512
,
8
)
if
backward
:
self
.
saver
=
tf
.
train
.
Saver
(
tf
.
global_variables
(),
max_to_keep
=
10
)
def
step
(
self
,
sess
,
inp
,
target
,
do_backward_in
,
noise_param
=
None
,
beam_size
=
2
,
eos_id
=
2
,
eos_cost
=
0.0
,
update_mem
=
None
,
state
=
None
):
"""Run a step of the network."""
assert
len
(
inp
)
==
len
(
target
)
length
=
len
(
target
)
batch_size
,
height
,
length
=
inp
.
shape
[
0
],
inp
.
shape
[
1
],
inp
.
shape
[
2
]
do_backward
=
do_backward_in
train_mode
=
True
if
do_backward_in
is
None
:
do_backward
=
False
train_mode
=
False
if
update_mem
is
None
:
update_mem
=
do_backward
feed_in
=
{}
# print " feeding sequences of length %d" % length
if
state
is
None
:
state
=
np
.
zeros
([
batch_size
,
length
,
height
,
self
.
nmaps
])
feed_in
[
self
.
prev_step
.
name
]
=
state
feed_in
[
self
.
length_tensor
.
name
]
=
length
feed_in
[
self
.
noise_param
.
name
]
=
noise_param
if
noise_param
else
0.0
feed_in
[
self
.
do_training
.
name
]
=
1.0
if
do_backward
else
0.0
feed_in
[
self
.
update_mem
.
name
]
=
1
if
update_mem
else
0
if
do_backward_in
is
False
:
feed_in
[
self
.
sampling
.
name
]
=
0.0
index
=
0
# We're dynamic now.
feed_out
=
[]
index
=
len
(
data_utils
.
bins
)
if
length
<
data_utils
.
bins
[
-
1
]
+
1
:
index
=
data_utils
.
bins
.
index
(
length
)
if
do_backward
:
feed_out
.
append
(
self
.
updates
[
index
])
feed_out
.
append
(
self
.
grad_norms
[
index
])
feed_out
.
append
(
self
.
losses
[
index
])
for
l
in
xrange
(
length
):
feed_in
[
self
.
input
[
l
].
name
]
=
inp
[
l
]
for
l
in
xrange
(
length
):
feed_in
[
self
.
target
[
l
].
name
]
=
target
[
l
]
feed_out
.
append
(
self
.
outputs
[
index
][
l
])
if
get_steps
:
for
l
in
xrange
(
length
+
1
):
feed_out
.
append
(
self
.
steps
[
index
][
l
])
res
=
sess
.
run
(
feed_out
,
feed_in
)
if
train_mode
:
feed_out
.
append
(
self
.
losses
[
index
])
feed_in
[
self
.
input
.
name
]
=
inp
feed_in
[
self
.
target
.
name
]
=
target
feed_out
.
append
(
self
.
outputs
[
index
])
if
train_mode
:
# Make a full-sequence training step with one call to session.run.
res
=
sess
.
run
([
self
.
after_enc_step
]
+
feed_out
,
feed_in
)
after_enc_state
,
res
=
res
[
0
],
res
[
1
:]
else
:
# Make a full-sequence decoding step with one call to session.run.
feed_in
[
self
.
sampling
.
name
]
=
1.1
# Sample every time.
res
=
sess
.
run
([
self
.
after_enc_step
,
self
.
out_idx
]
+
feed_out
,
feed_in
)
after_enc_state
,
out_idx
=
res
[
0
],
res
[
1
]
res
=
[
res
[
2
][
l
]
for
l
in
xrange
(
length
)]
outputs
=
[
out_idx
[:,
i
]
for
i
in
xrange
(
length
)]
cost
=
[
0.0
for
_
in
xrange
(
beam_size
*
batch_size
)]
seen_eos
=
[
0
for
_
in
xrange
(
beam_size
*
batch_size
)]
for
idx
,
logit
in
enumerate
(
res
):
best
=
outputs
[
idx
]
for
b
in
xrange
(
batch_size
):
if
seen_eos
[
b
]
>
1
:
cost
[
b
]
-=
eos_cost
else
:
cost
[
b
]
+=
np
.
log
(
logit
[
b
][
best
[
b
]])
if
best
[
b
]
in
[
eos_id
]:
seen_eos
[
b
]
+=
1
res
=
[[
-
c
for
c
in
cost
]]
+
outputs
# Collect and output results.
offset
=
0
norm
=
None
if
do_backward
:
offset
=
2
norm
=
res
[
1
]
outputs
=
res
[
offset
+
1
:
offset
+
1
+
length
]
steps
=
res
[
offset
+
1
+
length
:]
if
get_steps
else
None
return
res
[
offset
],
outputs
,
norm
,
steps
if
train_mode
:
outputs
=
res
[
offset
+
1
]
outputs
=
[
outputs
[
l
]
for
l
in
xrange
(
length
)]
return
res
[
offset
],
outputs
,
norm
,
after_enc_state
neural_gpu/neural_gpu_trainer.py
View file @
a315e568
...
...
@@ -12,260 +12,744 @@
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Neural GPU
for Learning Algorithms
."""
"""Neural GPU."""
import
math
import
os
import
random
import
sys
import
threading
import
time
import
matplotlib.animation
as
anim
import
matplotlib.pyplot
as
plt
import
numpy
as
np
import
tensorflow
as
tf
from
tensorflow.python.platform
import
gfile
import
program_utils
import
data_utils
as
data
import
neural_gpu
import
neural_gpu
as
ngpu
import
wmt_utils
as
wmt
tf
.
app
.
flags
.
DEFINE_float
(
"lr"
,
0.
00
1
,
"Learning rate."
)
tf
.
app
.
flags
.
DEFINE_float
(
"init_weight"
,
1.0
,
"Initial weights deviation."
)
tf
.
app
.
flags
.
DEFINE_float
(
"max_grad_norm"
,
1
.0
,
"Clip gradients to this norm."
)
tf
.
app
.
flags
.
DEFINE_float
(
"lr"
,
0.1
,
"Learning rate."
)
tf
.
app
.
flags
.
DEFINE_float
(
"init_weight"
,
0.8
,
"Initial weights deviation."
)
tf
.
app
.
flags
.
DEFINE_float
(
"max_grad_norm"
,
4
.0
,
"Clip gradients to this norm."
)
tf
.
app
.
flags
.
DEFINE_float
(
"cutoff"
,
1.2
,
"Cutoff at the gates."
)
tf
.
app
.
flags
.
DEFINE_float
(
"pull"
,
0.0005
,
"Starting pull of the relaxations."
)
tf
.
app
.
flags
.
DEFINE_float
(
"pull_incr"
,
1.2
,
"Increase pull by that much."
)
tf
.
app
.
flags
.
DEFINE_float
(
"curriculum_bound"
,
0.15
,
"Move curriculum < this."
)
tf
.
app
.
flags
.
DEFINE_float
(
"dropout"
,
0.15
,
"Dropout that much."
)
tf
.
app
.
flags
.
DEFINE_float
(
"curriculum_ppx"
,
9.9
,
"Move curriculum if ppl < X."
)
tf
.
app
.
flags
.
DEFINE_float
(
"curriculum_seq"
,
0.3
,
"Move curriculum if seq < X."
)
tf
.
app
.
flags
.
DEFINE_float
(
"dropout"
,
0.0
,
"Dropout that much."
)
tf
.
app
.
flags
.
DEFINE_float
(
"grad_noise_scale"
,
0.0
,
"Gradient noise scale."
)
tf
.
app
.
flags
.
DEFINE_float
(
"max_sampling_rate"
,
0.1
,
"Maximal sampling rate."
)
tf
.
app
.
flags
.
DEFINE_float
(
"length_norm"
,
0.0
,
"Length normalization."
)
tf
.
app
.
flags
.
DEFINE_float
(
"train_beam_freq"
,
0.0
,
"Beam-based training."
)
tf
.
app
.
flags
.
DEFINE_float
(
"train_beam_anneal"
,
20000
,
"How many steps anneal."
)
tf
.
app
.
flags
.
DEFINE_integer
(
"eval_beam_steps"
,
4
,
"How many beam steps eval."
)
tf
.
app
.
flags
.
DEFINE_integer
(
"batch_size"
,
32
,
"Batch size."
)
tf
.
app
.
flags
.
DEFINE_integer
(
"low_batch_size"
,
16
,
"Low batch size."
)
tf
.
app
.
flags
.
DEFINE_integer
(
"steps_per_checkpoint"
,
200
,
"Steps per epoch."
)
tf
.
app
.
flags
.
DEFINE_integer
(
"nmaps"
,
128
,
"Number of floats in each cell."
)
tf
.
app
.
flags
.
DEFINE_integer
(
"niclass"
,
33
,
"Number of classes (0 is padding)."
)
tf
.
app
.
flags
.
DEFINE_integer
(
"noclass"
,
33
,
"Number of classes (0 is padding)."
)
tf
.
app
.
flags
.
DEFINE_integer
(
"train_data_size"
,
5000
,
"Training examples/len."
)
tf
.
app
.
flags
.
DEFINE_integer
(
"max_length"
,
41
,
"Maximum length."
)
tf
.
app
.
flags
.
DEFINE_integer
(
"rx_step"
,
6
,
"Relax that many recursive steps."
)
tf
.
app
.
flags
.
DEFINE_integer
(
"steps_per_checkpoint"
,
100
,
"Steps per epoch."
)
tf
.
app
.
flags
.
DEFINE_integer
(
"nmaps"
,
64
,
"Number of floats in each cell."
)
tf
.
app
.
flags
.
DEFINE_integer
(
"vec_size"
,
64
,
"Size of word vectors."
)
tf
.
app
.
flags
.
DEFINE_integer
(
"train_data_size"
,
1000
,
"Training examples/len."
)
tf
.
app
.
flags
.
DEFINE_integer
(
"max_length"
,
40
,
"Maximum length."
)
tf
.
app
.
flags
.
DEFINE_integer
(
"random_seed"
,
125459
,
"Random seed."
)
tf
.
app
.
flags
.
DEFINE_integer
(
"nconvs"
,
2
,
"How many convolutions / 1 step."
)
tf
.
app
.
flags
.
DEFINE_integer
(
"kw"
,
3
,
"Kernel width."
)
tf
.
app
.
flags
.
DEFINE_integer
(
"kh"
,
3
,
"Kernel height."
)
tf
.
app
.
flags
.
DEFINE_integer
(
"height"
,
4
,
"Height."
)
tf
.
app
.
flags
.
DEFINE_integer
(
"forward_max"
,
401
,
"Maximum forward length."
)
tf
.
app
.
flags
.
DEFINE_integer
(
"jobid"
,
-
1
,
"Task id when running on borg."
)
tf
.
app
.
flags
.
DEFINE_integer
(
"mem_size"
,
-
1
,
"Memory size (sqrt)"
)
tf
.
app
.
flags
.
DEFINE_integer
(
"soft_mem_size"
,
1024
,
"Softmax memory this size."
)
tf
.
app
.
flags
.
DEFINE_integer
(
"num_gpus"
,
1
,
"Number of GPUs to use."
)
tf
.
app
.
flags
.
DEFINE_integer
(
"num_replicas"
,
1
,
"Number of replicas in use."
)
tf
.
app
.
flags
.
DEFINE_integer
(
"beam_size"
,
1
,
"Beam size during decoding. "
"If 0, no decoder, the non-extended Neural GPU."
)
tf
.
app
.
flags
.
DEFINE_integer
(
"max_target_vocab"
,
0
,
"Maximal size of target vocabulary."
)
tf
.
app
.
flags
.
DEFINE_integer
(
"decode_offset"
,
0
,
"Offset for decoding."
)
tf
.
app
.
flags
.
DEFINE_integer
(
"task"
,
-
1
,
"Task id when running on borg."
)
tf
.
app
.
flags
.
DEFINE_integer
(
"nprint"
,
0
,
"How many test examples to print out."
)
tf
.
app
.
flags
.
DEFINE_integer
(
"eval_bin_print"
,
3
,
"How many bins step in eval."
)
tf
.
app
.
flags
.
DEFINE_integer
(
"mode"
,
0
,
"Mode: 0-train other-decode."
)
tf
.
app
.
flags
.
DEFINE_bool
(
"animate"
,
False
,
"Whether to produce an animation."
)
tf
.
app
.
flags
.
DEFINE_bool
(
"atrous"
,
False
,
"Whether to use atrous convs."
)
tf
.
app
.
flags
.
DEFINE_bool
(
"layer_norm"
,
False
,
"Do layer normalization."
)
tf
.
app
.
flags
.
DEFINE_bool
(
"quantize"
,
False
,
"Whether to quantize variables."
)
tf
.
app
.
flags
.
DEFINE_string
(
"task"
,
"rev"
,
"Which task are we learning?"
)
tf
.
app
.
flags
.
DEFINE_bool
(
"do_train"
,
True
,
"If false, only update memory."
)
tf
.
app
.
flags
.
DEFINE_bool
(
"rnn_baseline"
,
False
,
"If true build an RNN instead."
)
tf
.
app
.
flags
.
DEFINE_bool
(
"simple_tokenizer"
,
False
,
"If true, tokenize on spaces only, digits are 0."
)
tf
.
app
.
flags
.
DEFINE_bool
(
"normalize_digits"
,
True
,
"Whether to normalize digits with simple tokenizer."
)
tf
.
app
.
flags
.
DEFINE_integer
(
"vocab_size"
,
16
,
"Joint vocabulary size."
)
tf
.
app
.
flags
.
DEFINE_string
(
"data_dir"
,
"/tmp"
,
"Data directory"
)
tf
.
app
.
flags
.
DEFINE_string
(
"train_dir"
,
"/tmp/"
,
"Directory to store models."
)
tf
.
app
.
flags
.
DEFINE_string
(
"ensemble"
,
""
,
"Model paths for ensemble."
)
tf
.
app
.
flags
.
DEFINE_string
(
"test_file_prefix"
,
""
,
"Files to test (.en,.fr)."
)
tf
.
app
.
flags
.
DEFINE_integer
(
"max_train_data_size"
,
0
,
"Limit on the size of training data (0: no limit)."
)
tf
.
app
.
flags
.
DEFINE_string
(
"word_vector_file_en"
,
""
,
"Optional file with word vectors to start training."
)
tf
.
app
.
flags
.
DEFINE_string
(
"word_vector_file_fr"
,
""
,
"Optional file with word vectors to start training."
)
tf
.
app
.
flags
.
DEFINE_string
(
"problem"
,
"wmt"
,
"What problem are we solving?."
)
tf
.
app
.
flags
.
DEFINE_integer
(
"ps_tasks"
,
0
,
"Number of ps tasks used."
)
tf
.
app
.
flags
.
DEFINE_string
(
"master"
,
""
,
"Name of the TensorFlow master."
)
FLAGS
=
tf
.
app
.
flags
.
FLAGS
EXTRA_EVAL
=
12
EXTRA_EVAL
=
10
EVAL_LEN_INCR
=
8
MAXLEN_F
=
2.0
def
zero_split
(
tok_list
,
append
=
None
):
"""Split tok_list (list of ints) on 0s, append int to all parts if given."""
res
,
cur
,
l
=
[],
[],
0
for
tok
in
tok_list
:
if
tok
==
0
:
if
append
is
not
None
:
cur
.
append
(
append
)
res
.
append
(
cur
)
l
=
max
(
l
,
len
(
cur
))
cur
=
[]
else
:
cur
.
append
(
tok
)
if
append
is
not
None
:
cur
.
append
(
append
)
res
.
append
(
cur
)
l
=
max
(
l
,
len
(
cur
))
return
res
,
l
def
read_data
(
source_path
,
target_path
,
buckets
,
max_size
=
None
,
print_out
=
True
):
"""Read data from source and target files and put into buckets.
Args:
source_path: path to the files with token-ids for the source language.
target_path: path to the file with token-ids for the target language;
it must be aligned with the source file: n-th line contains the desired
output for n-th line from the source_path.
buckets: the buckets to use.
max_size: maximum number of lines to read, all other will be ignored;
if 0 or None, data files will be read completely (no limit).
If set to 1, no data will be returned (empty lists of the right form).
print_out: whether to print out status or not.
Returns:
data_set: a list of length len(_buckets); data_set[n] contains a list of
(source, target) pairs read from the provided data files that fit
into the n-th bucket, i.e., such that len(source) < _buckets[n][0] and
len(target) < _buckets[n][1]; source and target are lists of token-ids.
"""
data_set
=
[[]
for
_
in
buckets
]
counter
=
0
if
max_size
!=
1
:
with
tf
.
gfile
.
GFile
(
source_path
,
mode
=
"r"
)
as
source_file
:
with
tf
.
gfile
.
GFile
(
target_path
,
mode
=
"r"
)
as
target_file
:
source
,
target
=
source_file
.
readline
(),
target_file
.
readline
()
while
source
and
target
and
(
not
max_size
or
counter
<
max_size
):
counter
+=
1
if
counter
%
100000
==
0
and
print_out
:
print
" reading data line %d"
%
counter
sys
.
stdout
.
flush
()
source_ids
=
[
int
(
x
)
for
x
in
source
.
split
()]
target_ids
=
[
int
(
x
)
for
x
in
target
.
split
()]
source_ids
,
source_len
=
zero_split
(
source_ids
)
target_ids
,
target_len
=
zero_split
(
target_ids
,
append
=
wmt
.
EOS_ID
)
for
bucket_id
,
size
in
enumerate
(
buckets
):
if
source_len
<=
size
and
target_len
<=
size
:
data_set
[
bucket_id
].
append
([
source_ids
,
target_ids
])
break
source
,
target
=
source_file
.
readline
(),
target_file
.
readline
()
return
data_set
global_train_set
=
{
"wmt"
:
[]}
train_buckets_scale
=
{
"wmt"
:
[]}
def
calculate_buckets_scale
(
data_set
,
buckets
,
problem
):
"""Calculate buckets scales for the given data set."""
train_bucket_sizes
=
[
len
(
data_set
[
b
])
for
b
in
xrange
(
len
(
buckets
))]
train_total_size
=
max
(
1
,
float
(
sum
(
train_bucket_sizes
)))
# A bucket scale is a list of increasing numbers from 0 to 1 that we'll use
# to select a bucket. Length of [scale[i], scale[i+1]] is proportional to
# the size if i-th training bucket, as used later.
if
problem
not
in
train_buckets_scale
:
train_buckets_scale
[
problem
]
=
[]
train_buckets_scale
[
problem
].
append
(
[
sum
(
train_bucket_sizes
[:
i
+
1
])
/
train_total_size
for
i
in
xrange
(
len
(
train_bucket_sizes
))])
return
train_total_size
def
read_data_into_global
(
source_path
,
target_path
,
buckets
,
max_size
=
None
,
print_out
=
True
):
"""Read data into the global variables (can be in a separate thread)."""
# pylint: disable=global-variable-not-assigned
global
global_train_set
,
train_buckets_scale
# pylint: enable=global-variable-not-assigned
data_set
=
read_data
(
source_path
,
target_path
,
buckets
,
max_size
,
print_out
)
global_train_set
[
"wmt"
].
append
(
data_set
)
train_total_size
=
calculate_buckets_scale
(
data_set
,
buckets
,
"wmt"
)
if
print_out
:
print
" Finished global data reading (%d)."
%
train_total_size
def
initialize
(
sess
):
def
initialize
(
sess
=
None
):
"""Initialize data and model."""
if
FLAGS
.
jobid
>=
0
:
data
.
log_filename
=
os
.
path
.
join
(
FLAGS
.
train_dir
,
"log%d"
%
FLAGS
.
jobid
)
data
.
print_out
(
"NN "
,
newline
=
False
)
global
MAXLEN_F
# Create training directory if it does not exist.
if
not
tf
.
gfile
.
IsDirectory
(
FLAGS
.
train_dir
):
data
.
print_out
(
"Creating training directory %s."
%
FLAGS
.
train_dir
)
tf
.
gfile
.
MkDir
(
FLAGS
.
train_dir
)
decode_suffix
=
"beam%dln%d"
%
(
FLAGS
.
beam_size
,
int
(
100
*
FLAGS
.
length_norm
))
if
FLAGS
.
mode
==
0
:
decode_suffix
=
""
if
FLAGS
.
task
>=
0
:
data
.
log_filename
=
os
.
path
.
join
(
FLAGS
.
train_dir
,
"log%d%s"
%
(
FLAGS
.
task
,
decode_suffix
))
else
:
data
.
log_filename
=
os
.
path
.
join
(
FLAGS
.
train_dir
,
"neural_gpu/log"
)
# Set random seed.
seed
=
FLAGS
.
random_seed
+
max
(
0
,
FLAGS
.
jobid
)
tf
.
set_random_seed
(
seed
)
random
.
seed
(
seed
)
np
.
random
.
seed
(
seed
)
if
FLAGS
.
random_seed
>
0
:
seed
=
FLAGS
.
random_seed
+
max
(
0
,
FLAGS
.
task
)
tf
.
set_random_seed
(
seed
)
random
.
seed
(
seed
)
np
.
random
.
seed
(
seed
)
# Check data sizes.
assert
data
.
bins
min_length
=
3
max_length
=
min
(
FLAGS
.
max_length
,
data
.
bins
[
-
1
])
while
len
(
data
.
bins
)
>
1
and
data
.
bins
[
-
2
]
>=
max_length
+
EXTRA_EVAL
:
data
.
bins
=
data
.
bins
[:
-
1
]
if
sess
is
None
and
FLAGS
.
task
==
0
and
FLAGS
.
num_replicas
>
1
:
if
max_length
>
60
:
max_length
=
max_length
*
1
/
2
# Save memory on chief.
min_length
=
min
(
14
,
max_length
-
3
)
if
FLAGS
.
problem
==
"wmt"
else
3
for
p
in
FLAGS
.
problem
.
split
(
"-"
):
if
p
in
[
"progeval"
,
"progsynth"
]:
min_length
=
max
(
26
,
min_length
)
assert
max_length
+
1
>
min_length
while
len
(
data
.
bins
)
>
1
and
data
.
bins
[
-
2
]
>
max_length
+
EXTRA_EVAL
:
while
len
(
data
.
bins
)
>
1
and
data
.
bins
[
-
2
]
>
=
max_length
+
EXTRA_EVAL
:
data
.
bins
=
data
.
bins
[:
-
1
]
assert
data
.
bins
[
0
]
>
FLAGS
.
rx_step
data
.
forward_max
=
max
(
FLAGS
.
forward_max
,
data
.
bins
[
-
1
])
nclass
=
min
(
FLAGS
.
niclass
,
FLAGS
.
noclass
)
data_size
=
FLAGS
.
train_data_size
if
FLAGS
.
mode
==
0
else
1000
# Initialize data for each task.
tasks
=
FLAGS
.
task
.
split
(
"-"
)
for
t
in
tasks
:
for
l
in
xrange
(
max_length
+
EXTRA_EVAL
-
1
):
data
.
init_data
(
t
,
l
,
data_size
,
nclass
)
data
.
init_data
(
t
,
data
.
bins
[
-
2
],
data_size
,
nclass
)
data
.
init_data
(
t
,
data
.
bins
[
-
1
],
data_size
,
nclass
)
end_size
=
4
*
1024
if
FLAGS
.
mode
>
0
else
1024
data
.
init_data
(
t
,
data
.
forward_max
,
end_size
,
nclass
)
# Print out parameters.
curriculum
=
FLAGS
.
curriculum_bound
msg1
=
(
"layers %d kw %d h %d kh %d relax %d batch %d noise %.2f task %s"
%
(
FLAGS
.
nconvs
,
FLAGS
.
kw
,
FLAGS
.
height
,
FLAGS
.
kh
,
FLAGS
.
rx_step
,
FLAGS
.
batch_size
,
FLAGS
.
grad_noise_scale
,
FLAGS
.
task
))
msg2
=
"data %d %s"
%
(
FLAGS
.
train_data_size
,
msg1
)
msg3
=
(
"cut %.2f pull %.3f lr %.2f iw %.2f cr %.2f nm %d d%.4f gn %.2f %s"
%
(
FLAGS
.
cutoff
,
FLAGS
.
pull_incr
,
FLAGS
.
lr
,
FLAGS
.
init_weight
,
curriculum
,
FLAGS
.
nmaps
,
FLAGS
.
dropout
,
FLAGS
.
max_grad_norm
,
msg2
))
data
.
print_out
(
msg3
)
# Create checkpoint directory if it does not exist.
checkpoint_dir
=
os
.
path
.
join
(
FLAGS
.
train_dir
,
"neural_gpu%s"
%
(
""
if
FLAGS
.
jobid
<
0
else
str
(
FLAGS
.
jobid
)))
if
not
gfile
.
IsDirectory
(
checkpoint_dir
):
if
FLAGS
.
mode
==
0
or
FLAGS
.
task
<
0
:
checkpoint_dir
=
os
.
path
.
join
(
FLAGS
.
train_dir
,
"neural_gpu%s"
%
(
""
if
FLAGS
.
task
<
0
else
str
(
FLAGS
.
task
)))
else
:
checkpoint_dir
=
FLAGS
.
train_dir
if
not
tf
.
gfile
.
IsDirectory
(
checkpoint_dir
):
data
.
print_out
(
"Creating checkpoint directory %s."
%
checkpoint_dir
)
gfile
.
MkDir
(
checkpoint_dir
)
tf
.
gfile
.
MkDir
(
checkpoint_dir
)
# Prepare data.
if
FLAGS
.
problem
==
"wmt"
:
# Prepare WMT data.
data
.
print_out
(
"Preparing WMT data in %s"
%
FLAGS
.
data_dir
)
if
FLAGS
.
simple_tokenizer
:
MAXLEN_F
=
3.5
(
en_train
,
fr_train
,
en_dev
,
fr_dev
,
en_path
,
fr_path
)
=
wmt
.
prepare_wmt_data
(
FLAGS
.
data_dir
,
FLAGS
.
vocab_size
,
tokenizer
=
wmt
.
space_tokenizer
,
normalize_digits
=
FLAGS
.
normalize_digits
)
else
:
(
en_train
,
fr_train
,
en_dev
,
fr_dev
,
en_path
,
fr_path
)
=
wmt
.
prepare_wmt_data
(
FLAGS
.
data_dir
,
FLAGS
.
vocab_size
)
# Read data into buckets and compute their sizes.
fr_vocab
,
rev_fr_vocab
=
wmt
.
initialize_vocabulary
(
fr_path
)
data
.
vocab
=
fr_vocab
data
.
rev_vocab
=
rev_fr_vocab
data
.
print_out
(
"Reading development and training data (limit: %d)."
%
FLAGS
.
max_train_data_size
)
dev_set
=
read_data
(
en_dev
,
fr_dev
,
data
.
bins
)
def
data_read
(
size
,
print_out
):
read_data_into_global
(
en_train
,
fr_train
,
data
.
bins
,
size
,
print_out
)
data_read
(
50000
,
False
)
read_thread_small
=
threading
.
Thread
(
name
=
"reading-data-small"
,
target
=
lambda
:
data_read
(
900000
,
False
))
read_thread_small
.
start
()
read_thread_full
=
threading
.
Thread
(
name
=
"reading-data-full"
,
target
=
lambda
:
data_read
(
FLAGS
.
max_train_data_size
,
True
))
read_thread_full
.
start
()
data
.
print_out
(
"Data reading set up."
)
else
:
# Prepare algorithmic data.
en_path
,
fr_path
=
None
,
None
tasks
=
FLAGS
.
problem
.
split
(
"-"
)
data_size
=
FLAGS
.
train_data_size
for
t
in
tasks
:
data
.
print_out
(
"Generating data for %s."
%
t
)
if
t
in
[
"progeval"
,
"progsynth"
]:
data
.
init_data
(
t
,
data
.
bins
[
-
1
],
20
*
data_size
,
FLAGS
.
vocab_size
)
if
len
(
program_utils
.
prog_vocab
)
>
FLAGS
.
vocab_size
-
2
:
raise
ValueError
(
"Increase vocab_size to %d for prog-tasks."
%
(
len
(
program_utils
.
prog_vocab
)
+
2
))
data
.
rev_vocab
=
program_utils
.
prog_vocab
data
.
vocab
=
program_utils
.
prog_rev_vocab
else
:
for
l
in
xrange
(
max_length
+
EXTRA_EVAL
-
1
):
data
.
init_data
(
t
,
l
,
data_size
,
FLAGS
.
vocab_size
)
data
.
init_data
(
t
,
data
.
bins
[
-
2
],
data_size
,
FLAGS
.
vocab_size
)
data
.
init_data
(
t
,
data
.
bins
[
-
1
],
data_size
,
FLAGS
.
vocab_size
)
if
t
not
in
global_train_set
:
global_train_set
[
t
]
=
[]
global_train_set
[
t
].
append
(
data
.
train_set
[
t
])
calculate_buckets_scale
(
data
.
train_set
[
t
],
data
.
bins
,
t
)
dev_set
=
data
.
test_set
# Grid-search parameters.
lr
=
FLAGS
.
lr
init_weight
=
FLAGS
.
init_weight
max_grad_norm
=
FLAGS
.
max_grad_norm
if
sess
is
not
None
and
FLAGS
.
task
>
-
1
:
def
job_id_factor
(
step
):
"""If jobid / step mod 3 is 0, 1, 2: say 0, 1, -1."""
return
((((
FLAGS
.
task
/
step
)
%
3
)
+
1
)
%
3
)
-
1
lr
*=
math
.
pow
(
2
,
job_id_factor
(
1
))
init_weight
*=
math
.
pow
(
1.5
,
job_id_factor
(
3
))
max_grad_norm
*=
math
.
pow
(
2
,
job_id_factor
(
9
))
# Print out parameters.
curriculum
=
FLAGS
.
curriculum_seq
msg1
=
(
"layers %d kw %d h %d kh %d batch %d noise %.2f"
%
(
FLAGS
.
nconvs
,
FLAGS
.
kw
,
FLAGS
.
height
,
FLAGS
.
kh
,
FLAGS
.
batch_size
,
FLAGS
.
grad_noise_scale
))
msg2
=
(
"cut %.2f lr %.3f iw %.2f cr %.2f nm %d d%.4f gn %.2f %s"
%
(
FLAGS
.
cutoff
,
lr
,
init_weight
,
curriculum
,
FLAGS
.
nmaps
,
FLAGS
.
dropout
,
max_grad_norm
,
msg1
))
data
.
print_out
(
msg2
)
# Create model and initialize it.
tf
.
get_variable_scope
().
set_initializer
(
tf
.
uniform_unit_scaling_initializer
(
factor
=
1.8
*
FLAGS
.
init_weight
))
model
=
neural_gpu
.
NeuralGPU
(
FLAGS
.
nmaps
,
FLAGS
.
nmaps
,
FLAGS
.
niclass
,
FLAGS
.
noclass
,
FLAGS
.
dropout
,
FLAGS
.
rx_step
,
FLAGS
.
max_grad_norm
,
FLAGS
.
cutoff
,
FLAGS
.
nconvs
,
FLAGS
.
kw
,
FLAGS
.
kh
,
FLAGS
.
height
,
FLAGS
.
mode
,
FLAGS
.
lr
,
FLAGS
.
pull
,
FLAGS
.
pull_incr
,
min_length
+
3
)
data
.
print_out
(
"Created model."
)
sess
.
run
(
tf
.
initialize_all_variables
())
data
.
print_out
(
"Initialized variables."
)
tf
.
orthogonal_initializer
(
gain
=
1.8
*
init_weight
))
max_sampling_rate
=
FLAGS
.
max_sampling_rate
if
FLAGS
.
mode
==
0
else
0.0
o
=
FLAGS
.
vocab_size
if
FLAGS
.
max_target_vocab
<
1
else
FLAGS
.
max_target_vocab
ngpu
.
CHOOSE_K
=
FLAGS
.
soft_mem_size
do_beam_model
=
FLAGS
.
train_beam_freq
>
0.0001
and
FLAGS
.
beam_size
>
1
beam_size
=
FLAGS
.
beam_size
if
FLAGS
.
mode
>
0
and
not
do_beam_model
else
1
beam_model
=
None
def
make_ngpu
(
cur_beam_size
,
back
):
return
ngpu
.
NeuralGPU
(
FLAGS
.
nmaps
,
FLAGS
.
vec_size
,
FLAGS
.
vocab_size
,
o
,
FLAGS
.
dropout
,
max_grad_norm
,
FLAGS
.
cutoff
,
FLAGS
.
nconvs
,
FLAGS
.
kw
,
FLAGS
.
kh
,
FLAGS
.
height
,
FLAGS
.
mem_size
,
lr
/
math
.
sqrt
(
FLAGS
.
num_replicas
),
min_length
+
3
,
FLAGS
.
num_gpus
,
FLAGS
.
num_replicas
,
FLAGS
.
grad_noise_scale
,
max_sampling_rate
,
atrous
=
FLAGS
.
atrous
,
do_rnn
=
FLAGS
.
rnn_baseline
,
do_layer_norm
=
FLAGS
.
layer_norm
,
beam_size
=
cur_beam_size
,
backward
=
back
)
if
sess
is
None
:
with
tf
.
device
(
tf
.
train
.
replica_device_setter
(
FLAGS
.
ps_tasks
)):
model
=
make_ngpu
(
beam_size
,
True
)
if
do_beam_model
:
tf
.
get_variable_scope
().
reuse_variables
()
beam_model
=
make_ngpu
(
FLAGS
.
beam_size
,
False
)
else
:
model
=
make_ngpu
(
beam_size
,
True
)
if
do_beam_model
:
tf
.
get_variable_scope
().
reuse_variables
()
beam_model
=
make_ngpu
(
FLAGS
.
beam_size
,
False
)
sv
=
None
if
sess
is
None
:
# The supervisor configuration has a few overriden options.
sv
=
tf
.
train
.
Supervisor
(
logdir
=
checkpoint_dir
,
is_chief
=
(
FLAGS
.
task
<
1
),
saver
=
model
.
saver
,
summary_op
=
None
,
save_summaries_secs
=
60
,
save_model_secs
=
15
*
60
,
global_step
=
model
.
global_step
)
config
=
tf
.
ConfigProto
(
allow_soft_placement
=
True
)
sess
=
sv
.
PrepareSession
(
FLAGS
.
master
,
config
=
config
)
data
.
print_out
(
"Created model. Checkpoint dir %s"
%
checkpoint_dir
)
# Load model from parameters if a checkpoint exists.
ckpt
=
tf
.
train
.
get_checkpoint_state
(
checkpoint_dir
)
if
ckpt
and
gfile
.
Exists
(
ckpt
.
model_checkpoint_path
):
if
ckpt
and
tf
.
gfile
.
Exists
(
ckpt
.
model_checkpoint_path
+
".index"
):
data
.
print_out
(
"Reading model parameters from %s"
%
ckpt
.
model_checkpoint_path
)
model
.
saver
.
restore
(
sess
,
ckpt
.
model_checkpoint_path
)
# Check if there are ensemble models and get their checkpoints.
ensemble
=
[]
ensemble_dir_list
=
[
d
for
d
in
FLAGS
.
ensemble
.
split
(
","
)
if
d
]
for
ensemble_dir
in
ensemble_dir_list
:
ckpt
=
tf
.
train
.
get_checkpoint_state
(
ensemble_dir
)
if
ckpt
and
gfile
.
Exists
(
ckpt
.
model_checkpoint_path
):
data
.
print_out
(
"Found ensemble model %s"
%
ckpt
.
model_checkpoint_path
)
ensemble
.
append
(
ckpt
.
model_checkpoint_path
)
elif
sv
is
None
:
sess
.
run
(
tf
.
initialize_all_variables
())
data
.
print_out
(
"Initialized variables (no supervisor mode)."
)
elif
FLAGS
.
task
<
1
and
FLAGS
.
mem_size
>
0
:
# sess.run(model.mem_norm_op)
data
.
print_out
(
"Created new model and normalized mem (on chief)."
)
# Return the model and needed variables.
return
(
model
,
min_length
,
max_length
,
checkpoint_dir
,
curriculum
,
ensemble
)
def
single_test
(
l
,
model
,
sess
,
task
,
nprint
,
batch_size
,
print_out
=
True
,
offset
=
None
,
ensemble
=
None
,
get_steps
=
False
):
return
(
model
,
beam_model
,
min_length
,
max_length
,
checkpoint_dir
,
(
global_train_set
,
dev_set
,
en_path
,
fr_path
),
sv
,
sess
)
def
m_step
(
model
,
beam_model
,
sess
,
batch_size
,
inp
,
target
,
bucket
,
nsteps
,
p
):
"""Evaluation multi-step for program synthesis."""
state
,
scores
,
hist
=
None
,
[[
-
11.0
for
_
in
xrange
(
batch_size
)]],
[]
for
_
in
xrange
(
nsteps
):
# Get the best beam (no training, just forward model).
new_target
,
new_first
,
new_inp
,
new_scores
=
get_best_beam
(
beam_model
,
sess
,
inp
,
target
,
batch_size
,
FLAGS
.
beam_size
,
bucket
,
hist
,
p
,
test_mode
=
True
)
hist
.
append
(
new_first
)
_
,
_
,
_
,
state
=
model
.
step
(
sess
,
inp
,
new_target
,
False
,
state
=
state
)
inp
=
new_inp
scores
.
append
([
max
(
scores
[
-
1
][
i
],
new_scores
[
i
])
for
i
in
xrange
(
batch_size
)])
# The final step with the true target.
loss
,
res
,
_
,
_
=
model
.
step
(
sess
,
inp
,
target
,
False
,
state
=
state
)
return
loss
,
res
,
new_target
,
scores
[
1
:]
def
single_test
(
bin_id
,
model
,
sess
,
nprint
,
batch_size
,
dev
,
p
,
print_out
=
True
,
offset
=
None
,
beam_model
=
None
):
"""Test model on test data of length l using the given session."""
inpt
,
target
=
data
.
get_batch
(
l
,
batch_size
,
False
,
task
,
offset
)
_
,
res
,
_
,
steps
=
model
.
step
(
sess
,
inpt
,
target
,
False
,
get_steps
=
get_steps
)
errors
,
total
,
seq_err
=
data
.
accuracy
(
inpt
,
res
,
target
,
batch_size
,
nprint
)
if
not
dev
[
p
][
bin_id
]:
data
.
print_out
(
" bin %d (%d)
\t
%s
\t
ppl NA errors NA seq-errors NA"
%
(
bin_id
,
data
.
bins
[
bin_id
],
p
))
return
1.0
,
1.0
,
0.0
inpt
,
target
=
data
.
get_batch
(
bin_id
,
batch_size
,
dev
[
p
],
FLAGS
.
height
,
offset
)
if
FLAGS
.
beam_size
>
1
and
beam_model
:
loss
,
res
,
new_tgt
,
scores
=
m_step
(
model
,
beam_model
,
sess
,
batch_size
,
inpt
,
target
,
bin_id
,
FLAGS
.
eval_beam_steps
,
p
)
score_avgs
=
[
sum
(
s
)
/
float
(
len
(
s
))
for
s
in
scores
]
score_maxs
=
[
max
(
s
)
for
s
in
scores
]
score_str
=
[
"(%.2f, %.2f)"
%
(
score_avgs
[
i
],
score_maxs
[
i
])
for
i
in
xrange
(
FLAGS
.
eval_beam_steps
)]
data
.
print_out
(
" == scores (avg, max): %s"
%
"; "
.
join
(
score_str
))
errors
,
total
,
seq_err
=
data
.
accuracy
(
inpt
,
res
,
target
,
batch_size
,
nprint
,
new_tgt
,
scores
[
-
1
])
else
:
loss
,
res
,
_
,
_
=
model
.
step
(
sess
,
inpt
,
target
,
False
)
errors
,
total
,
seq_err
=
data
.
accuracy
(
inpt
,
res
,
target
,
batch_size
,
nprint
)
seq_err
=
float
(
seq_err
)
/
batch_size
if
total
>
0
:
errors
=
float
(
errors
)
/
total
if
print_out
:
data
.
print_out
(
" %s len %d errors %.2f sequence-errors %.2f"
%
(
task
,
l
,
100
*
errors
,
100
*
seq_err
))
# Ensemble eval.
if
ensemble
:
results
=
[]
for
m
in
ensemble
:
model
.
saver
.
restore
(
sess
,
m
)
_
,
result
,
_
,
_
=
model
.
step
(
sess
,
inpt
,
target
,
False
)
m_errors
,
m_total
,
m_seq_err
=
data
.
accuracy
(
inpt
,
result
,
target
,
batch_size
,
nprint
)
m_seq_err
=
float
(
m_seq_err
)
/
batch_size
if
total
>
0
:
m_errors
=
float
(
m_errors
)
/
m_total
data
.
print_out
(
" %s len %d m-errors %.2f m-sequence-errors %.2f"
%
(
task
,
l
,
100
*
m_errors
,
100
*
m_seq_err
))
results
.
append
(
result
)
ens
=
[
sum
(
o
)
for
o
in
zip
(
*
results
)]
errors
,
total
,
seq_err
=
data
.
accuracy
(
inpt
,
ens
,
target
,
batch_size
,
nprint
)
seq_err
=
float
(
seq_err
)
/
batch_size
if
total
>
0
:
errors
=
float
(
errors
)
/
total
if
print_out
:
data
.
print_out
(
" %s len %d ens-errors %.2f ens-sequence-errors %.2f"
%
(
task
,
l
,
100
*
errors
,
100
*
seq_err
))
return
errors
,
seq_err
,
(
steps
,
inpt
,
[
np
.
argmax
(
o
,
axis
=
1
)
for
o
in
res
])
def
multi_test
(
l
,
model
,
sess
,
task
,
nprint
,
batch_size
,
offset
=
None
,
ensemble
=
None
):
"""Run multiple tests at lower batch size to save memory."""
errors
,
seq_err
=
0.0
,
0.0
to_print
=
nprint
low_batch
=
FLAGS
.
low_batch_size
low_batch
=
min
(
low_batch
,
batch_size
)
for
mstep
in
xrange
(
batch_size
/
low_batch
):
cur_offset
=
None
if
offset
is
None
else
offset
+
mstep
*
low_batch
err
,
sq_err
,
_
=
single_test
(
l
,
model
,
sess
,
task
,
to_print
,
low_batch
,
False
,
cur_offset
,
ensemble
=
ensemble
)
to_print
=
max
(
0
,
to_print
-
low_batch
)
errors
+=
err
seq_err
+=
sq_err
if
FLAGS
.
mode
>
0
:
cur_errors
=
float
(
low_batch
*
errors
)
/
((
mstep
+
1
)
*
low_batch
)
cur_seq_err
=
float
(
low_batch
*
seq_err
)
/
((
mstep
+
1
)
*
low_batch
)
data
.
print_out
(
" %s multitest current errors %.2f sequence-errors %.2f"
%
(
task
,
100
*
cur_errors
,
100
*
cur_seq_err
))
errors
=
float
(
low_batch
)
*
float
(
errors
)
/
batch_size
seq_err
=
float
(
low_batch
)
*
float
(
seq_err
)
/
batch_size
data
.
print_out
(
" %s len %d errors %.2f sequence-errors %.2f"
%
(
task
,
l
,
100
*
errors
,
100
*
seq_err
))
return
errors
,
seq_err
data
.
print_out
(
" bin %d (%d)
\t
%s
\t
ppl %.2f errors %.2f seq-errors %.2f"
%
(
bin_id
,
data
.
bins
[
bin_id
],
p
,
data
.
safe_exp
(
loss
),
100
*
errors
,
100
*
seq_err
))
return
(
errors
,
seq_err
,
loss
)
def
assign_vectors
(
word_vector_file
,
embedding_key
,
vocab_path
,
sess
):
"""Assign the embedding_key variable from the given word vectors file."""
# For words in the word vector file, set their embedding at start.
if
not
tf
.
gfile
.
Exists
(
word_vector_file
):
data
.
print_out
(
"Word vector file does not exist: %s"
%
word_vector_file
)
sys
.
exit
(
1
)
vocab
,
_
=
wmt
.
initialize_vocabulary
(
vocab_path
)
vectors_variable
=
[
v
for
v
in
tf
.
trainable_variables
()
if
embedding_key
==
v
.
name
]
if
len
(
vectors_variable
)
!=
1
:
data
.
print_out
(
"Word vector variable not found or too many."
)
sys
.
exit
(
1
)
vectors_variable
=
vectors_variable
[
0
]
vectors
=
vectors_variable
.
eval
()
data
.
print_out
(
"Pre-setting word vectors from %s"
%
word_vector_file
)
with
tf
.
gfile
.
GFile
(
word_vector_file
,
mode
=
"r"
)
as
f
:
# Lines have format: dog 0.045123 -0.61323 0.413667 ...
for
line
in
f
:
line_parts
=
line
.
split
()
# The first part is the word.
word
=
line_parts
[
0
]
if
word
in
vocab
:
# Remaining parts are components of the vector.
word_vector
=
np
.
array
(
map
(
float
,
line_parts
[
1
:]))
if
len
(
word_vector
)
!=
FLAGS
.
vec_size
:
data
.
print_out
(
"Warn: Word '%s', Expecting vector size %d, "
"found %d"
%
(
word
,
FLAGS
.
vec_size
,
len
(
word_vector
)))
else
:
vectors
[
vocab
[
word
]]
=
word_vector
# Assign the modified vectors to the vectors_variable in the graph.
sess
.
run
([
vectors_variable
.
initializer
],
{
vectors_variable
.
initializer
.
inputs
[
1
]:
vectors
})
def
print_vectors
(
embedding_key
,
vocab_path
,
word_vector_file
):
"""Print vectors from the given variable."""
_
,
rev_vocab
=
wmt
.
initialize_vocabulary
(
vocab_path
)
vectors_variable
=
[
v
for
v
in
tf
.
trainable_variables
()
if
embedding_key
==
v
.
name
]
if
len
(
vectors_variable
)
!=
1
:
data
.
print_out
(
"Word vector variable not found or too many."
)
sys
.
exit
(
1
)
vectors_variable
=
vectors_variable
[
0
]
vectors
=
vectors_variable
.
eval
()
l
,
s
=
vectors
.
shape
[
0
],
vectors
.
shape
[
1
]
data
.
print_out
(
"Printing %d word vectors from %s to %s."
%
(
l
,
embedding_key
,
word_vector_file
))
with
tf
.
gfile
.
GFile
(
word_vector_file
,
mode
=
"w"
)
as
f
:
# Lines have format: dog 0.045123 -0.61323 0.413667 ...
for
i
in
xrange
(
l
):
f
.
write
(
rev_vocab
[
i
])
for
j
in
xrange
(
s
):
f
.
write
(
" %.8f"
%
vectors
[
i
][
j
])
f
.
write
(
"
\n
"
)
def
get_bucket_id
(
train_buckets_scale_c
,
max_cur_length
,
data_set
):
"""Get a random bucket id."""
# Choose a bucket according to data distribution. Pick a random number
# in [0, 1] and use the corresponding interval in train_buckets_scale.
random_number_01
=
np
.
random
.
random_sample
()
bucket_id
=
min
([
i
for
i
in
xrange
(
len
(
train_buckets_scale_c
))
if
train_buckets_scale_c
[
i
]
>
random_number_01
])
while
bucket_id
>
0
and
not
data_set
[
bucket_id
]:
bucket_id
-=
1
for
_
in
xrange
(
10
if
np
.
random
.
random_sample
()
<
0.9
else
1
):
if
data
.
bins
[
bucket_id
]
>
max_cur_length
:
random_number_01
=
min
(
random_number_01
,
np
.
random
.
random_sample
())
bucket_id
=
min
([
i
for
i
in
xrange
(
len
(
train_buckets_scale_c
))
if
train_buckets_scale_c
[
i
]
>
random_number_01
])
while
bucket_id
>
0
and
not
data_set
[
bucket_id
]:
bucket_id
-=
1
return
bucket_id
def
score_beams
(
beams
,
target
,
inp
,
history
,
p
,
print_out
=
False
,
test_mode
=
False
):
"""Score beams."""
if
p
==
"progsynth"
:
return
score_beams_prog
(
beams
,
target
,
inp
,
history
,
print_out
,
test_mode
)
elif
test_mode
:
return
beams
[
0
],
10.0
if
str
(
beams
[
0
][:
len
(
target
)])
==
str
(
target
)
else
0.0
else
:
history_s
=
[
str
(
h
)
for
h
in
history
]
best
,
best_score
,
tgt
,
eos_id
=
None
,
-
1000.0
,
target
,
None
if
p
==
"wmt"
:
eos_id
=
wmt
.
EOS_ID
if
eos_id
and
eos_id
in
target
:
tgt
=
target
[:
target
.
index
(
eos_id
)]
for
beam
in
beams
:
if
eos_id
and
eos_id
in
beam
:
beam
=
beam
[:
beam
.
index
(
eos_id
)]
l
=
min
(
len
(
tgt
),
len
(
beam
))
score
=
len
([
i
for
i
in
xrange
(
l
)
if
tgt
[
i
]
==
beam
[
i
]])
/
float
(
len
(
tgt
))
hist_score
=
20.0
if
str
([
b
for
b
in
beam
if
b
>
0
])
in
history_s
else
0.0
if
score
<
1.0
:
score
-=
hist_score
if
score
>
best_score
:
best
=
beam
best_score
=
score
return
best
,
best_score
def
score_beams_prog
(
beams
,
target
,
inp
,
history
,
print_out
=
False
,
test_mode
=
False
):
"""Score beams for program synthesis."""
tgt_prog
=
linearize
(
target
,
program_utils
.
prog_vocab
,
True
,
1
)
hist_progs
=
[
linearize
(
h
,
program_utils
.
prog_vocab
,
True
,
1
)
for
h
in
history
]
tgt_set
=
set
(
target
)
if
print_out
:
print
"target: "
,
tgt_prog
inps
,
tgt_outs
=
[],
[]
for
i
in
xrange
(
3
):
ilist
=
[
inp
[
i
+
1
,
l
]
for
l
in
xrange
(
inp
.
shape
[
1
])]
clist
=
[
program_utils
.
prog_vocab
[
x
]
for
x
in
ilist
if
x
>
0
]
olist
=
clist
[
clist
.
index
(
"]"
)
+
1
:]
# outputs
clist
=
clist
[
1
:
clist
.
index
(
"]"
)]
# inputs
inps
.
append
([
int
(
x
)
for
x
in
clist
])
if
olist
[
0
]
==
"["
:
# olist may be [int] or just int
tgt_outs
.
append
(
str
([
int
(
x
)
for
x
in
olist
[
1
:
-
1
]]))
else
:
if
len
(
olist
)
==
1
:
tgt_outs
.
append
(
olist
[
0
])
else
:
print
[
program_utils
.
prog_vocab
[
x
]
for
x
in
ilist
if
x
>
0
]
print
olist
print
tgt_prog
print
program_utils
.
evaluate
(
tgt_prog
,
{
"a"
:
inps
[
-
1
]})
print
"AAAAA"
tgt_outs
.
append
(
olist
[
0
])
if
not
test_mode
:
for
_
in
xrange
(
7
):
ilen
=
np
.
random
.
randint
(
len
(
target
)
-
3
)
+
1
inps
.
append
([
random
.
choice
(
range
(
-
15
,
15
))
for
_
in
range
(
ilen
)])
tgt_outs
.
extend
([
program_utils
.
evaluate
(
tgt_prog
,
{
"a"
:
inp
})
for
inp
in
inps
[
3
:]])
best
,
best_prog
,
best_score
=
None
,
""
,
-
1000.0
for
beam
in
beams
:
b_prog
=
linearize
(
beam
,
program_utils
.
prog_vocab
,
True
,
1
)
b_set
=
set
(
beam
)
jsim
=
len
(
tgt_set
&
b_set
)
/
float
(
len
(
tgt_set
|
b_set
))
b_outs
=
[
program_utils
.
evaluate
(
b_prog
,
{
"a"
:
inp
})
for
inp
in
inps
]
errs
=
len
([
x
for
x
in
b_outs
if
x
==
"ERROR"
])
imatches
=
len
([
i
for
i
in
xrange
(
3
)
if
b_outs
[
i
]
==
tgt_outs
[
i
]])
perfect
=
10.0
if
imatches
==
3
else
0.0
hist_score
=
20.0
if
b_prog
in
hist_progs
else
0.0
if
test_mode
:
score
=
perfect
-
errs
else
:
matches
=
len
([
i
for
i
in
xrange
(
10
)
if
b_outs
[
i
]
==
tgt_outs
[
i
]])
score
=
perfect
+
matches
+
jsim
-
errs
if
score
<
10.0
:
score
-=
hist_score
# print b_prog
# print "jsim: ", jsim, " errs: ", errs, " mtchs: ", matches, " s: ", score
if
score
>
best_score
:
best
=
beam
best_prog
=
b_prog
best_score
=
score
if
print_out
:
print
"best score: "
,
best_score
,
" best prog: "
,
best_prog
return
best
,
best_score
def
get_best_beam
(
beam_model
,
sess
,
inp
,
target
,
batch_size
,
beam_size
,
bucket
,
history
,
p
,
test_mode
=
False
):
"""Run beam_model, score beams, and return the best as target and in input."""
_
,
output_logits
,
_
,
_
=
beam_model
.
step
(
sess
,
inp
,
target
,
None
,
beam_size
=
FLAGS
.
beam_size
)
new_targets
,
new_firsts
,
scores
,
new_inp
=
[],
[],
[],
np
.
copy
(
inp
)
for
b
in
xrange
(
batch_size
):
outputs
=
[]
history_b
=
[[
h
[
b
,
0
,
l
]
for
l
in
xrange
(
data
.
bins
[
bucket
])]
for
h
in
history
]
for
beam_idx
in
xrange
(
beam_size
):
outputs
.
append
([
int
(
o
[
beam_idx
*
batch_size
+
b
])
for
o
in
output_logits
])
target_t
=
[
target
[
b
,
0
,
l
]
for
l
in
xrange
(
data
.
bins
[
bucket
])]
best
,
best_score
=
score_beams
(
outputs
,
[
t
for
t
in
target_t
if
t
>
0
],
inp
[
b
,
:,
:],
[[
t
for
t
in
h
if
t
>
0
]
for
h
in
history_b
],
p
,
test_mode
=
test_mode
)
scores
.
append
(
best_score
)
if
1
in
best
:
# Only until _EOS.
best
=
best
[:
best
.
index
(
1
)
+
1
]
best
+=
[
0
for
_
in
xrange
(
len
(
target_t
)
-
len
(
best
))]
new_targets
.
append
([
best
])
first
,
_
=
score_beams
(
outputs
,
[
t
for
t
in
target_t
if
t
>
0
],
inp
[
b
,
:,
:],
[[
t
for
t
in
h
if
t
>
0
]
for
h
in
history_b
],
p
,
test_mode
=
True
)
if
1
in
first
:
# Only until _EOS.
first
=
first
[:
first
.
index
(
1
)
+
1
]
first
+=
[
0
for
_
in
xrange
(
len
(
target_t
)
-
len
(
first
))]
new_inp
[
b
,
0
,
:]
=
np
.
array
(
first
,
dtype
=
np
.
int32
)
new_firsts
.
append
([
first
])
# Change target if we found a great answer.
new_target
=
np
.
array
(
new_targets
,
dtype
=
np
.
int32
)
for
b
in
xrange
(
batch_size
):
if
scores
[
b
]
>=
10.0
:
target
[
b
,
0
,
:]
=
new_target
[
b
,
0
,
:]
new_first
=
np
.
array
(
new_firsts
,
dtype
=
np
.
int32
)
return
new_target
,
new_first
,
new_inp
,
scores
def
train
():
"""Train the model."""
batch_size
=
FLAGS
.
batch_size
tasks
=
FLAGS
.
task
.
split
(
"-"
)
with
tf
.
Session
()
as
sess
:
(
model
,
min_length
,
max_length
,
checkpoint_dir
,
curriculum
,
_
)
=
initialize
(
sess
)
quant_op
=
neural_gpu
.
quantize_weights_op
(
512
,
8
)
batch_size
=
FLAGS
.
batch_size
*
FLAGS
.
num_gpus
(
model
,
beam_model
,
min_length
,
max_length
,
checkpoint_dir
,
(
train_set
,
dev_set
,
en_vocab_path
,
fr_vocab_path
),
sv
,
sess
)
=
initialize
()
with
sess
.
as_default
():
quant_op
=
model
.
quantize_op
max_cur_length
=
min
(
min_length
+
3
,
max_length
)
prev_acc_perp
=
[
1000000
for
_
in
xrange
(
3
)]
prev_acc_perp
=
[
1000000
for
_
in
xrange
(
5
)]
prev_seq_err
=
1.0
is_chief
=
FLAGS
.
task
<
1
do_report
=
False
# Main traning loop.
while
True
:
global_step
,
pull
,
max_cur_length
,
learning_rate
=
sess
.
run
(
[
model
.
global_step
,
model
.
pull
,
model
.
cur_length
,
model
.
lr
])
acc_loss
,
acc_total
,
acc_errors
,
acc_seq_err
=
0.0
,
0
,
0
,
0
acc_grad_norm
,
step_count
,
step_time
=
0.0
,
0
,
0.0
while
not
sv
.
ShouldStop
():
global_step
,
max_cur_length
,
learning_rate
=
sess
.
run
(
[
model
.
global_step
,
model
.
cur_length
,
model
.
lr
])
acc_loss
,
acc_l1
,
acc_total
,
acc_errors
,
acc_seq_err
=
0.0
,
0.0
,
0
,
0
,
0
acc_grad_norm
,
step_count
,
step_c1
,
step_time
=
0.0
,
0
,
0
,
0.0
# For words in the word vector file, set their embedding at start.
bound1
=
FLAGS
.
steps_per_checkpoint
-
1
if
FLAGS
.
word_vector_file_en
and
global_step
<
bound1
and
is_chief
:
assign_vectors
(
FLAGS
.
word_vector_file_en
,
"embedding:0"
,
en_vocab_path
,
sess
)
if
FLAGS
.
max_target_vocab
<
1
:
assign_vectors
(
FLAGS
.
word_vector_file_en
,
"target_embedding:0"
,
en_vocab_path
,
sess
)
if
FLAGS
.
word_vector_file_fr
and
global_step
<
bound1
and
is_chief
:
assign_vectors
(
FLAGS
.
word_vector_file_fr
,
"embedding:0"
,
fr_vocab_path
,
sess
)
if
FLAGS
.
max_target_vocab
<
1
:
assign_vectors
(
FLAGS
.
word_vector_file_fr
,
"target_embedding:0"
,
fr_vocab_path
,
sess
)
for
_
in
xrange
(
FLAGS
.
steps_per_checkpoint
):
global_step
+=
1
task
=
random
.
choice
(
tasks
)
# Select the length for curriculum learning.
l
=
np
.
random
.
randint
(
max_cur_length
-
min_length
+
1
)
+
min_length
# Prefer longer stuff 60% of time.
if
np
.
random
.
randint
(
100
)
<
60
:
l1
=
np
.
random
.
randint
(
max_cur_length
-
min_length
+
1
)
+
min_length
l
=
max
(
l
,
l1
)
# Mixed curriculum learning: in 25% of cases go to any larger length.
if
np
.
random
.
randint
(
100
)
<
25
:
l1
=
np
.
random
.
randint
(
max_length
-
min_length
+
1
)
+
min_length
l
=
max
(
l
,
l1
)
step_count
+=
1
step_c1
+=
1
global_step
=
int
(
model
.
global_step
.
eval
())
train_beam_anneal
=
global_step
/
float
(
FLAGS
.
train_beam_anneal
)
train_beam_freq
=
FLAGS
.
train_beam_freq
*
min
(
1.0
,
train_beam_anneal
)
p
=
random
.
choice
(
FLAGS
.
problem
.
split
(
"-"
))
train_set
=
global_train_set
[
p
][
-
1
]
bucket_id
=
get_bucket_id
(
train_buckets_scale
[
p
][
-
1
],
max_cur_length
,
train_set
)
# Prefer longer stuff 60% of time if not wmt.
if
np
.
random
.
randint
(
100
)
<
60
and
FLAGS
.
problem
!=
"wmt"
:
bucket1
=
get_bucket_id
(
train_buckets_scale
[
p
][
-
1
],
max_cur_length
,
train_set
)
bucket_id
=
max
(
bucket1
,
bucket_id
)
# Run a step and time it.
start_time
=
time
.
time
()
inp
,
target
=
data
.
get_batch
(
l
,
batch_size
,
True
,
task
)
noise_param
=
math
.
sqrt
(
math
.
pow
(
global_step
,
-
0.55
)
*
inp
,
target
=
data
.
get_batch
(
bucket_id
,
batch_size
,
train_set
,
FLAGS
.
height
)
noise_param
=
math
.
sqrt
(
math
.
pow
(
global_step
+
1
,
-
0.55
)
*
prev_seq_err
)
*
FLAGS
.
grad_noise_scale
loss
,
res
,
gnorm
,
_
=
model
.
step
(
sess
,
inp
,
target
,
True
,
noise_param
)
# In multi-step mode, we use best from beam for middle steps.
state
,
new_target
,
scores
,
history
=
None
,
None
,
None
,
[]
while
(
FLAGS
.
beam_size
>
1
and
train_beam_freq
>
np
.
random
.
random_sample
()):
# Get the best beam (no training, just forward model).
new_target
,
new_first
,
new_inp
,
scores
=
get_best_beam
(
beam_model
,
sess
,
inp
,
target
,
batch_size
,
FLAGS
.
beam_size
,
bucket_id
,
history
,
p
)
history
.
append
(
new_first
)
# Training step with the previous input and the best beam as target.
_
,
_
,
_
,
state
=
model
.
step
(
sess
,
inp
,
new_target
,
FLAGS
.
do_train
,
noise_param
,
update_mem
=
True
,
state
=
state
)
# Change input to the new one for the next step.
inp
=
new_inp
# If all results are great, stop (todo: not to wait for all?).
if
FLAGS
.
nprint
>
1
:
print
scores
if
sum
(
scores
)
/
float
(
len
(
scores
))
>=
10.0
:
break
# The final step with the true target.
loss
,
res
,
gnorm
,
_
=
model
.
step
(
sess
,
inp
,
target
,
FLAGS
.
do_train
,
noise_param
,
update_mem
=
True
,
state
=
state
)
step_time
+=
time
.
time
()
-
start_time
acc_grad_norm
+=
float
(
gnorm
)
# Accumulate statistics only if we did not exceed curriculum length.
if
l
<
max_cur_length
+
1
:
step_count
+=
1
acc_loss
+=
loss
errors
,
total
,
seq_err
=
data
.
accuracy
(
inp
,
res
,
target
,
batch_size
,
0
)
acc_total
+=
total
acc_errors
+=
errors
acc_seq_err
+=
seq_err
acc_grad_norm
+=
0.0
if
gnorm
is
None
else
float
(
gnorm
)
# Accumulate statistics.
acc_loss
+=
loss
acc_l1
+=
loss
errors
,
total
,
seq_err
=
data
.
accuracy
(
inp
,
res
,
target
,
batch_size
,
0
,
new_target
,
scores
)
if
FLAGS
.
nprint
>
1
:
print
"seq_err: "
,
seq_err
acc_total
+=
total
acc_errors
+=
errors
acc_seq_err
+=
seq_err
# Report summary every 10 steps.
if
step_count
+
3
>
FLAGS
.
steps_per_checkpoint
:
do_report
=
True
# Don't polute plot too early.
if
is_chief
and
step_count
%
10
==
1
and
do_report
:
cur_loss
=
acc_l1
/
float
(
step_c1
)
acc_l1
,
step_c1
=
0.0
,
0
cur_perp
=
data
.
safe_exp
(
cur_loss
)
summary
=
tf
.
Summary
()
summary
.
value
.
extend
(
[
tf
.
Summary
.
Value
(
tag
=
"log_perplexity"
,
simple_value
=
cur_loss
),
tf
.
Summary
.
Value
(
tag
=
"perplexity"
,
simple_value
=
cur_perp
)])
sv
.
SummaryComputed
(
sess
,
summary
,
global_step
)
# Normalize and print out accumulated statistics.
acc_loss
/=
step_count
...
...
@@ -273,178 +757,257 @@ def train():
acc_seq_err
=
float
(
acc_seq_err
)
/
(
step_count
*
batch_size
)
prev_seq_err
=
max
(
0.0
,
acc_seq_err
-
0.02
)
# No noise at error < 2%.
acc_errors
=
float
(
acc_errors
)
/
acc_total
if
acc_total
>
0
else
1.0
msg1
=
"step %d step-time %.2f"
%
(
global_step
,
step_time
)
msg
2
=
"lr %.8f pull %.3f"
%
(
learning_rate
,
pull
)
msg3
=
(
"%s %s grad-norm %.8f"
%
(
msg1
,
msg2
,
acc_grad_norm
/
FLAGS
.
steps_per_checkpoint
))
data
.
print_out
(
"%s len %d pp
x
%.
8
f errors %.2f sequence-errors %.2f"
%
(
msg
3
,
max_cur_length
,
data
.
safe_exp
(
acc_loss
),
t_size
=
float
(
sum
([
len
(
x
)
for
x
in
train_set
]))
/
float
(
1000000
)
msg
=
(
"step %d step-time %.2f train-size %.3f lr %.6f grad-norm %.4f"
%
(
global_step
+
1
,
step_time
,
t_size
,
learning_rate
,
acc_grad_norm
/
FLAGS
.
steps_per_checkpoint
))
data
.
print_out
(
"%s len %d pp
l
%.
6
f errors %.2f sequence-errors %.2f"
%
(
msg
,
max_cur_length
,
data
.
safe_exp
(
acc_loss
),
100
*
acc_errors
,
100
*
acc_seq_err
))
# If errors are below the curriculum threshold, move curriculum forward.
if
curriculum
>
acc_seq_err
:
is_good
=
FLAGS
.
curriculum_ppx
>
data
.
safe_exp
(
acc_loss
)
is_good
=
is_good
and
FLAGS
.
curriculum_seq
>
acc_seq_err
if
is_good
and
is_chief
:
if
FLAGS
.
quantize
:
# Quantize weights.
data
.
print_out
(
" Quantizing parameters."
)
sess
.
run
([
quant_op
])
# Increase current length (until the next with training data).
do_incr
=
True
while
do_incr
and
max_cur_length
<
max_length
:
sess
.
run
(
model
.
cur_length_incr_op
)
for
t
in
tasks
:
if
data
.
train_set
[
t
]:
do_incr
=
False
sess
.
run
(
model
.
cur_length_incr_op
)
# Forget last perplexities if we're not yet at the end.
if
max_cur_length
<
max_length
:
prev_acc_perp
.
append
(
1000000
)
# Either increase pull or, if it's large, average parameters.
if
pull
<
0.1
:
sess
.
run
(
model
.
pull_incr_op
)
else
:
data
.
print_out
(
" Averaging parameters."
)
sess
.
run
(
model
.
avg_op
)
if
acc_seq_err
<
(
curriculum
/
3.0
):
sess
.
run
(
model
.
lr_decay_op
)
# Lower learning rate if we're worse than the last
3
checkpoints.
# Lower learning rate if we're worse than the last
5
checkpoints.
acc_perp
=
data
.
safe_exp
(
acc_loss
)
if
acc_perp
>
max
(
prev_acc_perp
[
-
3
:]):
if
acc_perp
>
max
(
prev_acc_perp
[
-
5
:])
and
is_chief
:
sess
.
run
(
model
.
lr_decay_op
)
prev_acc_perp
.
append
(
acc_perp
)
# Save checkpoint.
checkpoint_path
=
os
.
path
.
join
(
checkpoint_dir
,
"neural_gpu.ckpt"
)
model
.
saver
.
save
(
sess
,
checkpoint_path
,
global_step
=
model
.
global_step
)
# Run evaluation.
bound
=
data
.
bins
[
-
1
]
+
1
for
t
in
tasks
:
l
=
min_length
while
l
<
max_length
+
EXTRA_EVAL
and
l
<
bound
:
_
,
seq_err
,
_
=
single_test
(
l
,
model
,
sess
,
t
,
FLAGS
.
nprint
,
batch_size
)
l
+=
1
while
l
<
bound
+
1
and
not
data
.
test_set
[
t
][
l
]:
l
+=
1
if
seq_err
<
0.05
:
# Run larger test if we're good enough.
_
,
seq_err
=
multi_test
(
data
.
forward_max
,
model
,
sess
,
t
,
FLAGS
.
nprint
,
batch_size
*
4
)
if
seq_err
<
0.01
:
# Super-large test on 1-task large-forward models.
if
data
.
forward_max
>
4000
and
len
(
tasks
)
==
1
:
multi_test
(
data
.
forward_max
,
model
,
sess
,
tasks
[
0
],
FLAGS
.
nprint
,
batch_size
*
16
,
0
)
def
animate
(
l
,
test_data
,
anim_size
):
"""Create animation for the given data (hacky matplotlib use)."""
xf
=
12
# Extra frames to slow down at start and end.
fps
=
2
# Frames per step.
# Make the figure.
fig
=
plt
.
figure
(
figsize
=
(
16
,
9
),
facecolor
=
"white"
)
ax
=
fig
.
add_axes
([
0
,
0
,
1
,
1
],
frameon
=
False
,
zorder
=
2
)
ax
.
set_xticks
([
i
*
24
-
0.5
for
i
in
xrange
(
4
)])
ax
.
set_xticklabels
([])
ax
.
set_yticks
([
i
-
0.5
for
i
in
xrange
(
l
+
1
)])
ax
.
grid
(
which
=
"major"
,
axis
=
"both"
,
linestyle
=
"-"
,
color
=
"black"
)
# We need text fields.
text_fields
=
[]
text_size
=
24
*
32
/
l
for
y
in
xrange
(
l
):
text_fields
.
append
(
ax
.
text
(
11.25
,
y
+
0.15
,
""
,
color
=
"g"
,
ha
=
"center"
,
va
=
"center"
,
bbox
=
{
"facecolor"
:
"b"
,
"alpha"
:
0.01
,
"pad"
:
24
*
text_size
},
size
=
text_size
-
(
4
*
32
/
l
),
animated
=
True
))
im
=
ax
.
imshow
(
np
.
zeros_like
(
test_data
[
0
][
0
][
0
]),
vmin
=-
1.0
,
vmax
=
1.0
,
cmap
=
"gray"
,
aspect
=
"auto"
,
origin
=
"upper"
,
interpolation
=
"none"
,
animated
=
True
)
im
.
set_zorder
(
1
)
# Main animation step.
def
animation_update
(
frame_no
,
test_data
,
xf
,
im
,
text_fields
):
"""Update an animation frame."""
steps
,
inpt
,
out_raw
=
test_data
length
=
len
(
steps
)
batch
=
frame_no
/
(
fps
*
(
l
+
4
*
xf
))
index
=
int
((
frame_no
%
(
fps
*
(
l
+
4
*
xf
)))
/
fps
)
# Cut output after first padding.
out
=
[
out_raw
[
i
][
batch
]
for
i
in
xrange
(
len
(
text_fields
))]
if
0
in
out
:
i
=
out
.
index
(
0
)
out
=
out
[
0
:
i
]
+
[
0
for
_
in
xrange
(
len
(
out
)
-
i
)]
# Show the state after the first frames.
if
index
>=
2
*
xf
:
im
.
set_array
(
steps
[
min
(
length
-
1
,
index
-
2
*
xf
)][
batch
])
for
i
,
t
in
enumerate
(
text_fields
):
if
index
-
2
*
xf
<
length
:
t
.
set_text
(
""
)
else
:
t
.
set_text
(
data
.
to_symbol
(
out
[
i
]))
else
:
for
i
,
t
in
enumerate
(
text_fields
):
t
.
set_text
(
data
.
to_symbol
(
inpt
[
i
][
batch
])
if
index
<
xf
else
""
)
if
index
<
xf
:
im
.
set_array
(
np
.
zeros_like
(
steps
[
0
][
0
]))
else
:
im
.
set_array
(
steps
[
0
][
batch
])
return
im
,
# Create the animation and save to mp4.
animation
=
anim
.
FuncAnimation
(
fig
,
animation_update
,
blit
=
True
,
frames
=
(
l
+
4
*
xf
)
*
anim_size
*
fps
,
interval
=
500
/
fps
,
fargs
=
(
test_data
,
xf
,
im
,
text_fields
))
animation
.
save
(
"/tmp/neural_gpu.mp4"
,
writer
=
"mencoder"
,
fps
=
4
*
fps
,
dpi
=
3
*
80
)
if
is_chief
:
checkpoint_path
=
os
.
path
.
join
(
checkpoint_dir
,
"neural_gpu.ckpt"
)
model
.
saver
.
save
(
sess
,
checkpoint_path
,
global_step
=
model
.
global_step
)
# Run evaluation.
bin_bound
=
4
for
p
in
FLAGS
.
problem
.
split
(
"-"
):
total_loss
,
total_err
,
tl_counter
=
0.0
,
0.0
,
0
for
bin_id
in
xrange
(
len
(
data
.
bins
)):
if
bin_id
<
bin_bound
or
bin_id
%
FLAGS
.
eval_bin_print
==
1
:
err
,
_
,
loss
=
single_test
(
bin_id
,
model
,
sess
,
FLAGS
.
nprint
,
batch_size
*
4
,
dev_set
,
p
,
beam_model
=
beam_model
)
if
loss
>
0.0
:
total_loss
+=
loss
total_err
+=
err
tl_counter
+=
1
test_loss
=
total_loss
/
max
(
1
,
tl_counter
)
test_err
=
total_err
/
max
(
1
,
tl_counter
)
test_perp
=
data
.
safe_exp
(
test_loss
)
summary
=
tf
.
Summary
()
summary
.
value
.
extend
(
[
tf
.
Summary
.
Value
(
tag
=
"test/%s/loss"
%
p
,
simple_value
=
test_loss
),
tf
.
Summary
.
Value
(
tag
=
"test/%s/error"
%
p
,
simple_value
=
test_err
),
tf
.
Summary
.
Value
(
tag
=
"test/%s/perplexity"
%
p
,
simple_value
=
test_perp
)])
sv
.
SummaryComputed
(
sess
,
summary
,
global_step
)
def
linearize
(
output
,
rev_fr_vocab
,
simple_tokenizer
=
None
,
eos_id
=
wmt
.
EOS_ID
):
# If there is an EOS symbol in outputs, cut them at that point (WMT).
if
eos_id
in
output
:
output
=
output
[:
output
.
index
(
eos_id
)]
# Print out French sentence corresponding to outputs.
if
simple_tokenizer
or
FLAGS
.
simple_tokenizer
:
vlen
=
len
(
rev_fr_vocab
)
def
vget
(
o
):
if
o
<
vlen
:
return
rev_fr_vocab
[
o
]
return
"UNK"
return
" "
.
join
([
vget
(
o
)
for
o
in
output
])
else
:
return
wmt
.
basic_detokenizer
([
rev_fr_vocab
[
o
]
for
o
in
output
])
def
evaluate
():
"""Evaluate an existing model."""
batch_size
=
FLAGS
.
batch_size
tasks
=
FLAGS
.
task
.
split
(
"-"
)
with
tf
.
Session
()
as
sess
:
model
,
min_length
,
max_length
,
_
,
_
,
ensemble
=
initialize
(
sess
)
bound
=
data
.
bins
[
-
1
]
+
1
for
t
in
tasks
:
l
=
min_length
while
l
<
max_length
+
EXTRA_EVAL
and
l
<
bound
:
_
,
seq_err
,
_
=
single_test
(
l
,
model
,
sess
,
t
,
FLAGS
.
nprint
,
batch_size
,
ensemble
=
ensemble
)
l
+=
1
while
l
<
bound
+
1
and
not
data
.
test_set
[
t
][
l
]:
l
+=
1
# Animate.
if
FLAGS
.
animate
:
anim_size
=
2
_
,
_
,
test_data
=
single_test
(
l
,
model
,
sess
,
t
,
0
,
anim_size
,
get_steps
=
True
)
animate
(
l
,
test_data
,
anim_size
)
# More tests.
_
,
seq_err
=
multi_test
(
data
.
forward_max
,
model
,
sess
,
t
,
FLAGS
.
nprint
,
batch_size
*
4
,
ensemble
=
ensemble
)
if
seq_err
<
0.01
:
# Super-test if we're very good and in large-test mode.
if
data
.
forward_max
>
4000
and
len
(
tasks
)
==
1
:
multi_test
(
data
.
forward_max
,
model
,
sess
,
tasks
[
0
],
FLAGS
.
nprint
,
batch_size
*
64
,
0
,
ensemble
=
ensemble
)
batch_size
=
FLAGS
.
batch_size
*
FLAGS
.
num_gpus
with
tf
.
Session
(
config
=
tf
.
ConfigProto
(
allow_soft_placement
=
True
))
as
sess
:
(
model
,
beam_model
,
_
,
_
,
_
,
(
_
,
dev_set
,
en_vocab_path
,
fr_vocab_path
),
_
,
sess
)
=
initialize
(
sess
)
for
p
in
FLAGS
.
problem
.
split
(
"-"
):
for
bin_id
in
xrange
(
len
(
data
.
bins
)):
if
(
FLAGS
.
task
>=
0
and
bin_id
>
4
)
or
(
FLAGS
.
nprint
==
0
and
bin_id
>
8
and
p
==
"wmt"
):
break
single_test
(
bin_id
,
model
,
sess
,
FLAGS
.
nprint
,
batch_size
,
dev_set
,
p
,
beam_model
=
beam_model
)
path
=
FLAGS
.
test_file_prefix
xid
=
""
if
FLAGS
.
task
<
0
else
(
"%.4d"
%
(
FLAGS
.
task
+
FLAGS
.
decode_offset
))
en_path
,
fr_path
=
path
+
".en"
+
xid
,
path
+
".fr"
+
xid
# Evaluate the test file if they exist.
if
path
and
tf
.
gfile
.
Exists
(
en_path
)
and
tf
.
gfile
.
Exists
(
fr_path
):
data
.
print_out
(
"Translating test set %s"
%
en_path
)
# Read lines.
en_lines
,
fr_lines
=
[],
[]
with
tf
.
gfile
.
GFile
(
en_path
,
mode
=
"r"
)
as
f
:
for
line
in
f
:
en_lines
.
append
(
line
.
strip
())
with
tf
.
gfile
.
GFile
(
fr_path
,
mode
=
"r"
)
as
f
:
for
line
in
f
:
fr_lines
.
append
(
line
.
strip
())
# Tokenize and convert to ids.
en_vocab
,
_
=
wmt
.
initialize_vocabulary
(
en_vocab_path
)
_
,
rev_fr_vocab
=
wmt
.
initialize_vocabulary
(
fr_vocab_path
)
if
FLAGS
.
simple_tokenizer
:
en_ids
=
[
wmt
.
sentence_to_token_ids
(
l
,
en_vocab
,
tokenizer
=
wmt
.
space_tokenizer
,
normalize_digits
=
FLAGS
.
normalize_digits
)
for
l
in
en_lines
]
else
:
en_ids
=
[
wmt
.
sentence_to_token_ids
(
l
,
en_vocab
)
for
l
in
en_lines
]
# Translate.
results
=
[]
for
idx
,
token_ids
in
enumerate
(
en_ids
):
if
idx
%
5
==
0
:
data
.
print_out
(
"Translating example %d of %d."
%
(
idx
,
len
(
en_ids
)))
# Which bucket does it belong to?
buckets
=
[
b
for
b
in
xrange
(
len
(
data
.
bins
))
if
data
.
bins
[
b
]
>=
len
(
token_ids
)]
if
buckets
:
result
,
result_cost
=
[],
100000000.0
for
bucket_id
in
buckets
:
if
data
.
bins
[
bucket_id
]
>
MAXLEN_F
*
len
(
token_ids
)
+
EVAL_LEN_INCR
:
break
# Get a 1-element batch to feed the sentence to the model.
used_batch_size
=
1
# batch_size
inp
,
target
=
data
.
get_batch
(
bucket_id
,
used_batch_size
,
None
,
FLAGS
.
height
,
preset
=
([
token_ids
],
[[]]))
loss
,
output_logits
,
_
,
_
=
model
.
step
(
sess
,
inp
,
target
,
None
,
beam_size
=
FLAGS
.
beam_size
)
outputs
=
[
int
(
o
[
0
])
for
o
in
output_logits
]
loss
=
loss
[
0
]
-
(
data
.
bins
[
bucket_id
]
*
FLAGS
.
length_norm
)
if
FLAGS
.
simple_tokenizer
:
cur_out
=
outputs
if
wmt
.
EOS_ID
in
cur_out
:
cur_out
=
cur_out
[:
cur_out
.
index
(
wmt
.
EOS_ID
)]
res_tags
=
[
rev_fr_vocab
[
o
]
for
o
in
cur_out
]
bad_words
,
bad_brack
=
wmt
.
parse_constraints
(
token_ids
,
res_tags
)
loss
+=
1000.0
*
bad_words
+
100.0
*
bad_brack
# print (bucket_id, loss)
if
loss
<
result_cost
:
result
=
outputs
result_cost
=
loss
final
=
linearize
(
result
,
rev_fr_vocab
)
results
.
append
(
"%s
\t
%s
\n
"
%
(
final
,
fr_lines
[
idx
]))
# print result_cost
sys
.
stderr
.
write
(
results
[
-
1
])
sys
.
stderr
.
flush
()
else
:
sys
.
stderr
.
write
(
"TOOO_LONG
\t
%s
\n
"
%
fr_lines
[
idx
])
sys
.
stderr
.
flush
()
if
xid
:
decode_suffix
=
"beam%dln%dn"
%
(
FLAGS
.
beam_size
,
int
(
100
*
FLAGS
.
length_norm
))
with
tf
.
gfile
.
GFile
(
path
+
".res"
+
decode_suffix
+
xid
,
mode
=
"w"
)
as
f
:
for
line
in
results
:
f
.
write
(
line
)
def
mul
(
l
):
res
=
1.0
for
s
in
l
:
res
*=
s
return
res
def
interactive
():
"""Interactively probe an existing model."""
with
tf
.
Session
()
as
sess
:
model
,
_
,
_
,
_
,
_
,
_
=
initialize
(
sess
)
sys
.
stdout
.
write
(
"Input to Neural GPU, e.g., 0 1. Use -1 for PAD.
\n
"
)
with
tf
.
Session
(
config
=
tf
.
ConfigProto
(
allow_soft_placement
=
True
))
as
sess
:
# Initialize model.
(
model
,
_
,
_
,
_
,
_
,
(
_
,
_
,
en_path
,
fr_path
),
_
,
_
)
=
initialize
(
sess
)
# Load vocabularies.
en_vocab
,
rev_en_vocab
=
wmt
.
initialize_vocabulary
(
en_path
)
_
,
rev_fr_vocab
=
wmt
.
initialize_vocabulary
(
fr_path
)
# Print out vectors and variables.
if
FLAGS
.
nprint
>
0
and
FLAGS
.
word_vector_file_en
:
print_vectors
(
"embedding:0"
,
en_path
,
FLAGS
.
word_vector_file_en
)
if
FLAGS
.
nprint
>
0
and
FLAGS
.
word_vector_file_fr
:
print_vectors
(
"target_embedding:0"
,
fr_path
,
FLAGS
.
word_vector_file_fr
)
total
=
0
for
v
in
tf
.
trainable_variables
():
shape
=
v
.
get_shape
().
as_list
()
total
+=
mul
(
shape
)
print
(
v
.
name
,
shape
,
mul
(
shape
))
print
total
# Start interactive loop.
sys
.
stdout
.
write
(
"Input to Neural GPU Translation Model.
\n
"
)
sys
.
stdout
.
write
(
"> "
)
sys
.
stdout
.
flush
()
inpt
=
sys
.
stdin
.
readline
()
inpt
=
sys
.
stdin
.
readline
()
,
""
while
inpt
:
ids
=
[
data
.
to_id
(
s
)
for
s
in
inpt
.
strip
().
split
()]
inpt
,
target
=
data
.
get_batch
(
len
(
ids
),
1
,
False
,
""
,
preset
=
(
ids
,
[
0
for
_
in
ids
]))
_
,
res
,
_
,
_
=
model
.
step
(
sess
,
inpt
,
target
,
False
)
res
=
[
np
.
argmax
(
o
,
axis
=
1
)
for
o
in
res
]
res
=
[
o
for
o
in
res
[:
len
(
ids
)]
if
o
>
0
]
print
" "
+
" "
.
join
([
data
.
to_symbol
(
output
[
0
])
for
output
in
res
])
cures
=
[]
# Get token-ids for the input sentence.
if
FLAGS
.
simple_tokenizer
:
token_ids
=
wmt
.
sentence_to_token_ids
(
inpt
,
en_vocab
,
tokenizer
=
wmt
.
space_tokenizer
,
normalize_digits
=
FLAGS
.
normalize_digits
)
else
:
token_ids
=
wmt
.
sentence_to_token_ids
(
inpt
,
en_vocab
)
print
[
rev_en_vocab
[
t
]
for
t
in
token_ids
]
# Which bucket does it belong to?
buckets
=
[
b
for
b
in
xrange
(
len
(
data
.
bins
))
if
data
.
bins
[
b
]
>=
max
(
len
(
token_ids
),
len
(
cures
))]
if
cures
:
buckets
=
[
buckets
[
0
]]
if
buckets
:
result
,
result_cost
=
[],
10000000.0
for
bucket_id
in
buckets
:
if
data
.
bins
[
bucket_id
]
>
MAXLEN_F
*
len
(
token_ids
)
+
EVAL_LEN_INCR
:
break
glen
=
1
for
gen_idx
in
xrange
(
glen
):
# Get a 1-element batch to feed the sentence to the model.
inp
,
target
=
data
.
get_batch
(
bucket_id
,
1
,
None
,
FLAGS
.
height
,
preset
=
([
token_ids
],
[
cures
]))
loss
,
output_logits
,
_
,
_
=
model
.
step
(
sess
,
inp
,
target
,
None
,
beam_size
=
FLAGS
.
beam_size
,
update_mem
=
False
)
# If it is a greedy decoder, outputs are argmaxes of output_logits.
if
FLAGS
.
beam_size
>
1
:
outputs
=
[
int
(
o
)
for
o
in
output_logits
]
else
:
loss
=
loss
[
0
]
-
(
data
.
bins
[
bucket_id
]
*
FLAGS
.
length_norm
)
outputs
=
[
int
(
np
.
argmax
(
logit
,
axis
=
1
))
for
logit
in
output_logits
]
print
[
rev_fr_vocab
[
t
]
for
t
in
outputs
]
print
loss
,
data
.
bins
[
bucket_id
]
print
linearize
(
outputs
,
rev_fr_vocab
)
cures
.
append
(
outputs
[
gen_idx
])
print
cures
print
linearize
(
cures
,
rev_fr_vocab
)
if
FLAGS
.
simple_tokenizer
:
cur_out
=
outputs
if
wmt
.
EOS_ID
in
cur_out
:
cur_out
=
cur_out
[:
cur_out
.
index
(
wmt
.
EOS_ID
)]
res_tags
=
[
rev_fr_vocab
[
o
]
for
o
in
cur_out
]
bad_words
,
bad_brack
=
wmt
.
parse_constraints
(
token_ids
,
res_tags
)
loss
+=
1000.0
*
bad_words
+
100.0
*
bad_brack
if
loss
<
result_cost
:
result
=
outputs
result_cost
=
loss
print
(
"FINAL"
,
result_cost
)
print
[
rev_fr_vocab
[
t
]
for
t
in
result
]
print
linearize
(
result
,
rev_fr_vocab
)
else
:
print
"TOOO_LONG"
sys
.
stdout
.
write
(
"> "
)
sys
.
stdout
.
flush
()
inpt
=
sys
.
stdin
.
readline
()
inpt
=
sys
.
stdin
.
readline
()
,
""
def
main
(
_
):
...
...
neural_gpu/program_utils.py
0 → 100644
View file @
a315e568
# Copyright 2015 Google Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Utilities for generating program synthesis and evaluation data."""
import
contextlib
import
sys
import
StringIO
import
random
import
os
class
ListType
(
object
):
def
__init__
(
self
,
arg
):
self
.
arg
=
arg
def
__str__
(
self
):
return
"["
+
str
(
self
.
arg
)
+
"]"
def
__eq__
(
self
,
other
):
if
not
isinstance
(
other
,
ListType
):
return
False
return
self
.
arg
==
other
.
arg
def
__hash__
(
self
):
return
hash
(
self
.
arg
)
class
VarType
(
object
):
def
__init__
(
self
,
arg
):
self
.
arg
=
arg
def
__str__
(
self
):
return
str
(
self
.
arg
)
def
__eq__
(
self
,
other
):
if
not
isinstance
(
other
,
VarType
):
return
False
return
self
.
arg
==
other
.
arg
def
__hash__
(
self
):
return
hash
(
self
.
arg
)
class
FunctionType
(
object
):
def
__init__
(
self
,
args
):
self
.
args
=
args
def
__str__
(
self
):
return
str
(
self
.
args
[
0
])
+
" -> "
+
str
(
self
.
args
[
1
])
def
__eq__
(
self
,
other
):
if
not
isinstance
(
other
,
FunctionType
):
return
False
return
self
.
args
==
other
.
args
def
__hash__
(
self
):
return
hash
(
tuple
(
self
.
args
))
class
Function
(
object
):
def
__init__
(
self
,
name
,
arg_types
,
output_type
,
fn_arg_types
=
None
):
self
.
name
=
name
self
.
arg_types
=
arg_types
self
.
fn_arg_types
=
fn_arg_types
or
[]
self
.
output_type
=
output_type
Null
=
100
## Functions
f_head
=
Function
(
"c_head"
,
[
ListType
(
"Int"
)],
"Int"
)
def
c_head
(
xs
):
return
xs
[
0
]
if
len
(
xs
)
>
0
else
Null
f_last
=
Function
(
"c_last"
,
[
ListType
(
"Int"
)],
"Int"
)
def
c_last
(
xs
):
return
xs
[
-
1
]
if
len
(
xs
)
>
0
else
Null
f_take
=
Function
(
"c_take"
,
[
"Int"
,
ListType
(
"Int"
)],
ListType
(
"Int"
))
def
c_take
(
n
,
xs
):
return
xs
[:
n
]
f_drop
=
Function
(
"c_drop"
,
[
"Int"
,
ListType
(
"Int"
)],
ListType
(
"Int"
))
def
c_drop
(
n
,
xs
):
return
xs
[
n
:]
f_access
=
Function
(
"c_access"
,
[
"Int"
,
ListType
(
"Int"
)],
"Int"
)
def
c_access
(
n
,
xs
):
return
xs
[
n
]
if
n
>=
0
and
len
(
xs
)
>
n
else
Null
f_max
=
Function
(
"c_max"
,
[
ListType
(
"Int"
)],
"Int"
)
def
c_max
(
xs
):
return
max
(
xs
)
if
len
(
xs
)
>
0
else
Null
f_min
=
Function
(
"c_min"
,
[
ListType
(
"Int"
)],
"Int"
)
def
c_min
(
xs
):
return
min
(
xs
)
if
len
(
xs
)
>
0
else
Null
f_reverse
=
Function
(
"c_reverse"
,
[
ListType
(
"Int"
)],
ListType
(
"Int"
))
def
c_reverse
(
xs
):
return
list
(
reversed
(
xs
))
f_sort
=
Function
(
"sorted"
,
[
ListType
(
"Int"
)],
ListType
(
"Int"
))
# def c_sort(xs): return sorted(xs)
f_sum
=
Function
(
"sum"
,
[
ListType
(
"Int"
)],
"Int"
)
# def c_sum(xs): return sum(xs)
## Lambdas
# Int -> Int
def
plus_one
(
x
):
return
x
+
1
def
minus_one
(
x
):
return
x
-
1
def
times_two
(
x
):
return
x
*
2
def
neg
(
x
):
return
x
*
(
-
1
)
def
div_two
(
x
):
return
int
(
x
/
2
)
def
sq
(
x
):
return
x
**
2
def
times_three
(
x
):
return
x
*
3
def
div_three
(
x
):
return
int
(
x
/
3
)
def
times_four
(
x
):
return
x
*
4
def
div_four
(
x
):
return
int
(
x
/
4
)
# Int -> Bool
def
pos
(
x
):
return
x
>
0
def
neg
(
x
):
return
x
<
0
def
even
(
x
):
return
x
%
2
==
0
def
odd
(
x
):
return
x
%
2
==
1
# Int -> Int -> Int
def
add
(
x
,
y
):
return
x
+
y
def
sub
(
x
,
y
):
return
x
-
y
def
mul
(
x
,
y
):
return
x
*
y
# HOFs
f_map
=
Function
(
"map"
,
[
ListType
(
"Int"
)],
ListType
(
"Int"
),
[
FunctionType
([
"Int"
,
"Int"
])])
f_filter
=
Function
(
"filter"
,
[
ListType
(
"Int"
)],
ListType
(
"Int"
),
[
FunctionType
([
"Int"
,
"Bool"
])])
f_count
=
Function
(
"c_count"
,
[
ListType
(
"Int"
)],
"Int"
,
[
FunctionType
([
"Int"
,
"Bool"
])])
def
c_count
(
f
,
xs
):
return
len
([
x
for
x
in
xs
if
f
(
x
)])
f_zipwith
=
Function
(
"c_zipwith"
,
[
ListType
(
"Int"
),
ListType
(
"Int"
)],
ListType
(
"Int"
),
[
FunctionType
([
"Int"
,
"Int"
,
"Int"
])])
#FIX
def
c_zipwith
(
f
,
xs
,
ys
):
return
[
f
(
x
,
y
)
for
(
x
,
y
)
in
zip
(
xs
,
ys
)]
f_scan
=
Function
(
"c_scan"
,
[
ListType
(
"Int"
)],
ListType
(
"Int"
),
[
FunctionType
([
"Int"
,
"Int"
,
"Int"
])])
def
c_scan
(
f
,
xs
):
out
=
xs
for
i
in
range
(
1
,
len
(
xs
)):
out
[
i
]
=
f
(
xs
[
i
],
xs
[
i
-
1
])
return
out
@
contextlib
.
contextmanager
def
stdoutIO
(
stdout
=
None
):
old
=
sys
.
stdout
if
stdout
is
None
:
stdout
=
StringIO
.
StringIO
()
sys
.
stdout
=
stdout
yield
stdout
sys
.
stdout
=
old
def
evaluate
(
program_str
,
input_names_to_vals
,
default
=
"ERROR"
):
exec_str
=
[]
for
name
,
val
in
input_names_to_vals
.
iteritems
():
exec_str
+=
name
+
" = "
+
str
(
val
)
+
"; "
exec_str
+=
program_str
if
type
(
exec_str
)
is
list
:
exec_str
=
""
.
join
(
exec_str
)
with
stdoutIO
()
as
s
:
# pylint: disable=bare-except
try
:
exec
exec_str
+
" print(out)"
return
s
.
getvalue
()[:
-
1
]
except
:
return
default
# pylint: enable=bare-except
class
Statement
(
object
):
"""Statement class."""
def
__init__
(
self
,
fn
,
output_var
,
arg_vars
,
fn_args
=
None
):
self
.
fn
=
fn
self
.
output_var
=
output_var
self
.
arg_vars
=
arg_vars
self
.
fn_args
=
fn_args
or
[]
def
__str__
(
self
):
return
"%s = %s(%s%s%s)"
%
(
self
.
output_var
,
self
.
fn
.
name
,
", "
.
join
(
self
.
fn_args
),
", "
if
self
.
fn_args
else
""
,
", "
.
join
(
self
.
arg_vars
))
def
substitute
(
self
,
env
):
self
.
output_var
=
env
.
get
(
self
.
output_var
,
self
.
output_var
)
self
.
arg_vars
=
[
env
.
get
(
v
,
v
)
for
v
in
self
.
arg_vars
]
class
ProgramGrower
(
object
):
"""Grow programs."""
def
__init__
(
self
,
functions
,
types_to_lambdas
):
self
.
functions
=
functions
self
.
types_to_lambdas
=
types_to_lambdas
def
grow_body
(
self
,
new_var_name
,
dependencies
,
types_to_vars
):
"""Grow the program body."""
choices
=
[]
for
f
in
self
.
functions
:
if
all
([
a
in
types_to_vars
.
keys
()
for
a
in
f
.
arg_types
]):
choices
.
append
(
f
)
f
=
random
.
choice
(
choices
)
args
=
[]
for
t
in
f
.
arg_types
:
possible_vars
=
random
.
choice
(
types_to_vars
[
t
])
var
=
random
.
choice
(
possible_vars
)
args
.
append
(
var
)
dependencies
.
setdefault
(
new_var_name
,
[]).
extend
(
[
var
]
+
(
dependencies
[
var
]))
fn_args
=
[
random
.
choice
(
self
.
types_to_lambdas
[
t
])
for
t
in
f
.
fn_arg_types
]
types_to_vars
.
setdefault
(
f
.
output_type
,
[]).
append
(
new_var_name
)
return
Statement
(
f
,
new_var_name
,
args
,
fn_args
)
def
grow
(
self
,
program_len
,
input_types
):
"""Grow the program."""
var_names
=
list
(
reversed
(
map
(
chr
,
range
(
97
,
123
))))
dependencies
=
dict
()
types_to_vars
=
dict
()
input_names
=
[]
for
t
in
input_types
:
var
=
var_names
.
pop
()
dependencies
[
var
]
=
[]
types_to_vars
.
setdefault
(
t
,
[]).
append
(
var
)
input_names
.
append
(
var
)
statements
=
[]
for
_
in
range
(
program_len
-
1
):
var
=
var_names
.
pop
()
statements
.
append
(
self
.
grow_body
(
var
,
dependencies
,
types_to_vars
))
statements
.
append
(
self
.
grow_body
(
"out"
,
dependencies
,
types_to_vars
))
new_var_names
=
[
c
for
c
in
map
(
chr
,
range
(
97
,
123
))
if
c
not
in
input_names
]
new_var_names
.
reverse
()
keep_statements
=
[]
env
=
dict
()
for
s
in
statements
:
if
s
.
output_var
in
dependencies
[
"out"
]:
keep_statements
.
append
(
s
)
env
[
s
.
output_var
]
=
new_var_names
.
pop
()
if
s
.
output_var
==
"out"
:
keep_statements
.
append
(
s
)
for
k
in
keep_statements
:
k
.
substitute
(
env
)
return
Program
(
input_names
,
input_types
,
";"
.
join
(
[
str
(
k
)
for
k
in
keep_statements
]))
class
Program
(
object
):
"""The program class."""
def
__init__
(
self
,
input_names
,
input_types
,
body
):
self
.
input_names
=
input_names
self
.
input_types
=
input_types
self
.
body
=
body
def
evaluate
(
self
,
inputs
):
"""Evaluate this program."""
if
len
(
inputs
)
!=
len
(
self
.
input_names
):
raise
AssertionError
(
"inputs and input_names have to"
"have the same len. inp: %s , names: %s"
%
(
str
(
inputs
),
str
(
self
.
input_names
)))
inp_str
=
""
for
(
name
,
inp
)
in
zip
(
self
.
input_names
,
inputs
):
inp_str
+=
name
+
" = "
+
str
(
inp
)
+
"; "
with
stdoutIO
()
as
s
:
# pylint: disable=exec-used
exec
inp_str
+
self
.
body
+
"; print(out)"
# pylint: enable=exec-used
return
s
.
getvalue
()[:
-
1
]
def
flat_str
(
self
):
out
=
""
for
s
in
self
.
body
.
split
(
";"
):
out
+=
s
+
";"
return
out
def
__str__
(
self
):
out
=
""
for
(
n
,
t
)
in
zip
(
self
.
input_names
,
self
.
input_types
):
out
+=
n
+
" = "
+
str
(
t
)
+
"
\n
"
for
s
in
self
.
body
.
split
(
";"
):
out
+=
s
+
"
\n
"
return
out
prog_vocab
=
[]
prog_rev_vocab
=
{}
def
tokenize
(
string
,
tokens
=
None
):
"""Tokenize the program string."""
if
tokens
is
None
:
tokens
=
prog_vocab
tokens
=
sorted
(
tokens
,
key
=
len
,
reverse
=
True
)
out
=
[]
string
=
string
.
strip
()
while
string
:
found
=
False
for
t
in
tokens
:
if
string
.
startswith
(
t
):
out
.
append
(
t
)
string
=
string
[
len
(
t
):]
found
=
True
break
if
not
found
:
raise
ValueError
(
"Couldn't tokenize this: "
+
string
)
string
=
string
.
strip
()
return
out
def
clean_up
(
output
,
max_val
=
100
):
o
=
eval
(
str
(
output
))
if
isinstance
(
o
,
bool
):
return
o
if
isinstance
(
o
,
int
):
if
o
>=
0
:
return
min
(
o
,
max_val
)
else
:
return
max
(
o
,
-
1
*
max_val
)
if
isinstance
(
o
,
list
):
return
[
clean_up
(
l
)
for
l
in
o
]
def
make_vocab
():
gen
(
2
,
0
)
def
gen
(
max_len
,
how_many
):
"""Generate some programs."""
functions
=
[
f_head
,
f_last
,
f_take
,
f_drop
,
f_access
,
f_max
,
f_min
,
f_reverse
,
f_sort
,
f_sum
,
f_map
,
f_filter
,
f_count
,
f_zipwith
,
f_scan
]
types_to_lambdas
=
{
FunctionType
([
"Int"
,
"Int"
]):
[
"plus_one"
,
"minus_one"
,
"times_two"
,
"div_two"
,
"sq"
,
"times_three"
,
"div_three"
,
"times_four"
,
"div_four"
],
FunctionType
([
"Int"
,
"Bool"
]):
[
"pos"
,
"neg"
,
"even"
,
"odd"
],
FunctionType
([
"Int"
,
"Int"
,
"Int"
]):
[
"add"
,
"sub"
,
"mul"
]
}
tokens
=
[]
for
f
in
functions
:
tokens
.
append
(
f
.
name
)
for
v
in
types_to_lambdas
.
values
():
tokens
.
extend
(
v
)
tokens
.
extend
([
"="
,
";"
,
","
,
"("
,
")"
,
"["
,
"]"
,
"Int"
,
"out"
])
tokens
.
extend
(
map
(
chr
,
range
(
97
,
123
)))
io_tokens
=
map
(
str
,
range
(
-
220
,
220
))
if
not
prog_vocab
:
prog_vocab
.
extend
([
"_PAD"
,
"_EOS"
]
+
tokens
+
io_tokens
)
for
i
,
t
in
enumerate
(
prog_vocab
):
prog_rev_vocab
[
t
]
=
i
io_tokens
+=
[
","
,
"["
,
"]"
,
")"
,
"("
,
"None"
]
grower
=
ProgramGrower
(
functions
=
functions
,
types_to_lambdas
=
types_to_lambdas
)
def
mk_inp
(
l
):
return
[
random
.
choice
(
range
(
-
5
,
5
))
for
_
in
range
(
l
)]
tar
=
[
ListType
(
"Int"
)]
inps
=
[[
mk_inp
(
3
)],
[
mk_inp
(
5
)],
[
mk_inp
(
7
)],
[
mk_inp
(
15
)]]
save_prefix
=
None
outcomes_to_programs
=
dict
()
tried
=
set
()
counter
=
0
choices
=
[
0
]
if
max_len
==
0
else
range
(
max_len
)
while
counter
<
100
*
how_many
and
len
(
outcomes_to_programs
)
<
how_many
:
counter
+=
1
length
=
random
.
choice
(
choices
)
t
=
grower
.
grow
(
length
,
tar
)
while
t
in
tried
:
length
=
random
.
choice
(
choices
)
t
=
grower
.
grow
(
length
,
tar
)
# print(t.flat_str())
tried
.
add
(
t
)
outcomes
=
[
clean_up
(
t
.
evaluate
(
i
))
for
i
in
inps
]
outcome_str
=
str
(
zip
(
inps
,
outcomes
))
if
outcome_str
in
outcomes_to_programs
:
outcomes_to_programs
[
outcome_str
]
=
min
(
[
t
.
flat_str
(),
outcomes_to_programs
[
outcome_str
]],
key
=
lambda
x
:
len
(
tokenize
(
x
,
tokens
)))
else
:
outcomes_to_programs
[
outcome_str
]
=
t
.
flat_str
()
if
counter
%
5000
==
0
:
print
"== proggen: tried: "
+
str
(
counter
)
print
"== proggen: kept: "
+
str
(
len
(
outcomes_to_programs
))
if
counter
%
250000
==
0
and
save_prefix
is
not
None
:
print
"saving..."
save_counter
=
0
progfilename
=
os
.
path
.
join
(
save_prefix
,
"prog_"
+
str
(
counter
)
+
".txt"
)
iofilename
=
os
.
path
.
join
(
save_prefix
,
"io_"
+
str
(
counter
)
+
".txt"
)
prog_token_filename
=
os
.
path
.
join
(
save_prefix
,
"prog_tokens_"
+
str
(
counter
)
+
".txt"
)
io_token_filename
=
os
.
path
.
join
(
save_prefix
,
"io_tokens_"
+
str
(
counter
)
+
".txt"
)
with
open
(
progfilename
,
"a+"
)
as
fp
,
\
open
(
iofilename
,
"a+"
)
as
fi
,
\
open
(
prog_token_filename
,
"a+"
)
as
ftp
,
\
open
(
io_token_filename
,
"a+"
)
as
fti
:
for
(
o
,
p
)
in
outcomes_to_programs
.
iteritems
():
save_counter
+=
1
if
save_counter
%
500
==
0
:
print
"saving %d of %d"
%
(
save_counter
,
len
(
outcomes_to_programs
))
fp
.
write
(
p
+
"
\n
"
)
fi
.
write
(
o
+
"
\n
"
)
ftp
.
write
(
str
(
tokenize
(
p
,
tokens
))
+
"
\n
"
)
fti
.
write
(
str
(
tokenize
(
o
,
io_tokens
))
+
"
\n
"
)
return
list
(
outcomes_to_programs
.
values
())
neural_gpu/wmt_utils.py
0 → 100644
View file @
a315e568
# Copyright 2015 Google Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Utilities for downloading data from WMT, tokenizing, vocabularies."""
import
gzip
import
os
import
re
import
tarfile
from
six.moves
import
urllib
import
tensorflow
as
tf
# Special vocabulary symbols - we always put them at the start.
_PAD
=
b
"_PAD"
_GO
=
b
"_GO"
_EOS
=
b
"_EOS"
_UNK
=
b
"_CHAR_UNK"
_SPACE
=
b
"_SPACE"
_START_VOCAB
=
[
_PAD
,
_GO
,
_EOS
,
_UNK
,
_SPACE
]
PAD_ID
=
0
GO_ID
=
1
EOS_ID
=
2
UNK_ID
=
3
SPACE_ID
=
4
# Regular expressions used to tokenize.
_CHAR_MARKER
=
"_CHAR_"
_CHAR_MARKER_LEN
=
len
(
_CHAR_MARKER
)
_SPEC_CHARS
=
""
+
chr
(
226
)
+
chr
(
153
)
+
chr
(
128
)
_PUNCTUATION
=
"][.,!?
\"
':;%$#@&*+}{|><=/^~)(_`,0123456789"
+
_SPEC_CHARS
+
"-"
_WORD_SPLIT
=
re
.
compile
(
b
"(["
+
_PUNCTUATION
+
"])"
)
_OLD_WORD_SPLIT
=
re
.
compile
(
b
"([.,!?
\"
':;)(])"
)
_DIGIT_RE
=
re
.
compile
(
br
"\d"
)
# URLs for WMT data.
_WMT_ENFR_TRAIN_URL
=
"http://www.statmt.org/wmt10/training-giga-fren.tar"
_WMT_ENFR_DEV_URL
=
"http://www.statmt.org/wmt15/dev-v2.tgz"
def
maybe_download
(
directory
,
filename
,
url
):
"""Download filename from url unless it's already in directory."""
if
not
tf
.
gfile
.
Exists
(
directory
):
print
"Creating directory %s"
%
directory
os
.
mkdir
(
directory
)
filepath
=
os
.
path
.
join
(
directory
,
filename
)
if
not
tf
.
gfile
.
Exists
(
filepath
):
print
"Downloading %s to %s"
%
(
url
,
filepath
)
filepath
,
_
=
urllib
.
request
.
urlretrieve
(
url
,
filepath
)
statinfo
=
os
.
stat
(
filepath
)
print
"Succesfully downloaded"
,
filename
,
statinfo
.
st_size
,
"bytes"
return
filepath
def
gunzip_file
(
gz_path
,
new_path
):
"""Unzips from gz_path into new_path."""
print
"Unpacking %s to %s"
%
(
gz_path
,
new_path
)
with
gzip
.
open
(
gz_path
,
"rb"
)
as
gz_file
:
with
open
(
new_path
,
"wb"
)
as
new_file
:
for
line
in
gz_file
:
new_file
.
write
(
line
)
def
get_wmt_enfr_train_set
(
directory
):
"""Download the WMT en-fr training corpus to directory unless it's there."""
train_path
=
os
.
path
.
join
(
directory
,
"giga-fren.release2.fixed"
)
if
not
(
tf
.
gfile
.
Exists
(
train_path
+
".fr"
)
and
tf
.
gfile
.
Exists
(
train_path
+
".en"
)):
corpus_file
=
maybe_download
(
directory
,
"training-giga-fren.tar"
,
_WMT_ENFR_TRAIN_URL
)
print
"Extracting tar file %s"
%
corpus_file
with
tarfile
.
open
(
corpus_file
,
"r"
)
as
corpus_tar
:
corpus_tar
.
extractall
(
directory
)
gunzip_file
(
train_path
+
".fr.gz"
,
train_path
+
".fr"
)
gunzip_file
(
train_path
+
".en.gz"
,
train_path
+
".en"
)
return
train_path
def
get_wmt_enfr_dev_set
(
directory
):
"""Download the WMT en-fr training corpus to directory unless it's there."""
dev_name
=
"newstest2013"
dev_path
=
os
.
path
.
join
(
directory
,
dev_name
)
if
not
(
tf
.
gfile
.
Exists
(
dev_path
+
".fr"
)
and
tf
.
gfile
.
Exists
(
dev_path
+
".en"
)):
dev_file
=
maybe_download
(
directory
,
"dev-v2.tgz"
,
_WMT_ENFR_DEV_URL
)
print
"Extracting tgz file %s"
%
dev_file
with
tarfile
.
open
(
dev_file
,
"r:gz"
)
as
dev_tar
:
fr_dev_file
=
dev_tar
.
getmember
(
"dev/"
+
dev_name
+
".fr"
)
en_dev_file
=
dev_tar
.
getmember
(
"dev/"
+
dev_name
+
".en"
)
fr_dev_file
.
name
=
dev_name
+
".fr"
# Extract without "dev/" prefix.
en_dev_file
.
name
=
dev_name
+
".en"
dev_tar
.
extract
(
fr_dev_file
,
directory
)
dev_tar
.
extract
(
en_dev_file
,
directory
)
return
dev_path
def
is_char
(
token
):
if
len
(
token
)
>
_CHAR_MARKER_LEN
:
if
token
[:
_CHAR_MARKER_LEN
]
==
_CHAR_MARKER
:
return
True
return
False
def
basic_detokenizer
(
tokens
):
"""Reverse the process of the basic tokenizer below."""
result
=
[]
previous_nospace
=
True
for
t
in
tokens
:
if
is_char
(
t
):
result
.
append
(
t
[
_CHAR_MARKER_LEN
:])
previous_nospace
=
True
elif
t
==
_SPACE
:
result
.
append
(
" "
)
previous_nospace
=
True
elif
previous_nospace
:
result
.
append
(
t
)
previous_nospace
=
False
else
:
result
.
extend
([
" "
,
t
])
previous_nospace
=
False
return
""
.
join
(
result
)
old_style
=
False
def
basic_tokenizer
(
sentence
):
"""Very basic tokenizer: split the sentence into a list of tokens."""
words
=
[]
if
old_style
:
for
space_separated_fragment
in
sentence
.
strip
().
split
():
words
.
extend
(
re
.
split
(
_OLD_WORD_SPLIT
,
space_separated_fragment
))
return
[
w
for
w
in
words
if
w
]
for
space_separated_fragment
in
sentence
.
strip
().
split
():
tokens
=
[
t
for
t
in
re
.
split
(
_WORD_SPLIT
,
space_separated_fragment
)
if
t
]
first_is_char
=
False
for
i
,
t
in
enumerate
(
tokens
):
if
len
(
t
)
==
1
and
t
in
_PUNCTUATION
:
tokens
[
i
]
=
_CHAR_MARKER
+
t
if
i
==
0
:
first_is_char
=
True
if
words
and
words
[
-
1
]
!=
_SPACE
and
(
first_is_char
or
is_char
(
words
[
-
1
])):
tokens
=
[
_SPACE
]
+
tokens
spaced_tokens
=
[]
for
i
,
tok
in
enumerate
(
tokens
):
spaced_tokens
.
append
(
tokens
[
i
])
if
i
<
len
(
tokens
)
-
1
:
if
tok
!=
_SPACE
and
not
(
is_char
(
tok
)
or
is_char
(
tokens
[
i
+
1
])):
spaced_tokens
.
append
(
_SPACE
)
words
.
extend
(
spaced_tokens
)
return
words
def
space_tokenizer
(
sentence
):
return
sentence
.
strip
().
split
()
def
is_pos_tag
(
token
):
"""Check if token is a part-of-speech tag."""
return
(
token
in
[
"CC"
,
"CD"
,
"DT"
,
"EX"
,
"FW"
,
"IN"
,
"JJ"
,
"JJR"
,
"JJS"
,
"LS"
,
"MD"
,
"NN"
,
"NNS"
,
"NNP"
,
"NNPS"
,
"PDT"
,
"POS"
,
"PRP"
,
"PRP$"
,
"RB"
,
"RBR"
,
"RBS"
,
"RP"
,
"SYM"
,
"TO"
,
"UH"
,
"VB"
,
"VBD"
,
"VBG"
,
"VBN"
,
"VBP"
,
"VBZ"
,
"WDT"
,
"WP"
,
"WP$"
,
"WRB"
,
"."
,
","
,
":"
,
")"
,
"-LRB-"
,
"("
,
"-RRB-"
,
"HYPH"
,
"$"
,
"``"
,
"''"
,
"ADD"
,
"AFX"
,
"QTR"
,
"BES"
,
"-DFL-"
,
"GW"
,
"HVS"
,
"NFP"
])
def
parse_constraints
(
inpt
,
res
):
ntags
=
len
(
res
)
nwords
=
len
(
inpt
)
npostags
=
len
([
x
for
x
in
res
if
is_pos_tag
(
x
)])
nclose
=
len
([
x
for
x
in
res
if
x
[
0
]
==
"/"
])
nopen
=
ntags
-
nclose
-
npostags
return
(
abs
(
npostags
-
nwords
),
abs
(
nclose
-
nopen
))
def
create_vocabulary
(
vocabulary_path
,
data_path
,
max_vocabulary_size
,
tokenizer
=
None
,
normalize_digits
=
False
):
"""Create vocabulary file (if it does not exist yet) from data file.
Data file is assumed to contain one sentence per line. Each sentence is
tokenized and digits are normalized (if normalize_digits is set).
Vocabulary contains the most-frequent tokens up to max_vocabulary_size.
We write it to vocabulary_path in a one-token-per-line format, so that later
token in the first line gets id=0, second line gets id=1, and so on.
Args:
vocabulary_path: path where the vocabulary will be created.
data_path: data file that will be used to create vocabulary.
max_vocabulary_size: limit on the size of the created vocabulary.
tokenizer: a function to use to tokenize each data sentence;
if None, basic_tokenizer will be used.
normalize_digits: Boolean; if true, all digits are replaced by 0s.
"""
if
not
tf
.
gfile
.
Exists
(
vocabulary_path
):
print
"Creating vocabulary %s from data %s"
%
(
vocabulary_path
,
data_path
)
vocab
,
chars
=
{},
{}
for
c
in
_PUNCTUATION
:
chars
[
c
]
=
1
# Read French file.
with
tf
.
gfile
.
GFile
(
data_path
+
".fr"
,
mode
=
"rb"
)
as
f
:
counter
=
0
for
line_in
in
f
:
line
=
" "
.
join
(
line_in
.
split
())
counter
+=
1
if
counter
%
100000
==
0
:
print
" processing fr line %d"
%
counter
for
c
in
line
:
if
c
in
chars
:
chars
[
c
]
+=
1
else
:
chars
[
c
]
=
1
tokens
=
tokenizer
(
line
)
if
tokenizer
else
basic_tokenizer
(
line
)
tokens
=
[
t
for
t
in
tokens
if
not
is_char
(
t
)
and
t
!=
_SPACE
]
for
w
in
tokens
:
word
=
re
.
sub
(
_DIGIT_RE
,
b
"0"
,
w
)
if
normalize_digits
else
w
if
word
in
vocab
:
vocab
[
word
]
+=
1000000000
# We want target words first.
else
:
vocab
[
word
]
=
1000000000
# Read English file.
with
tf
.
gfile
.
GFile
(
data_path
+
".en"
,
mode
=
"rb"
)
as
f
:
counter
=
0
for
line_in
in
f
:
line
=
" "
.
join
(
line_in
.
split
())
counter
+=
1
if
counter
%
100000
==
0
:
print
" processing en line %d"
%
counter
for
c
in
line
:
if
c
in
chars
:
chars
[
c
]
+=
1
else
:
chars
[
c
]
=
1
tokens
=
tokenizer
(
line
)
if
tokenizer
else
basic_tokenizer
(
line
)
tokens
=
[
t
for
t
in
tokens
if
not
is_char
(
t
)
and
t
!=
_SPACE
]
for
w
in
tokens
:
word
=
re
.
sub
(
_DIGIT_RE
,
b
"0"
,
w
)
if
normalize_digits
else
w
if
word
in
vocab
:
vocab
[
word
]
+=
1
else
:
vocab
[
word
]
=
1
sorted_vocab
=
sorted
(
vocab
,
key
=
vocab
.
get
,
reverse
=
True
)
sorted_chars
=
sorted
(
chars
,
key
=
vocab
.
get
,
reverse
=
True
)
sorted_chars
=
[
_CHAR_MARKER
+
c
for
c
in
sorted_chars
]
vocab_list
=
_START_VOCAB
+
sorted_chars
+
sorted_vocab
if
tokenizer
:
vocab_list
=
_START_VOCAB
+
sorted_vocab
if
len
(
vocab_list
)
>
max_vocabulary_size
:
vocab_list
=
vocab_list
[:
max_vocabulary_size
]
with
tf
.
gfile
.
GFile
(
vocabulary_path
,
mode
=
"wb"
)
as
vocab_file
:
for
w
in
vocab_list
:
vocab_file
.
write
(
w
+
b
"
\n
"
)
def
initialize_vocabulary
(
vocabulary_path
):
"""Initialize vocabulary from file.
We assume the vocabulary is stored one-item-per-line, so a file:
dog
cat
will result in a vocabulary {"dog": 0, "cat": 1}, and this function will
also return the reversed-vocabulary ["dog", "cat"].
Args:
vocabulary_path: path to the file containing the vocabulary.
Returns:
a pair: the vocabulary (a dictionary mapping string to integers), and
the reversed vocabulary (a list, which reverses the vocabulary mapping).
Raises:
ValueError: if the provided vocabulary_path does not exist.
"""
if
tf
.
gfile
.
Exists
(
vocabulary_path
):
rev_vocab
=
[]
with
tf
.
gfile
.
GFile
(
vocabulary_path
,
mode
=
"rb"
)
as
f
:
rev_vocab
.
extend
(
f
.
readlines
())
rev_vocab
=
[
line
.
strip
()
for
line
in
rev_vocab
]
vocab
=
dict
([(
x
,
y
)
for
(
y
,
x
)
in
enumerate
(
rev_vocab
)])
return
vocab
,
rev_vocab
else
:
raise
ValueError
(
"Vocabulary file %s not found."
,
vocabulary_path
)
def
sentence_to_token_ids_raw
(
sentence
,
vocabulary
,
tokenizer
=
None
,
normalize_digits
=
old_style
):
"""Convert a string to list of integers representing token-ids.
For example, a sentence "I have a dog" may become tokenized into
["I", "have", "a", "dog"] and with vocabulary {"I": 1, "have": 2,
"a": 4, "dog": 7"} this function will return [1, 2, 4, 7].
Args:
sentence: the sentence in bytes format to convert to token-ids.
vocabulary: a dictionary mapping tokens to integers.
tokenizer: a function to use to tokenize each sentence;
if None, basic_tokenizer will be used.
normalize_digits: Boolean; if true, all digits are replaced by 0s.
Returns:
a list of integers, the token-ids for the sentence.
"""
if
tokenizer
:
words
=
tokenizer
(
sentence
)
else
:
words
=
basic_tokenizer
(
sentence
)
result
=
[]
for
w
in
words
:
if
normalize_digits
:
w
=
re
.
sub
(
_DIGIT_RE
,
b
"0"
,
w
)
if
w
in
vocabulary
:
result
.
append
(
vocabulary
[
w
])
else
:
if
tokenizer
:
result
.
append
(
UNK_ID
)
else
:
result
.
append
(
SPACE_ID
)
for
c
in
w
:
result
.
append
(
vocabulary
.
get
(
_CHAR_MARKER
+
c
,
UNK_ID
))
result
.
append
(
SPACE_ID
)
while
result
and
result
[
0
]
==
SPACE_ID
:
result
=
result
[
1
:]
while
result
and
result
[
-
1
]
==
SPACE_ID
:
result
=
result
[:
-
1
]
return
result
def
sentence_to_token_ids
(
sentence
,
vocabulary
,
tokenizer
=
None
,
normalize_digits
=
old_style
):
"""Convert a string to list of integers representing token-ids, tab=0."""
tab_parts
=
sentence
.
strip
().
split
(
"
\t
"
)
toks
=
[
sentence_to_token_ids_raw
(
t
,
vocabulary
,
tokenizer
,
normalize_digits
)
for
t
in
tab_parts
]
res
=
[]
for
t
in
toks
:
res
.
extend
(
t
)
res
.
append
(
0
)
return
res
[:
-
1
]
def
data_to_token_ids
(
data_path
,
target_path
,
vocabulary_path
,
tokenizer
=
None
,
normalize_digits
=
False
):
"""Tokenize data file and turn into token-ids using given vocabulary file.
This function loads data line-by-line from data_path, calls the above
sentence_to_token_ids, and saves the result to target_path. See comment
for sentence_to_token_ids on the details of token-ids format.
Args:
data_path: path to the data file in one-sentence-per-line format.
target_path: path where the file with token-ids will be created.
vocabulary_path: path to the vocabulary file.
tokenizer: a function to use to tokenize each sentence;
if None, basic_tokenizer will be used.
normalize_digits: Boolean; if true, all digits are replaced by 0s.
"""
if
not
tf
.
gfile
.
Exists
(
target_path
):
print
"Tokenizing data in %s"
%
data_path
vocab
,
_
=
initialize_vocabulary
(
vocabulary_path
)
with
tf
.
gfile
.
GFile
(
data_path
,
mode
=
"rb"
)
as
data_file
:
with
tf
.
gfile
.
GFile
(
target_path
,
mode
=
"w"
)
as
tokens_file
:
counter
=
0
for
line
in
data_file
:
counter
+=
1
if
counter
%
100000
==
0
:
print
" tokenizing line %d"
%
counter
token_ids
=
sentence_to_token_ids
(
line
,
vocab
,
tokenizer
,
normalize_digits
)
tokens_file
.
write
(
" "
.
join
([
str
(
tok
)
for
tok
in
token_ids
])
+
"
\n
"
)
def
prepare_wmt_data
(
data_dir
,
vocabulary_size
,
tokenizer
=
None
,
normalize_digits
=
False
):
"""Get WMT data into data_dir, create vocabularies and tokenize data.
Args:
data_dir: directory in which the data sets will be stored.
vocabulary_size: size of the joint vocabulary to create and use.
tokenizer: a function to use to tokenize each data sentence;
if None, basic_tokenizer will be used.
normalize_digits: Boolean; if true, all digits are replaced by 0s.
Returns:
A tuple of 6 elements:
(1) path to the token-ids for English training data-set,
(2) path to the token-ids for French training data-set,
(3) path to the token-ids for English development data-set,
(4) path to the token-ids for French development data-set,
(5) path to the vocabulary file,
(6) path to the vocabulary file (for compatibility with non-joint vocab).
"""
# Get wmt data to the specified directory.
train_path
=
get_wmt_enfr_train_set
(
data_dir
)
dev_path
=
get_wmt_enfr_dev_set
(
data_dir
)
# Create vocabularies of the appropriate sizes.
vocab_path
=
os
.
path
.
join
(
data_dir
,
"vocab%d.txt"
%
vocabulary_size
)
create_vocabulary
(
vocab_path
,
train_path
,
vocabulary_size
,
tokenizer
=
tokenizer
,
normalize_digits
=
normalize_digits
)
# Create token ids for the training data.
fr_train_ids_path
=
train_path
+
(
".ids%d.fr"
%
vocabulary_size
)
en_train_ids_path
=
train_path
+
(
".ids%d.en"
%
vocabulary_size
)
data_to_token_ids
(
train_path
+
".fr"
,
fr_train_ids_path
,
vocab_path
,
tokenizer
=
tokenizer
,
normalize_digits
=
normalize_digits
)
data_to_token_ids
(
train_path
+
".en"
,
en_train_ids_path
,
vocab_path
,
tokenizer
=
tokenizer
,
normalize_digits
=
normalize_digits
)
# Create token ids for the development data.
fr_dev_ids_path
=
dev_path
+
(
".ids%d.fr"
%
vocabulary_size
)
en_dev_ids_path
=
dev_path
+
(
".ids%d.en"
%
vocabulary_size
)
data_to_token_ids
(
dev_path
+
".fr"
,
fr_dev_ids_path
,
vocab_path
,
tokenizer
=
tokenizer
,
normalize_digits
=
normalize_digits
)
data_to_token_ids
(
dev_path
+
".en"
,
en_dev_ids_path
,
vocab_path
,
tokenizer
=
tokenizer
,
normalize_digits
=
normalize_digits
)
return
(
en_train_ids_path
,
fr_train_ids_path
,
en_dev_ids_path
,
fr_dev_ids_path
,
vocab_path
,
vocab_path
)
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment