Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
ModelZoo
ResNet50_tensorflow
Commits
c902a867
Commit
c902a867
authored
Jan 03, 2017
by
Viacheslav Kovalevskyi
Browse files
Ability to train the translation model on arbitrary input sources.
parent
0d9a3abd
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
96 additions
and
44 deletions
+96
-44
tutorials/rnn/translate/data_utils.py
tutorials/rnn/translate/data_utils.py
+50
-18
tutorials/rnn/translate/seq2seq_model.py
tutorials/rnn/translate/seq2seq_model.py
+8
-13
tutorials/rnn/translate/translate.py
tutorials/rnn/translate/translate.py
+38
-13
No files found.
tutorials/rnn/translate/data_utils.py
View file @
c902a867
...
@@ -239,8 +239,8 @@ def data_to_token_ids(data_path, target_path, vocabulary_path,
...
@@ -239,8 +239,8 @@ def data_to_token_ids(data_path, target_path, vocabulary_path,
counter
+=
1
counter
+=
1
if
counter
%
100000
==
0
:
if
counter
%
100000
==
0
:
print
(
" tokenizing line %d"
%
counter
)
print
(
" tokenizing line %d"
%
counter
)
token_ids
=
sentence_to_token_ids
(
tf
.
compat
.
as_bytes
(
line
)
,
vocab
,
token_ids
=
sentence_to_token_ids
(
line
,
vocab
,
tokenizer
,
tokenizer
,
normalize_digits
)
normalize_digits
)
tokens_file
.
write
(
" "
.
join
([
str
(
tok
)
for
tok
in
token_ids
])
+
"
\n
"
)
tokens_file
.
write
(
" "
.
join
([
str
(
tok
)
for
tok
in
token_ids
])
+
"
\n
"
)
...
@@ -267,24 +267,56 @@ def prepare_wmt_data(data_dir, en_vocabulary_size, fr_vocabulary_size, tokenizer
...
@@ -267,24 +267,56 @@ def prepare_wmt_data(data_dir, en_vocabulary_size, fr_vocabulary_size, tokenizer
train_path
=
get_wmt_enfr_train_set
(
data_dir
)
train_path
=
get_wmt_enfr_train_set
(
data_dir
)
dev_path
=
get_wmt_enfr_dev_set
(
data_dir
)
dev_path
=
get_wmt_enfr_dev_set
(
data_dir
)
from_train_path
=
train_path
+
".en"
to_train_path
=
train_path
+
".fr"
from_dev_path
=
dev_path
+
".en"
to_dev_path
=
dev_path
+
".fr"
return
prepare_data
(
data_dir
,
from_train_path
,
to_train_path
,
from_dev_path
,
to_dev_path
,
en_vocabulary_size
,
fr_vocabulary_size
,
tokenizer
)
def
prepare_data
(
data_dir
,
from_train_path
,
to_train_path
,
from_dev_path
,
to_dev_path
,
from_vocabulary_size
,
to_vocabulary_size
,
tokenizer
=
None
):
"""Preapre all necessary files that are required for the training.
Args:
data_dir: directory in which the data sets will be stored.
from_train_path: path to the file that includes "from" training samples.
to_train_path: path to the file that includes "to" training samples.
from_dev_path: path to the file that includes "from" dev samples.
to_dev_path: path to the file that includes "to" dev samples.
from_vocabulary_size: size of the "from language" vocabulary to create and use.
to_vocabulary_size: size of the "to language" vocabulary to create and use.
tokenizer: a function to use to tokenize each data sentence;
if None, basic_tokenizer will be used.
Returns:
A tuple of 6 elements:
(1) path to the token-ids for "from language" training data-set,
(2) path to the token-ids for "to language" training data-set,
(3) path to the token-ids for "from language" development data-set,
(4) path to the token-ids for "to language" development data-set,
(5) path to the "from language" vocabulary file,
(6) path to the "to language" vocabulary file.
"""
# Create vocabularies of the appropriate sizes.
# Create vocabularies of the appropriate sizes.
fr
_vocab_path
=
os
.
path
.
join
(
data_dir
,
"vocab%d
.fr
"
%
fr
_vocabulary_size
)
to
_vocab_path
=
os
.
path
.
join
(
data_dir
,
"vocab%d"
%
to
_vocabulary_size
)
en
_vocab_path
=
os
.
path
.
join
(
data_dir
,
"vocab%d
.en
"
%
en
_vocabulary_size
)
from
_vocab_path
=
os
.
path
.
join
(
data_dir
,
"vocab%d"
%
from
_vocabulary_size
)
create_vocabulary
(
fr
_vocab_path
,
train_path
+
".fr"
,
fr
_vocabulary_size
,
tokenizer
)
create_vocabulary
(
to
_vocab_path
,
to_
train_path
,
to
_vocabulary_size
,
tokenizer
)
create_vocabulary
(
en
_vocab_path
,
train_path
+
".en"
,
en
_vocabulary_size
,
tokenizer
)
create_vocabulary
(
from
_vocab_path
,
from_
train_path
,
from
_vocabulary_size
,
tokenizer
)
# Create token ids for the training data.
# Create token ids for the training data.
fr
_train_ids_path
=
train_path
+
(
".ids%d
.fr
"
%
fr
_vocabulary_size
)
to
_train_ids_path
=
to_
train_path
+
(
".ids%d"
%
to
_vocabulary_size
)
en
_train_ids_path
=
train_path
+
(
".ids%d
.en
"
%
en
_vocabulary_size
)
from
_train_ids_path
=
from_
train_path
+
(
".ids%d"
%
from
_vocabulary_size
)
data_to_token_ids
(
train_path
+
".fr"
,
fr
_train_ids_path
,
fr
_vocab_path
,
tokenizer
)
data_to_token_ids
(
to_
train_path
,
to
_train_ids_path
,
to
_vocab_path
,
tokenizer
)
data_to_token_ids
(
train_path
+
".en"
,
en
_train_ids_path
,
en
_vocab_path
,
tokenizer
)
data_to_token_ids
(
from_
train_path
,
from
_train_ids_path
,
from
_vocab_path
,
tokenizer
)
# Create token ids for the development data.
# Create token ids for the development data.
fr
_dev_ids_path
=
dev_path
+
(
".ids%d
.fr
"
%
fr
_vocabulary_size
)
to
_dev_ids_path
=
to_
dev_path
+
(
".ids%d"
%
to
_vocabulary_size
)
en
_dev_ids_path
=
dev_path
+
(
".ids%d
.en
"
%
en
_vocabulary_size
)
from
_dev_ids_path
=
from_
dev_path
+
(
".ids%d"
%
from
_vocabulary_size
)
data_to_token_ids
(
dev_path
+
".fr"
,
fr
_dev_ids_path
,
fr
_vocab_path
,
tokenizer
)
data_to_token_ids
(
to_
dev_path
,
to
_dev_ids_path
,
to
_vocab_path
,
tokenizer
)
data_to_token_ids
(
dev_path
+
".en"
,
en
_dev_ids_path
,
en
_vocab_path
,
tokenizer
)
data_to_token_ids
(
from_
dev_path
,
from
_dev_ids_path
,
from
_vocab_path
,
tokenizer
)
return
(
en
_train_ids_path
,
fr
_train_ids_path
,
return
(
from
_train_ids_path
,
to
_train_ids_path
,
en
_dev_ids_path
,
fr
_dev_ids_path
,
from
_dev_ids_path
,
to
_dev_ids_path
,
en
_vocab_path
,
fr
_vocab_path
)
from
_vocab_path
,
to
_vocab_path
)
tutorials/rnn/translate/seq2seq_model.py
View file @
c902a867
...
@@ -108,27 +108,22 @@ class Seq2SeqModel(object):
...
@@ -108,27 +108,22 @@ class Seq2SeqModel(object):
local_b
=
tf
.
cast
(
b
,
tf
.
float32
)
local_b
=
tf
.
cast
(
b
,
tf
.
float32
)
local_inputs
=
tf
.
cast
(
inputs
,
tf
.
float32
)
local_inputs
=
tf
.
cast
(
inputs
,
tf
.
float32
)
return
tf
.
cast
(
return
tf
.
cast
(
tf
.
nn
.
sampled_softmax_loss
(
tf
.
nn
.
sampled_softmax_loss
(
local_w_t
,
local_b
,
local_inputs
,
labels
,
weights
=
local_w_t
,
num_samples
,
self
.
target_vocab_size
),
biases
=
local_b
,
labels
=
labels
,
inputs
=
local_inputs
,
num_sampled
=
num_samples
,
num_classes
=
self
.
target_vocab_size
),
dtype
)
dtype
)
softmax_loss_function
=
sampled_loss
softmax_loss_function
=
sampled_loss
# Create the internal multi-layer cell for our RNN.
# Create the internal multi-layer cell for our RNN.
single_cell
=
tf
.
contrib
.
rnn
.
GRUCell
(
size
)
single_cell
=
tf
.
nn
.
rnn_cell
.
GRUCell
(
size
)
if
use_lstm
:
if
use_lstm
:
single_cell
=
tf
.
contrib
.
rnn
.
BasicLSTMCell
(
size
)
single_cell
=
tf
.
nn
.
rnn_cell
.
BasicLSTMCell
(
size
)
cell
=
single_cell
cell
=
single_cell
if
num_layers
>
1
:
if
num_layers
>
1
:
cell
=
tf
.
contrib
.
rnn
.
MultiRNNCell
([
single_cell
]
*
num_layers
)
cell
=
tf
.
nn
.
rnn_cell
.
MultiRNNCell
([
single_cell
]
*
num_layers
)
# The seq2seq function: we use embedding for the input and attention.
# The seq2seq function: we use embedding for the input and attention.
def
seq2seq_f
(
encoder_inputs
,
decoder_inputs
,
do_decode
):
def
seq2seq_f
(
encoder_inputs
,
decoder_inputs
,
do_decode
):
return
tf
.
contrib
.
legacy_
seq2seq
.
embedding_attention_seq2seq
(
return
tf
.
nn
.
seq2seq
.
embedding_attention_seq2seq
(
encoder_inputs
,
encoder_inputs
,
decoder_inputs
,
decoder_inputs
,
cell
,
cell
,
...
@@ -158,7 +153,7 @@ class Seq2SeqModel(object):
...
@@ -158,7 +153,7 @@ class Seq2SeqModel(object):
# Training outputs and losses.
# Training outputs and losses.
if
forward_only
:
if
forward_only
:
self
.
outputs
,
self
.
losses
=
tf
.
contrib
.
legacy_
seq2seq
.
model_with_buckets
(
self
.
outputs
,
self
.
losses
=
tf
.
nn
.
seq2seq
.
model_with_buckets
(
self
.
encoder_inputs
,
self
.
decoder_inputs
,
targets
,
self
.
encoder_inputs
,
self
.
decoder_inputs
,
targets
,
self
.
target_weights
,
buckets
,
lambda
x
,
y
:
seq2seq_f
(
x
,
y
,
True
),
self
.
target_weights
,
buckets
,
lambda
x
,
y
:
seq2seq_f
(
x
,
y
,
True
),
softmax_loss_function
=
softmax_loss_function
)
softmax_loss_function
=
softmax_loss_function
)
...
@@ -170,7 +165,7 @@ class Seq2SeqModel(object):
...
@@ -170,7 +165,7 @@ class Seq2SeqModel(object):
for
output
in
self
.
outputs
[
b
]
for
output
in
self
.
outputs
[
b
]
]
]
else
:
else
:
self
.
outputs
,
self
.
losses
=
tf
.
contrib
.
legacy_
seq2seq
.
model_with_buckets
(
self
.
outputs
,
self
.
losses
=
tf
.
nn
.
seq2seq
.
model_with_buckets
(
self
.
encoder_inputs
,
self
.
decoder_inputs
,
targets
,
self
.
encoder_inputs
,
self
.
decoder_inputs
,
targets
,
self
.
target_weights
,
buckets
,
self
.
target_weights
,
buckets
,
lambda
x
,
y
:
seq2seq_f
(
x
,
y
,
False
),
lambda
x
,
y
:
seq2seq_f
(
x
,
y
,
False
),
...
...
tutorials/rnn/translate/translate.py
View file @
c902a867
...
@@ -55,10 +55,14 @@ tf.app.flags.DEFINE_integer("batch_size", 64,
...
@@ -55,10 +55,14 @@ tf.app.flags.DEFINE_integer("batch_size", 64,
"Batch size to use during training."
)
"Batch size to use during training."
)
tf
.
app
.
flags
.
DEFINE_integer
(
"size"
,
1024
,
"Size of each model layer."
)
tf
.
app
.
flags
.
DEFINE_integer
(
"size"
,
1024
,
"Size of each model layer."
)
tf
.
app
.
flags
.
DEFINE_integer
(
"num_layers"
,
3
,
"Number of layers in the model."
)
tf
.
app
.
flags
.
DEFINE_integer
(
"num_layers"
,
3
,
"Number of layers in the model."
)
tf
.
app
.
flags
.
DEFINE_integer
(
"
en
_vocab_size"
,
40000
,
"English vocabulary size."
)
tf
.
app
.
flags
.
DEFINE_integer
(
"
from
_vocab_size"
,
40000
,
"English vocabulary size."
)
tf
.
app
.
flags
.
DEFINE_integer
(
"
fr
_vocab_size"
,
40000
,
"French vocabulary size."
)
tf
.
app
.
flags
.
DEFINE_integer
(
"
to
_vocab_size"
,
40000
,
"French vocabulary size."
)
tf
.
app
.
flags
.
DEFINE_string
(
"data_dir"
,
"/tmp"
,
"Data directory"
)
tf
.
app
.
flags
.
DEFINE_string
(
"data_dir"
,
"/tmp"
,
"Data directory"
)
tf
.
app
.
flags
.
DEFINE_string
(
"train_dir"
,
"/tmp"
,
"Training directory."
)
tf
.
app
.
flags
.
DEFINE_string
(
"train_dir"
,
"/tmp"
,
"Training directory."
)
tf
.
app
.
flags
.
DEFINE_string
(
"from_train_data"
,
None
,
"Training data."
)
tf
.
app
.
flags
.
DEFINE_string
(
"to_train_data"
,
None
,
"Training data."
)
tf
.
app
.
flags
.
DEFINE_string
(
"from_dev_data"
,
None
,
"Training data."
)
tf
.
app
.
flags
.
DEFINE_string
(
"to_dev_data"
,
None
,
"Training data."
)
tf
.
app
.
flags
.
DEFINE_integer
(
"max_train_data_size"
,
0
,
tf
.
app
.
flags
.
DEFINE_integer
(
"max_train_data_size"
,
0
,
"Limit on the size of training data (0: no limit)."
)
"Limit on the size of training data (0: no limit)."
)
tf
.
app
.
flags
.
DEFINE_integer
(
"steps_per_checkpoint"
,
200
,
tf
.
app
.
flags
.
DEFINE_integer
(
"steps_per_checkpoint"
,
200
,
...
@@ -119,8 +123,8 @@ def create_model(session, forward_only):
...
@@ -119,8 +123,8 @@ def create_model(session, forward_only):
"""Create translation model and initialize or load parameters in session."""
"""Create translation model and initialize or load parameters in session."""
dtype
=
tf
.
float16
if
FLAGS
.
use_fp16
else
tf
.
float32
dtype
=
tf
.
float16
if
FLAGS
.
use_fp16
else
tf
.
float32
model
=
seq2seq_model
.
Seq2SeqModel
(
model
=
seq2seq_model
.
Seq2SeqModel
(
FLAGS
.
en
_vocab_size
,
FLAGS
.
from
_vocab_size
,
FLAGS
.
fr
_vocab_size
,
FLAGS
.
to
_vocab_size
,
_buckets
,
_buckets
,
FLAGS
.
size
,
FLAGS
.
size
,
FLAGS
.
num_layers
,
FLAGS
.
num_layers
,
...
@@ -142,10 +146,31 @@ def create_model(session, forward_only):
...
@@ -142,10 +146,31 @@ def create_model(session, forward_only):
def
train
():
def
train
():
"""Train a en->fr translation model using WMT data."""
"""Train a en->fr translation model using WMT data."""
# Prepare WMT data.
from_train
=
None
print
(
"Preparing WMT data in %s"
%
FLAGS
.
data_dir
)
to_train
=
None
en_train
,
fr_train
,
en_dev
,
fr_dev
,
_
,
_
=
data_utils
.
prepare_wmt_data
(
from_dev
=
None
FLAGS
.
data_dir
,
FLAGS
.
en_vocab_size
,
FLAGS
.
fr_vocab_size
)
to_dev
=
None
if
FLAGS
.
from_train_data
and
FLAGS
.
to_train_data
:
from_train_data
=
FLAGS
.
from_train_data
to_train_data
=
FLAGS
.
to_train_data
from_dev_data
=
from_train_data
to_dev_data
=
to_train_data
if
FLAGS
.
from_dev_data
and
FLAGS
.
to_dev_data
:
from_dev_data
=
FLAGS
.
from_dev_data
to_dev_data
=
FLAGS
.
to_dev_data
from_train
,
to_train
,
from_dev
,
to_dev
,
_
,
_
=
data_utils
.
prepare_data
(
FLAGS
.
data_dir
,
from_train_data
,
to_train_data
,
from_dev_data
,
to_dev_data
,
FLAGS
.
from_vocab_size
,
FLAGS
.
to_vocab_size
)
else
:
# Prepare WMT data.
print
(
"Preparing WMT data in %s"
%
FLAGS
.
data_dir
)
from_train
,
to_train
,
from_dev
,
to_dev
,
_
,
_
=
data_utils
.
prepare_wmt_data
(
FLAGS
.
data_dir
,
FLAGS
.
from_vocab_size
,
FLAGS
.
to_vocab_size
)
with
tf
.
Session
()
as
sess
:
with
tf
.
Session
()
as
sess
:
# Create model.
# Create model.
...
@@ -155,8 +180,8 @@ def train():
...
@@ -155,8 +180,8 @@ def train():
# Read data into buckets and compute their sizes.
# Read data into buckets and compute their sizes.
print
(
"Reading development and training data (limit: %d)."
print
(
"Reading development and training data (limit: %d)."
%
FLAGS
.
max_train_data_size
)
%
FLAGS
.
max_train_data_size
)
dev_set
=
read_data
(
en
_dev
,
fr
_dev
)
dev_set
=
read_data
(
from
_dev
,
to
_dev
)
train_set
=
read_data
(
en
_train
,
fr
_train
,
FLAGS
.
max_train_data_size
)
train_set
=
read_data
(
from
_train
,
to
_train
,
FLAGS
.
max_train_data_size
)
train_bucket_sizes
=
[
len
(
train_set
[
b
])
for
b
in
xrange
(
len
(
_buckets
))]
train_bucket_sizes
=
[
len
(
train_set
[
b
])
for
b
in
xrange
(
len
(
_buckets
))]
train_total_size
=
float
(
sum
(
train_bucket_sizes
))
train_total_size
=
float
(
sum
(
train_bucket_sizes
))
...
@@ -225,9 +250,9 @@ def decode():
...
@@ -225,9 +250,9 @@ def decode():
# Load vocabularies.
# Load vocabularies.
en_vocab_path
=
os
.
path
.
join
(
FLAGS
.
data_dir
,
en_vocab_path
=
os
.
path
.
join
(
FLAGS
.
data_dir
,
"vocab%d.
en
"
%
FLAGS
.
en
_vocab_size
)
"vocab%d.
from
"
%
FLAGS
.
from
_vocab_size
)
fr_vocab_path
=
os
.
path
.
join
(
FLAGS
.
data_dir
,
fr_vocab_path
=
os
.
path
.
join
(
FLAGS
.
data_dir
,
"vocab%d.
fr
"
%
FLAGS
.
fr
_vocab_size
)
"vocab%d.
to
"
%
FLAGS
.
to
_vocab_size
)
en_vocab
,
_
=
data_utils
.
initialize_vocabulary
(
en_vocab_path
)
en_vocab
,
_
=
data_utils
.
initialize_vocabulary
(
en_vocab_path
)
_
,
rev_fr_vocab
=
data_utils
.
initialize_vocabulary
(
fr_vocab_path
)
_
,
rev_fr_vocab
=
data_utils
.
initialize_vocabulary
(
fr_vocab_path
)
...
@@ -245,7 +270,7 @@ def decode():
...
@@ -245,7 +270,7 @@ def decode():
bucket_id
=
i
bucket_id
=
i
break
break
else
:
else
:
logging
.
warning
(
"Sentence truncated: %s"
,
sentence
)
logging
.
warning
(
"Sentence truncated: %s"
,
sentence
)
# Get a 1-element batch to feed the sentence to the model.
# Get a 1-element batch to feed the sentence to the model.
encoder_inputs
,
decoder_inputs
,
target_weights
=
model
.
get_batch
(
encoder_inputs
,
decoder_inputs
,
target_weights
=
model
.
get_batch
(
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment