Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
ModelZoo
ResNet50_tensorflow
Commits
85cfe94d
Commit
85cfe94d
authored
Jun 16, 2020
by
Tianqi Liu
Committed by
A. Unique TensorFlower
Jun 16, 2020
Browse files
Internal cleanup.
PiperOrigin-RevId: 316734574
parent
c64cb01b
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
44 additions
and
35 deletions
+44
-35
official/nlp/bert/model_training_utils.py
official/nlp/bert/model_training_utils.py
+38
-31
official/nlp/bert/model_training_utils_test.py
official/nlp/bert/model_training_utils_test.py
+6
-4
No files found.
official/nlp/bert/model_training_utils.py
View file @
85cfe94d
...
...
@@ -111,6 +111,7 @@ def run_customized_training_loop(
model_dir
=
None
,
train_input_fn
=
None
,
steps_per_epoch
=
None
,
num_eval_per_epoch
=
1
,
steps_per_loop
=
None
,
epochs
=
1
,
eval_input_fn
=
None
,
...
...
@@ -144,6 +145,7 @@ def run_customized_training_loop(
steps_per_epoch: Number of steps to run per epoch. At the end of each
epoch, model checkpoint will be saved and evaluation will be conducted
if evaluation dataset is provided.
num_eval_per_epoch: Number of evaluations per epoch.
steps_per_loop: Number of steps per graph-mode loop. In order to reduce
communication in eager context, training logs are printed every
steps_per_loop.
...
...
@@ -166,8 +168,8 @@ def run_customized_training_loop(
sub_model_export_name: If not None, will export `sub_model` returned by
`model_fn` into checkpoint files. The name of intermediate checkpoint
file is {sub_model_export_name}_step_{step}.ckpt and the last
checkpint's name is {sub_model_export_name}.ckpt;
if None, `sub_model`
will not be exported as checkpoint.
checkpint's name is {sub_model_export_name}.ckpt;
if None, `sub_model`
will not be exported as checkpoint.
explicit_allreduce: Whether to explicitly perform gradient allreduce,
instead of relying on implicit allreduce in optimizer.apply_gradients().
default is False. For now, if training using FP16 mixed precision,
...
...
@@ -177,10 +179,10 @@ def run_customized_training_loop(
pre_allreduce_callbacks: A list of callback functions that takes gradients
and model variables pairs as input, manipulate them, and returns a new
gradients and model variables paris. The callback functions will be
invoked in the list order and before gradients are allreduced.
With
mixed precision training, the pre_allreduce_allbacks will be
applied on
scaled_gradients. Default is no callbacks.
Only used when
explicit_allreduce=True.
invoked in the list order and before gradients are allreduced.
With
mixed precision training, the pre_allreduce_allbacks will be
applied on
scaled_gradients. Default is no callbacks.
Only used when
explicit_allreduce=True.
post_allreduce_callbacks: A list of callback functions that takes
gradients and model variables pairs as input, manipulate them, and
returns a new gradients and model variables paris. The callback
...
...
@@ -208,6 +210,8 @@ def run_customized_training_loop(
required_arguments
=
[
strategy
,
model_fn
,
loss_fn
,
model_dir
,
steps_per_epoch
,
train_input_fn
]
steps_between_evals
=
int
(
steps_per_epoch
/
num_eval_per_epoch
)
if
[
arg
for
arg
in
required_arguments
if
arg
is
None
]:
raise
ValueError
(
'`strategy`, `model_fn`, `loss_fn`, `model_dir`, '
'`steps_per_epoch` and `train_input_fn` are required '
...
...
@@ -216,17 +220,17 @@ def run_customized_training_loop(
if
tf
.
config
.
list_logical_devices
(
'TPU'
):
# One can't fully utilize a TPU with steps_per_loop=1, so in this case
# default users to a more useful value.
steps_per_loop
=
min
(
1000
,
steps_
per_epoch
)
steps_per_loop
=
min
(
1000
,
steps_
between_evals
)
else
:
steps_per_loop
=
1
logging
.
info
(
'steps_per_loop not specified. Using steps_per_loop=%d'
,
steps_per_loop
)
if
steps_per_loop
>
steps_
per_epoch
:
if
steps_per_loop
>
steps_
between_evals
:
logging
.
warning
(
'steps_per_loop: %d is specified to be greater than '
' steps_
per_epoch
: %d, we will use steps_
per_epoch
as'
' steps_per_loop.'
,
steps_per_loop
,
steps_
per_epoch
)
steps_per_loop
=
steps_
per_epoch
' steps_
between_evals
: %d, we will use steps_
between_evals
as'
' steps_per_loop.'
,
steps_per_loop
,
steps_
between_evals
)
steps_per_loop
=
steps_
between_evals
assert
tf
.
executing_eagerly
()
if
run_eagerly
:
...
...
@@ -246,8 +250,7 @@ def run_customized_training_loop(
total_training_steps
=
steps_per_epoch
*
epochs
train_iterator
=
_get_input_iterator
(
train_input_fn
,
strategy
)
eval_loss_metric
=
tf
.
keras
.
metrics
.
Mean
(
'training_loss'
,
dtype
=
tf
.
float32
)
eval_loss_metric
=
tf
.
keras
.
metrics
.
Mean
(
'training_loss'
,
dtype
=
tf
.
float32
)
with
distribution_utils
.
get_strategy_scope
(
strategy
):
# To correctly place the model weights on accelerators,
...
...
@@ -270,8 +273,7 @@ def run_customized_training_loop(
checkpoint
.
restore
(
init_checkpoint
).
assert_existing_objects_matched
()
logging
.
info
(
'Loading from checkpoint file completed'
)
train_loss_metric
=
tf
.
keras
.
metrics
.
Mean
(
'training_loss'
,
dtype
=
tf
.
float32
)
train_loss_metric
=
tf
.
keras
.
metrics
.
Mean
(
'training_loss'
,
dtype
=
tf
.
float32
)
eval_metrics
=
[
metric_fn
()]
if
metric_fn
else
[]
# If evaluation is required, make a copy of metric as it will be used by
# both train and evaluation.
...
...
@@ -440,18 +442,19 @@ def run_customized_training_loop(
latest_checkpoint_file
=
tf
.
train
.
latest_checkpoint
(
model_dir
)
if
latest_checkpoint_file
:
logging
.
info
(
'Checkpoint file %s found and restoring from '
'checkpoint'
,
latest_checkpoint_file
)
logging
.
info
(
'Checkpoint file %s found and restoring from '
'checkpoint'
,
latest_checkpoint_file
)
checkpoint
.
restore
(
latest_checkpoint_file
)
logging
.
info
(
'Loading from checkpoint file completed'
)
current_step
=
optimizer
.
iterations
.
numpy
()
checkpoint_name
=
'ctl_step_{step}.ckpt'
logs
=
{}
while
current_step
<
total_training_steps
:
if
current_step
%
steps_per_epoch
==
0
:
callback_list
.
on_epoch_begin
(
int
(
current_step
/
steps_per_epoch
)
+
1
)
callback_list
.
on_epoch_begin
(
int
(
current_step
/
steps_per_epoch
)
+
1
)
# Training loss/metric are taking average over steps inside micro
# training loop. We reset the their values before each round.
...
...
@@ -461,7 +464,7 @@ def run_customized_training_loop(
callback_list
.
on_batch_begin
(
current_step
)
# Runs several steps in the host while loop.
steps
=
steps_to_run
(
current_step
,
steps_
per_epoch
,
steps_per_loop
)
steps
=
steps_to_run
(
current_step
,
steps_
between_evals
,
steps_per_loop
)
if
tf
.
config
.
list_physical_devices
(
'GPU'
):
# TODO(zongweiz): merge with train_steps once tf.while_loop
...
...
@@ -470,11 +473,9 @@ def run_customized_training_loop(
train_single_step
(
train_iterator
)
else
:
# Converts steps to a Tensor to avoid tf.function retracing.
train_steps
(
train_iterator
,
tf
.
convert_to_tensor
(
steps
,
dtype
=
tf
.
int32
))
train_steps
(
train_iterator
,
tf
.
convert_to_tensor
(
steps
,
dtype
=
tf
.
int32
))
train_loss
=
_float_metric_value
(
train_loss_metric
)
current_step
+=
steps
callback_list
.
on_batch_end
(
current_step
-
1
,
{
'loss'
:
train_loss
})
# Updates training logging.
training_status
=
'Train Step: %d/%d / loss = %s'
%
(
...
...
@@ -492,8 +493,7 @@ def run_customized_training_loop(
'learning_rate'
,
optimizer
.
learning_rate
(
current_step
),
step
=
current_step
)
tf
.
summary
.
scalar
(
train_loss_metric
.
name
,
train_loss
,
step
=
current_step
)
tf
.
summary
.
scalar
(
train_loss_metric
.
name
,
train_loss
,
step
=
current_step
)
for
metric
in
train_metrics
+
model
.
metrics
:
metric_value
=
_float_metric_value
(
metric
)
training_status
+=
' %s = %f'
%
(
metric
.
name
,
metric_value
)
...
...
@@ -501,7 +501,11 @@ def run_customized_training_loop(
summary_writer
.
flush
()
logging
.
info
(
training_status
)
if
current_step
%
steps_per_epoch
==
0
:
# If no need for evaluation, we only call on_batch_end with train_loss,
# this is to ensure we get granular global_step/sec on Tensorboard.
if
current_step
%
steps_between_evals
:
callback_list
.
on_batch_end
(
current_step
-
1
,
{
'loss'
:
train_loss
})
else
:
# Save a submodel with the step in the file name after each epoch.
if
sub_model_export_name
:
_save_checkpoint
(
...
...
@@ -514,7 +518,6 @@ def run_customized_training_loop(
if
current_step
<
total_training_steps
:
_save_checkpoint
(
strategy
,
checkpoint
,
model_dir
,
checkpoint_name
.
format
(
step
=
current_step
))
logs
=
None
if
eval_input_fn
:
logging
.
info
(
'Running evaluation after step: %s.'
,
current_step
)
logs
=
_run_evaluation
(
current_step
,
...
...
@@ -523,8 +526,15 @@ def run_customized_training_loop(
eval_loss_metric
.
reset_states
()
for
metric
in
eval_metrics
+
model
.
metrics
:
metric
.
reset_states
()
# We add train_loss here rather than call on_batch_end twice to make
# sure that no duplicated values are generated.
logs
[
'loss'
]
=
train_loss
callback_list
.
on_batch_end
(
current_step
-
1
,
logs
)
callback_list
.
on_epoch_end
(
int
(
current_step
/
steps_per_epoch
),
logs
)
# Calls on_epoch_end after each real epoch ends to prevent mis-calculation
# of training steps.
if
current_step
%
steps_per_epoch
==
0
:
callback_list
.
on_epoch_end
(
int
(
current_step
/
steps_per_epoch
),
logs
)
if
sub_model_export_name
:
_save_checkpoint
(
strategy
,
sub_model_checkpoint
,
model_dir
,
...
...
@@ -532,14 +542,11 @@ def run_customized_training_loop(
_save_checkpoint
(
strategy
,
checkpoint
,
model_dir
,
checkpoint_name
.
format
(
step
=
current_step
))
logs
=
None
if
eval_input_fn
:
logging
.
info
(
'Running final evaluation after training is complete.'
)
logs
=
_run_evaluation
(
current_step
,
_get_input_iterator
(
eval_input_fn
,
strategy
))
callback_list
.
on_epoch_end
(
int
(
current_step
/
steps_per_epoch
),
logs
)
training_summary
=
{
'total_training_steps'
:
total_training_steps
,
'train_loss'
:
_float_metric_value
(
train_loss_metric
),
...
...
official/nlp/bert/model_training_utils_test.py
View file @
85cfe94d
...
...
@@ -258,6 +258,7 @@ class ModelTrainingUtilsTest(tf.test.TestCase, parameterized.TestCase):
loss_fn
=
tf
.
keras
.
losses
.
categorical_crossentropy
,
model_dir
=
model_dir
,
steps_per_epoch
=
20
,
num_eval_per_epoch
=
4
,
steps_per_loop
=
10
,
epochs
=
2
,
train_input_fn
=
input_fn
,
...
...
@@ -269,14 +270,15 @@ class ModelTrainingUtilsTest(tf.test.TestCase, parameterized.TestCase):
run_eagerly
=
False
)
self
.
assertEqual
(
callback
.
epoch_begin
,
[(
1
,
{}),
(
2
,
{})])
epoch_ends
,
epoch_end_infos
=
zip
(
*
callback
.
epoch_end
)
self
.
assertEqual
(
list
(
epoch_ends
),
[
1
,
2
])
self
.
assertEqual
(
list
(
epoch_ends
),
[
1
,
2
,
2
])
for
info
in
epoch_end_infos
:
self
.
assertIn
(
'accuracy'
,
info
)
self
.
assertEqual
(
callback
.
batch_begin
,
[(
0
,
{}),
(
10
,
{}),
(
20
,
{}),
(
30
,
{})])
self
.
assertEqual
(
callback
.
batch_begin
,
[(
0
,
{}),
(
5
,
{}),
(
10
,
{}),
(
15
,
{}),
(
20
,
{}),
(
25
,
{}),
(
30
,
{}),
(
35
,
{})])
batch_ends
,
batch_end_infos
=
zip
(
*
callback
.
batch_end
)
self
.
assertEqual
(
list
(
batch_ends
),
[
9
,
1
9
,
29
,
39
])
self
.
assertEqual
(
list
(
batch_ends
),
[
4
,
9
,
1
4
,
19
,
24
,
29
,
34
,
39
])
for
info
in
batch_end_infos
:
self
.
assertIn
(
'loss'
,
info
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment