Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
ModelZoo
ResNet50_tensorflow
Commits
9af441a4
Commit
9af441a4
authored
Oct 15, 2019
by
Yeqing Li
Committed by
A. Unique TensorFlower
Oct 15, 2019
Browse files
Internal change
PiperOrigin-RevId: 274807747
parent
ddd45b81
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
115 additions
and
91 deletions
+115
-91
official/modeling/training/distributed_executor.py
official/modeling/training/distributed_executor.py
+82
-32
official/vision/detection/executor/detection_executor.py
official/vision/detection/executor/detection_executor.py
+33
-59
No files found.
official/modeling/training/distributed_executor.py
View file @
9af441a4
...
@@ -27,14 +27,13 @@ from absl import logging
...
@@ -27,14 +27,13 @@ from absl import logging
import
tensorflow
as
tf
import
tensorflow
as
tf
# pylint: disable=unused-import,g-import-not-at-top,redefined-outer-name,reimported
# pylint: disable=unused-import,g-import-not-at-top,redefined-outer-name,reimported
from
typing
import
Optional
,
Dict
,
Text
,
Callable
,
Union
,
Iterator
,
Any
from
typing
import
Optional
,
Dict
,
List
,
Text
,
Callable
,
Union
,
Iterator
,
Any
from
official.modeling.hyperparams
import
params_dict
from
official.modeling.hyperparams
import
params_dict
from
official.utils.misc
import
tpu_lib
from
official.utils.misc
import
tpu_lib
FLAGS
=
flags
.
FLAGS
FLAGS
=
flags
.
FLAGS
# TODO(yeqing): Move all the flags out of this file.
def
define_common_hparams_flags
():
def
define_common_hparams_flags
():
"""Define the common flags across models."""
"""Define the common flags across models."""
...
@@ -145,6 +144,13 @@ def _save_checkpoint(checkpoint, model_dir, checkpoint_prefix):
...
@@ -145,6 +144,13 @@ def _save_checkpoint(checkpoint, model_dir, checkpoint_prefix):
logging
.
info
(
'Saving model as TF checkpoint: %s'
,
saved_path
)
logging
.
info
(
'Saving model as TF checkpoint: %s'
,
saved_path
)
def
_steps_to_run
(
current_step
,
total_steps
,
steps_per_loop
):
"""Calculates steps to run on device."""
if
steps_per_loop
<=
0
:
raise
ValueError
(
'steps_per_loop should be positive integer.'
)
return
min
(
total_steps
-
current_step
,
steps_per_loop
)
def
_no_metric
():
def
_no_metric
():
return
None
return
None
...
@@ -270,7 +276,34 @@ class DistributedExecutor(object):
...
@@ -270,7 +276,34 @@ class DistributedExecutor(object):
input_data
=
input_fn
(
self
.
_params
)
input_data
=
input_fn
(
self
.
_params
)
return
iter
(
strategy
.
experimental_distribute_dataset
(
input_data
))
return
iter
(
strategy
.
experimental_distribute_dataset
(
input_data
))
# TODO(yeqing): Extract the train_step out as a class for re-usability.
def
_create_replicated_step
(
self
,
strategy
,
model
,
loss_fn
,
optimizer
,
metric
=
None
):
def
_replicated_step
(
inputs
):
"""Replicated training step."""
inputs
,
labels
=
inputs
with
tf
.
GradientTape
()
as
tape
:
outputs
=
model
(
inputs
,
training
=
True
)
prediction_loss
=
loss_fn
(
labels
,
outputs
)
loss
=
tf
.
reduce_mean
(
prediction_loss
)
loss
=
loss
/
strategy
.
num_replicas_in_sync
if
isinstance
(
metric
,
tf
.
keras
.
metrics
.
Metric
):
metric
.
update_state
(
labels
,
outputs
)
else
:
logging
.
error
(
'train metric is not an instance of '
'tf.keras.metrics.Metric.'
)
grads
=
tape
.
gradient
(
loss
,
model
.
trainable_variables
)
optimizer
.
apply_gradients
(
zip
(
grads
,
model
.
trainable_variables
))
return
loss
return
_replicated_step
def
_create_train_step
(
self
,
def
_create_train_step
(
self
,
strategy
,
strategy
,
model
,
model
,
...
@@ -290,9 +323,11 @@ class DistributedExecutor(object):
...
@@ -290,9 +323,11 @@ class DistributedExecutor(object):
Returns:
Returns:
The training step callable.
The training step callable.
"""
"""
_replicated_step
=
self
.
_create_replicated_step
(
strategy
,
model
,
loss_fn
,
optimizer
,
metric
)
@
tf
.
function
@
tf
.
function
def
train_step
(
iterator
):
def
train_step
(
iterator
,
num_steps
):
"""Performs a distributed training step.
"""Performs a distributed training step.
Args:
Args:
...
@@ -301,28 +336,15 @@ class DistributedExecutor(object):
...
@@ -301,28 +336,15 @@ class DistributedExecutor(object):
Returns:
Returns:
The loss tensor.
The loss tensor.
"""
"""
if
not
isinstance
(
num_steps
,
tf
.
Tensor
):
def
_replicated_step
(
inputs
):
raise
ValueError
(
'steps should be an Tensor. Python object may cause '
"""Replicated training step."""
'retracing.'
)
inputs
,
labels
=
inputs
with
tf
.
GradientTape
()
as
tape
:
outputs
=
model
(
inputs
,
training
=
True
)
prediction_loss
=
loss_fn
(
labels
,
outputs
)
loss
=
tf
.
reduce_mean
(
prediction_loss
)
loss
=
loss
/
strategy
.
num_replicas_in_sync
if
isinstance
(
metric
,
tf
.
keras
.
metrics
.
Metric
):
metric
.
update_state
(
labels
,
outputs
)
else
:
logging
.
error
(
'train metric is not an instance of '
'tf.keras.metrics.Metric.'
)
grads
=
tape
.
gradient
(
loss
,
model
.
trainable_variables
)
optimizer
.
apply_gradients
(
zip
(
grads
,
model
.
trainable_variables
))
return
loss
per_replica_losses
=
strategy
.
experimental_run_v2
(
per_replica_losses
=
strategy
.
experimental_run_v2
(
_replicated_step
,
args
=
(
next
(
iterator
),))
_replicated_step
,
args
=
(
next
(
iterator
),))
for
_
in
tf
.
range
(
num_steps
-
1
):
per_replica_losses
=
strategy
.
experimental_run_v2
(
_replicated_step
,
args
=
(
next
(
iterator
),))
# For reporting, we returns the mean of losses.
# For reporting, we returns the mean of losses.
loss
=
strategy
.
reduce
(
loss
=
strategy
.
reduce
(
...
@@ -368,6 +390,7 @@ class DistributedExecutor(object):
...
@@ -368,6 +390,7 @@ class DistributedExecutor(object):
summary_writer_fn
:
Callable
[[
Text
,
Text
],
summary_writer_fn
:
Callable
[[
Text
,
Text
],
SummaryWriter
]
=
SummaryWriter
,
SummaryWriter
]
=
SummaryWriter
,
init_checkpoint
:
Callable
[[
tf
.
keras
.
Model
],
Any
]
=
None
,
init_checkpoint
:
Callable
[[
tf
.
keras
.
Model
],
Any
]
=
None
,
custom_callbacks
:
List
[
tf
.
keras
.
callbacks
.
Callback
]
=
None
,
save_config
:
bool
=
True
):
save_config
:
bool
=
True
):
"""Runs distributed training.
"""Runs distributed training.
...
@@ -384,6 +407,9 @@ class DistributedExecutor(object):
...
@@ -384,6 +407,9 @@ class DistributedExecutor(object):
eval_metric_fn: metric_fn for evaluation in test_step.
eval_metric_fn: metric_fn for evaluation in test_step.
summary_writer_fn: function to create summary writer.
summary_writer_fn: function to create summary writer.
init_checkpoint: function to load checkpoint.
init_checkpoint: function to load checkpoint.
custom_callbacks: A list of Keras Callbacks objects to run during
training. More specifically, `on_batch_begin()`, `on_batch_end()`,
methods are invoked during training.
save_config: bool. Whether to save params to model_dir.
save_config: bool. Whether to save params to model_dir.
Returns:
Returns:
...
@@ -399,6 +425,25 @@ class DistributedExecutor(object):
...
@@ -399,6 +425,25 @@ class DistributedExecutor(object):
train_metric_fn
=
train_metric_fn
or
_no_metric
train_metric_fn
=
train_metric_fn
or
_no_metric
eval_metric_fn
=
eval_metric_fn
or
_no_metric
eval_metric_fn
=
eval_metric_fn
or
_no_metric
if
custom_callbacks
and
iterations_per_loop
!=
1
:
logging
.
error
(
'It is sematically wrong to run callbacks when '
'iterations_per_loop is not one (%s)'
,
iterations_per_loop
)
def
_run_callbacks_on_batch_begin
(
batch
):
"""Runs custom callbacks at the start of every step."""
if
not
custom_callbacks
:
return
for
callback
in
custom_callbacks
:
callback
.
on_batch_begin
(
batch
)
def
_run_callbacks_on_batch_end
(
batch
):
"""Runs custom callbacks at the end of every step."""
if
not
custom_callbacks
:
return
for
callback
in
custom_callbacks
:
callback
.
on_batch_end
(
batch
)
if
save_config
:
if
save_config
:
self
.
_save_config
(
model_dir
)
self
.
_save_config
(
model_dir
)
...
@@ -419,7 +464,6 @@ class DistributedExecutor(object):
...
@@ -419,7 +464,6 @@ class DistributedExecutor(object):
optimizer
=
model
.
optimizer
optimizer
=
model
.
optimizer
# Training loop starts here.
# Training loop starts here.
# TODO(yeqing): Implementing checkpoints with Callbacks.
checkpoint
=
tf
.
train
.
Checkpoint
(
model
=
model
,
optimizer
=
optimizer
)
checkpoint
=
tf
.
train
.
Checkpoint
(
model
=
model
,
optimizer
=
optimizer
)
latest_checkpoint_file
=
tf
.
train
.
latest_checkpoint
(
model_dir
)
latest_checkpoint_file
=
tf
.
train
.
latest_checkpoint
(
model_dir
)
initial_step
=
0
initial_step
=
0
...
@@ -446,19 +490,24 @@ class DistributedExecutor(object):
...
@@ -446,19 +490,24 @@ class DistributedExecutor(object):
# Continue training loop.
# Continue training loop.
train_step
=
self
.
_create_train_step
(
train_step
=
self
.
_create_train_step
(
strategy
,
model
,
self
.
loss_fn
(),
optimizer
,
metric
=
train_metric
)
strategy
=
strategy
,
model
=
model
,
loss_fn
=
self
.
loss_fn
(),
optimizer
=
optimizer
,
metric
=
train_metric
)
test_step
=
None
test_step
=
None
if
eval_input_fn
and
eval_metric
:
if
eval_input_fn
and
eval_metric
:
test_step
=
self
.
_create_test_step
(
strategy
,
model
,
metric
=
eval_metric
)
test_step
=
self
.
_create_test_step
(
strategy
,
model
,
metric
=
eval_metric
)
logging
.
info
(
'Training started'
)
logging
.
info
(
'Training started'
)
for
step
in
range
(
initial
_step
,
total_steps
)
:
while
current
_step
<
total_steps
:
current_step
=
step
+
1
num_steps
=
_steps_to_run
(
current_step
,
total_steps
,
iterations_per_loop
)
train_loss
=
train_step
(
train_iterator
)
_run_callbacks_on_batch_begin
(
current_step
)
if
current_step
%
iterations_per_loop
!=
0
:
train_loss
=
train_step
(
train_iterator
,
# Skip metric if run less than one epoch.
tf
.
convert_to_tensor
(
num_steps
,
dtype
=
tf
.
int32
))
continue
_run_callbacks_on_batch_end
(
current_step
)
current_step
+=
num_steps
train_loss
=
tf
.
nest
.
map_structure
(
lambda
x
:
x
.
numpy
().
astype
(
float
),
train_loss
=
tf
.
nest
.
map_structure
(
lambda
x
:
x
.
numpy
().
astype
(
float
),
train_loss
)
train_loss
)
...
@@ -487,7 +536,8 @@ class DistributedExecutor(object):
...
@@ -487,7 +536,8 @@ class DistributedExecutor(object):
train_summary_writer
(
train_summary_writer
(
metrics
=
train_metric_result
,
step
=
optimizer
.
iterations
)
metrics
=
train_metric_result
,
step
=
optimizer
.
iterations
)
# Saves model checkpoints and run validation steps at every epoch end.
# Saves model checkpoints and run validation steps at every
# iterations_per_loop steps.
# To avoid repeated model saving, we do not save after the last
# To avoid repeated model saving, we do not save after the last
# step of training.
# step of training.
if
current_step
<
total_steps
:
if
current_step
<
total_steps
:
...
...
official/vision/detection/executor/detection_executor.py
View file @
9af441a4
...
@@ -46,68 +46,42 @@ class DetectionDistributedExecutor(executor.DistributedExecutor):
...
@@ -46,68 +46,42 @@ class DetectionDistributedExecutor(executor.DistributedExecutor):
self
.
_predict_post_process_fn
=
predict_post_process_fn
self
.
_predict_post_process_fn
=
predict_post_process_fn
self
.
_trainable_variables_filter
=
trainable_variables_filter
self
.
_trainable_variables_filter
=
trainable_variables_filter
def
_create_train_step
(
self
,
def
_create_replicated_step
(
self
,
strategy
,
strategy
,
model
,
model
,
loss_fn
,
loss_fn
,
optimizer
,
optimizer
,
metric
=
None
):
metric
=
None
):
"""Creates a distributed training step."""
trainable_variables
=
model
.
trainable_variables
if
self
.
_trainable_variables_filter
:
@
tf
.
function
trainable_variables
=
self
.
_trainable_variables_filter
(
def
train_step
(
iterator
):
trainable_variables
)
"""Performs a distributed training step.
logging
.
info
(
'Filter trainable variables from %d to %d'
,
len
(
model
.
trainable_variables
),
len
(
trainable_variables
))
Args:
strategy: an instance of tf.distribute.Strategy.
def
_replicated_step
(
inputs
):
model: (Tensor, bool) -> Tensor. model function.
"""Replicated training step."""
loss_fn: (y_true: Tensor, y_pred: Tensor) -> Tensor.
inputs
,
labels
=
inputs
optimizer: tf.keras.optimizers.Optimizer.
iterator: an iterator that yields input tensors.
with
tf
.
GradientTape
()
as
tape
:
metric: eval metrics to be run outside the graph.
outputs
=
model
(
inputs
,
training
=
True
)
all_losses
=
loss_fn
(
labels
,
outputs
)
Returns:
losses
=
{}
The loss tensor.
for
k
,
v
in
all_losses
.
items
():
"""
v
=
tf
.
reduce_mean
(
v
)
/
strategy
.
num_replicas_in_sync
losses
[
k
]
=
v
def
_replicated_step
(
inputs
):
loss
=
losses
[
'total_loss'
]
"""Replicated training step."""
if
isinstance
(
metric
,
tf
.
keras
.
metrics
.
Metric
):
inputs
,
labels
=
inputs
metric
.
update_state
(
labels
,
outputs
)
else
:
logging
.
error
(
'train metric is not an instance of '
'tf.keras.metrics.Metric.'
)
with
tf
.
GradientTape
()
as
tape
:
grads
=
tape
.
gradient
(
loss
,
trainable_variables
)
outputs
=
model
(
inputs
,
training
=
True
)
optimizer
.
apply_gradients
(
zip
(
grads
,
trainable_variables
))
all_losses
=
loss_fn
(
labels
,
outputs
)
losses
=
{}
for
k
,
v
in
all_losses
.
items
():
v
=
tf
.
reduce_mean
(
v
)
/
strategy
.
num_replicas_in_sync
losses
[
k
]
=
v
loss
=
losses
[
'total_loss'
]
if
isinstance
(
metric
,
tf
.
keras
.
metrics
.
Metric
):
metric
.
update_state
(
labels
,
outputs
)
else
:
logging
.
error
(
'train metric is not an instance of '
'tf.keras.metrics.Metric.'
)
trainable_variables
=
model
.
trainable_variables
if
self
.
_trainable_variables_filter
:
trainable_variables
=
self
.
_trainable_variables_filter
(
trainable_variables
)
logging
.
info
(
'Filter trainable variables from %d to %d'
,
len
(
model
.
trainable_variables
),
len
(
trainable_variables
))
grads
=
tape
.
gradient
(
loss
,
trainable_variables
)
optimizer
.
apply_gradients
(
zip
(
grads
,
trainable_variables
))
# return losses, labels
return
loss
per_replica_losses
=
strategy
.
experimental_run_v2
(
_replicated_step
,
args
=
(
next
(
iterator
),))
# For reporting, we returns the mean of losses.
loss
=
strategy
.
reduce
(
tf
.
distribute
.
ReduceOp
.
MEAN
,
per_replica_losses
,
axis
=
None
)
return
loss
return
loss
return
train
_step
return
_replicated
_step
def
_create_test_step
(
self
,
strategy
,
model
,
metric
):
def
_create_test_step
(
self
,
strategy
,
model
,
metric
):
"""Creates a distributed test step."""
"""Creates a distributed test step."""
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment