Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
ModelZoo
ResNet50_tensorflow
Commits
fc2056bc
Commit
fc2056bc
authored
Oct 30, 2019
by
Hongkun Yu
Committed by
A. Unique TensorFlower
Oct 30, 2019
Browse files
Clean up primary_cpu_task. It is not necessary.
PiperOrigin-RevId: 277649735
parent
53eff257
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
45 additions
and
63 deletions
+45
-63
official/modeling/training/distributed_executor.py
official/modeling/training/distributed_executor.py
+1
-16
official/nlp/bert/run_squad.py
official/nlp/bert/run_squad.py
+44
-47
No files found.
official/modeling/training/distributed_executor.py
View file @
fc2056bc
...
...
@@ -131,13 +131,6 @@ def hparam_flags_dict():
}
def
primary_cpu_task
(
use_remote_tpu
=
False
):
"""Returns primary CPU task to which input pipeline Ops are put."""
# Remote Eager Borg job configures the TPU worker with job name 'worker'.
return
'/job:worker'
if
use_remote_tpu
else
''
def
_save_checkpoint
(
checkpoint
,
model_dir
,
checkpoint_prefix
):
"""Saves model to model_dir with provided checkpoint prefix."""
...
...
@@ -204,7 +197,6 @@ class DistributedExecutor(object):
metric_fn: metric function. Signature: () -> tf.keras.metrics.Metric.
is_multi_host: Set to True when using multi hosts for training, like multi
worker GPU or TPU pod (slice). Otherwise, False.
use_remote_tpu: If True, run on remote TPU mode.
"""
def
__init__
(
self
,
...
...
@@ -212,14 +204,12 @@ class DistributedExecutor(object):
params
,
model_fn
,
loss_fn
,
is_multi_host
=
False
,
use_remote_tpu
=
False
):
is_multi_host
=
False
):
self
.
_params
=
params
self
.
_model_fn
=
model_fn
self
.
_loss_fn
=
loss_fn
self
.
_strategy
=
strategy
self
.
_use_remote_tpu
=
use_remote_tpu
self
.
_checkpoint_name
=
'ctl_step_{step}.ckpt'
self
.
_is_multi_host
=
is_multi_host
...
...
@@ -867,11 +857,6 @@ class ExecutorBuilder(object):
raise
ValueError
(
'`strategy` should not be None. You need to specify '
'`strategy_type` in the builder contructor or directly '
'set the `strategy` property of the builder.'
)
if
'use_remote_tpu'
not
in
kwargs
:
use_remote_tpu
=
(
isinstance
(
self
.
_strategy
,
tf
.
distribute
.
experimental
.
TPUStrategy
)
and
bool
(
self
.
_strategy_config
.
tpu
))
kwargs
[
'use_remote_tpu'
]
=
use_remote_tpu
return
class_ctor
(
strategy
=
self
.
_strategy
,
params
=
params
,
...
...
official/nlp/bert/run_squad.py
View file @
fc2056bc
...
...
@@ -135,53 +135,50 @@ def get_raw_results(predictions):
def
predict_squad_customized
(
strategy
,
input_meta_data
,
bert_config
,
predict_tfrecord_path
,
num_steps
):
"""Make predictions using a Bert-based squad model."""
primary_cpu_task
=
'/job:worker'
if
FLAGS
.
tpu
else
''
with
tf
.
device
(
primary_cpu_task
):
predict_dataset
=
input_pipeline
.
create_squad_dataset
(
predict_tfrecord_path
,
input_meta_data
[
'max_seq_length'
],
FLAGS
.
predict_batch_size
,
is_training
=
False
)
predict_iterator
=
iter
(
strategy
.
experimental_distribute_dataset
(
predict_dataset
))
with
strategy
.
scope
():
# Prediction always uses float32, even if training uses mixed precision.
tf
.
keras
.
mixed_precision
.
experimental
.
set_policy
(
'float32'
)
squad_model
,
_
=
bert_models
.
squad_model
(
bert_config
,
input_meta_data
[
'max_seq_length'
],
float_type
=
tf
.
float32
)
checkpoint_path
=
tf
.
train
.
latest_checkpoint
(
FLAGS
.
model_dir
)
logging
.
info
(
'Restoring checkpoints from %s'
,
checkpoint_path
)
checkpoint
=
tf
.
train
.
Checkpoint
(
model
=
squad_model
)
checkpoint
.
restore
(
checkpoint_path
).
expect_partial
()
@
tf
.
function
def
predict_step
(
iterator
):
"""Predicts on distributed devices."""
def
_replicated_step
(
inputs
):
"""Replicated prediction calculation."""
x
,
_
=
inputs
unique_ids
,
start_logits
,
end_logits
=
squad_model
(
x
,
training
=
False
)
return
dict
(
unique_ids
=
unique_ids
,
start_logits
=
start_logits
,
end_logits
=
end_logits
)
outputs
=
strategy
.
experimental_run_v2
(
_replicated_step
,
args
=
(
next
(
iterator
),))
return
tf
.
nest
.
map_structure
(
strategy
.
experimental_local_results
,
outputs
)
all_results
=
[]
for
_
in
range
(
num_steps
):
predictions
=
predict_step
(
predict_iterator
)
for
result
in
get_raw_results
(
predictions
):
all_results
.
append
(
result
)
if
len
(
all_results
)
%
100
==
0
:
logging
.
info
(
'Made predictions for %d records.'
,
len
(
all_results
))
return
all_results
predict_dataset
=
input_pipeline
.
create_squad_dataset
(
predict_tfrecord_path
,
input_meta_data
[
'max_seq_length'
],
FLAGS
.
predict_batch_size
,
is_training
=
False
)
predict_iterator
=
iter
(
strategy
.
experimental_distribute_dataset
(
predict_dataset
))
with
strategy
.
scope
():
# Prediction always uses float32, even if training uses mixed precision.
tf
.
keras
.
mixed_precision
.
experimental
.
set_policy
(
'float32'
)
squad_model
,
_
=
bert_models
.
squad_model
(
bert_config
,
input_meta_data
[
'max_seq_length'
],
float_type
=
tf
.
float32
)
checkpoint_path
=
tf
.
train
.
latest_checkpoint
(
FLAGS
.
model_dir
)
logging
.
info
(
'Restoring checkpoints from %s'
,
checkpoint_path
)
checkpoint
=
tf
.
train
.
Checkpoint
(
model
=
squad_model
)
checkpoint
.
restore
(
checkpoint_path
).
expect_partial
()
@
tf
.
function
def
predict_step
(
iterator
):
"""Predicts on distributed devices."""
def
_replicated_step
(
inputs
):
"""Replicated prediction calculation."""
x
,
_
=
inputs
unique_ids
,
start_logits
,
end_logits
=
squad_model
(
x
,
training
=
False
)
return
dict
(
unique_ids
=
unique_ids
,
start_logits
=
start_logits
,
end_logits
=
end_logits
)
outputs
=
strategy
.
experimental_run_v2
(
_replicated_step
,
args
=
(
next
(
iterator
),))
return
tf
.
nest
.
map_structure
(
strategy
.
experimental_local_results
,
outputs
)
all_results
=
[]
for
_
in
range
(
num_steps
):
predictions
=
predict_step
(
predict_iterator
)
for
result
in
get_raw_results
(
predictions
):
all_results
.
append
(
result
)
if
len
(
all_results
)
%
100
==
0
:
logging
.
info
(
'Made predictions for %d records.'
,
len
(
all_results
))
return
all_results
def
train_squad
(
strategy
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment