Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
ModelZoo
ResNet50_tensorflow
Commits
fc02382c
Commit
fc02382c
authored
Mar 30, 2020
by
Hongkun Yu
Committed by
A. Unique TensorFlower
Mar 30, 2020
Browse files
Move a R1 specific util function from common utils to R1 models.
PiperOrigin-RevId: 303767122
parent
01d1931f
Changes
4
Show whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
65 additions
and
51 deletions
+65
-51
official/r1/resnet/resnet_run_loop.py
official/r1/resnet/resnet_run_loop.py
+33
-2
official/r1/transformer/transformer_main.py
official/r1/transformer/transformer_main.py
+32
-2
official/utils/misc/distribution_utils.py
official/utils/misc/distribution_utils.py
+0
-31
official/utils/misc/distribution_utils_test.py
official/utils/misc/distribution_utils_test.py
+0
-16
No files found.
official/r1/resnet/resnet_run_loop.py
View file @
fc02382c
...
@@ -329,6 +329,37 @@ def learning_rate_with_decay(
...
@@ -329,6 +329,37 @@ def learning_rate_with_decay(
return
learning_rate_fn
return
learning_rate_fn
def
per_replica_batch_size
(
batch_size
,
num_gpus
):
"""For multi-gpu, batch-size must be a multiple of the number of GPUs.
Note that distribution strategy handles this automatically when used with
Keras. For using with Estimator, we need to get per GPU batch.
Args:
batch_size: Global batch size to be divided among devices. This should be
equal to num_gpus times the single-GPU batch_size for multi-gpu training.
num_gpus: How many GPUs are used with DistributionStrategies.
Returns:
Batch size per device.
Raises:
ValueError: if batch_size is not divisible by number of devices
"""
if
num_gpus
<=
1
:
return
batch_size
remainder
=
batch_size
%
num_gpus
if
remainder
:
err
=
(
'When running with multiple GPUs, batch size '
'must be a multiple of the number of available GPUs. Found {} '
'GPUs with a batch size of {}; try --batch_size={} instead.'
).
format
(
num_gpus
,
batch_size
,
batch_size
-
remainder
)
raise
ValueError
(
err
)
return
int
(
batch_size
/
num_gpus
)
def
resnet_model_fn
(
features
,
labels
,
mode
,
model_class
,
def
resnet_model_fn
(
features
,
labels
,
mode
,
model_class
,
resnet_size
,
weight_decay
,
learning_rate_fn
,
momentum
,
resnet_size
,
weight_decay
,
learning_rate_fn
,
momentum
,
data_format
,
resnet_version
,
loss_scale
,
data_format
,
resnet_version
,
loss_scale
,
...
@@ -620,7 +651,7 @@ def resnet_main(
...
@@ -620,7 +651,7 @@ def resnet_main(
return
input_function
(
return
input_function
(
is_training
=
True
,
is_training
=
True
,
data_dir
=
flags_obj
.
data_dir
,
data_dir
=
flags_obj
.
data_dir
,
batch_size
=
distribution_utils
.
per_replica_batch_size
(
batch_size
=
per_replica_batch_size
(
flags_obj
.
batch_size
,
flags_core
.
get_num_gpus
(
flags_obj
)),
flags_obj
.
batch_size
,
flags_core
.
get_num_gpus
(
flags_obj
)),
num_epochs
=
num_epochs
,
num_epochs
=
num_epochs
,
dtype
=
flags_core
.
get_tf_dtype
(
flags_obj
),
dtype
=
flags_core
.
get_tf_dtype
(
flags_obj
),
...
@@ -631,7 +662,7 @@ def resnet_main(
...
@@ -631,7 +662,7 @@ def resnet_main(
return
input_function
(
return
input_function
(
is_training
=
False
,
is_training
=
False
,
data_dir
=
flags_obj
.
data_dir
,
data_dir
=
flags_obj
.
data_dir
,
batch_size
=
distribution_utils
.
per_replica_batch_size
(
batch_size
=
per_replica_batch_size
(
flags_obj
.
batch_size
,
flags_core
.
get_num_gpus
(
flags_obj
)),
flags_obj
.
batch_size
,
flags_core
.
get_num_gpus
(
flags_obj
)),
num_epochs
=
1
,
num_epochs
=
1
,
dtype
=
flags_core
.
get_tf_dtype
(
flags_obj
))
dtype
=
flags_core
.
get_tf_dtype
(
flags_obj
))
...
...
official/r1/transformer/transformer_main.py
View file @
fc02382c
...
@@ -562,6 +562,36 @@ def construct_estimator(flags_obj, params, schedule_manager):
...
@@ -562,6 +562,36 @@ def construct_estimator(flags_obj, params, schedule_manager):
},
},
config
=
run_config
)
config
=
run_config
)
def
per_replica_batch_size
(
batch_size
,
num_gpus
):
"""For multi-gpu, batch-size must be a multiple of the number of GPUs.
Note that distribution strategy handles this automatically when used with
Keras. For using with Estimator, we need to get per GPU batch.
Args:
batch_size: Global batch size to be divided among devices. This should be
equal to num_gpus times the single-GPU batch_size for multi-gpu training.
num_gpus: How many GPUs are used with DistributionStrategies.
Returns:
Batch size per device.
Raises:
ValueError: if batch_size is not divisible by number of devices
"""
if
num_gpus
<=
1
:
return
batch_size
remainder
=
batch_size
%
num_gpus
if
remainder
:
err
=
(
'When running with multiple GPUs, batch size '
'must be a multiple of the number of available GPUs. Found {} '
'GPUs with a batch size of {}; try --batch_size={} instead.'
).
format
(
num_gpus
,
batch_size
,
batch_size
-
remainder
)
raise
ValueError
(
err
)
return
int
(
batch_size
/
num_gpus
)
def
run_transformer
(
flags_obj
):
def
run_transformer
(
flags_obj
):
"""Create tf.Estimator to train and evaluate transformer model.
"""Create tf.Estimator to train and evaluate transformer model.
...
@@ -605,8 +635,8 @@ def run_transformer(flags_obj):
...
@@ -605,8 +635,8 @@ def run_transformer(flags_obj):
total_batch_size
=
params
[
"batch_size"
]
total_batch_size
=
params
[
"batch_size"
]
if
not
params
[
"use_tpu"
]:
if
not
params
[
"use_tpu"
]:
params
[
"batch_size"
]
=
distribution_utils
.
per_replica_batch_size
(
params
[
"batch_size"
]
=
per_replica_batch_size
(
params
[
"batch_size"
],
params
[
"batch_size"
],
num_gpus
)
num_gpus
)
schedule_manager
=
schedule
.
Manager
(
schedule_manager
=
schedule
.
Manager
(
train_steps
=
flags_obj
.
train_steps
,
train_steps
=
flags_obj
.
train_steps
,
...
...
official/utils/misc/distribution_utils.py
View file @
fc02382c
...
@@ -157,37 +157,6 @@ def get_distribution_strategy(distribution_strategy="mirrored",
...
@@ -157,37 +157,6 @@ def get_distribution_strategy(distribution_strategy="mirrored",
"Unrecognized Distribution Strategy: %r"
%
distribution_strategy
)
"Unrecognized Distribution Strategy: %r"
%
distribution_strategy
)
def
per_replica_batch_size
(
batch_size
,
num_gpus
):
"""For multi-gpu, batch-size must be a multiple of the number of GPUs.
Note that distribution strategy handles this automatically when used with
Keras. For using with Estimator, we need to get per GPU batch.
Args:
batch_size: Global batch size to be divided among devices. This should be
equal to num_gpus times the single-GPU batch_size for multi-gpu training.
num_gpus: How many GPUs are used with DistributionStrategies.
Returns:
Batch size per device.
Raises:
ValueError: if batch_size is not divisible by number of devices
"""
if
num_gpus
<=
1
:
return
batch_size
remainder
=
batch_size
%
num_gpus
if
remainder
:
err
=
(
'When running with multiple GPUs, batch size '
'must be a multiple of the number of available GPUs. Found {} '
'GPUs with a batch size of {}; try --batch_size={} instead.'
).
format
(
num_gpus
,
batch_size
,
batch_size
-
remainder
)
raise
ValueError
(
err
)
return
int
(
batch_size
/
num_gpus
)
# The `SyntheticDataset` is a temporary solution for generating synthetic data
# The `SyntheticDataset` is a temporary solution for generating synthetic data
# directly on devices. It is only useful for Keras with Distribution
# directly on devices. It is only useful for Keras with Distribution
# Strategies. We will have better support in `tf.data` or Distribution Strategy
# Strategies. We will have better support in `tf.data` or Distribution Strategy
...
...
official/utils/misc/distribution_utils_test.py
View file @
fc02382c
...
@@ -45,21 +45,5 @@ class GetDistributionStrategyTest(tf.test.TestCase):
...
@@ -45,21 +45,5 @@ class GetDistributionStrategyTest(tf.test.TestCase):
self
.
assertIn
(
'GPU'
,
device
)
self
.
assertIn
(
'GPU'
,
device
)
class
PerReplicaBatchSizeTest
(
tf
.
test
.
TestCase
):
"""Tests for per_replica_batch_size."""
def
test_batch_size
(
self
):
self
.
assertEquals
(
distribution_utils
.
per_replica_batch_size
(
147
,
num_gpus
=
0
),
147
)
self
.
assertEquals
(
distribution_utils
.
per_replica_batch_size
(
147
,
num_gpus
=
1
),
147
)
self
.
assertEquals
(
distribution_utils
.
per_replica_batch_size
(
147
,
num_gpus
=
7
),
21
)
def
test_batch_size_with_remainder
(
self
):
with
self
.
assertRaises
(
ValueError
):
distribution_utils
.
per_replica_batch_size
(
147
,
num_gpus
=
5
)
if
__name__
==
"__main__"
:
if
__name__
==
"__main__"
:
tf
.
test
.
main
()
tf
.
test
.
main
()
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment