Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
6015d0ad
Unverified
Commit
6015d0ad
authored
Jan 10, 2024
by
Zach Mueller
Committed by
GitHub
Jan 10, 2024
Browse files
Support `DeepSpeed` when using auto find batch size (#28088)
Fixup test
parent
a777f525
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
84 additions
and
17 deletions
+84
-17
src/transformers/integrations/deepspeed.py
src/transformers/integrations/deepspeed.py
+10
-3
src/transformers/trainer.py
src/transformers/trainer.py
+28
-14
tests/trainer/test_trainer.py
tests/trainer/test_trainer.py
+46
-0
No files found.
src/transformers/integrations/deepspeed.py
View file @
6015d0ad
...
...
@@ -129,7 +129,7 @@ class HfTrainerDeepSpeedConfig(HfDeepSpeedConfig):
fill_only
=
partialmethod
(
fill_match
,
must_match
=
False
)
def
trainer_config_process
(
self
,
args
):
def
trainer_config_process
(
self
,
args
,
auto_find_batch_size
=
False
):
"""
Adjust the config with `TrainingArguments` values. This stage is run during `TrainingArguments` object
creation.
...
...
@@ -138,10 +138,15 @@ class HfTrainerDeepSpeedConfig(HfDeepSpeedConfig):
# train_batch_size = world_size * train_micro_batch_size_per_gpu * gradient_accumulation_steps
train_batch_size
=
args
.
world_size
*
args
.
per_device_train_batch_size
*
args
.
gradient_accumulation_steps
self
.
fill_match
(
"train_micro_batch_size_per_gpu"
,
args
.
per_device_train_batch_size
,
"per_device_train_batch_size"
"train_micro_batch_size_per_gpu"
,
args
.
per_device_train_batch_size
,
"per_device_train_batch_size"
,
not
auto_find_batch_size
,
)
self
.
fill_match
(
"gradient_accumulation_steps"
,
args
.
gradient_accumulation_steps
,
"gradient_accumulation_steps"
)
self
.
fill_match
(
"train_batch_size"
,
train_batch_size
,
"train_batch_size (calculated)"
)
self
.
fill_match
(
"train_batch_size"
,
train_batch_size
,
"train_batch_size (calculated)"
,
not
auto_find_batch_size
)
self
.
fill_match
(
"gradient_clipping"
,
args
.
max_grad_norm
,
"max_grad_norm"
)
self
.
fill_match
(
"optimizer.params.lr"
,
args
.
learning_rate
,
"learning_rate"
)
...
...
@@ -336,6 +341,8 @@ def deepspeed_init(trainer, num_training_steps, inference=False):
num_training_steps: per single gpu
resume_from_checkpoint: path to a checkpoint if to resume from after normal DeepSpeedEngine load
inference: launch in inference mode (no optimizer and no lr scheduler)
auto_find_batch_size: whether to ignore the `train_micro_batch_size_per_gpu` argument as it's being
set automatically by the auto batch size finder
Returns: optimizer, lr_scheduler
...
...
src/transformers/trainer.py
View file @
6015d0ad
...
...
@@ -1506,13 +1506,9 @@ class Trainer:
if
resume_from_checkpoint
is
None
:
raise
ValueError
(
f
"No valid checkpoint found in output directory (
{
args
.
output_dir
}
)"
)
if
(
resume_from_checkpoint
is
not
None
and
not
is_sagemaker_mp_enabled
()
and
not
self
.
is_deepspeed_enabled
and
not
self
.
is_fsdp_enabled
):
self
.
_load_from_checkpoint
(
resume_from_checkpoint
)
if
resume_from_checkpoint
is
not
None
:
if
not
is_sagemaker_mp_enabled
()
and
not
self
.
is_deepspeed_enabled
and
not
self
.
is_fsdp_enabled
:
self
.
_load_from_checkpoint
(
resume_from_checkpoint
)
# In case of repeating the find_executable_batch_size, set `self._train_batch_size` properly
state
=
TrainerState
.
load_from_json
(
os
.
path
.
join
(
resume_from_checkpoint
,
TRAINER_STATE_NAME
))
if
state
.
train_batch_size
is
not
None
:
...
...
@@ -1553,6 +1549,19 @@ class Trainer:
self
.
accelerator
.
free_memory
()
self
.
_train_batch_size
=
batch_size
if
self
.
args
.
auto_find_batch_size
:
if
self
.
state
.
train_batch_size
!=
self
.
_train_batch_size
:
from
accelerate.utils
import
release_memory
(
self
.
model_wrapped
,)
=
release_memory
(
self
.
model_wrapped
)
self
.
model_wrapped
=
self
.
model
# Check for DeepSpeed *after* the intial pass and modify the config
if
self
.
is_deepspeed_enabled
:
# Temporarily unset `self.args.train_batch_size`
original_bs
=
self
.
args
.
per_device_train_batch_size
self
.
args
.
per_device_train_batch_size
=
self
.
_train_batch_size
//
max
(
1
,
self
.
args
.
n_gpu
)
self
.
propagate_args_to_deepspeed
(
True
)
self
.
args
.
per_device_train_batch_size
=
original_bs
self
.
state
.
train_batch_size
=
self
.
_train_batch_size
logger
.
debug
(
f
"Currently training with a batch size of:
{
self
.
_train_batch_size
}
"
)
# Data loader and number of training steps
...
...
@@ -3944,12 +3953,17 @@ class Trainer:
"when using FSDP."
)
if
self
.
is_deepspeed_enabled
:
if
getattr
(
self
.
args
,
"hf_deepspeed_config"
,
None
)
is
None
:
from
transformers.integrations.deepspeed
import
HfTrainerDeepSpeedConfig
if
self
.
is_deepspeed_enabled
and
getattr
(
self
.
args
,
"hf_deepspeed_config"
,
None
)
is
None
:
self
.
propagate_args_to_deepspeed
()
def
propagate_args_to_deepspeed
(
self
,
auto_find_batch_size
=
False
):
"""
Sets values in the deepspeed plugin based on the Trainer args
"""
from
transformers.integrations.deepspeed
import
HfTrainerDeepSpeedConfig
ds_plugin
=
self
.
accelerator
.
state
.
deepspeed_plugin
ds_plugin
=
self
.
accelerator
.
state
.
deepspeed_plugin
ds_plugin
.
hf_ds_config
=
HfTrainerDeepSpeedConfig
(
ds_plugin
.
hf_ds_config
.
config
)
ds_plugin
.
deepspeed_config
=
ds_plugin
.
hf_ds_config
.
config
ds_plugin
.
hf_ds_config
.
trainer_config_process
(
self
.
args
)
ds_plugin
.
hf_ds_config
=
HfTrainerDeepSpeedConfig
(
ds_plugin
.
hf_ds_config
.
config
)
ds_plugin
.
deepspeed_config
=
ds_plugin
.
hf_ds_config
.
config
ds_plugin
.
hf_ds_config
.
trainer_config_process
(
self
.
args
,
auto_find_batch_size
)
tests/trainer/test_trainer.py
View file @
6015d0ad
...
...
@@ -57,6 +57,7 @@ from transformers.testing_utils import (
get_tests_dir
,
is_staging_test
,
require_accelerate
,
require_deepspeed
,
require_intel_extension_for_pytorch
,
require_optuna
,
require_ray
,
...
...
@@ -1551,6 +1552,51 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
with
patch
.
object
(
sys
,
"argv"
,
testargs
):
run_glue
.
main
()
@
require_deepspeed
def
test_auto_batch_size_with_resume_from_checkpoint_with_deepspeed
(
self
):
train_dataset
=
RegressionDataset
(
length
=
128
)
config
=
RegressionModelConfig
(
a
=
0
,
b
=
2
)
model
=
RegressionRandomPreTrainedModel
(
config
)
tmp_dir
=
self
.
get_auto_remove_tmp_dir
()
class
MockCudaOOMCallback
(
TrainerCallback
):
def
on_step_end
(
self
,
args
,
state
,
control
,
**
kwargs
):
# simulate OOM on the first step
if
state
.
train_batch_size
>=
16
:
raise
RuntimeError
(
"CUDA out of memory."
)
deepspeed
=
{
"zero_optimization"
:
{
"stage"
:
1
,
},
"train_batch_size"
:
"auto"
,
"train_micro_batch_size_per_gpu"
:
"auto"
,
}
args
=
RegressionTrainingArguments
(
tmp_dir
,
do_train
=
True
,
max_steps
=
2
,
save_steps
=
1
,
per_device_train_batch_size
=
16
,
auto_find_batch_size
=
True
,
deepspeed
=
deepspeed
,
)
trainer
=
Trainer
(
model
,
args
,
train_dataset
=
train_dataset
,
callbacks
=
[
MockCudaOOMCallback
()])
trainer
.
train
()
# After `auto_find_batch_size` is ran we should now be at 8
self
.
assertEqual
(
trainer
.
_train_batch_size
,
8
)
# We can then make a new Trainer
trainer
=
Trainer
(
model
,
args
,
train_dataset
=
train_dataset
)
# Check we are at 16 to start
self
.
assertEqual
(
trainer
.
_train_batch_size
,
16
*
max
(
trainer
.
args
.
n_gpu
,
1
))
trainer
.
train
(
resume_from_checkpoint
=
True
)
# We should be back to 8 again, picking up based upon the last ran Trainer
self
.
assertEqual
(
trainer
.
_train_batch_size
,
8
)
def
test_auto_batch_size_with_resume_from_checkpoint
(
self
):
train_dataset
=
RegressionDataset
(
length
=
128
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment