Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
322037e8
Unverified
Commit
322037e8
authored
Feb 08, 2021
by
Stas Bekman
Committed by
GitHub
Feb 08, 2021
Browse files
[trainer] deepspeed bug fixes and tests (#10039)
* deepspeed bug fixes and tests * manual wrap?
parent
f285e4c3
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
151 additions
and
36 deletions
+151
-36
examples/seq2seq/test_deepspeed.py
examples/seq2seq/test_deepspeed.py
+132
-0
examples/seq2seq/test_finetune_trainer.py
examples/seq2seq/test_finetune_trainer.py
+4
-35
src/transformers/trainer.py
src/transformers/trainer.py
+15
-1
No files found.
examples/seq2seq/test_deepspeed.py
0 → 100644
View file @
322037e8
# Copyright 2020 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
os
import
unittest
from
transformers.integrations
import
is_deepspeed_available
from
transformers.testing_utils
import
TestCasePlus
,
execute_subprocess_async
,
require_torch_multi_gpu
from
transformers.trainer_callback
import
TrainerState
from
transformers.trainer_utils
import
set_seed
from
utils
import
load_json
set_seed
(
42
)
MBART_TINY
=
"sshleifer/tiny-mbart"
# a candidate for testing_utils
def
require_deepspeed
(
test_case
):
"""
Decorator marking a test that requires deepspeed
"""
if
not
is_deepspeed_available
():
return
unittest
.
skip
(
"test requires deepspeed"
)(
test_case
)
else
:
return
test_case
@
require_deepspeed
class
TestDeepSpeed
(
TestCasePlus
):
# XXX: need to do better validation beyond just that the run was successful
def
run_quick
(
self
,
distributed
=
None
,
extra_args_str
=
None
,
remove_args_str
=
None
):
output_dir
=
self
.
run_trainer
(
1
,
"12"
,
MBART_TINY
,
1
,
distributed
,
extra_args_str
,
remove_args_str
)
logs
=
TrainerState
.
load_from_json
(
os
.
path
.
join
(
output_dir
,
"trainer_state.json"
)).
log_history
eval_metrics
=
[
log
for
log
in
logs
if
"eval_loss"
in
log
.
keys
()]
first_step_stats
=
eval_metrics
[
0
]
assert
"eval_bleu"
in
first_step_stats
def
run_quick_no_train
(
self
,
distributed
=
None
,
extra_args_str
=
None
):
remove_args_str
=
"--do_train"
output_dir
=
self
.
run_trainer
(
1
,
"12"
,
MBART_TINY
,
1
,
distributed
,
extra_args_str
,
remove_args_str
)
val_metrics
=
load_json
(
os
.
path
.
join
(
output_dir
,
"val_results.json"
))
assert
"val_bleu"
in
val_metrics
test_metrics
=
load_json
(
os
.
path
.
join
(
output_dir
,
"test_results.json"
))
assert
"test_bleu"
in
test_metrics
@
require_torch_multi_gpu
def
test_basic
(
self
):
self
.
run_quick
()
@
require_torch_multi_gpu
def
test_grad_acum
(
self
):
self
.
run_quick
(
extra_args_str
=
"--gradient_accumulation_steps 2"
)
@
require_torch_multi_gpu
def
test_no_train
(
self
):
# we should not fail if train is skipped
self
.
run_quick_no_train
()
def
run_trainer
(
self
,
eval_steps
:
int
,
max_len
:
str
,
model_name
:
str
,
num_train_epochs
:
int
,
distributed
:
bool
=
False
,
extra_args_str
:
str
=
None
,
remove_args_str
:
str
=
None
,
):
data_dir
=
self
.
examples_dir
/
"seq2seq/test_data/wmt_en_ro"
output_dir
=
self
.
get_auto_remove_tmp_dir
()
args
=
f
"""
--model_name_or_path
{
model_name
}
--data_dir
{
data_dir
}
--output_dir
{
output_dir
}
--overwrite_output_dir
--n_train 8
--n_val 8
--max_source_length
{
max_len
}
--max_target_length
{
max_len
}
--val_max_target_length
{
max_len
}
--do_train
--do_eval
--do_predict
--num_train_epochs
{
str
(
num_train_epochs
)
}
--per_device_train_batch_size 4
--per_device_eval_batch_size 4
--learning_rate 3e-3
--warmup_steps 8
--evaluation_strategy steps
--predict_with_generate
--logging_steps 0
--save_steps
{
str
(
eval_steps
)
}
--eval_steps
{
str
(
eval_steps
)
}
--group_by_length
--label_smoothing_factor 0.1
--adafactor
--task translation
--tgt_lang ro_RO
--src_lang en_XX
"""
.
split
()
# --eval_beams 2
if
extra_args_str
is
not
None
:
args
.
extend
(
extra_args_str
.
split
())
if
remove_args_str
is
not
None
:
remove_args
=
remove_args_str
.
split
()
args
=
[
x
for
x
in
args
if
x
not
in
remove_args
]
ds_args
=
f
"--deepspeed
{
self
.
test_file_dir_str
}
/ds_config.json"
.
split
()
distributed_args
=
f
"""
{
self
.
test_file_dir
}
/finetune_trainer.py
"""
.
split
()
cmd
=
[
"deepspeed"
]
+
distributed_args
+
args
+
ds_args
# keep for quick debug
# print(" ".join(cmd)); die
execute_subprocess_async
(
cmd
,
env
=
self
.
get_env
())
return
output_dir
examples/seq2seq/test_finetune_trainer.py
View file @
322037e8
...
@@ -18,7 +18,7 @@ import unittest
...
@@ -18,7 +18,7 @@ import unittest
from
unittest.mock
import
patch
from
unittest.mock
import
patch
from
transformers.file_utils
import
is_apex_available
from
transformers.file_utils
import
is_apex_available
from
transformers.integrations
import
is_deepspeed_available
,
is_fairscale_available
from
transformers.integrations
import
is_fairscale_available
from
transformers.testing_utils
import
(
from
transformers.testing_utils
import
(
TestCasePlus
,
TestCasePlus
,
execute_subprocess_async
,
execute_subprocess_async
,
...
@@ -49,17 +49,6 @@ def require_fairscale(test_case):
...
@@ -49,17 +49,6 @@ def require_fairscale(test_case):
return
test_case
return
test_case
# a candidate for testing_utils
def
require_deepspeed
(
test_case
):
"""
Decorator marking a test that requires deepspeed
"""
if
not
is_deepspeed_available
():
return
unittest
.
skip
(
"test requires deepspeed"
)(
test_case
)
else
:
return
test_case
# a candidate for testing_utils
# a candidate for testing_utils
def
require_apex
(
test_case
):
def
require_apex
(
test_case
):
"""
"""
...
@@ -72,8 +61,8 @@ def require_apex(test_case):
...
@@ -72,8 +61,8 @@ def require_apex(test_case):
class
TestFinetuneTrainer
(
TestCasePlus
):
class
TestFinetuneTrainer
(
TestCasePlus
):
def
finetune_trainer_quick
(
self
,
distributed
=
None
,
deepspeed
=
False
,
extra_args_str
=
None
):
def
finetune_trainer_quick
(
self
,
distributed
=
None
,
extra_args_str
=
None
):
output_dir
=
self
.
run_trainer
(
1
,
"12"
,
MBART_TINY
,
1
,
distributed
,
deepspeed
,
extra_args_str
)
output_dir
=
self
.
run_trainer
(
1
,
"12"
,
MBART_TINY
,
1
,
distributed
,
extra_args_str
)
logs
=
TrainerState
.
load_from_json
(
os
.
path
.
join
(
output_dir
,
"trainer_state.json"
)).
log_history
logs
=
TrainerState
.
load_from_json
(
os
.
path
.
join
(
output_dir
,
"trainer_state.json"
)).
log_history
eval_metrics
=
[
log
for
log
in
logs
if
"eval_loss"
in
log
.
keys
()]
eval_metrics
=
[
log
for
log
in
logs
if
"eval_loss"
in
log
.
keys
()]
first_step_stats
=
eval_metrics
[
0
]
first_step_stats
=
eval_metrics
[
0
]
...
@@ -107,16 +96,6 @@ class TestFinetuneTrainer(TestCasePlus):
...
@@ -107,16 +96,6 @@ class TestFinetuneTrainer(TestCasePlus):
def
test_finetune_trainer_apex
(
self
):
def
test_finetune_trainer_apex
(
self
):
self
.
finetune_trainer_quick
(
extra_args_str
=
"--fp16 --fp16_backend=apex"
)
self
.
finetune_trainer_quick
(
extra_args_str
=
"--fp16 --fp16_backend=apex"
)
@
require_torch_multi_gpu
@
require_deepspeed
def
test_finetune_trainer_deepspeed
(
self
):
self
.
finetune_trainer_quick
(
deepspeed
=
True
)
@
require_torch_multi_gpu
@
require_deepspeed
def
test_finetune_trainer_deepspeed_grad_acum
(
self
):
self
.
finetune_trainer_quick
(
deepspeed
=
True
,
extra_args_str
=
"--gradient_accumulation_steps 2"
)
@
slow
@
slow
def
test_finetune_trainer_slow
(
self
):
def
test_finetune_trainer_slow
(
self
):
# There is a missing call to __init__process_group somewhere
# There is a missing call to __init__process_group somewhere
...
@@ -146,7 +125,6 @@ class TestFinetuneTrainer(TestCasePlus):
...
@@ -146,7 +125,6 @@ class TestFinetuneTrainer(TestCasePlus):
model_name
:
str
,
model_name
:
str
,
num_train_epochs
:
int
,
num_train_epochs
:
int
,
distributed
:
bool
=
False
,
distributed
:
bool
=
False
,
deepspeed
:
bool
=
False
,
extra_args_str
:
str
=
None
,
extra_args_str
:
str
=
None
,
):
):
data_dir
=
self
.
examples_dir
/
"seq2seq/test_data/wmt_en_ro"
data_dir
=
self
.
examples_dir
/
"seq2seq/test_data/wmt_en_ro"
...
@@ -186,15 +164,7 @@ class TestFinetuneTrainer(TestCasePlus):
...
@@ -186,15 +164,7 @@ class TestFinetuneTrainer(TestCasePlus):
if
extra_args_str
is
not
None
:
if
extra_args_str
is
not
None
:
args
.
extend
(
extra_args_str
.
split
())
args
.
extend
(
extra_args_str
.
split
())
if
deepspeed
:
if
distributed
:
ds_args
=
f
"--deepspeed
{
self
.
test_file_dir_str
}
/ds_config.json"
.
split
()
distributed_args
=
f
"""
{
self
.
test_file_dir
}
/finetune_trainer.py
"""
.
split
()
cmd
=
[
"deepspeed"
]
+
distributed_args
+
args
+
ds_args
execute_subprocess_async
(
cmd
,
env
=
self
.
get_env
())
elif
distributed
:
n_gpu
=
get_gpu_count
()
n_gpu
=
get_gpu_count
()
distributed_args
=
f
"""
distributed_args
=
f
"""
-m torch.distributed.launch
-m torch.distributed.launch
...
@@ -203,7 +173,6 @@ class TestFinetuneTrainer(TestCasePlus):
...
@@ -203,7 +173,6 @@ class TestFinetuneTrainer(TestCasePlus):
"""
.
split
()
"""
.
split
()
cmd
=
[
sys
.
executable
]
+
distributed_args
+
args
cmd
=
[
sys
.
executable
]
+
distributed_args
+
args
execute_subprocess_async
(
cmd
,
env
=
self
.
get_env
())
execute_subprocess_async
(
cmd
,
env
=
self
.
get_env
())
else
:
else
:
testargs
=
[
"finetune_trainer.py"
]
+
args
testargs
=
[
"finetune_trainer.py"
]
+
args
with
patch
.
object
(
sys
,
"argv"
,
testargs
):
with
patch
.
object
(
sys
,
"argv"
,
testargs
):
...
...
src/transformers/trainer.py
View file @
322037e8
...
@@ -932,7 +932,11 @@ class Trainer:
...
@@ -932,7 +932,11 @@ class Trainer:
if
(
step
+
1
)
%
self
.
args
.
gradient_accumulation_steps
==
0
:
if
(
step
+
1
)
%
self
.
args
.
gradient_accumulation_steps
==
0
:
self
.
control
=
self
.
callback_handler
.
on_step_begin
(
self
.
args
,
self
.
state
,
self
.
control
)
self
.
control
=
self
.
callback_handler
.
on_step_begin
(
self
.
args
,
self
.
state
,
self
.
control
)
if
((
step
+
1
)
%
self
.
args
.
gradient_accumulation_steps
!=
0
)
and
self
.
args
.
local_rank
!=
-
1
:
if
(
((
step
+
1
)
%
self
.
args
.
gradient_accumulation_steps
!=
0
)
and
self
.
args
.
local_rank
!=
-
1
and
not
self
.
args
.
deepspeed
):
# Avoid unnecessary DDP synchronization since there will be no backward pass on this example.
# Avoid unnecessary DDP synchronization since there will be no backward pass on this example.
with
model
.
no_sync
():
with
model
.
no_sync
():
tr_loss
+=
self
.
training_step
(
model
,
inputs
)
tr_loss
+=
self
.
training_step
(
model
,
inputs
)
...
@@ -1588,7 +1592,17 @@ class Trainer:
...
@@ -1588,7 +1592,17 @@ class Trainer:
prediction_loss_only
if
prediction_loss_only
is
not
None
else
self
.
args
.
prediction_loss_only
prediction_loss_only
if
prediction_loss_only
is
not
None
else
self
.
args
.
prediction_loss_only
)
)
if
self
.
args
.
deepspeed
and
not
self
.
args
.
do_train
:
# In the future we probably can run deepspeed for inference too, but this will require
# some thinking about how to best run it - since while it works DeepSpeed wasn't
# designed for inference
# since we have to postpone model.to() till training for DeepSpeed, if there was no
# training, we must put the model on the right device
self
.
model
=
self
.
model
.
to
(
self
.
args
.
device
)
model
=
self
.
model
model
=
self
.
model
# multi-gpu eval
# multi-gpu eval
if
self
.
args
.
n_gpu
>
1
:
if
self
.
args
.
n_gpu
>
1
:
model
=
torch
.
nn
.
DataParallel
(
model
)
model
=
torch
.
nn
.
DataParallel
(
model
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment