Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
c5d7754b
Unverified
Commit
c5d7754b
authored
Nov 09, 2023
by
Hz, Ji
Committed by
GitHub
Nov 09, 2023
Browse files
device-agnostic deepspeed testing (#27342)
parent
9999b739
Changes
1
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
14 additions
and
13 deletions
+14
-13
tests/deepspeed/test_deepspeed.py
tests/deepspeed/test_deepspeed.py
+14
-13
No files found.
tests/deepspeed/test_deepspeed.py
View file @
c5d7754b
...
...
@@ -38,17 +38,18 @@ from transformers.testing_utils import (
CaptureStderr
,
LoggingLevel
,
TestCasePlus
,
backend_device_count
,
execute_subprocess_async
,
get_gpu_count
,
mockenv_context
,
require_deepspeed
,
require_optuna
,
require_torch_
gpu
,
require_torch_multi_
gpu
,
require_torch_
accelerator
,
require_torch_multi_
accelerator
,
slow
,
torch_device
,
)
from
transformers.trainer_utils
import
get_last_checkpoint
,
set_seed
from
transformers.utils
import
SAFE_WEIGHTS_NAME
,
is_torch_bf16_
gpu_
available
from
transformers.utils
import
SAFE_WEIGHTS_NAME
,
is_torch_bf16_available
_on_device
if
is_torch_available
():
...
...
@@ -125,7 +126,7 @@ def get_launcher(distributed=False):
# - it won't be able to handle that
# 2. for now testing with just 2 gpus max (since some quality tests may give different
# results with mode gpus because we use very little data)
num_gpus
=
min
(
2
,
get_gpu_count
(
))
if
distributed
else
1
num_gpus
=
min
(
2
,
backend_device_count
(
torch_device
))
if
distributed
else
1
master_port
=
get_master_port
(
real_launcher
=
True
)
return
f
"deepspeed --num_nodes 1 --num_gpus
{
num_gpus
}
--master_port
{
master_port
}
"
.
split
()
...
...
@@ -145,7 +146,7 @@ optims = [HF_OPTIM, DS_OPTIM]
schedulers
=
[
HF_SCHEDULER
,
DS_SCHEDULER
]
stages
=
[
ZERO2
,
ZERO3
]
if
is_torch_bf16_
gpu_
available
(
):
if
is_torch_bf16_available
_on_device
(
torch_device
):
dtypes
=
[
FP16
,
BF16
]
else
:
dtypes
=
[
FP16
]
...
...
@@ -165,7 +166,7 @@ params_with_optims_and_schedulers = list(itertools.product(stages, dtypes, optim
@
require_deepspeed
@
require_torch_
gpu
@
require_torch_
accelerator
class
CoreIntegrationDeepSpeed
(
TestCasePlus
,
TrainerIntegrationCommon
):
"""
Testing non-Trainer DeepSpeed integration
...
...
@@ -273,7 +274,7 @@ class TrainerIntegrationDeepSpeedWithCustomConfig(TestCasePlus):
@
require_deepspeed
@
require_torch_
gpu
@
require_torch_
accelerator
class
TrainerIntegrationDeepSpeed
(
TrainerIntegrationDeepSpeedWithCustomConfig
,
TrainerIntegrationCommon
):
"""
...
...
@@ -875,7 +876,7 @@ class TrainerIntegrationDeepSpeed(TrainerIntegrationDeepSpeedWithCustomConfig, T
@
slow
@
require_deepspeed
@
require_torch_
gpu
@
require_torch_
accelerator
class
TestDeepSpeedWithLauncher
(
TestCasePlus
):
"""This class is for testing via an external script - can do multiple gpus"""
...
...
@@ -896,7 +897,7 @@ class TestDeepSpeedWithLauncher(TestCasePlus):
#
@
parameterized
.
expand
(
params
,
name_func
=
parameterized_custom_name_func
)
@
require_torch_multi_
gpu
@
require_torch_multi_
accelerator
def
test_basic_distributed
(
self
,
stage
,
dtype
):
self
.
run_and_check
(
stage
=
stage
,
dtype
=
dtype
,
distributed
=
True
)
...
...
@@ -927,7 +928,7 @@ class TestDeepSpeedWithLauncher(TestCasePlus):
)
@
parameterized
.
expand
(
params
,
name_func
=
parameterized_custom_name_func
)
@
require_torch_multi_
gpu
@
require_torch_multi_
accelerator
def
test_fp32_distributed
(
self
,
stage
,
dtype
):
# real model needs too much GPU memory under stage2+fp32, so using tiny random model here -
# therefore no quality checks, just basic completion checks are done
...
...
@@ -968,9 +969,9 @@ class TestDeepSpeedWithLauncher(TestCasePlus):
self
.
do_checks
(
output_dir
,
do_train
=
do_train
,
do_eval
=
do_eval
)
@
parameterized
.
expand
([
"bf16"
,
"fp16"
,
"fp32"
])
@
require_torch_multi_
gpu
@
require_torch_multi_
accelerator
def
test_inference
(
self
,
dtype
):
if
dtype
==
"bf16"
and
not
is_torch_bf16_
gpu_
available
(
):
if
dtype
==
"bf16"
and
not
is_torch_bf16_available
_on_device
(
torch_device
):
self
.
skipTest
(
"test requires bfloat16 hardware support"
)
# this is just inference, so no optimizer should be loaded
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment