Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
5764e67c
"...git@developer.sourcefind.cn:chenpangpang/transformers.git" did not exist on "b57bfb5fa06cc45bdb9af2f7fd2ab11b0227f41a"
Unverified
Commit
5764e67c
authored
Apr 20, 2023
by
Zachary Mueller
Committed by
GitHub
Apr 20, 2023
Browse files
Revert DeepSpeed stuff from accelerate integration (#22899)
parent
f1430377
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
62 additions
and
47 deletions
+62
-47
src/transformers/training_args.py
src/transformers/training_args.py
+62
-47
No files found.
src/transformers/training_args.py
View file @
5764e67c
...
@@ -1544,69 +1544,84 @@ class TrainingArguments:
...
@@ -1544,69 +1544,84 @@ class TrainingArguments:
self
.
_n_gpu
=
1
self
.
_n_gpu
=
1
torch
.
cuda
.
set_device
(
device
)
torch
.
cuda
.
set_device
(
device
)
elif
self
.
deepspeed
:
elif
self
.
deepspeed
:
self
.
distributed_state
=
PartialState
(
timeout
=
timedelta
(
seconds
=
self
.
ddp_timeout
))
# deepspeed inits torch.distributed internally
from
.deepspeed
import
is_deepspeed_available
if
not
is_deepspeed_available
():
raise
ImportError
(
"--deepspeed requires deepspeed: `pip install deepspeed`."
)
import
deepspeed
deepspeed
.
init_distributed
(
timeout
=
timedelta
(
seconds
=
self
.
ddp_timeout
))
# workaround for setups like notebooks where the launcher can't be used,
# but deepspeed requires a dist env.
# env LOCAL_RANK could be set manually by the user, or via init_distributed if mpi4py is installed
self
.
local_rank
=
int
(
os
.
environ
.
get
(
"LOCAL_RANK"
,
"-1"
))
device
=
torch
.
device
(
"cuda"
,
self
.
local_rank
)
self
.
_n_gpu
=
1
self
.
_n_gpu
=
1
else
:
else
:
self
.
distributed_state
=
PartialState
(
backend
=
self
.
xpu_backend
)
self
.
distributed_state
=
PartialState
(
backend
=
self
.
xpu_backend
)
self
.
_n_gpu
=
1
self
.
_n_gpu
=
1
if
not
is_sagemaker_mp_enabled
():
if
not
is_sagemaker_mp_enabled
()
and
not
self
.
deepspeed
:
device
=
self
.
distributed_state
.
device
device
=
self
.
distributed_state
.
device
self
.
local_rank
=
self
.
distributed_state
.
local_process_index
self
.
local_rank
=
self
.
distributed_state
.
local_process_index
if
(
if
(
torch
.
distributed
.
is_available
()
torch
.
distributed
.
is_available
()
and
torch
.
distributed
.
is_initialized
()
and
torch
.
distributed
.
is_initialized
()
and
hasattr
(
self
,
"distributed_state"
)
and
self
.
distributed_state
.
distributed_type
==
DistributedType
.
NO
and
self
.
distributed_state
.
distributed_type
==
DistributedType
.
NO
):
):
logger
.
warning
(
logger
.
warning
(
"torch.distributed process group is initialized, but parallel_mode != ParallelMode.DISTRIBUTED. "
"torch.distributed process group is initialized, but parallel_mode != ParallelMode.DISTRIBUTED. "
"In order to use Torch DDP, launch your script with `python -m torch.distributed.launch"
"In order to use Torch DDP, launch your script with `python -m torch.distributed.launch"
)
)
if
not
self
.
deepspeed
:
if
is_torch_tpu_available
():
if
is_torch_tpu_available
():
device
=
self
.
distributed_state
.
device
device
=
self
.
distributed_state
.
device
self
.
_n_gpu
=
0
self
.
_n_gpu
=
0
elif
is_sagemaker_dp_enabled
():
elif
is_sagemaker_dp_enabled
():
self
.
_n_gpu
=
1
self
.
_n_gpu
=
1
elif
self
.
distributed_state
.
distributed_type
==
DistributedType
.
NO
:
elif
self
.
distributed_state
.
distributed_type
==
DistributedType
.
NO
:
if
self
.
use_mps_device
:
if
self
.
use_mps_device
:
if
not
torch
.
backends
.
mps
.
is_available
():
if
not
torch
.
backends
.
mps
.
is_available
():
if
not
torch
.
backends
.
mps
.
is_built
():
if
not
torch
.
backends
.
mps
.
is_built
():
raise
AssertionError
(
raise
AssertionError
(
"MPS not available because the current PyTorch install was not "
"MPS not available because the current PyTorch install was not "
"built with MPS enabled. Please install torch version >=1.12.0 on "
"built with MPS enabled. Please install torch version >=1.12.0 on "
"your Apple silicon Mac running macOS 12.3 or later with a native "
"your Apple silicon Mac running macOS 12.3 or later with a native "
"version (arm64) of Python"
"version (arm64) of Python"
)
)
else
:
raise
AssertionError
(
"MPS not available because the current MacOS version is not 12.3+ "
"and/or you do not have an MPS-enabled device on this machine."
)
else
:
else
:
raise
AssertionError
(
if
not
version
.
parse
(
version
.
parse
(
torch
.
__version__
).
base_version
)
>
version
.
parse
(
"1.12.0"
):
"MPS not available because the current MacOS version is not 12.3+ "
warnings
.
warn
(
"and/or you do not have an MPS-enabled device on this machine."
"We strongly recommend to install PyTorch >= 1.13 (nightly version at the time of writing)"
)
" on your MacOS machine. It has major fixes related to model correctness and performance"
else
:
" improvements for transformer based models. Please refer to"
if
not
version
.
parse
(
version
.
parse
(
torch
.
__version__
).
base_version
)
>
version
.
parse
(
"1.12.0"
):
" https://github.com/pytorch/pytorch/issues/82707 for more details."
warnings
.
warn
(
)
"We strongly recommend to install PyTorch >= 1.13 (nightly version at the time of writing)"
device
=
torch
.
device
(
"mps"
)
" on your MacOS machine. It has major fixes related to model correctness and performance"
self
.
_n_gpu
=
1
" improvements for transformer based models. Please refer to"
" https://github.com/pytorch/pytorch/issues/82707 for more details."
)
device
=
torch
.
device
(
"mps"
)
self
.
_n_gpu
=
1
else
:
else
:
# if n_gpu is > 1 we'll use nn.DataParallel.
# if n_gpu is > 1 we'll use nn.DataParallel.
# If you only want to use a specific subset of GPUs use `CUDA_VISIBLE_DEVICES=0`
# If you only want to use a specific subset of GPUs use `CUDA_VISIBLE_DEVICES=0`
# Explicitly set CUDA to the first (index 0) CUDA device, otherwise `set_device` will
# Explicitly set CUDA to the first (index 0) CUDA device, otherwise `set_device` will
# trigger an error that a device index is missing. Index 0 takes into account the
# trigger an error that a device index is missing. Index 0 takes into account the
# GPUs available in the environment, so `CUDA_VISIBLE_DEVICES=1,2` with `cuda:0`
# GPUs available in the environment, so `CUDA_VISIBLE_DEVICES=1,2` with `cuda:0`
# will use the first GPU in that env, i.e. GPU#1
# will use the first GPU in that env, i.e. GPU#1
# device = self.distributed_state.device
# device = self.distributed_state.device
device
=
torch
.
device
(
"cuda:0"
if
torch
.
cuda
.
is_available
()
else
"cpu"
)
device
=
torch
.
device
(
"cuda:0"
if
torch
.
cuda
.
is_available
()
else
"cpu"
)
# Sometimes the line in the postinit has not been run before we end up here, so just checking we're not at
# Sometimes the line in the postinit has not been run before we end up here, so just checking we're not at
# the default value.
# the default value.
self
.
_n_gpu
=
torch
.
cuda
.
device_count
()
self
.
_n_gpu
=
torch
.
cuda
.
device_count
()
if
device
.
type
==
"cuda"
:
if
device
.
type
==
"cuda"
:
torch
.
cuda
.
set_device
(
device
)
torch
.
cuda
.
set_device
(
device
)
return
device
return
device
@
property
@
property
...
@@ -1649,7 +1664,7 @@ class TrainingArguments:
...
@@ -1649,7 +1664,7 @@ class TrainingArguments:
return
ParallelMode
.
SAGEMAKER_MODEL_PARALLEL
return
ParallelMode
.
SAGEMAKER_MODEL_PARALLEL
elif
is_sagemaker_dp_enabled
():
elif
is_sagemaker_dp_enabled
():
return
ParallelMode
.
SAGEMAKER_DATA_PARALLEL
return
ParallelMode
.
SAGEMAKER_DATA_PARALLEL
elif
hasattr
(
self
,
"distributed_state"
)
and
(
self
.
distributed_state
.
distributed_type
!=
DistributedType
.
NO
)
:
elif
self
.
deepspeed
or
self
.
distributed_state
.
distributed_type
!=
DistributedType
.
NO
:
return
ParallelMode
.
DISTRIBUTED
return
ParallelMode
.
DISTRIBUTED
elif
self
.
n_gpu
>
1
:
elif
self
.
n_gpu
>
1
:
return
ParallelMode
.
NOT_DISTRIBUTED
return
ParallelMode
.
NOT_DISTRIBUTED
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment