Unverified Commit 5764e67c authored by Zachary Mueller's avatar Zachary Mueller Committed by GitHub
Browse files

Revert DeepSpeed stuff from accelerate integration (#22899)

parent f1430377
...@@ -1544,24 +1544,39 @@ class TrainingArguments: ...@@ -1544,24 +1544,39 @@ class TrainingArguments:
self._n_gpu = 1 self._n_gpu = 1
torch.cuda.set_device(device) torch.cuda.set_device(device)
elif self.deepspeed: elif self.deepspeed:
self.distributed_state = PartialState(timeout=timedelta(seconds=self.ddp_timeout)) # deepspeed inits torch.distributed internally
from .deepspeed import is_deepspeed_available
if not is_deepspeed_available():
raise ImportError("--deepspeed requires deepspeed: `pip install deepspeed`.")
import deepspeed
deepspeed.init_distributed(timeout=timedelta(seconds=self.ddp_timeout))
# workaround for setups like notebooks where the launcher can't be used,
# but deepspeed requires a dist env.
# env LOCAL_RANK could be set manually by the user, or via init_distributed if mpi4py is installed
self.local_rank = int(os.environ.get("LOCAL_RANK", "-1"))
device = torch.device("cuda", self.local_rank)
self._n_gpu = 1 self._n_gpu = 1
else: else:
self.distributed_state = PartialState(backend=self.xpu_backend) self.distributed_state = PartialState(backend=self.xpu_backend)
self._n_gpu = 1 self._n_gpu = 1
if not is_sagemaker_mp_enabled(): if not is_sagemaker_mp_enabled() and not self.deepspeed:
device = self.distributed_state.device device = self.distributed_state.device
self.local_rank = self.distributed_state.local_process_index self.local_rank = self.distributed_state.local_process_index
if ( if (
torch.distributed.is_available() torch.distributed.is_available()
and torch.distributed.is_initialized() and torch.distributed.is_initialized()
and hasattr(self, "distributed_state")
and self.distributed_state.distributed_type == DistributedType.NO and self.distributed_state.distributed_type == DistributedType.NO
): ):
logger.warning( logger.warning(
"torch.distributed process group is initialized, but parallel_mode != ParallelMode.DISTRIBUTED. " "torch.distributed process group is initialized, but parallel_mode != ParallelMode.DISTRIBUTED. "
"In order to use Torch DDP, launch your script with `python -m torch.distributed.launch" "In order to use Torch DDP, launch your script with `python -m torch.distributed.launch"
) )
if not self.deepspeed:
if is_torch_tpu_available(): if is_torch_tpu_available():
device = self.distributed_state.device device = self.distributed_state.device
self._n_gpu = 0 self._n_gpu = 0
...@@ -1649,7 +1664,7 @@ class TrainingArguments: ...@@ -1649,7 +1664,7 @@ class TrainingArguments:
return ParallelMode.SAGEMAKER_MODEL_PARALLEL return ParallelMode.SAGEMAKER_MODEL_PARALLEL
elif is_sagemaker_dp_enabled(): elif is_sagemaker_dp_enabled():
return ParallelMode.SAGEMAKER_DATA_PARALLEL return ParallelMode.SAGEMAKER_DATA_PARALLEL
elif hasattr(self, "distributed_state") and (self.distributed_state.distributed_type != DistributedType.NO): elif self.deepspeed or self.distributed_state.distributed_type != DistributedType.NO:
return ParallelMode.DISTRIBUTED return ParallelMode.DISTRIBUTED
elif self.n_gpu > 1: elif self.n_gpu > 1:
return ParallelMode.NOT_DISTRIBUTED return ParallelMode.NOT_DISTRIBUTED
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment