Unverified Commit 8c3b1fcb authored by Stas Bekman's avatar Stas Bekman Committed by GitHub
Browse files

[trainer] a few fixes (#9993)

* trainer fixes

* don't switch the model  just for deepspeed and mp

* correct the fix
parent 714855bd
......@@ -264,11 +264,14 @@ class Trainer:
self.eval_dataset = eval_dataset
self.tokenizer = tokenizer
# Model parallel
if not self.is_model_parallel:
# postpone switching model to cuda when:
# 1. MP - since we are trying to fit a much bigger than 1 gpu model
# 2. fp16-enabled DeepSpeed loads the model in half the size and it doesn't need .to() anyway
if not (self.is_model_parallel or args.deepspeed):
model = model.to(args.device)
else:
# Force n_gpu to 1 to avoid DataParallel.
# Force n_gpu to 1 to avoid DataParallel as MP will manage the GPUs
if self.is_model_parallel:
self.args._n_gpu = 1
# later use `self.model is self.model_wrapped` to check if it's wrapped or not
......@@ -790,6 +793,8 @@ class Trainer:
model = ShardedDDP(model, self.optimizer)
elif is_sagemaker_distributed_available():
model = DDP(model, device_ids=[dist.get_local_rank()], broadcast_buffers=False)
if self.deepspeed:
pass # already initialized its own DDP earlier
elif self.args.local_rank != -1:
if self.args.ddp_find_unused_parameters is not None:
find_unused_parameters = self.args.ddp_find_unused_parameters
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment