Unverified Commit c054b5ee authored by Thomas Wolf's avatar Thomas Wolf Committed by GitHub
Browse files

Merge pull request #896 from zijunsun/master

fix multi-gpu training bug when using fp16
parents 46cc9dd2 f0aeb7a8
...@@ -92,6 +92,10 @@ def train(args, train_dataset, model, tokenizer): ...@@ -92,6 +92,10 @@ def train(args, train_dataset, model, tokenizer):
raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.") raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level)
# multi-gpu training (should be after apex fp16 initialization)
if args.n_gpu > 1:
model = torch.nn.DataParallel(model)
# Distributed training (should be after apex fp16 initialization) # Distributed training (should be after apex fp16 initialization)
if args.local_rank != -1: if args.local_rank != -1:
model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank], model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank],
...@@ -418,8 +422,6 @@ def main(): ...@@ -418,8 +422,6 @@ def main():
torch.distributed.barrier() # Make sure only the first process in distributed training will download model & vocab torch.distributed.barrier() # Make sure only the first process in distributed training will download model & vocab
model.to(args.device) model.to(args.device)
if args.n_gpu > 1:
model = torch.nn.DataParallel(model)
logger.info("Training/evaluation parameters %s", args) logger.info("Training/evaluation parameters %s", args)
......
...@@ -101,6 +101,10 @@ def train(args, train_dataset, model, tokenizer): ...@@ -101,6 +101,10 @@ def train(args, train_dataset, model, tokenizer):
raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.") raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level)
# multi-gpu training (should be after apex fp16 initialization)
if args.n_gpu > 1:
model = torch.nn.DataParallel(model)
# Distributed training (should be after apex fp16 initialization) # Distributed training (should be after apex fp16 initialization)
if args.local_rank != -1: if args.local_rank != -1:
model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank], model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank],
...@@ -460,8 +464,6 @@ def main(): ...@@ -460,8 +464,6 @@ def main():
torch.distributed.barrier() # Make sure only the first process in distributed training will download model & vocab torch.distributed.barrier() # Make sure only the first process in distributed training will download model & vocab
model.to(args.device) model.to(args.device)
if args.n_gpu > 1:
model = torch.nn.DataParallel(model)
logger.info("Training/evaluation parameters %s", args) logger.info("Training/evaluation parameters %s", args)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment