Merge pull request #896 from zijunsun/master

fix multi-gpu training bug when using fp16

Merge pull request #896 from zijunsun/master
fix multi-gpu training bug when using fp16
c054b5ee · Thomas Wolf · GitHub · 46cc9dd2 · f0aeb7a8 · c054b5ee
Unverified Commit c054b5ee authored Jul 26, 2019 by Thomas Wolf Committed by GitHub Jul 26, 2019
Show whitespace changes
Inline Side-by-side

Showing with 8 additions and 4 deletions

examples/run_glue.py examples/run_glue.py +4 -2

examples/run_squad.py examples/run_squad.py +4 -2

No files found.
--- a/examples/run_glue.py
+++ b/examples/run_glue.py
@@ -92,6 +92,10 @@ def train(args, train_dataset, model, tokenizer):
            raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
        model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level)

+    # multi-gpu training (should be after apex fp16 initialization)
+    if args.n_gpu > 1:
+        model = torch.nn.DataParallel(model)
+
    # Distributed training (should be after apex fp16 initialization)
    if args.local_rank != -1:
        model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank],
@@ -418,8 +422,6 @@ def main():
        torch.distributed.barrier()  # Make sure only the first process in distributed training will download model & vocab

    model.to(args.device)
-    if args.n_gpu > 1:
-        model = torch.nn.DataParallel(model)

    logger.info("Training/evaluation parameters %s", args)


--- a/examples/run_squad.py
+++ b/examples/run_squad.py
@@ -101,6 +101,10 @@ def train(args, train_dataset, model, tokenizer):
            raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
        model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level)

+    # multi-gpu training (should be after apex fp16 initialization)
+    if args.n_gpu > 1:
+        model = torch.nn.DataParallel(model)
+
    # Distributed training (should be after apex fp16 initialization)
    if args.local_rank != -1:
        model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank],
@@ -460,8 +464,6 @@ def main():
        torch.distributed.barrier()  # Make sure only the first process in distributed training will download model & vocab

    model.to(args.device)
-    if args.n_gpu > 1:
-        model = torch.nn.DataParallel(model)

    logger.info("Training/evaluation parameters %s", args)