set find_unused_parameters=True in DDP

d82e5dee · thomwolf · a59abedf · d82e5dee · d82e5dee
Commit d82e5dee authored Jun 18, 2019 by thomwolf
Hide whitespace changes
Inline Side-by-side

Showing with 8 additions and 4 deletions

README.md README.md +4 -3

examples/run_squad.py examples/run_squad.py +4 -1

No files found.
--- a/README.md
+++ b/README.md
@@ -1468,12 +1468,13 @@ python -m torch.distributed.launch --nproc_per_node=8 \
  --do_lower_case \
  --train_file $SQUAD_DIR/train-v1.1.json \
  --predict_file $SQUAD_DIR/dev-v1.1.json \
-  --train_batch_size 12 \
  --learning_rate 3e-5 \
-  --num_train_epochs 2.0 \
+  --num_train_epochs 2 \
  --max_seq_length 384 \
  --doc_stride 128 \
-  --output_dir /tmp/debug_squad/
+  --output_dir /tmp/debug_squad/ \
+  --train_batch_size 24 \
+  --gradient_accumulation_steps 2
 ```

 ## Notebooks

--- a/examples/run_squad.py
+++ b/examples/run_squad.py
@@ -907,7 +907,10 @@ def main():
    #     except ImportError:
    #         raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.")

-        model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank], output_device=args.local_rank)
+        model = torch.nn.parallel.DistributedDataParallel(model,
+                                                          device_ids=[args.local_rank],
+                                                          output_device=args.local_rank,
+                                                          find_unused_parameters=True)
    elif n_gpu > 1:
        model = torch.nn.DataParallel(model)