Merge pull request #1081 from huggingface/fix_distributed_barrier_hang

Fix distributed barrier hang

Merge pull request #1081 from huggingface/fix_distributed_barrier_hang
Fix distributed barrier hang
df9d6eff · Thomas Wolf · GitHub · 3f20dd71 · 57272d5d · df9d6eff
Unverified Commit df9d6eff authored Aug 23, 2019 by Thomas Wolf Committed by GitHub Aug 23, 2019
Hide whitespace changes
Inline Side-by-side

Showing with 4 additions and 4 deletions

examples/run_glue.py examples/run_glue.py +2 -2

examples/run_squad.py examples/run_squad.py +2 -2

No files found.
--- a/examples/run_glue.py
+++ b/examples/run_glue.py
@@ -251,7 +251,7 @@ def evaluate(args, model, tokenizer, prefix=""):
 def load_and_cache_examples(args, task, tokenizer, evaluate=False):
-    if args.local_rank not in [-1, 0]:
+    if args.local_rank not in [-1, 0] and not evaluate:
        torch.distributed.barrier()  # Make sure only the first process in distributed training process the dataset, and the others will use the cache
    processor = processors[task]()
@@ -286,7 +286,7 @@ def load_and_cache_examples(args, task, tokenizer, evaluate=False):
            logger.info("Saving features into cached file %s", cached_features_file)
            torch.save(features, cached_features_file)
-    if args.local_rank == 0:
+    if args.local_rank == 0 and not evaluate:
        torch.distributed.barrier()  # Make sure only the first process in distributed training process the dataset, and the others will use the cache
    # Convert to Tensors and build dataset

--- a/examples/run_squad.py
+++ b/examples/run_squad.py
@@ -272,7 +272,7 @@ def evaluate(args, model, tokenizer, prefix=""):
 def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=False):
-    if args.local_rank not in [-1, 0]:
+    if args.local_rank not in [-1, 0] and not evaluate:
        torch.distributed.barrier()  # Make sure only the first process in distributed training process the dataset, and the others will use the cache
    # Load data features from cache or dataset file
@@ -299,7 +299,7 @@ def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=Fal
            logger.info("Saving features into cached file %s", cached_features_file)
            torch.save(features, cached_features_file)
-    if args.local_rank == 0:
+    if args.local_rank == 0 and not evaluate:
        torch.distributed.barrier()  # Make sure only the first process in distributed training process the dataset, and the others will use the cache
    # Convert to Tensors and build dataset