Unverified Commit df9d6eff authored by Thomas Wolf's avatar Thomas Wolf Committed by GitHub
Browse files

Merge pull request #1081 from huggingface/fix_distributed_barrier_hang

Fix distributed barrier hang
parents 3f20dd71 57272d5d
...@@ -251,7 +251,7 @@ def evaluate(args, model, tokenizer, prefix=""): ...@@ -251,7 +251,7 @@ def evaluate(args, model, tokenizer, prefix=""):
def load_and_cache_examples(args, task, tokenizer, evaluate=False): def load_and_cache_examples(args, task, tokenizer, evaluate=False):
if args.local_rank not in [-1, 0]: if args.local_rank not in [-1, 0] and not evaluate:
torch.distributed.barrier() # Make sure only the first process in distributed training process the dataset, and the others will use the cache torch.distributed.barrier() # Make sure only the first process in distributed training process the dataset, and the others will use the cache
processor = processors[task]() processor = processors[task]()
...@@ -286,7 +286,7 @@ def load_and_cache_examples(args, task, tokenizer, evaluate=False): ...@@ -286,7 +286,7 @@ def load_and_cache_examples(args, task, tokenizer, evaluate=False):
logger.info("Saving features into cached file %s", cached_features_file) logger.info("Saving features into cached file %s", cached_features_file)
torch.save(features, cached_features_file) torch.save(features, cached_features_file)
if args.local_rank == 0: if args.local_rank == 0 and not evaluate:
torch.distributed.barrier() # Make sure only the first process in distributed training process the dataset, and the others will use the cache torch.distributed.barrier() # Make sure only the first process in distributed training process the dataset, and the others will use the cache
# Convert to Tensors and build dataset # Convert to Tensors and build dataset
......
...@@ -272,7 +272,7 @@ def evaluate(args, model, tokenizer, prefix=""): ...@@ -272,7 +272,7 @@ def evaluate(args, model, tokenizer, prefix=""):
def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=False): def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=False):
if args.local_rank not in [-1, 0]: if args.local_rank not in [-1, 0] and not evaluate:
torch.distributed.barrier() # Make sure only the first process in distributed training process the dataset, and the others will use the cache torch.distributed.barrier() # Make sure only the first process in distributed training process the dataset, and the others will use the cache
# Load data features from cache or dataset file # Load data features from cache or dataset file
...@@ -299,7 +299,7 @@ def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=Fal ...@@ -299,7 +299,7 @@ def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=Fal
logger.info("Saving features into cached file %s", cached_features_file) logger.info("Saving features into cached file %s", cached_features_file)
torch.save(features, cached_features_file) torch.save(features, cached_features_file)
if args.local_rank == 0: if args.local_rank == 0 and not evaluate:
torch.distributed.barrier() # Make sure only the first process in distributed training process the dataset, and the others will use the cache torch.distributed.barrier() # Make sure only the first process in distributed training process the dataset, and the others will use the cache
# Convert to Tensors and build dataset # Convert to Tensors and build dataset
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment