Unverified Commit f5ed19f5 authored by Patrick von Platen's avatar Patrick von Platen Committed by GitHub
Browse files

[Speech Recognition] - Distributed training: Make sure vocab file removal and...

[Speech Recognition] - Distributed training: Make sure vocab file removal and creation don't interfer  (#14161)

* up

* better
parent 840fc8db
...@@ -395,20 +395,20 @@ def main(): ...@@ -395,20 +395,20 @@ def main():
# the training and evaluation datasets # the training and evaluation datasets
# We need to make sure that only first rank saves vocabulary # We need to make sure that only first rank saves vocabulary
# make sure all processes wait until vocab is created # make sure all processes wait until vocab is created
with training_args.main_process_first(desc="dataset map vocabulary creation"):
vocab_dict = create_vocabulary_from_data(raw_datasets)
vocab_file = os.path.join(training_args.output_dir, "vocab.json") vocab_file = os.path.join(training_args.output_dir, "vocab.json")
# save vocab dict to be loaded into tokenizer with training_args.main_process_first():
os.makedirs(training_args.output_dir, exist_ok=True)
if training_args.overwrite_output_dir and os.path.isfile(vocab_file): if training_args.overwrite_output_dir and os.path.isfile(vocab_file):
os.remove(vocab_file) os.remove(vocab_file)
with training_args.main_process_first(desc="dataset map vocabulary creation"):
if not os.path.isfile(vocab_file): if not os.path.isfile(vocab_file):
with open(vocab_file, "w") as vocab_file: os.makedirs(training_args.output_dir, exist_ok=True)
json.dump(vocab_dict, vocab_file) vocab_dict = create_vocabulary_from_data(raw_datasets)
# save vocab dict to be loaded into tokenizer
with open(vocab_file, "w") as file:
json.dump(vocab_dict, file)
# 4. Now we can instantiate the configuration, feature extractor, tokenizer and model # 4. Now we can instantiate the configuration, feature extractor, tokenizer and model
# Note for distributed training, the .from_pretrained methods guarantee that only # Note for distributed training, the .from_pretrained methods guarantee that only
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment