Unverified Commit 44eb8bde authored by Patrick von Platen's avatar Patrick von Platen Committed by GitHub
Browse files

map only on one process (#13810)

parent 9a9805fc
...@@ -337,6 +337,7 @@ def main(): ...@@ -337,6 +337,7 @@ def main():
def tokenize_function(examples): def tokenize_function(examples):
return tokenizer(examples[text_column_name]) return tokenizer(examples[text_column_name])
with accelerator.main_process_first():
tokenized_datasets = raw_datasets.map( tokenized_datasets = raw_datasets.map(
tokenize_function, tokenize_function,
batched=True, batched=True,
...@@ -386,6 +387,7 @@ def main(): ...@@ -386,6 +387,7 @@ def main():
# To speed up this part, we use multiprocessing. See the documentation of the map method for more information: # To speed up this part, we use multiprocessing. See the documentation of the map method for more information:
# https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map
with accelerator.main_process_first():
lm_datasets = tokenized_datasets.map( lm_datasets = tokenized_datasets.map(
group_texts, group_texts,
batched=True, batched=True,
......
...@@ -374,6 +374,7 @@ def main(): ...@@ -374,6 +374,7 @@ def main():
return_special_tokens_mask=True, return_special_tokens_mask=True,
) )
with accelerator.main_process_first():
tokenized_datasets = raw_datasets.map( tokenized_datasets = raw_datasets.map(
tokenize_function, tokenize_function,
batched=True, batched=True,
...@@ -389,6 +390,7 @@ def main(): ...@@ -389,6 +390,7 @@ def main():
def tokenize_function(examples): def tokenize_function(examples):
return tokenizer(examples[text_column_name], return_special_tokens_mask=True) return tokenizer(examples[text_column_name], return_special_tokens_mask=True)
with accelerator.main_process_first():
tokenized_datasets = raw_datasets.map( tokenized_datasets = raw_datasets.map(
tokenize_function, tokenize_function,
batched=True, batched=True,
...@@ -422,6 +424,7 @@ def main(): ...@@ -422,6 +424,7 @@ def main():
# To speed up this part, we use multiprocessing. See the documentation of the map method for more information: # To speed up this part, we use multiprocessing. See the documentation of the map method for more information:
# https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map
with accelerator.main_process_first():
tokenized_datasets = tokenized_datasets.map( tokenized_datasets = tokenized_datasets.map(
group_texts, group_texts,
batched=True, batched=True,
......
...@@ -381,6 +381,7 @@ def main(): ...@@ -381,6 +381,7 @@ def main():
tokenized_inputs["labels"] = labels tokenized_inputs["labels"] = labels
return tokenized_inputs return tokenized_inputs
with accelerator.main_process_first():
processed_datasets = raw_datasets.map( processed_datasets = raw_datasets.map(
preprocess_function, batched=True, remove_columns=raw_datasets["train"].column_names preprocess_function, batched=True, remove_columns=raw_datasets["train"].column_names
) )
......
...@@ -440,6 +440,7 @@ def main(): ...@@ -440,6 +440,7 @@ def main():
# We will select sample from whole data if agument is specified # We will select sample from whole data if agument is specified
train_dataset = train_dataset.select(range(args.max_train_samples)) train_dataset = train_dataset.select(range(args.max_train_samples))
# Create train feature from dataset # Create train feature from dataset
with accelerator.main_process_first():
train_dataset = train_dataset.map( train_dataset = train_dataset.map(
prepare_train_features, prepare_train_features,
batched=True, batched=True,
...@@ -530,6 +531,7 @@ def main(): ...@@ -530,6 +531,7 @@ def main():
# We will select sample from whole data # We will select sample from whole data
eval_examples = eval_examples.select(range(args.max_eval_samples)) eval_examples = eval_examples.select(range(args.max_eval_samples))
# Validation Feature Creation # Validation Feature Creation
with accelerator.main_process_first():
eval_dataset = eval_examples.map( eval_dataset = eval_examples.map(
prepare_validation_features, prepare_validation_features,
batched=True, batched=True,
...@@ -551,6 +553,7 @@ def main(): ...@@ -551,6 +553,7 @@ def main():
# We will select sample from whole data # We will select sample from whole data
predict_examples = predict_examples.select(range(args.max_predict_samples)) predict_examples = predict_examples.select(range(args.max_predict_samples))
# Predict Feature Creation # Predict Feature Creation
with accelerator.main_process_first():
predict_dataset = predict_examples.map( predict_dataset = predict_examples.map(
prepare_validation_features, prepare_validation_features,
batched=True, batched=True,
......
...@@ -468,7 +468,9 @@ def main(): ...@@ -468,7 +468,9 @@ def main():
if args.max_train_samples is not None: if args.max_train_samples is not None:
# We will select sample from whole data if agument is specified # We will select sample from whole data if agument is specified
train_dataset = train_dataset.select(range(args.max_train_samples)) train_dataset = train_dataset.select(range(args.max_train_samples))
# Create train feature from dataset # Create train feature from dataset
with accelerator.main_process_first():
train_dataset = train_dataset.map( train_dataset = train_dataset.map(
prepare_train_features, prepare_train_features,
batched=True, batched=True,
...@@ -535,6 +537,7 @@ def main(): ...@@ -535,6 +537,7 @@ def main():
# We will select sample from whole data # We will select sample from whole data
eval_examples = eval_examples.select(range(args.max_eval_samples)) eval_examples = eval_examples.select(range(args.max_eval_samples))
# Validation Feature Creation # Validation Feature Creation
with accelerator.main_process_first():
eval_dataset = eval_examples.map( eval_dataset = eval_examples.map(
prepare_validation_features, prepare_validation_features,
batched=True, batched=True,
...@@ -556,6 +559,7 @@ def main(): ...@@ -556,6 +559,7 @@ def main():
# We will select sample from whole data # We will select sample from whole data
predict_examples = predict_examples.select(range(args.max_predict_samples)) predict_examples = predict_examples.select(range(args.max_predict_samples))
# Predict Feature Creation # Predict Feature Creation
with accelerator.main_process_first():
predict_dataset = predict_examples.map( predict_dataset = predict_examples.map(
prepare_validation_features, prepare_validation_features,
batched=True, batched=True,
......
...@@ -439,6 +439,7 @@ def main(): ...@@ -439,6 +439,7 @@ def main():
model_inputs["labels"] = labels["input_ids"] model_inputs["labels"] = labels["input_ids"]
return model_inputs return model_inputs
with accelerator.main_process_first():
processed_datasets = raw_datasets.map( processed_datasets = raw_datasets.map(
preprocess_function, preprocess_function,
batched=True, batched=True,
......
...@@ -330,6 +330,7 @@ def main(): ...@@ -330,6 +330,7 @@ def main():
result["labels"] = examples["label"] result["labels"] = examples["label"]
return result return result
with accelerator.main_process_first():
processed_datasets = raw_datasets.map( processed_datasets = raw_datasets.map(
preprocess_function, preprocess_function,
batched=True, batched=True,
......
...@@ -403,6 +403,7 @@ def main(): ...@@ -403,6 +403,7 @@ def main():
tokenized_inputs["labels"] = labels tokenized_inputs["labels"] = labels
return tokenized_inputs return tokenized_inputs
with accelerator.main_process_first():
processed_raw_datasets = raw_datasets.map( processed_raw_datasets = raw_datasets.map(
tokenize_and_align_labels, tokenize_and_align_labels,
batched=True, batched=True,
......
...@@ -418,6 +418,7 @@ def main(): ...@@ -418,6 +418,7 @@ def main():
model_inputs["labels"] = labels["input_ids"] model_inputs["labels"] = labels["input_ids"]
return model_inputs return model_inputs
with accelerator.main_process_first():
processed_datasets = raw_datasets.map( processed_datasets = raw_datasets.map(
preprocess_function, preprocess_function,
batched=True, batched=True,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment