"git@developer.sourcefind.cn:chenpangpang/transformers.git" did not exist on "269078a7ebad5b36dfb2205d310f56c950b02ec3"
Unverified Commit 44eb8bde authored by Patrick von Platen's avatar Patrick von Platen Committed by GitHub
Browse files

map only on one process (#13810)

parent 9a9805fc
...@@ -337,14 +337,15 @@ def main(): ...@@ -337,14 +337,15 @@ def main():
def tokenize_function(examples): def tokenize_function(examples):
return tokenizer(examples[text_column_name]) return tokenizer(examples[text_column_name])
tokenized_datasets = raw_datasets.map( with accelerator.main_process_first():
tokenize_function, tokenized_datasets = raw_datasets.map(
batched=True, tokenize_function,
num_proc=args.preprocessing_num_workers, batched=True,
remove_columns=column_names, num_proc=args.preprocessing_num_workers,
load_from_cache_file=not args.overwrite_cache, remove_columns=column_names,
desc="Running tokenizer on dataset", load_from_cache_file=not args.overwrite_cache,
) desc="Running tokenizer on dataset",
)
if args.block_size is None: if args.block_size is None:
block_size = tokenizer.model_max_length block_size = tokenizer.model_max_length
...@@ -386,13 +387,14 @@ def main(): ...@@ -386,13 +387,14 @@ def main():
# To speed up this part, we use multiprocessing. See the documentation of the map method for more information: # To speed up this part, we use multiprocessing. See the documentation of the map method for more information:
# https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map
lm_datasets = tokenized_datasets.map( with accelerator.main_process_first():
group_texts, lm_datasets = tokenized_datasets.map(
batched=True, group_texts,
num_proc=args.preprocessing_num_workers, batched=True,
load_from_cache_file=not args.overwrite_cache, num_proc=args.preprocessing_num_workers,
desc=f"Grouping texts in chunks of {block_size}", load_from_cache_file=not args.overwrite_cache,
) desc=f"Grouping texts in chunks of {block_size}",
)
train_dataset = lm_datasets["train"] train_dataset = lm_datasets["train"]
eval_dataset = lm_datasets["validation"] eval_dataset = lm_datasets["validation"]
......
...@@ -374,14 +374,15 @@ def main(): ...@@ -374,14 +374,15 @@ def main():
return_special_tokens_mask=True, return_special_tokens_mask=True,
) )
tokenized_datasets = raw_datasets.map( with accelerator.main_process_first():
tokenize_function, tokenized_datasets = raw_datasets.map(
batched=True, tokenize_function,
num_proc=args.preprocessing_num_workers, batched=True,
remove_columns=[text_column_name], num_proc=args.preprocessing_num_workers,
load_from_cache_file=not args.overwrite_cache, remove_columns=[text_column_name],
desc="Running tokenizer on dataset line_by_line", load_from_cache_file=not args.overwrite_cache,
) desc="Running tokenizer on dataset line_by_line",
)
else: else:
# Otherwise, we tokenize every text, then concatenate them together before splitting them in smaller parts. # Otherwise, we tokenize every text, then concatenate them together before splitting them in smaller parts.
# We use `return_special_tokens_mask=True` because DataCollatorForLanguageModeling (see below) is more # We use `return_special_tokens_mask=True` because DataCollatorForLanguageModeling (see below) is more
...@@ -389,14 +390,15 @@ def main(): ...@@ -389,14 +390,15 @@ def main():
def tokenize_function(examples): def tokenize_function(examples):
return tokenizer(examples[text_column_name], return_special_tokens_mask=True) return tokenizer(examples[text_column_name], return_special_tokens_mask=True)
tokenized_datasets = raw_datasets.map( with accelerator.main_process_first():
tokenize_function, tokenized_datasets = raw_datasets.map(
batched=True, tokenize_function,
num_proc=args.preprocessing_num_workers, batched=True,
remove_columns=column_names, num_proc=args.preprocessing_num_workers,
load_from_cache_file=not args.overwrite_cache, remove_columns=column_names,
desc="Running tokenizer on every text in dataset", load_from_cache_file=not args.overwrite_cache,
) desc="Running tokenizer on every text in dataset",
)
# Main data processing function that will concatenate all texts from our dataset and generate chunks of # Main data processing function that will concatenate all texts from our dataset and generate chunks of
# max_seq_length. # max_seq_length.
...@@ -422,13 +424,14 @@ def main(): ...@@ -422,13 +424,14 @@ def main():
# To speed up this part, we use multiprocessing. See the documentation of the map method for more information: # To speed up this part, we use multiprocessing. See the documentation of the map method for more information:
# https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map
tokenized_datasets = tokenized_datasets.map( with accelerator.main_process_first():
group_texts, tokenized_datasets = tokenized_datasets.map(
batched=True, group_texts,
num_proc=args.preprocessing_num_workers, batched=True,
load_from_cache_file=not args.overwrite_cache, num_proc=args.preprocessing_num_workers,
desc=f"Grouping texts in chunks of {max_seq_length}", load_from_cache_file=not args.overwrite_cache,
) desc=f"Grouping texts in chunks of {max_seq_length}",
)
train_dataset = tokenized_datasets["train"] train_dataset = tokenized_datasets["train"]
eval_dataset = tokenized_datasets["validation"] eval_dataset = tokenized_datasets["validation"]
......
...@@ -381,9 +381,10 @@ def main(): ...@@ -381,9 +381,10 @@ def main():
tokenized_inputs["labels"] = labels tokenized_inputs["labels"] = labels
return tokenized_inputs return tokenized_inputs
processed_datasets = raw_datasets.map( with accelerator.main_process_first():
preprocess_function, batched=True, remove_columns=raw_datasets["train"].column_names processed_datasets = raw_datasets.map(
) preprocess_function, batched=True, remove_columns=raw_datasets["train"].column_names
)
train_dataset = processed_datasets["train"] train_dataset = processed_datasets["train"]
eval_dataset = processed_datasets["validation"] eval_dataset = processed_datasets["validation"]
......
...@@ -440,14 +440,15 @@ def main(): ...@@ -440,14 +440,15 @@ def main():
# We will select sample from whole data if agument is specified # We will select sample from whole data if agument is specified
train_dataset = train_dataset.select(range(args.max_train_samples)) train_dataset = train_dataset.select(range(args.max_train_samples))
# Create train feature from dataset # Create train feature from dataset
train_dataset = train_dataset.map( with accelerator.main_process_first():
prepare_train_features, train_dataset = train_dataset.map(
batched=True, prepare_train_features,
num_proc=args.preprocessing_num_workers, batched=True,
remove_columns=column_names, num_proc=args.preprocessing_num_workers,
load_from_cache_file=not args.overwrite_cache, remove_columns=column_names,
desc="Running tokenizer on train dataset", load_from_cache_file=not args.overwrite_cache,
) desc="Running tokenizer on train dataset",
)
if args.max_train_samples is not None: if args.max_train_samples is not None:
# Number of samples might increase during Feature Creation, We select only specified max samples # Number of samples might increase during Feature Creation, We select only specified max samples
train_dataset = train_dataset.select(range(args.max_train_samples)) train_dataset = train_dataset.select(range(args.max_train_samples))
...@@ -530,14 +531,15 @@ def main(): ...@@ -530,14 +531,15 @@ def main():
# We will select sample from whole data # We will select sample from whole data
eval_examples = eval_examples.select(range(args.max_eval_samples)) eval_examples = eval_examples.select(range(args.max_eval_samples))
# Validation Feature Creation # Validation Feature Creation
eval_dataset = eval_examples.map( with accelerator.main_process_first():
prepare_validation_features, eval_dataset = eval_examples.map(
batched=True, prepare_validation_features,
num_proc=args.preprocessing_num_workers, batched=True,
remove_columns=column_names, num_proc=args.preprocessing_num_workers,
load_from_cache_file=not args.overwrite_cache, remove_columns=column_names,
desc="Running tokenizer on validation dataset", load_from_cache_file=not args.overwrite_cache,
) desc="Running tokenizer on validation dataset",
)
if args.max_eval_samples is not None: if args.max_eval_samples is not None:
# During Feature creation dataset samples might increase, we will select required samples again # During Feature creation dataset samples might increase, we will select required samples again
...@@ -551,17 +553,18 @@ def main(): ...@@ -551,17 +553,18 @@ def main():
# We will select sample from whole data # We will select sample from whole data
predict_examples = predict_examples.select(range(args.max_predict_samples)) predict_examples = predict_examples.select(range(args.max_predict_samples))
# Predict Feature Creation # Predict Feature Creation
predict_dataset = predict_examples.map( with accelerator.main_process_first():
prepare_validation_features, predict_dataset = predict_examples.map(
batched=True, prepare_validation_features,
num_proc=args.preprocessing_num_workers, batched=True,
remove_columns=column_names, num_proc=args.preprocessing_num_workers,
load_from_cache_file=not args.overwrite_cache, remove_columns=column_names,
desc="Running tokenizer on prediction dataset", load_from_cache_file=not args.overwrite_cache,
) desc="Running tokenizer on prediction dataset",
if args.max_predict_samples is not None: )
# During Feature creation dataset samples might increase, we will select required samples again if args.max_predict_samples is not None:
predict_dataset = predict_dataset.select(range(args.max_predict_samples)) # During Feature creation dataset samples might increase, we will select required samples again
predict_dataset = predict_dataset.select(range(args.max_predict_samples))
# Log a few random samples from the training set: # Log a few random samples from the training set:
for index in random.sample(range(len(train_dataset)), 3): for index in random.sample(range(len(train_dataset)), 3):
......
...@@ -468,18 +468,20 @@ def main(): ...@@ -468,18 +468,20 @@ def main():
if args.max_train_samples is not None: if args.max_train_samples is not None:
# We will select sample from whole data if agument is specified # We will select sample from whole data if agument is specified
train_dataset = train_dataset.select(range(args.max_train_samples)) train_dataset = train_dataset.select(range(args.max_train_samples))
# Create train feature from dataset # Create train feature from dataset
train_dataset = train_dataset.map( with accelerator.main_process_first():
prepare_train_features, train_dataset = train_dataset.map(
batched=True, prepare_train_features,
num_proc=args.preprocessing_num_workers, batched=True,
remove_columns=column_names, num_proc=args.preprocessing_num_workers,
load_from_cache_file=not args.overwrite_cache, remove_columns=column_names,
desc="Running tokenizer on train dataset", load_from_cache_file=not args.overwrite_cache,
) desc="Running tokenizer on train dataset",
if args.max_train_samples is not None: )
# Number of samples might increase during Feature Creation, We select only specified max samples if args.max_train_samples is not None:
train_dataset = train_dataset.select(range(args.max_train_samples)) # Number of samples might increase during Feature Creation, We select only specified max samples
train_dataset = train_dataset.select(range(args.max_train_samples))
# Validation preprocessing # Validation preprocessing
def prepare_validation_features(examples): def prepare_validation_features(examples):
...@@ -535,14 +537,15 @@ def main(): ...@@ -535,14 +537,15 @@ def main():
# We will select sample from whole data # We will select sample from whole data
eval_examples = eval_examples.select(range(args.max_eval_samples)) eval_examples = eval_examples.select(range(args.max_eval_samples))
# Validation Feature Creation # Validation Feature Creation
eval_dataset = eval_examples.map( with accelerator.main_process_first():
prepare_validation_features, eval_dataset = eval_examples.map(
batched=True, prepare_validation_features,
num_proc=args.preprocessing_num_workers, batched=True,
remove_columns=column_names, num_proc=args.preprocessing_num_workers,
load_from_cache_file=not args.overwrite_cache, remove_columns=column_names,
desc="Running tokenizer on validation dataset", load_from_cache_file=not args.overwrite_cache,
) desc="Running tokenizer on validation dataset",
)
if args.max_eval_samples is not None: if args.max_eval_samples is not None:
# During Feature creation dataset samples might increase, we will select required samples again # During Feature creation dataset samples might increase, we will select required samples again
...@@ -556,17 +559,18 @@ def main(): ...@@ -556,17 +559,18 @@ def main():
# We will select sample from whole data # We will select sample from whole data
predict_examples = predict_examples.select(range(args.max_predict_samples)) predict_examples = predict_examples.select(range(args.max_predict_samples))
# Predict Feature Creation # Predict Feature Creation
predict_dataset = predict_examples.map( with accelerator.main_process_first():
prepare_validation_features, predict_dataset = predict_examples.map(
batched=True, prepare_validation_features,
num_proc=args.preprocessing_num_workers, batched=True,
remove_columns=column_names, num_proc=args.preprocessing_num_workers,
load_from_cache_file=not args.overwrite_cache, remove_columns=column_names,
desc="Running tokenizer on prediction dataset", load_from_cache_file=not args.overwrite_cache,
) desc="Running tokenizer on prediction dataset",
if args.max_predict_samples is not None: )
# During Feature creation dataset samples might increase, we will select required samples again if args.max_predict_samples is not None:
predict_dataset = predict_dataset.select(range(args.max_predict_samples)) # During Feature creation dataset samples might increase, we will select required samples again
predict_dataset = predict_dataset.select(range(args.max_predict_samples))
# Log a few random samples from the training set: # Log a few random samples from the training set:
for index in random.sample(range(len(train_dataset)), 3): for index in random.sample(range(len(train_dataset)), 3):
......
...@@ -439,13 +439,14 @@ def main(): ...@@ -439,13 +439,14 @@ def main():
model_inputs["labels"] = labels["input_ids"] model_inputs["labels"] = labels["input_ids"]
return model_inputs return model_inputs
processed_datasets = raw_datasets.map( with accelerator.main_process_first():
preprocess_function, processed_datasets = raw_datasets.map(
batched=True, preprocess_function,
remove_columns=column_names, batched=True,
load_from_cache_file=not args.overwrite_cache, remove_columns=column_names,
desc="Running tokenizer on dataset", load_from_cache_file=not args.overwrite_cache,
) desc="Running tokenizer on dataset",
)
train_dataset = processed_datasets["train"] train_dataset = processed_datasets["train"]
eval_dataset = processed_datasets["validation"] eval_dataset = processed_datasets["validation"]
......
...@@ -330,12 +330,13 @@ def main(): ...@@ -330,12 +330,13 @@ def main():
result["labels"] = examples["label"] result["labels"] = examples["label"]
return result return result
processed_datasets = raw_datasets.map( with accelerator.main_process_first():
preprocess_function, processed_datasets = raw_datasets.map(
batched=True, preprocess_function,
remove_columns=raw_datasets["train"].column_names, batched=True,
desc="Running tokenizer on dataset", remove_columns=raw_datasets["train"].column_names,
) desc="Running tokenizer on dataset",
)
train_dataset = processed_datasets["train"] train_dataset = processed_datasets["train"]
eval_dataset = processed_datasets["validation_matched" if args.task_name == "mnli" else "validation"] eval_dataset = processed_datasets["validation_matched" if args.task_name == "mnli" else "validation"]
......
...@@ -403,12 +403,13 @@ def main(): ...@@ -403,12 +403,13 @@ def main():
tokenized_inputs["labels"] = labels tokenized_inputs["labels"] = labels
return tokenized_inputs return tokenized_inputs
processed_raw_datasets = raw_datasets.map( with accelerator.main_process_first():
tokenize_and_align_labels, processed_raw_datasets = raw_datasets.map(
batched=True, tokenize_and_align_labels,
remove_columns=raw_datasets["train"].column_names, batched=True,
desc="Running tokenizer on dataset", remove_columns=raw_datasets["train"].column_names,
) desc="Running tokenizer on dataset",
)
train_dataset = processed_raw_datasets["train"] train_dataset = processed_raw_datasets["train"]
eval_dataset = processed_raw_datasets["validation"] eval_dataset = processed_raw_datasets["validation"]
......
...@@ -418,14 +418,15 @@ def main(): ...@@ -418,14 +418,15 @@ def main():
model_inputs["labels"] = labels["input_ids"] model_inputs["labels"] = labels["input_ids"]
return model_inputs return model_inputs
processed_datasets = raw_datasets.map( with accelerator.main_process_first():
preprocess_function, processed_datasets = raw_datasets.map(
batched=True, preprocess_function,
num_proc=args.preprocessing_num_workers, batched=True,
remove_columns=column_names, num_proc=args.preprocessing_num_workers,
load_from_cache_file=not args.overwrite_cache, remove_columns=column_names,
desc="Running tokenizer on dataset", load_from_cache_file=not args.overwrite_cache,
) desc="Running tokenizer on dataset",
)
train_dataset = processed_datasets["train"] train_dataset = processed_datasets["train"]
eval_dataset = processed_datasets["validation"] eval_dataset = processed_datasets["validation"]
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment