"examples/vscode:/vscode.git/clone" did not exist on "8581fbaa6d420c8559a7b52397f466d362265aba"
Unverified Commit 04dbea31 authored by Bhadresh Savani's avatar Bhadresh Savani Committed by GitHub
Browse files

[Examples] Added context manager to datasets map (#12367)

* added cotext manager to datasets map

* fixed style and spaces

* fixed warning of deprecation

* changed desc
parent d25ad34c
...@@ -356,6 +356,7 @@ def main(): ...@@ -356,6 +356,7 @@ def main():
) )
return output return output
with training_args.main_process_first(desc="dataset map tokenization"):
tokenized_datasets = raw_datasets.map( tokenized_datasets = raw_datasets.map(
tokenize_function, tokenize_function,
batched=True, batched=True,
...@@ -404,6 +405,7 @@ def main(): ...@@ -404,6 +405,7 @@ def main():
# To speed up this part, we use multiprocessing. See the documentation of the map method for more information: # To speed up this part, we use multiprocessing. See the documentation of the map method for more information:
# https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map
with training_args.main_process_first(desc="grouping texts together"):
lm_datasets = tokenized_datasets.map( lm_datasets = tokenized_datasets.map(
group_texts, group_texts,
batched=True, batched=True,
......
...@@ -383,6 +383,7 @@ def main(): ...@@ -383,6 +383,7 @@ def main():
return_special_tokens_mask=True, return_special_tokens_mask=True,
) )
with training_args.main_process_first(desc="dataset map tokenization"):
tokenized_datasets = raw_datasets.map( tokenized_datasets = raw_datasets.map(
tokenize_function, tokenize_function,
batched=True, batched=True,
...@@ -398,6 +399,7 @@ def main(): ...@@ -398,6 +399,7 @@ def main():
def tokenize_function(examples): def tokenize_function(examples):
return tokenizer(examples[text_column_name], return_special_tokens_mask=True) return tokenizer(examples[text_column_name], return_special_tokens_mask=True)
with training_args.main_process_first(desc="dataset map tokenization"):
tokenized_datasets = raw_datasets.map( tokenized_datasets = raw_datasets.map(
tokenize_function, tokenize_function,
batched=True, batched=True,
...@@ -430,6 +432,7 @@ def main(): ...@@ -430,6 +432,7 @@ def main():
# To speed up this part, we use multiprocessing. See the documentation of the map method for more information: # To speed up this part, we use multiprocessing. See the documentation of the map method for more information:
# https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map
with training_args.main_process_first(desc="grouping texts together"):
tokenized_datasets = tokenized_datasets.map( tokenized_datasets = tokenized_datasets.map(
group_texts, group_texts,
batched=True, batched=True,
......
...@@ -359,6 +359,7 @@ def main(): ...@@ -359,6 +359,7 @@ def main():
examples["text"] = [line for line in examples["text"] if len(line) > 0 and not line.isspace()] examples["text"] = [line for line in examples["text"] if len(line) > 0 and not line.isspace()]
return tokenizer(examples["text"], padding=padding, truncation=True, max_length=max_seq_length) return tokenizer(examples["text"], padding=padding, truncation=True, max_length=max_seq_length)
with training_args.main_process_first(desc="dataset map tokenization"):
tokenized_datasets = raw_datasets.map( tokenized_datasets = raw_datasets.map(
tokenize_function, tokenize_function,
batched=True, batched=True,
...@@ -372,6 +373,7 @@ def main(): ...@@ -372,6 +373,7 @@ def main():
def tokenize_function(examples): def tokenize_function(examples):
return tokenizer(examples[text_column_name]) return tokenizer(examples[text_column_name])
with training_args.main_process_first(desc="dataset map tokenization"):
tokenized_datasets = raw_datasets.map( tokenized_datasets = raw_datasets.map(
tokenize_function, tokenize_function,
batched=True, batched=True,
...@@ -404,6 +406,7 @@ def main(): ...@@ -404,6 +406,7 @@ def main():
# To speed up this part, we use multiprocessing. See the documentation of the map method for more information: # To speed up this part, we use multiprocessing. See the documentation of the map method for more information:
# https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map
with training_args.main_process_first(desc="grouping texts together"):
tokenized_datasets = tokenized_datasets.map( tokenized_datasets = tokenized_datasets.map(
group_texts, group_texts,
batched=True, batched=True,
......
...@@ -353,6 +353,7 @@ def main(): ...@@ -353,6 +353,7 @@ def main():
train_dataset = raw_datasets["train"] train_dataset = raw_datasets["train"]
if data_args.max_train_samples is not None: if data_args.max_train_samples is not None:
train_dataset = train_dataset.select(range(data_args.max_train_samples)) train_dataset = train_dataset.select(range(data_args.max_train_samples))
with training_args.main_process_first(desc="train dataset map pre-processing"):
train_dataset = train_dataset.map( train_dataset = train_dataset.map(
preprocess_function, preprocess_function,
batched=True, batched=True,
...@@ -366,6 +367,7 @@ def main(): ...@@ -366,6 +367,7 @@ def main():
eval_dataset = raw_datasets["validation"] eval_dataset = raw_datasets["validation"]
if data_args.max_eval_samples is not None: if data_args.max_eval_samples is not None:
eval_dataset = eval_dataset.select(range(data_args.max_eval_samples)) eval_dataset = eval_dataset.select(range(data_args.max_eval_samples))
with training_args.main_process_first(desc="validation dataset map pre-processing"):
eval_dataset = eval_dataset.map( eval_dataset = eval_dataset.map(
preprocess_function, preprocess_function,
batched=True, batched=True,
......
...@@ -418,6 +418,7 @@ def main(): ...@@ -418,6 +418,7 @@ def main():
# We will select sample from whole data if agument is specified # We will select sample from whole data if agument is specified
train_dataset = train_dataset.select(range(data_args.max_train_samples)) train_dataset = train_dataset.select(range(data_args.max_train_samples))
# Create train feature from dataset # Create train feature from dataset
with training_args.main_process_first(desc="train dataset map pre-processing"):
train_dataset = train_dataset.map( train_dataset = train_dataset.map(
prepare_train_features, prepare_train_features,
batched=True, batched=True,
...@@ -480,6 +481,7 @@ def main(): ...@@ -480,6 +481,7 @@ def main():
# We will select sample from whole data # We will select sample from whole data
eval_examples = eval_examples.select(range(data_args.max_eval_samples)) eval_examples = eval_examples.select(range(data_args.max_eval_samples))
# Validation Feature Creation # Validation Feature Creation
with training_args.main_process_first(desc="validation dataset map pre-processing"):
eval_dataset = eval_examples.map( eval_dataset = eval_examples.map(
prepare_validation_features, prepare_validation_features,
batched=True, batched=True,
...@@ -500,6 +502,7 @@ def main(): ...@@ -500,6 +502,7 @@ def main():
# We will select sample from whole data # We will select sample from whole data
predict_examples = predict_examples.select(range(data_args.max_predict_samples)) predict_examples = predict_examples.select(range(data_args.max_predict_samples))
# Predict Feature Creation # Predict Feature Creation
with training_args.main_process_first(desc="prediction dataset map pre-processing"):
predict_dataset = predict_examples.map( predict_dataset = predict_examples.map(
prepare_validation_features, prepare_validation_features,
batched=True, batched=True,
......
...@@ -429,6 +429,7 @@ def main(): ...@@ -429,6 +429,7 @@ def main():
# Select samples from Dataset, This will help to decrease processing time # Select samples from Dataset, This will help to decrease processing time
train_dataset = train_dataset.select(range(data_args.max_train_samples)) train_dataset = train_dataset.select(range(data_args.max_train_samples))
# Create Training Features # Create Training Features
with training_args.main_process_first(desc="train dataset map pre-processing"):
train_dataset = train_dataset.map( train_dataset = train_dataset.map(
prepare_train_features, prepare_train_features,
batched=True, batched=True,
...@@ -515,6 +516,7 @@ def main(): ...@@ -515,6 +516,7 @@ def main():
# Selecting Eval Samples from Dataset # Selecting Eval Samples from Dataset
eval_examples = eval_examples.select(range(data_args.max_eval_samples)) eval_examples = eval_examples.select(range(data_args.max_eval_samples))
# Create Features from Eval Dataset # Create Features from Eval Dataset
with training_args.main_process_first(desc="validation dataset map pre-processing"):
eval_dataset = eval_examples.map( eval_dataset = eval_examples.map(
prepare_validation_features, prepare_validation_features,
batched=True, batched=True,
...@@ -535,6 +537,7 @@ def main(): ...@@ -535,6 +537,7 @@ def main():
# We will select sample from whole data # We will select sample from whole data
predict_examples = predict_examples.select(range(data_args.max_predict_samples)) predict_examples = predict_examples.select(range(data_args.max_predict_samples))
# Test Feature Creation # Test Feature Creation
with training_args.main_process_first(desc="prediction dataset map pre-processing"):
predict_dataset = predict_examples.map( predict_dataset = predict_examples.map(
prepare_validation_features, prepare_validation_features,
batched=True, batched=True,
......
...@@ -435,6 +435,7 @@ def main(): ...@@ -435,6 +435,7 @@ def main():
train_dataset = raw_datasets["train"] train_dataset = raw_datasets["train"]
if data_args.max_train_samples is not None: if data_args.max_train_samples is not None:
train_dataset = train_dataset.select(range(data_args.max_train_samples)) train_dataset = train_dataset.select(range(data_args.max_train_samples))
with training_args.main_process_first(desc="train dataset map pre-processing"):
train_dataset = train_dataset.map( train_dataset = train_dataset.map(
preprocess_function, preprocess_function,
batched=True, batched=True,
...@@ -451,6 +452,7 @@ def main(): ...@@ -451,6 +452,7 @@ def main():
eval_dataset = raw_datasets["validation"] eval_dataset = raw_datasets["validation"]
if data_args.max_eval_samples is not None: if data_args.max_eval_samples is not None:
eval_dataset = eval_dataset.select(range(data_args.max_eval_samples)) eval_dataset = eval_dataset.select(range(data_args.max_eval_samples))
with training_args.main_process_first(desc="validation dataset map pre-processing"):
eval_dataset = eval_dataset.map( eval_dataset = eval_dataset.map(
preprocess_function, preprocess_function,
batched=True, batched=True,
...@@ -467,6 +469,7 @@ def main(): ...@@ -467,6 +469,7 @@ def main():
predict_dataset = raw_datasets["test"] predict_dataset = raw_datasets["test"]
if data_args.max_predict_samples is not None: if data_args.max_predict_samples is not None:
predict_dataset = predict_dataset.select(range(data_args.max_predict_samples)) predict_dataset = predict_dataset.select(range(data_args.max_predict_samples))
with training_args.main_process_first(desc="prediction dataset map pre-processing"):
predict_dataset = predict_dataset.map( predict_dataset = predict_dataset.map(
preprocess_function, preprocess_function,
batched=True, batched=True,
......
...@@ -400,6 +400,7 @@ def main(): ...@@ -400,6 +400,7 @@ def main():
result["label"] = [(label_to_id[l] if l != -1 else -1) for l in examples["label"]] result["label"] = [(label_to_id[l] if l != -1 else -1) for l in examples["label"]]
return result return result
with training_args.main_process_first(desc="dataset map pre-processing"):
raw_datasets = raw_datasets.map( raw_datasets = raw_datasets.map(
preprocess_function, preprocess_function,
batched=True, batched=True,
...@@ -526,7 +527,7 @@ def main(): ...@@ -526,7 +527,7 @@ def main():
for predict_dataset, task in zip(predict_datasets, tasks): for predict_dataset, task in zip(predict_datasets, tasks):
# Removing the `label` columns because it contains -1 and Trainer won't like that. # Removing the `label` columns because it contains -1 and Trainer won't like that.
predict_dataset.remove_columns_("label") predict_dataset = predict_dataset.remove_columns("label")
predictions = trainer.predict(predict_dataset, metric_key_prefix="predict").predictions predictions = trainer.predict(predict_dataset, metric_key_prefix="predict").predictions
predictions = np.squeeze(predictions) if is_regression else np.argmax(predictions, axis=1) predictions = np.squeeze(predictions) if is_regression else np.argmax(predictions, axis=1)
......
...@@ -280,6 +280,7 @@ def main(): ...@@ -280,6 +280,7 @@ def main():
if training_args.do_train: if training_args.do_train:
if data_args.max_train_samples is not None: if data_args.max_train_samples is not None:
train_dataset = train_dataset.select(range(data_args.max_train_samples)) train_dataset = train_dataset.select(range(data_args.max_train_samples))
with training_args.main_process_first(desc="train dataset map pre-processing"):
train_dataset = train_dataset.map( train_dataset = train_dataset.map(
preprocess_function, preprocess_function,
batched=True, batched=True,
...@@ -293,6 +294,7 @@ def main(): ...@@ -293,6 +294,7 @@ def main():
if training_args.do_eval: if training_args.do_eval:
if data_args.max_eval_samples is not None: if data_args.max_eval_samples is not None:
eval_dataset = eval_dataset.select(range(data_args.max_eval_samples)) eval_dataset = eval_dataset.select(range(data_args.max_eval_samples))
with training_args.main_process_first(desc="validation dataset map pre-processing"):
eval_dataset = eval_dataset.map( eval_dataset = eval_dataset.map(
preprocess_function, preprocess_function,
batched=True, batched=True,
...@@ -303,6 +305,7 @@ def main(): ...@@ -303,6 +305,7 @@ def main():
if training_args.do_predict: if training_args.do_predict:
if data_args.max_predict_samples is not None: if data_args.max_predict_samples is not None:
predict_dataset = predict_dataset.select(range(data_args.max_predict_samples)) predict_dataset = predict_dataset.select(range(data_args.max_predict_samples))
with training_args.main_process_first(desc="prediction dataset map pre-processing"):
predict_dataset = predict_dataset.map( predict_dataset = predict_dataset.map(
preprocess_function, preprocess_function,
batched=True, batched=True,
......
...@@ -390,6 +390,7 @@ def main(): ...@@ -390,6 +390,7 @@ def main():
train_dataset = raw_datasets["train"] train_dataset = raw_datasets["train"]
if data_args.max_train_samples is not None: if data_args.max_train_samples is not None:
train_dataset = train_dataset.select(range(data_args.max_train_samples)) train_dataset = train_dataset.select(range(data_args.max_train_samples))
with training_args.main_process_first(desc="train dataset map pre-processing"):
train_dataset = train_dataset.map( train_dataset = train_dataset.map(
tokenize_and_align_labels, tokenize_and_align_labels,
batched=True, batched=True,
...@@ -404,6 +405,7 @@ def main(): ...@@ -404,6 +405,7 @@ def main():
eval_dataset = raw_datasets["validation"] eval_dataset = raw_datasets["validation"]
if data_args.max_eval_samples is not None: if data_args.max_eval_samples is not None:
eval_dataset = eval_dataset.select(range(data_args.max_eval_samples)) eval_dataset = eval_dataset.select(range(data_args.max_eval_samples))
with training_args.main_process_first(desc="validation dataset map pre-processing"):
eval_dataset = eval_dataset.map( eval_dataset = eval_dataset.map(
tokenize_and_align_labels, tokenize_and_align_labels,
batched=True, batched=True,
...@@ -418,6 +420,7 @@ def main(): ...@@ -418,6 +420,7 @@ def main():
predict_dataset = raw_datasets["test"] predict_dataset = raw_datasets["test"]
if data_args.max_predict_samples is not None: if data_args.max_predict_samples is not None:
predict_dataset = predict_dataset.select(range(data_args.max_predict_samples)) predict_dataset = predict_dataset.select(range(data_args.max_predict_samples))
with training_args.main_process_first(desc="prediction dataset map pre-processing"):
predict_dataset = predict_dataset.map( predict_dataset = predict_dataset.map(
tokenize_and_align_labels, tokenize_and_align_labels,
batched=True, batched=True,
......
...@@ -370,6 +370,7 @@ def main(): ...@@ -370,6 +370,7 @@ def main():
# Select Sample from Dataset # Select Sample from Dataset
train_dataset = train_dataset.select(range(data_args.max_train_samples)) train_dataset = train_dataset.select(range(data_args.max_train_samples))
# tokenize train dataset in batch # tokenize train dataset in batch
with training_args.main_process_first(desc="train dataset map tokenization"):
train_dataset = train_dataset.map( train_dataset = train_dataset.map(
tokenize_function, tokenize_function,
batched=True, batched=True,
...@@ -386,6 +387,7 @@ def main(): ...@@ -386,6 +387,7 @@ def main():
if data_args.max_eval_samples is not None: if data_args.max_eval_samples is not None:
eval_dataset = eval_dataset.select(range(data_args.max_eval_samples)) eval_dataset = eval_dataset.select(range(data_args.max_eval_samples))
# tokenize validation dataset # tokenize validation dataset
with training_args.main_process_first(desc="validation dataset map tokenization"):
eval_dataset = eval_dataset.map( eval_dataset = eval_dataset.map(
tokenize_function, tokenize_function,
batched=True, batched=True,
...@@ -402,6 +404,7 @@ def main(): ...@@ -402,6 +404,7 @@ def main():
if data_args.max_predict_samples is not None: if data_args.max_predict_samples is not None:
predict_dataset = predict_dataset.select(range(data_args.max_predict_samples)) predict_dataset = predict_dataset.select(range(data_args.max_predict_samples))
# tokenize predict dataset # tokenize predict dataset
with training_args.main_process_first(desc="prediction dataset map tokenization"):
predict_dataset = predict_dataset.map( predict_dataset = predict_dataset.map(
tokenize_function, tokenize_function,
batched=True, batched=True,
......
...@@ -503,7 +503,7 @@ def main(): ...@@ -503,7 +503,7 @@ def main():
for test_dataset, task in zip(test_datasets, tasks): for test_dataset, task in zip(test_datasets, tasks):
# Removing the `label` columns because it contains -1 and Trainer won't like that. # Removing the `label` columns because it contains -1 and Trainer won't like that.
test_dataset.remove_columns_("label") test_dataset = test_dataset.remove_columns("label")
predictions = trainer.predict(test_dataset=test_dataset).predictions predictions = trainer.predict(test_dataset=test_dataset).predictions
predictions = np.squeeze(predictions) if is_regression else np.argmax(predictions, axis=1) predictions = np.squeeze(predictions) if is_regression else np.argmax(predictions, axis=1)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment