"docs/vscode:/vscode.git/clone" did not exist on "8992589dd6fcfdfc5f1435a4a9a92da501dd5ab6"
Unverified Commit 04dbea31 authored by Bhadresh Savani's avatar Bhadresh Savani Committed by GitHub
Browse files

[Examples] Added context manager to datasets map (#12367)

* added cotext manager to datasets map

* fixed style and spaces

* fixed warning of deprecation

* changed desc
parent d25ad34c
......@@ -356,6 +356,7 @@ def main():
)
return output
with training_args.main_process_first(desc="dataset map tokenization"):
tokenized_datasets = raw_datasets.map(
tokenize_function,
batched=True,
......@@ -404,6 +405,7 @@ def main():
# To speed up this part, we use multiprocessing. See the documentation of the map method for more information:
# https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map
with training_args.main_process_first(desc="grouping texts together"):
lm_datasets = tokenized_datasets.map(
group_texts,
batched=True,
......
......@@ -383,6 +383,7 @@ def main():
return_special_tokens_mask=True,
)
with training_args.main_process_first(desc="dataset map tokenization"):
tokenized_datasets = raw_datasets.map(
tokenize_function,
batched=True,
......@@ -398,6 +399,7 @@ def main():
def tokenize_function(examples):
return tokenizer(examples[text_column_name], return_special_tokens_mask=True)
with training_args.main_process_first(desc="dataset map tokenization"):
tokenized_datasets = raw_datasets.map(
tokenize_function,
batched=True,
......@@ -430,6 +432,7 @@ def main():
# To speed up this part, we use multiprocessing. See the documentation of the map method for more information:
# https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map
with training_args.main_process_first(desc="grouping texts together"):
tokenized_datasets = tokenized_datasets.map(
group_texts,
batched=True,
......
......@@ -359,6 +359,7 @@ def main():
examples["text"] = [line for line in examples["text"] if len(line) > 0 and not line.isspace()]
return tokenizer(examples["text"], padding=padding, truncation=True, max_length=max_seq_length)
with training_args.main_process_first(desc="dataset map tokenization"):
tokenized_datasets = raw_datasets.map(
tokenize_function,
batched=True,
......@@ -372,6 +373,7 @@ def main():
def tokenize_function(examples):
return tokenizer(examples[text_column_name])
with training_args.main_process_first(desc="dataset map tokenization"):
tokenized_datasets = raw_datasets.map(
tokenize_function,
batched=True,
......@@ -404,6 +406,7 @@ def main():
# To speed up this part, we use multiprocessing. See the documentation of the map method for more information:
# https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map
with training_args.main_process_first(desc="grouping texts together"):
tokenized_datasets = tokenized_datasets.map(
group_texts,
batched=True,
......
......@@ -353,6 +353,7 @@ def main():
train_dataset = raw_datasets["train"]
if data_args.max_train_samples is not None:
train_dataset = train_dataset.select(range(data_args.max_train_samples))
with training_args.main_process_first(desc="train dataset map pre-processing"):
train_dataset = train_dataset.map(
preprocess_function,
batched=True,
......@@ -366,6 +367,7 @@ def main():
eval_dataset = raw_datasets["validation"]
if data_args.max_eval_samples is not None:
eval_dataset = eval_dataset.select(range(data_args.max_eval_samples))
with training_args.main_process_first(desc="validation dataset map pre-processing"):
eval_dataset = eval_dataset.map(
preprocess_function,
batched=True,
......
......@@ -418,6 +418,7 @@ def main():
# We will select sample from whole data if agument is specified
train_dataset = train_dataset.select(range(data_args.max_train_samples))
# Create train feature from dataset
with training_args.main_process_first(desc="train dataset map pre-processing"):
train_dataset = train_dataset.map(
prepare_train_features,
batched=True,
......@@ -480,6 +481,7 @@ def main():
# We will select sample from whole data
eval_examples = eval_examples.select(range(data_args.max_eval_samples))
# Validation Feature Creation
with training_args.main_process_first(desc="validation dataset map pre-processing"):
eval_dataset = eval_examples.map(
prepare_validation_features,
batched=True,
......@@ -500,6 +502,7 @@ def main():
# We will select sample from whole data
predict_examples = predict_examples.select(range(data_args.max_predict_samples))
# Predict Feature Creation
with training_args.main_process_first(desc="prediction dataset map pre-processing"):
predict_dataset = predict_examples.map(
prepare_validation_features,
batched=True,
......
......@@ -429,6 +429,7 @@ def main():
# Select samples from Dataset, This will help to decrease processing time
train_dataset = train_dataset.select(range(data_args.max_train_samples))
# Create Training Features
with training_args.main_process_first(desc="train dataset map pre-processing"):
train_dataset = train_dataset.map(
prepare_train_features,
batched=True,
......@@ -515,6 +516,7 @@ def main():
# Selecting Eval Samples from Dataset
eval_examples = eval_examples.select(range(data_args.max_eval_samples))
# Create Features from Eval Dataset
with training_args.main_process_first(desc="validation dataset map pre-processing"):
eval_dataset = eval_examples.map(
prepare_validation_features,
batched=True,
......@@ -535,6 +537,7 @@ def main():
# We will select sample from whole data
predict_examples = predict_examples.select(range(data_args.max_predict_samples))
# Test Feature Creation
with training_args.main_process_first(desc="prediction dataset map pre-processing"):
predict_dataset = predict_examples.map(
prepare_validation_features,
batched=True,
......
......@@ -435,6 +435,7 @@ def main():
train_dataset = raw_datasets["train"]
if data_args.max_train_samples is not None:
train_dataset = train_dataset.select(range(data_args.max_train_samples))
with training_args.main_process_first(desc="train dataset map pre-processing"):
train_dataset = train_dataset.map(
preprocess_function,
batched=True,
......@@ -451,6 +452,7 @@ def main():
eval_dataset = raw_datasets["validation"]
if data_args.max_eval_samples is not None:
eval_dataset = eval_dataset.select(range(data_args.max_eval_samples))
with training_args.main_process_first(desc="validation dataset map pre-processing"):
eval_dataset = eval_dataset.map(
preprocess_function,
batched=True,
......@@ -467,6 +469,7 @@ def main():
predict_dataset = raw_datasets["test"]
if data_args.max_predict_samples is not None:
predict_dataset = predict_dataset.select(range(data_args.max_predict_samples))
with training_args.main_process_first(desc="prediction dataset map pre-processing"):
predict_dataset = predict_dataset.map(
preprocess_function,
batched=True,
......
......@@ -400,6 +400,7 @@ def main():
result["label"] = [(label_to_id[l] if l != -1 else -1) for l in examples["label"]]
return result
with training_args.main_process_first(desc="dataset map pre-processing"):
raw_datasets = raw_datasets.map(
preprocess_function,
batched=True,
......@@ -526,7 +527,7 @@ def main():
for predict_dataset, task in zip(predict_datasets, tasks):
# Removing the `label` columns because it contains -1 and Trainer won't like that.
predict_dataset.remove_columns_("label")
predict_dataset = predict_dataset.remove_columns("label")
predictions = trainer.predict(predict_dataset, metric_key_prefix="predict").predictions
predictions = np.squeeze(predictions) if is_regression else np.argmax(predictions, axis=1)
......
......@@ -280,6 +280,7 @@ def main():
if training_args.do_train:
if data_args.max_train_samples is not None:
train_dataset = train_dataset.select(range(data_args.max_train_samples))
with training_args.main_process_first(desc="train dataset map pre-processing"):
train_dataset = train_dataset.map(
preprocess_function,
batched=True,
......@@ -293,6 +294,7 @@ def main():
if training_args.do_eval:
if data_args.max_eval_samples is not None:
eval_dataset = eval_dataset.select(range(data_args.max_eval_samples))
with training_args.main_process_first(desc="validation dataset map pre-processing"):
eval_dataset = eval_dataset.map(
preprocess_function,
batched=True,
......@@ -303,6 +305,7 @@ def main():
if training_args.do_predict:
if data_args.max_predict_samples is not None:
predict_dataset = predict_dataset.select(range(data_args.max_predict_samples))
with training_args.main_process_first(desc="prediction dataset map pre-processing"):
predict_dataset = predict_dataset.map(
preprocess_function,
batched=True,
......
......@@ -390,6 +390,7 @@ def main():
train_dataset = raw_datasets["train"]
if data_args.max_train_samples is not None:
train_dataset = train_dataset.select(range(data_args.max_train_samples))
with training_args.main_process_first(desc="train dataset map pre-processing"):
train_dataset = train_dataset.map(
tokenize_and_align_labels,
batched=True,
......@@ -404,6 +405,7 @@ def main():
eval_dataset = raw_datasets["validation"]
if data_args.max_eval_samples is not None:
eval_dataset = eval_dataset.select(range(data_args.max_eval_samples))
with training_args.main_process_first(desc="validation dataset map pre-processing"):
eval_dataset = eval_dataset.map(
tokenize_and_align_labels,
batched=True,
......@@ -418,6 +420,7 @@ def main():
predict_dataset = raw_datasets["test"]
if data_args.max_predict_samples is not None:
predict_dataset = predict_dataset.select(range(data_args.max_predict_samples))
with training_args.main_process_first(desc="prediction dataset map pre-processing"):
predict_dataset = predict_dataset.map(
tokenize_and_align_labels,
batched=True,
......
......@@ -370,6 +370,7 @@ def main():
# Select Sample from Dataset
train_dataset = train_dataset.select(range(data_args.max_train_samples))
# tokenize train dataset in batch
with training_args.main_process_first(desc="train dataset map tokenization"):
train_dataset = train_dataset.map(
tokenize_function,
batched=True,
......@@ -386,6 +387,7 @@ def main():
if data_args.max_eval_samples is not None:
eval_dataset = eval_dataset.select(range(data_args.max_eval_samples))
# tokenize validation dataset
with training_args.main_process_first(desc="validation dataset map tokenization"):
eval_dataset = eval_dataset.map(
tokenize_function,
batched=True,
......@@ -402,6 +404,7 @@ def main():
if data_args.max_predict_samples is not None:
predict_dataset = predict_dataset.select(range(data_args.max_predict_samples))
# tokenize predict dataset
with training_args.main_process_first(desc="prediction dataset map tokenization"):
predict_dataset = predict_dataset.map(
tokenize_function,
batched=True,
......
......@@ -503,7 +503,7 @@ def main():
for test_dataset, task in zip(test_datasets, tasks):
# Removing the `label` columns because it contains -1 and Trainer won't like that.
test_dataset.remove_columns_("label")
test_dataset = test_dataset.remove_columns("label")
predictions = trainer.predict(test_dataset=test_dataset).predictions
predictions = np.squeeze(predictions) if is_regression else np.argmax(predictions, axis=1)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment