"git@developer.sourcefind.cn:chenpangpang/transformers.git" did not exist on "9d20601259b8da3d238f3bfcba4efa0a6fc34309"
Unverified Commit 04dbea31 authored by Bhadresh Savani's avatar Bhadresh Savani Committed by GitHub
Browse files

[Examples] Added context manager to datasets map (#12367)

* added cotext manager to datasets map

* fixed style and spaces

* fixed warning of deprecation

* changed desc
parent d25ad34c
...@@ -356,14 +356,15 @@ def main(): ...@@ -356,14 +356,15 @@ def main():
) )
return output return output
tokenized_datasets = raw_datasets.map( with training_args.main_process_first(desc="dataset map tokenization"):
tokenize_function, tokenized_datasets = raw_datasets.map(
batched=True, tokenize_function,
num_proc=data_args.preprocessing_num_workers, batched=True,
remove_columns=column_names, num_proc=data_args.preprocessing_num_workers,
load_from_cache_file=not data_args.overwrite_cache, remove_columns=column_names,
desc="Running tokenizer on dataset", load_from_cache_file=not data_args.overwrite_cache,
) desc="Running tokenizer on dataset",
)
if data_args.block_size is None: if data_args.block_size is None:
block_size = tokenizer.model_max_length block_size = tokenizer.model_max_length
...@@ -404,13 +405,14 @@ def main(): ...@@ -404,13 +405,14 @@ def main():
# To speed up this part, we use multiprocessing. See the documentation of the map method for more information: # To speed up this part, we use multiprocessing. See the documentation of the map method for more information:
# https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map
lm_datasets = tokenized_datasets.map( with training_args.main_process_first(desc="grouping texts together"):
group_texts, lm_datasets = tokenized_datasets.map(
batched=True, group_texts,
num_proc=data_args.preprocessing_num_workers, batched=True,
load_from_cache_file=not data_args.overwrite_cache, num_proc=data_args.preprocessing_num_workers,
desc=f"Grouping texts in chunks of {block_size}", load_from_cache_file=not data_args.overwrite_cache,
) desc=f"Grouping texts in chunks of {block_size}",
)
if training_args.do_train: if training_args.do_train:
if "train" not in tokenized_datasets: if "train" not in tokenized_datasets:
......
...@@ -383,14 +383,15 @@ def main(): ...@@ -383,14 +383,15 @@ def main():
return_special_tokens_mask=True, return_special_tokens_mask=True,
) )
tokenized_datasets = raw_datasets.map( with training_args.main_process_first(desc="dataset map tokenization"):
tokenize_function, tokenized_datasets = raw_datasets.map(
batched=True, tokenize_function,
num_proc=data_args.preprocessing_num_workers, batched=True,
remove_columns=[text_column_name], num_proc=data_args.preprocessing_num_workers,
load_from_cache_file=not data_args.overwrite_cache, remove_columns=[text_column_name],
desc="Running tokenizer on dataset line_by_line", load_from_cache_file=not data_args.overwrite_cache,
) desc="Running tokenizer on dataset line_by_line",
)
else: else:
# Otherwise, we tokenize every text, then concatenate them together before splitting them in smaller parts. # Otherwise, we tokenize every text, then concatenate them together before splitting them in smaller parts.
# We use `return_special_tokens_mask=True` because DataCollatorForLanguageModeling (see below) is more # We use `return_special_tokens_mask=True` because DataCollatorForLanguageModeling (see below) is more
...@@ -398,14 +399,15 @@ def main(): ...@@ -398,14 +399,15 @@ def main():
def tokenize_function(examples): def tokenize_function(examples):
return tokenizer(examples[text_column_name], return_special_tokens_mask=True) return tokenizer(examples[text_column_name], return_special_tokens_mask=True)
tokenized_datasets = raw_datasets.map( with training_args.main_process_first(desc="dataset map tokenization"):
tokenize_function, tokenized_datasets = raw_datasets.map(
batched=True, tokenize_function,
num_proc=data_args.preprocessing_num_workers, batched=True,
remove_columns=column_names, num_proc=data_args.preprocessing_num_workers,
load_from_cache_file=not data_args.overwrite_cache, remove_columns=column_names,
desc="Running tokenizer on every text in dataset", load_from_cache_file=not data_args.overwrite_cache,
) desc="Running tokenizer on every text in dataset",
)
# Main data processing function that will concatenate all texts from our dataset and generate chunks of # Main data processing function that will concatenate all texts from our dataset and generate chunks of
# max_seq_length. # max_seq_length.
...@@ -430,13 +432,14 @@ def main(): ...@@ -430,13 +432,14 @@ def main():
# To speed up this part, we use multiprocessing. See the documentation of the map method for more information: # To speed up this part, we use multiprocessing. See the documentation of the map method for more information:
# https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map
tokenized_datasets = tokenized_datasets.map( with training_args.main_process_first(desc="grouping texts together"):
group_texts, tokenized_datasets = tokenized_datasets.map(
batched=True, group_texts,
num_proc=data_args.preprocessing_num_workers, batched=True,
load_from_cache_file=not data_args.overwrite_cache, num_proc=data_args.preprocessing_num_workers,
desc=f"Grouping texts in chunks of {max_seq_length}", load_from_cache_file=not data_args.overwrite_cache,
) desc=f"Grouping texts in chunks of {max_seq_length}",
)
if training_args.do_train: if training_args.do_train:
if "train" not in tokenized_datasets: if "train" not in tokenized_datasets:
......
...@@ -359,27 +359,29 @@ def main(): ...@@ -359,27 +359,29 @@ def main():
examples["text"] = [line for line in examples["text"] if len(line) > 0 and not line.isspace()] examples["text"] = [line for line in examples["text"] if len(line) > 0 and not line.isspace()]
return tokenizer(examples["text"], padding=padding, truncation=True, max_length=max_seq_length) return tokenizer(examples["text"], padding=padding, truncation=True, max_length=max_seq_length)
tokenized_datasets = raw_datasets.map( with training_args.main_process_first(desc="dataset map tokenization"):
tokenize_function, tokenized_datasets = raw_datasets.map(
batched=True, tokenize_function,
num_proc=data_args.preprocessing_num_workers, batched=True,
remove_columns=[text_column_name], num_proc=data_args.preprocessing_num_workers,
load_from_cache_file=not data_args.overwrite_cache, remove_columns=[text_column_name],
desc="Running tokenizer on dataset line_by_line", load_from_cache_file=not data_args.overwrite_cache,
) desc="Running tokenizer on dataset line_by_line",
)
else: else:
# Otherwise, we tokenize every text, then concatenate them together before splitting them in smaller parts. # Otherwise, we tokenize every text, then concatenate them together before splitting them in smaller parts.
def tokenize_function(examples): def tokenize_function(examples):
return tokenizer(examples[text_column_name]) return tokenizer(examples[text_column_name])
tokenized_datasets = raw_datasets.map( with training_args.main_process_first(desc="dataset map tokenization"):
tokenize_function, tokenized_datasets = raw_datasets.map(
batched=True, tokenize_function,
num_proc=data_args.preprocessing_num_workers, batched=True,
remove_columns=column_names, num_proc=data_args.preprocessing_num_workers,
load_from_cache_file=not data_args.overwrite_cache, remove_columns=column_names,
desc="Running tokenizer on every text in dataset", load_from_cache_file=not data_args.overwrite_cache,
) desc="Running tokenizer on every text in dataset",
)
# Main data processing function that will concatenate all texts from our dataset and generate chunks of # Main data processing function that will concatenate all texts from our dataset and generate chunks of
# max_seq_length. # max_seq_length.
...@@ -404,13 +406,14 @@ def main(): ...@@ -404,13 +406,14 @@ def main():
# To speed up this part, we use multiprocessing. See the documentation of the map method for more information: # To speed up this part, we use multiprocessing. See the documentation of the map method for more information:
# https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map
tokenized_datasets = tokenized_datasets.map( with training_args.main_process_first(desc="grouping texts together"):
group_texts, tokenized_datasets = tokenized_datasets.map(
batched=True, group_texts,
num_proc=data_args.preprocessing_num_workers, batched=True,
load_from_cache_file=not data_args.overwrite_cache, num_proc=data_args.preprocessing_num_workers,
desc=f"Grouping texts in chunks of {max_seq_length}", load_from_cache_file=not data_args.overwrite_cache,
) desc=f"Grouping texts in chunks of {max_seq_length}",
)
if training_args.do_train: if training_args.do_train:
if "train" not in tokenized_datasets: if "train" not in tokenized_datasets:
......
...@@ -353,12 +353,13 @@ def main(): ...@@ -353,12 +353,13 @@ def main():
train_dataset = raw_datasets["train"] train_dataset = raw_datasets["train"]
if data_args.max_train_samples is not None: if data_args.max_train_samples is not None:
train_dataset = train_dataset.select(range(data_args.max_train_samples)) train_dataset = train_dataset.select(range(data_args.max_train_samples))
train_dataset = train_dataset.map( with training_args.main_process_first(desc="train dataset map pre-processing"):
preprocess_function, train_dataset = train_dataset.map(
batched=True, preprocess_function,
num_proc=data_args.preprocessing_num_workers, batched=True,
load_from_cache_file=not data_args.overwrite_cache, num_proc=data_args.preprocessing_num_workers,
) load_from_cache_file=not data_args.overwrite_cache,
)
if training_args.do_eval: if training_args.do_eval:
if "validation" not in raw_datasets: if "validation" not in raw_datasets:
...@@ -366,12 +367,13 @@ def main(): ...@@ -366,12 +367,13 @@ def main():
eval_dataset = raw_datasets["validation"] eval_dataset = raw_datasets["validation"]
if data_args.max_eval_samples is not None: if data_args.max_eval_samples is not None:
eval_dataset = eval_dataset.select(range(data_args.max_eval_samples)) eval_dataset = eval_dataset.select(range(data_args.max_eval_samples))
eval_dataset = eval_dataset.map( with training_args.main_process_first(desc="validation dataset map pre-processing"):
preprocess_function, eval_dataset = eval_dataset.map(
batched=True, preprocess_function,
num_proc=data_args.preprocessing_num_workers, batched=True,
load_from_cache_file=not data_args.overwrite_cache, num_proc=data_args.preprocessing_num_workers,
) load_from_cache_file=not data_args.overwrite_cache,
)
# Data collator # Data collator
data_collator = ( data_collator = (
......
...@@ -418,14 +418,15 @@ def main(): ...@@ -418,14 +418,15 @@ def main():
# We will select sample from whole data if agument is specified # We will select sample from whole data if agument is specified
train_dataset = train_dataset.select(range(data_args.max_train_samples)) train_dataset = train_dataset.select(range(data_args.max_train_samples))
# Create train feature from dataset # Create train feature from dataset
train_dataset = train_dataset.map( with training_args.main_process_first(desc="train dataset map pre-processing"):
prepare_train_features, train_dataset = train_dataset.map(
batched=True, prepare_train_features,
num_proc=data_args.preprocessing_num_workers, batched=True,
remove_columns=column_names, num_proc=data_args.preprocessing_num_workers,
load_from_cache_file=not data_args.overwrite_cache, remove_columns=column_names,
desc="Running tokenizer on train dataset", load_from_cache_file=not data_args.overwrite_cache,
) desc="Running tokenizer on train dataset",
)
if data_args.max_train_samples is not None: if data_args.max_train_samples is not None:
# Number of samples might increase during Feature Creation, We select only specified max samples # Number of samples might increase during Feature Creation, We select only specified max samples
train_dataset = train_dataset.select(range(data_args.max_train_samples)) train_dataset = train_dataset.select(range(data_args.max_train_samples))
...@@ -480,14 +481,15 @@ def main(): ...@@ -480,14 +481,15 @@ def main():
# We will select sample from whole data # We will select sample from whole data
eval_examples = eval_examples.select(range(data_args.max_eval_samples)) eval_examples = eval_examples.select(range(data_args.max_eval_samples))
# Validation Feature Creation # Validation Feature Creation
eval_dataset = eval_examples.map( with training_args.main_process_first(desc="validation dataset map pre-processing"):
prepare_validation_features, eval_dataset = eval_examples.map(
batched=True, prepare_validation_features,
num_proc=data_args.preprocessing_num_workers, batched=True,
remove_columns=column_names, num_proc=data_args.preprocessing_num_workers,
load_from_cache_file=not data_args.overwrite_cache, remove_columns=column_names,
desc="Running tokenizer on validation dataset", load_from_cache_file=not data_args.overwrite_cache,
) desc="Running tokenizer on validation dataset",
)
if data_args.max_eval_samples is not None: if data_args.max_eval_samples is not None:
# During Feature creation dataset samples might increase, we will select required samples again # During Feature creation dataset samples might increase, we will select required samples again
eval_dataset = eval_dataset.select(range(data_args.max_eval_samples)) eval_dataset = eval_dataset.select(range(data_args.max_eval_samples))
...@@ -500,14 +502,15 @@ def main(): ...@@ -500,14 +502,15 @@ def main():
# We will select sample from whole data # We will select sample from whole data
predict_examples = predict_examples.select(range(data_args.max_predict_samples)) predict_examples = predict_examples.select(range(data_args.max_predict_samples))
# Predict Feature Creation # Predict Feature Creation
predict_dataset = predict_examples.map( with training_args.main_process_first(desc="prediction dataset map pre-processing"):
prepare_validation_features, predict_dataset = predict_examples.map(
batched=True, prepare_validation_features,
num_proc=data_args.preprocessing_num_workers, batched=True,
remove_columns=column_names, num_proc=data_args.preprocessing_num_workers,
load_from_cache_file=not data_args.overwrite_cache, remove_columns=column_names,
desc="Running tokenizer on prediction dataset", load_from_cache_file=not data_args.overwrite_cache,
) desc="Running tokenizer on prediction dataset",
)
if data_args.max_predict_samples is not None: if data_args.max_predict_samples is not None:
# During Feature creation dataset samples might increase, we will select required samples again # During Feature creation dataset samples might increase, we will select required samples again
predict_dataset = predict_dataset.select(range(data_args.max_predict_samples)) predict_dataset = predict_dataset.select(range(data_args.max_predict_samples))
......
...@@ -429,14 +429,15 @@ def main(): ...@@ -429,14 +429,15 @@ def main():
# Select samples from Dataset, This will help to decrease processing time # Select samples from Dataset, This will help to decrease processing time
train_dataset = train_dataset.select(range(data_args.max_train_samples)) train_dataset = train_dataset.select(range(data_args.max_train_samples))
# Create Training Features # Create Training Features
train_dataset = train_dataset.map( with training_args.main_process_first(desc="train dataset map pre-processing"):
prepare_train_features, train_dataset = train_dataset.map(
batched=True, prepare_train_features,
num_proc=data_args.preprocessing_num_workers, batched=True,
remove_columns=column_names, num_proc=data_args.preprocessing_num_workers,
load_from_cache_file=not data_args.overwrite_cache, remove_columns=column_names,
desc="Running tokenizer on train dataset", load_from_cache_file=not data_args.overwrite_cache,
) desc="Running tokenizer on train dataset",
)
if data_args.max_train_samples is not None: if data_args.max_train_samples is not None:
# Select samples from dataset again since Feature Creation might increase number of features # Select samples from dataset again since Feature Creation might increase number of features
train_dataset = train_dataset.select(range(data_args.max_train_samples)) train_dataset = train_dataset.select(range(data_args.max_train_samples))
...@@ -515,14 +516,15 @@ def main(): ...@@ -515,14 +516,15 @@ def main():
# Selecting Eval Samples from Dataset # Selecting Eval Samples from Dataset
eval_examples = eval_examples.select(range(data_args.max_eval_samples)) eval_examples = eval_examples.select(range(data_args.max_eval_samples))
# Create Features from Eval Dataset # Create Features from Eval Dataset
eval_dataset = eval_examples.map( with training_args.main_process_first(desc="validation dataset map pre-processing"):
prepare_validation_features, eval_dataset = eval_examples.map(
batched=True, prepare_validation_features,
num_proc=data_args.preprocessing_num_workers, batched=True,
remove_columns=column_names, num_proc=data_args.preprocessing_num_workers,
load_from_cache_file=not data_args.overwrite_cache, remove_columns=column_names,
desc="Running tokenizer on validation dataset", load_from_cache_file=not data_args.overwrite_cache,
) desc="Running tokenizer on validation dataset",
)
if data_args.max_eval_samples is not None: if data_args.max_eval_samples is not None:
# Selecting Samples from Dataset again since Feature Creation might increase samples size # Selecting Samples from Dataset again since Feature Creation might increase samples size
eval_dataset = eval_dataset.select(range(data_args.max_eval_samples)) eval_dataset = eval_dataset.select(range(data_args.max_eval_samples))
...@@ -535,14 +537,15 @@ def main(): ...@@ -535,14 +537,15 @@ def main():
# We will select sample from whole data # We will select sample from whole data
predict_examples = predict_examples.select(range(data_args.max_predict_samples)) predict_examples = predict_examples.select(range(data_args.max_predict_samples))
# Test Feature Creation # Test Feature Creation
predict_dataset = predict_examples.map( with training_args.main_process_first(desc="prediction dataset map pre-processing"):
prepare_validation_features, predict_dataset = predict_examples.map(
batched=True, prepare_validation_features,
num_proc=data_args.preprocessing_num_workers, batched=True,
remove_columns=column_names, num_proc=data_args.preprocessing_num_workers,
load_from_cache_file=not data_args.overwrite_cache, remove_columns=column_names,
desc="Running tokenizer on prediction dataset", load_from_cache_file=not data_args.overwrite_cache,
) desc="Running tokenizer on prediction dataset",
)
if data_args.max_predict_samples is not None: if data_args.max_predict_samples is not None:
# During Feature creation dataset samples might increase, we will select required samples again # During Feature creation dataset samples might increase, we will select required samples again
predict_dataset = predict_dataset.select(range(data_args.max_predict_samples)) predict_dataset = predict_dataset.select(range(data_args.max_predict_samples))
......
...@@ -435,14 +435,15 @@ def main(): ...@@ -435,14 +435,15 @@ def main():
train_dataset = raw_datasets["train"] train_dataset = raw_datasets["train"]
if data_args.max_train_samples is not None: if data_args.max_train_samples is not None:
train_dataset = train_dataset.select(range(data_args.max_train_samples)) train_dataset = train_dataset.select(range(data_args.max_train_samples))
train_dataset = train_dataset.map( with training_args.main_process_first(desc="train dataset map pre-processing"):
preprocess_function, train_dataset = train_dataset.map(
batched=True, preprocess_function,
num_proc=data_args.preprocessing_num_workers, batched=True,
remove_columns=column_names, num_proc=data_args.preprocessing_num_workers,
load_from_cache_file=not data_args.overwrite_cache, remove_columns=column_names,
desc="Running tokenizer on train dataset", load_from_cache_file=not data_args.overwrite_cache,
) desc="Running tokenizer on train dataset",
)
if training_args.do_eval: if training_args.do_eval:
max_target_length = data_args.val_max_target_length max_target_length = data_args.val_max_target_length
...@@ -451,14 +452,15 @@ def main(): ...@@ -451,14 +452,15 @@ def main():
eval_dataset = raw_datasets["validation"] eval_dataset = raw_datasets["validation"]
if data_args.max_eval_samples is not None: if data_args.max_eval_samples is not None:
eval_dataset = eval_dataset.select(range(data_args.max_eval_samples)) eval_dataset = eval_dataset.select(range(data_args.max_eval_samples))
eval_dataset = eval_dataset.map( with training_args.main_process_first(desc="validation dataset map pre-processing"):
preprocess_function, eval_dataset = eval_dataset.map(
batched=True, preprocess_function,
num_proc=data_args.preprocessing_num_workers, batched=True,
remove_columns=column_names, num_proc=data_args.preprocessing_num_workers,
load_from_cache_file=not data_args.overwrite_cache, remove_columns=column_names,
desc="Running tokenizer on validation dataset", load_from_cache_file=not data_args.overwrite_cache,
) desc="Running tokenizer on validation dataset",
)
if training_args.do_predict: if training_args.do_predict:
max_target_length = data_args.val_max_target_length max_target_length = data_args.val_max_target_length
...@@ -467,14 +469,15 @@ def main(): ...@@ -467,14 +469,15 @@ def main():
predict_dataset = raw_datasets["test"] predict_dataset = raw_datasets["test"]
if data_args.max_predict_samples is not None: if data_args.max_predict_samples is not None:
predict_dataset = predict_dataset.select(range(data_args.max_predict_samples)) predict_dataset = predict_dataset.select(range(data_args.max_predict_samples))
predict_dataset = predict_dataset.map( with training_args.main_process_first(desc="prediction dataset map pre-processing"):
preprocess_function, predict_dataset = predict_dataset.map(
batched=True, preprocess_function,
num_proc=data_args.preprocessing_num_workers, batched=True,
remove_columns=column_names, num_proc=data_args.preprocessing_num_workers,
load_from_cache_file=not data_args.overwrite_cache, remove_columns=column_names,
desc="Running tokenizer on prediction dataset", load_from_cache_file=not data_args.overwrite_cache,
) desc="Running tokenizer on prediction dataset",
)
# Data collator # Data collator
label_pad_token_id = -100 if data_args.ignore_pad_token_for_loss else tokenizer.pad_token_id label_pad_token_id = -100 if data_args.ignore_pad_token_for_loss else tokenizer.pad_token_id
......
...@@ -400,12 +400,13 @@ def main(): ...@@ -400,12 +400,13 @@ def main():
result["label"] = [(label_to_id[l] if l != -1 else -1) for l in examples["label"]] result["label"] = [(label_to_id[l] if l != -1 else -1) for l in examples["label"]]
return result return result
raw_datasets = raw_datasets.map( with training_args.main_process_first(desc="dataset map pre-processing"):
preprocess_function, raw_datasets = raw_datasets.map(
batched=True, preprocess_function,
load_from_cache_file=not data_args.overwrite_cache, batched=True,
desc="Running tokenizer on dataset", load_from_cache_file=not data_args.overwrite_cache,
) desc="Running tokenizer on dataset",
)
if training_args.do_train: if training_args.do_train:
if "train" not in raw_datasets: if "train" not in raw_datasets:
raise ValueError("--do_train requires a train dataset") raise ValueError("--do_train requires a train dataset")
...@@ -526,7 +527,7 @@ def main(): ...@@ -526,7 +527,7 @@ def main():
for predict_dataset, task in zip(predict_datasets, tasks): for predict_dataset, task in zip(predict_datasets, tasks):
# Removing the `label` columns because it contains -1 and Trainer won't like that. # Removing the `label` columns because it contains -1 and Trainer won't like that.
predict_dataset.remove_columns_("label") predict_dataset = predict_dataset.remove_columns("label")
predictions = trainer.predict(predict_dataset, metric_key_prefix="predict").predictions predictions = trainer.predict(predict_dataset, metric_key_prefix="predict").predictions
predictions = np.squeeze(predictions) if is_regression else np.argmax(predictions, axis=1) predictions = np.squeeze(predictions) if is_regression else np.argmax(predictions, axis=1)
......
...@@ -280,12 +280,13 @@ def main(): ...@@ -280,12 +280,13 @@ def main():
if training_args.do_train: if training_args.do_train:
if data_args.max_train_samples is not None: if data_args.max_train_samples is not None:
train_dataset = train_dataset.select(range(data_args.max_train_samples)) train_dataset = train_dataset.select(range(data_args.max_train_samples))
train_dataset = train_dataset.map( with training_args.main_process_first(desc="train dataset map pre-processing"):
preprocess_function, train_dataset = train_dataset.map(
batched=True, preprocess_function,
load_from_cache_file=not data_args.overwrite_cache, batched=True,
desc="Running tokenizer on train dataset", load_from_cache_file=not data_args.overwrite_cache,
) desc="Running tokenizer on train dataset",
)
# Log a few random samples from the training set: # Log a few random samples from the training set:
for index in random.sample(range(len(train_dataset)), 3): for index in random.sample(range(len(train_dataset)), 3):
logger.info(f"Sample {index} of the training set: {train_dataset[index]}.") logger.info(f"Sample {index} of the training set: {train_dataset[index]}.")
...@@ -293,22 +294,24 @@ def main(): ...@@ -293,22 +294,24 @@ def main():
if training_args.do_eval: if training_args.do_eval:
if data_args.max_eval_samples is not None: if data_args.max_eval_samples is not None:
eval_dataset = eval_dataset.select(range(data_args.max_eval_samples)) eval_dataset = eval_dataset.select(range(data_args.max_eval_samples))
eval_dataset = eval_dataset.map( with training_args.main_process_first(desc="validation dataset map pre-processing"):
preprocess_function, eval_dataset = eval_dataset.map(
batched=True, preprocess_function,
load_from_cache_file=not data_args.overwrite_cache, batched=True,
desc="Running tokenizer on validation dataset", load_from_cache_file=not data_args.overwrite_cache,
) desc="Running tokenizer on validation dataset",
)
if training_args.do_predict: if training_args.do_predict:
if data_args.max_predict_samples is not None: if data_args.max_predict_samples is not None:
predict_dataset = predict_dataset.select(range(data_args.max_predict_samples)) predict_dataset = predict_dataset.select(range(data_args.max_predict_samples))
predict_dataset = predict_dataset.map( with training_args.main_process_first(desc="prediction dataset map pre-processing"):
preprocess_function, predict_dataset = predict_dataset.map(
batched=True, preprocess_function,
load_from_cache_file=not data_args.overwrite_cache, batched=True,
desc="Running tokenizer on prediction dataset", load_from_cache_file=not data_args.overwrite_cache,
) desc="Running tokenizer on prediction dataset",
)
# Get the metric function # Get the metric function
metric = load_metric("xnli") metric = load_metric("xnli")
......
...@@ -390,13 +390,14 @@ def main(): ...@@ -390,13 +390,14 @@ def main():
train_dataset = raw_datasets["train"] train_dataset = raw_datasets["train"]
if data_args.max_train_samples is not None: if data_args.max_train_samples is not None:
train_dataset = train_dataset.select(range(data_args.max_train_samples)) train_dataset = train_dataset.select(range(data_args.max_train_samples))
train_dataset = train_dataset.map( with training_args.main_process_first(desc="train dataset map pre-processing"):
tokenize_and_align_labels, train_dataset = train_dataset.map(
batched=True, tokenize_and_align_labels,
num_proc=data_args.preprocessing_num_workers, batched=True,
load_from_cache_file=not data_args.overwrite_cache, num_proc=data_args.preprocessing_num_workers,
desc="Running tokenizer on train dataset", load_from_cache_file=not data_args.overwrite_cache,
) desc="Running tokenizer on train dataset",
)
if training_args.do_eval: if training_args.do_eval:
if "validation" not in raw_datasets: if "validation" not in raw_datasets:
...@@ -404,13 +405,14 @@ def main(): ...@@ -404,13 +405,14 @@ def main():
eval_dataset = raw_datasets["validation"] eval_dataset = raw_datasets["validation"]
if data_args.max_eval_samples is not None: if data_args.max_eval_samples is not None:
eval_dataset = eval_dataset.select(range(data_args.max_eval_samples)) eval_dataset = eval_dataset.select(range(data_args.max_eval_samples))
eval_dataset = eval_dataset.map( with training_args.main_process_first(desc="validation dataset map pre-processing"):
tokenize_and_align_labels, eval_dataset = eval_dataset.map(
batched=True, tokenize_and_align_labels,
num_proc=data_args.preprocessing_num_workers, batched=True,
load_from_cache_file=not data_args.overwrite_cache, num_proc=data_args.preprocessing_num_workers,
desc="Running tokenizer on validation dataset", load_from_cache_file=not data_args.overwrite_cache,
) desc="Running tokenizer on validation dataset",
)
if training_args.do_predict: if training_args.do_predict:
if "test" not in raw_datasets: if "test" not in raw_datasets:
...@@ -418,13 +420,14 @@ def main(): ...@@ -418,13 +420,14 @@ def main():
predict_dataset = raw_datasets["test"] predict_dataset = raw_datasets["test"]
if data_args.max_predict_samples is not None: if data_args.max_predict_samples is not None:
predict_dataset = predict_dataset.select(range(data_args.max_predict_samples)) predict_dataset = predict_dataset.select(range(data_args.max_predict_samples))
predict_dataset = predict_dataset.map( with training_args.main_process_first(desc="prediction dataset map pre-processing"):
tokenize_and_align_labels, predict_dataset = predict_dataset.map(
batched=True, tokenize_and_align_labels,
num_proc=data_args.preprocessing_num_workers, batched=True,
load_from_cache_file=not data_args.overwrite_cache, num_proc=data_args.preprocessing_num_workers,
desc="Running tokenizer on prediction dataset", load_from_cache_file=not data_args.overwrite_cache,
) desc="Running tokenizer on prediction dataset",
)
# Data collator # Data collator
data_collator = DataCollatorForTokenClassification(tokenizer, pad_to_multiple_of=8 if training_args.fp16 else None) data_collator = DataCollatorForTokenClassification(tokenizer, pad_to_multiple_of=8 if training_args.fp16 else None)
......
...@@ -370,13 +370,14 @@ def main(): ...@@ -370,13 +370,14 @@ def main():
# Select Sample from Dataset # Select Sample from Dataset
train_dataset = train_dataset.select(range(data_args.max_train_samples)) train_dataset = train_dataset.select(range(data_args.max_train_samples))
# tokenize train dataset in batch # tokenize train dataset in batch
train_dataset = train_dataset.map( with training_args.main_process_first(desc="train dataset map tokenization"):
tokenize_function, train_dataset = train_dataset.map(
batched=True, tokenize_function,
num_proc=data_args.preprocessing_num_workers, batched=True,
remove_columns=[text_column_name], num_proc=data_args.preprocessing_num_workers,
load_from_cache_file=not data_args.overwrite_cache, remove_columns=[text_column_name],
) load_from_cache_file=not data_args.overwrite_cache,
)
if training_args.do_eval: if training_args.do_eval:
if "validation" not in raw_datasets: if "validation" not in raw_datasets:
...@@ -386,13 +387,14 @@ def main(): ...@@ -386,13 +387,14 @@ def main():
if data_args.max_eval_samples is not None: if data_args.max_eval_samples is not None:
eval_dataset = eval_dataset.select(range(data_args.max_eval_samples)) eval_dataset = eval_dataset.select(range(data_args.max_eval_samples))
# tokenize validation dataset # tokenize validation dataset
eval_dataset = eval_dataset.map( with training_args.main_process_first(desc="validation dataset map tokenization"):
tokenize_function, eval_dataset = eval_dataset.map(
batched=True, tokenize_function,
num_proc=data_args.preprocessing_num_workers, batched=True,
remove_columns=[text_column_name], num_proc=data_args.preprocessing_num_workers,
load_from_cache_file=not data_args.overwrite_cache, remove_columns=[text_column_name],
) load_from_cache_file=not data_args.overwrite_cache,
)
if training_args.do_predict: if training_args.do_predict:
if "test" not in raw_datasets: if "test" not in raw_datasets:
...@@ -402,13 +404,14 @@ def main(): ...@@ -402,13 +404,14 @@ def main():
if data_args.max_predict_samples is not None: if data_args.max_predict_samples is not None:
predict_dataset = predict_dataset.select(range(data_args.max_predict_samples)) predict_dataset = predict_dataset.select(range(data_args.max_predict_samples))
# tokenize predict dataset # tokenize predict dataset
predict_dataset = predict_dataset.map( with training_args.main_process_first(desc="prediction dataset map tokenization"):
tokenize_function, predict_dataset = predict_dataset.map(
batched=True, tokenize_function,
num_proc=data_args.preprocessing_num_workers, batched=True,
remove_columns=[text_column_name], num_proc=data_args.preprocessing_num_workers,
load_from_cache_file=not data_args.overwrite_cache, remove_columns=[text_column_name],
) load_from_cache_file=not data_args.overwrite_cache,
)
# Data collator # Data collator
data_collator=default_data_collator if not training_args.fp16 else DataCollatorWithPadding(tokenizer, pad_to_multiple_of=8) data_collator=default_data_collator if not training_args.fp16 else DataCollatorWithPadding(tokenizer, pad_to_multiple_of=8)
......
...@@ -503,7 +503,7 @@ def main(): ...@@ -503,7 +503,7 @@ def main():
for test_dataset, task in zip(test_datasets, tasks): for test_dataset, task in zip(test_datasets, tasks):
# Removing the `label` columns because it contains -1 and Trainer won't like that. # Removing the `label` columns because it contains -1 and Trainer won't like that.
test_dataset.remove_columns_("label") test_dataset = test_dataset.remove_columns("label")
predictions = trainer.predict(test_dataset=test_dataset).predictions predictions = trainer.predict(test_dataset=test_dataset).predictions
predictions = np.squeeze(predictions) if is_regression else np.argmax(predictions, axis=1) predictions = np.squeeze(predictions) if is_regression else np.argmax(predictions, axis=1)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment