Unverified Commit a73281e3 authored by Stas Bekman's avatar Stas Bekman Committed by GitHub
Browse files

[examples] max samples can't be bigger than the len of dataset (#16501)

* [examples] max samples can't be bigger than then len of dataset

* do tf and flax
parent c4deb7b3
...@@ -613,7 +613,8 @@ def main(): ...@@ -613,7 +613,8 @@ def main():
raise ValueError("--do_train requires a train dataset") raise ValueError("--do_train requires a train dataset")
train_dataset = dataset["train"] train_dataset = dataset["train"]
if data_args.max_train_samples is not None: if data_args.max_train_samples is not None:
train_dataset = train_dataset.select(range(data_args.max_train_samples)) max_train_samples = min(len(train_dataset), data_args.max_train_samples)
train_dataset = train_dataset.select(range(max_train_samples))
# remove problematic examples # remove problematic examples
# (if feature extraction is performed at the beginning, the filtering is done during preprocessing below # (if feature extraction is performed at the beginning, the filtering is done during preprocessing below
# instead here.) # instead here.)
...@@ -646,7 +647,8 @@ def main(): ...@@ -646,7 +647,8 @@ def main():
raise ValueError("--do_eval requires a validation dataset") raise ValueError("--do_eval requires a validation dataset")
eval_dataset = dataset["validation"] eval_dataset = dataset["validation"]
if data_args.max_eval_samples is not None: if data_args.max_eval_samples is not None:
eval_dataset = eval_dataset.select(range(data_args.max_eval_samples)) max_eval_samples = min(len(eval_dataset), data_args.max_eval_samples)
eval_dataset = eval_dataset.select(range(max_eval_samples))
# remove problematic examples # remove problematic examples
# (if feature extraction is performed at the beginning, the filtering is done during preprocessing below # (if feature extraction is performed at the beginning, the filtering is done during preprocessing below
# instead here.) # instead here.)
...@@ -675,7 +677,8 @@ def main(): ...@@ -675,7 +677,8 @@ def main():
raise ValueError("--do_predict requires a test dataset") raise ValueError("--do_predict requires a test dataset")
predict_dataset = dataset["test"] predict_dataset = dataset["test"]
if data_args.max_predict_samples is not None: if data_args.max_predict_samples is not None:
predict_dataset = predict_dataset.select(range(data_args.max_predict_samples)) max_predict_samples = min(len(predict_dataset), data_args.max_predict_samples)
predict_dataset = predict_dataset.select(range(max_predict_samples))
# remove problematic examples # remove problematic examples
# (if feature extraction is performed at the beginning, the filtering is done during preprocessing below # (if feature extraction is performed at the beginning, the filtering is done during preprocessing below
# instead here.) # instead here.)
......
...@@ -527,14 +527,16 @@ def main(): ...@@ -527,14 +527,16 @@ def main():
raise ValueError("--do_train requires a train dataset") raise ValueError("--do_train requires a train dataset")
train_dataset = lm_datasets["train"] train_dataset = lm_datasets["train"]
if data_args.max_train_samples is not None: if data_args.max_train_samples is not None:
train_dataset = train_dataset.select(range(data_args.max_train_samples)) max_train_samples = min(len(train_dataset), data_args.max_train_samples)
train_dataset = train_dataset.select(range(max_train_samples))
if training_args.do_eval: if training_args.do_eval:
if "validation" not in tokenized_datasets: if "validation" not in tokenized_datasets:
raise ValueError("--do_eval requires a validation dataset") raise ValueError("--do_eval requires a validation dataset")
eval_dataset = lm_datasets["validation"] eval_dataset = lm_datasets["validation"]
if data_args.max_eval_samples is not None: if data_args.max_eval_samples is not None:
eval_dataset = eval_dataset.select(range(data_args.max_eval_samples)) max_eval_samples = min(len(eval_dataset), data_args.max_eval_samples)
eval_dataset = eval_dataset.select(range(max_eval_samples))
# Enable tensorboard only on the master node # Enable tensorboard only on the master node
has_tensorboard = is_tensorboard_available() has_tensorboard = is_tensorboard_available()
......
...@@ -602,7 +602,8 @@ def main(): ...@@ -602,7 +602,8 @@ def main():
train_dataset = raw_datasets["train"] train_dataset = raw_datasets["train"]
if data_args.max_train_samples is not None: if data_args.max_train_samples is not None:
# We will select sample from whole data if agument is specified # We will select sample from whole data if agument is specified
train_dataset = train_dataset.select(range(data_args.max_train_samples)) max_train_samples = min(len(train_dataset), data_args.max_train_samples)
train_dataset = train_dataset.select(range(max_train_samples))
# Create train feature from dataset # Create train feature from dataset
train_dataset = train_dataset.map( train_dataset = train_dataset.map(
prepare_train_features, prepare_train_features,
...@@ -613,7 +614,8 @@ def main(): ...@@ -613,7 +614,8 @@ def main():
) )
if data_args.max_train_samples is not None: if data_args.max_train_samples is not None:
# Number of samples might increase during Feature Creation, We select only specified max samples # Number of samples might increase during Feature Creation, We select only specified max samples
train_dataset = train_dataset.select(range(data_args.max_train_samples)) max_train_samples = min(len(train_dataset), data_args.max_train_samples)
train_dataset = train_dataset.select(range(max_train_samples))
processed_raw_datasets["train"] = train_dataset processed_raw_datasets["train"] = train_dataset
# Validation preprocessing # Validation preprocessing
...@@ -669,7 +671,8 @@ def main(): ...@@ -669,7 +671,8 @@ def main():
eval_examples = raw_datasets["validation"] eval_examples = raw_datasets["validation"]
if data_args.max_eval_samples is not None: if data_args.max_eval_samples is not None:
# We will select sample from whole data # We will select sample from whole data
eval_examples = eval_examples.select(range(data_args.max_eval_samples)) max_eval_samples = min(len(eval_examples), data_args.max_eval_samples)
eval_examples = eval_examples.select(range(max_eval_samples))
# Validation Feature Creation # Validation Feature Creation
eval_dataset = eval_examples.map( eval_dataset = eval_examples.map(
prepare_validation_features, prepare_validation_features,
...@@ -680,7 +683,8 @@ def main(): ...@@ -680,7 +683,8 @@ def main():
) )
if data_args.max_eval_samples is not None: if data_args.max_eval_samples is not None:
# During Feature creation dataset samples might increase, we will select required samples again # During Feature creation dataset samples might increase, we will select required samples again
eval_dataset = eval_dataset.select(range(data_args.max_eval_samples)) max_eval_samples = min(len(eval_dataset), data_args.max_eval_samples)
eval_dataset = eval_dataset.select(range(max_eval_samples))
processed_raw_datasets["validation"] = eval_dataset processed_raw_datasets["validation"] = eval_dataset
if training_args.do_predict: if training_args.do_predict:
...@@ -700,7 +704,8 @@ def main(): ...@@ -700,7 +704,8 @@ def main():
) )
if data_args.max_predict_samples is not None: if data_args.max_predict_samples is not None:
# During Feature creation dataset samples might increase, we will select required samples again # During Feature creation dataset samples might increase, we will select required samples again
predict_dataset = predict_dataset.select(range(data_args.max_predict_samples)) max_predict_samples = min(len(predict_dataset), data_args.max_predict_samples)
predict_dataset = predict_dataset.select(range(max_predict_samples))
processed_raw_datasets["test"] = predict_dataset processed_raw_datasets["test"] = predict_dataset
# endregion # endregion
......
...@@ -547,7 +547,8 @@ def main(): ...@@ -547,7 +547,8 @@ def main():
raise ValueError("--do_train requires a train dataset") raise ValueError("--do_train requires a train dataset")
train_dataset = dataset["train"] train_dataset = dataset["train"]
if data_args.max_train_samples is not None: if data_args.max_train_samples is not None:
train_dataset = train_dataset.select(range(data_args.max_train_samples)) max_train_samples = min(len(train_dataset), data_args.max_train_samples)
train_dataset = train_dataset.select(range(max_train_samples))
train_dataset = train_dataset.map( train_dataset = train_dataset.map(
preprocess_function, preprocess_function,
batched=True, batched=True,
...@@ -563,7 +564,8 @@ def main(): ...@@ -563,7 +564,8 @@ def main():
raise ValueError("--do_eval requires a validation dataset") raise ValueError("--do_eval requires a validation dataset")
eval_dataset = dataset["validation"] eval_dataset = dataset["validation"]
if data_args.max_eval_samples is not None: if data_args.max_eval_samples is not None:
eval_dataset = eval_dataset.select(range(data_args.max_eval_samples)) max_eval_samples = min(len(eval_dataset), data_args.max_eval_samples)
eval_dataset = eval_dataset.select(range(max_eval_samples))
eval_dataset = eval_dataset.map( eval_dataset = eval_dataset.map(
preprocess_function, preprocess_function,
batched=True, batched=True,
...@@ -579,7 +581,8 @@ def main(): ...@@ -579,7 +581,8 @@ def main():
raise ValueError("--do_predict requires a test dataset") raise ValueError("--do_predict requires a test dataset")
predict_dataset = dataset["test"] predict_dataset = dataset["test"]
if data_args.max_predict_samples is not None: if data_args.max_predict_samples is not None:
predict_dataset = predict_dataset.select(range(data_args.max_predict_samples)) max_predict_samples = min(len(predict_dataset), data_args.max_predict_samples)
predict_dataset = predict_dataset.select(range(max_predict_samples))
predict_dataset = predict_dataset.map( predict_dataset = predict_dataset.map(
preprocess_function, preprocess_function,
batched=True, batched=True,
......
...@@ -404,7 +404,8 @@ def main(): ...@@ -404,7 +404,8 @@ def main():
raise ValueError("--do_train requires a train dataset") raise ValueError("--do_train requires a train dataset")
train_dataset = dataset["train"] train_dataset = dataset["train"]
if data_args.max_train_samples is not None: if data_args.max_train_samples is not None:
train_dataset = train_dataset.select(range(data_args.max_train_samples)) max_train_samples = min(len(train_dataset), data_args.max_train_samples)
train_dataset = train_dataset.select(range(max_train_samples))
train_dataset = train_dataset.filter( train_dataset = train_dataset.filter(
filter_corrupt_images, batched=True, num_proc=data_args.preprocessing_num_workers filter_corrupt_images, batched=True, num_proc=data_args.preprocessing_num_workers
...@@ -426,7 +427,8 @@ def main(): ...@@ -426,7 +427,8 @@ def main():
raise ValueError("--do_eval requires a train validation") raise ValueError("--do_eval requires a train validation")
eval_dataset = dataset["validation"] eval_dataset = dataset["validation"]
if data_args.max_eval_samples is not None: if data_args.max_eval_samples is not None:
eval_dataset = eval_dataset.select(range(data_args.max_eval_samples)) max_eval_samples = min(len(eval_dataset), data_args.max_eval_samples)
eval_dataset = eval_dataset.select(range(max_eval_samples))
eval_dataset = eval_dataset.filter( eval_dataset = eval_dataset.filter(
filter_corrupt_images, batched=True, num_proc=data_args.preprocessing_num_workers filter_corrupt_images, batched=True, num_proc=data_args.preprocessing_num_workers
...@@ -448,7 +450,8 @@ def main(): ...@@ -448,7 +450,8 @@ def main():
raise ValueError("--do_predict requires a test dataset") raise ValueError("--do_predict requires a test dataset")
test_dataset = dataset["test"] test_dataset = dataset["test"]
if data_args.max_eval_samples is not None: if data_args.max_eval_samples is not None:
test_dataset = test_dataset.select(range(data_args.max_eval_samples)) max_eval_samples = min(len(test_dataset), data_args.max_eval_samples)
test_dataset = test_dataset.select(range(max_eval_samples))
test_dataset = test_dataset.filter( test_dataset = test_dataset.filter(
filter_corrupt_images, batched=True, num_proc=data_args.preprocessing_num_workers filter_corrupt_images, batched=True, num_proc=data_args.preprocessing_num_workers
......
...@@ -445,14 +445,16 @@ def main(): ...@@ -445,14 +445,16 @@ def main():
raise ValueError("--do_train requires a train dataset") raise ValueError("--do_train requires a train dataset")
train_dataset = lm_datasets["train"] train_dataset = lm_datasets["train"]
if data_args.max_train_samples is not None: if data_args.max_train_samples is not None:
train_dataset = train_dataset.select(range(data_args.max_train_samples)) max_train_samples = min(len(train_dataset), data_args.max_train_samples)
train_dataset = train_dataset.select(range(max_train_samples))
if training_args.do_eval: if training_args.do_eval:
if "validation" not in tokenized_datasets: if "validation" not in tokenized_datasets:
raise ValueError("--do_eval requires a validation dataset") raise ValueError("--do_eval requires a validation dataset")
eval_dataset = lm_datasets["validation"] eval_dataset = lm_datasets["validation"]
if data_args.max_eval_samples is not None: if data_args.max_eval_samples is not None:
eval_dataset = eval_dataset.select(range(data_args.max_eval_samples)) max_eval_samples = min(len(eval_dataset), data_args.max_eval_samples)
eval_dataset = eval_dataset.select(range(max_eval_samples))
def preprocess_logits_for_metrics(logits, labels): def preprocess_logits_for_metrics(logits, labels):
if isinstance(logits, tuple): if isinstance(logits, tuple):
......
...@@ -468,14 +468,16 @@ def main(): ...@@ -468,14 +468,16 @@ def main():
raise ValueError("--do_train requires a train dataset") raise ValueError("--do_train requires a train dataset")
train_dataset = tokenized_datasets["train"] train_dataset = tokenized_datasets["train"]
if data_args.max_train_samples is not None: if data_args.max_train_samples is not None:
train_dataset = train_dataset.select(range(data_args.max_train_samples)) max_train_samples = min(len(train_dataset), data_args.max_train_samples)
train_dataset = train_dataset.select(range(max_train_samples))
if training_args.do_eval: if training_args.do_eval:
if "validation" not in tokenized_datasets: if "validation" not in tokenized_datasets:
raise ValueError("--do_eval requires a validation dataset") raise ValueError("--do_eval requires a validation dataset")
eval_dataset = tokenized_datasets["validation"] eval_dataset = tokenized_datasets["validation"]
if data_args.max_eval_samples is not None: if data_args.max_eval_samples is not None:
eval_dataset = eval_dataset.select(range(data_args.max_eval_samples)) max_eval_samples = min(len(eval_dataset), data_args.max_eval_samples)
eval_dataset = eval_dataset.select(range(max_eval_samples))
def preprocess_logits_for_metrics(logits, labels): def preprocess_logits_for_metrics(logits, labels):
if isinstance(logits, tuple): if isinstance(logits, tuple):
......
...@@ -438,14 +438,16 @@ def main(): ...@@ -438,14 +438,16 @@ def main():
raise ValueError("--do_train requires a train dataset") raise ValueError("--do_train requires a train dataset")
train_dataset = tokenized_datasets["train"] train_dataset = tokenized_datasets["train"]
if data_args.max_train_samples is not None: if data_args.max_train_samples is not None:
train_dataset = train_dataset.select(range(data_args.max_train_samples)) max_train_samples = min(len(train_dataset), data_args.max_train_samples)
train_dataset = train_dataset.select(range(max_train_samples))
if training_args.do_eval: if training_args.do_eval:
if "validation" not in tokenized_datasets: if "validation" not in tokenized_datasets:
raise ValueError("--do_eval requires a validation dataset") raise ValueError("--do_eval requires a validation dataset")
eval_dataset = tokenized_datasets["validation"] eval_dataset = tokenized_datasets["validation"]
if data_args.max_eval_samples is not None: if data_args.max_eval_samples is not None:
eval_dataset = eval_dataset.select(range(data_args.max_eval_samples)) max_eval_samples = min(len(eval_dataset), data_args.max_eval_samples)
eval_dataset = eval_dataset.select(range(max_eval_samples))
# Data collator # Data collator
data_collator = DataCollatorForPermutationLanguageModeling( data_collator = DataCollatorForPermutationLanguageModeling(
......
...@@ -352,7 +352,8 @@ def main(): ...@@ -352,7 +352,8 @@ def main():
raise ValueError("--do_train requires a train dataset") raise ValueError("--do_train requires a train dataset")
train_dataset = raw_datasets["train"] train_dataset = raw_datasets["train"]
if data_args.max_train_samples is not None: if data_args.max_train_samples is not None:
train_dataset = train_dataset.select(range(data_args.max_train_samples)) max_train_samples = min(len(train_dataset), data_args.max_train_samples)
train_dataset = train_dataset.select(range(max_train_samples))
with training_args.main_process_first(desc="train dataset map pre-processing"): with training_args.main_process_first(desc="train dataset map pre-processing"):
train_dataset = train_dataset.map( train_dataset = train_dataset.map(
preprocess_function, preprocess_function,
...@@ -366,7 +367,8 @@ def main(): ...@@ -366,7 +367,8 @@ def main():
raise ValueError("--do_eval requires a validation dataset") raise ValueError("--do_eval requires a validation dataset")
eval_dataset = raw_datasets["validation"] eval_dataset = raw_datasets["validation"]
if data_args.max_eval_samples is not None: if data_args.max_eval_samples is not None:
eval_dataset = eval_dataset.select(range(data_args.max_eval_samples)) max_eval_samples = min(len(eval_dataset), data_args.max_eval_samples)
eval_dataset = eval_dataset.select(range(max_eval_samples))
with training_args.main_process_first(desc="validation dataset map pre-processing"): with training_args.main_process_first(desc="validation dataset map pre-processing"):
eval_dataset = eval_dataset.map( eval_dataset = eval_dataset.map(
preprocess_function, preprocess_function,
......
...@@ -421,7 +421,8 @@ def main(): ...@@ -421,7 +421,8 @@ def main():
train_dataset = raw_datasets["train"] train_dataset = raw_datasets["train"]
if data_args.max_train_samples is not None: if data_args.max_train_samples is not None:
# We will select sample from whole data if argument is specified # We will select sample from whole data if argument is specified
train_dataset = train_dataset.select(range(data_args.max_train_samples)) max_train_samples = min(len(train_dataset), data_args.max_train_samples)
train_dataset = train_dataset.select(range(max_train_samples))
# Create train feature from dataset # Create train feature from dataset
with training_args.main_process_first(desc="train dataset map pre-processing"): with training_args.main_process_first(desc="train dataset map pre-processing"):
train_dataset = train_dataset.map( train_dataset = train_dataset.map(
...@@ -434,7 +435,8 @@ def main(): ...@@ -434,7 +435,8 @@ def main():
) )
if data_args.max_train_samples is not None: if data_args.max_train_samples is not None:
# Number of samples might increase during Feature Creation, We select only specified max samples # Number of samples might increase during Feature Creation, We select only specified max samples
train_dataset = train_dataset.select(range(data_args.max_train_samples)) max_train_samples = min(len(train_dataset), data_args.max_train_samples)
train_dataset = train_dataset.select(range(max_train_samples))
# Validation preprocessing # Validation preprocessing
def prepare_validation_features(examples): def prepare_validation_features(examples):
...@@ -489,7 +491,8 @@ def main(): ...@@ -489,7 +491,8 @@ def main():
eval_examples = raw_datasets["validation"] eval_examples = raw_datasets["validation"]
if data_args.max_eval_samples is not None: if data_args.max_eval_samples is not None:
# We will select sample from whole data # We will select sample from whole data
eval_examples = eval_examples.select(range(data_args.max_eval_samples)) max_eval_samples = min(len(eval_examples), data_args.max_eval_samples)
eval_examples = eval_examples.select(range(max_eval_samples))
# Validation Feature Creation # Validation Feature Creation
with training_args.main_process_first(desc="validation dataset map pre-processing"): with training_args.main_process_first(desc="validation dataset map pre-processing"):
eval_dataset = eval_examples.map( eval_dataset = eval_examples.map(
...@@ -502,7 +505,8 @@ def main(): ...@@ -502,7 +505,8 @@ def main():
) )
if data_args.max_eval_samples is not None: if data_args.max_eval_samples is not None:
# During Feature creation dataset samples might increase, we will select required samples again # During Feature creation dataset samples might increase, we will select required samples again
eval_dataset = eval_dataset.select(range(data_args.max_eval_samples)) max_eval_samples = min(len(eval_dataset), data_args.max_eval_samples)
eval_dataset = eval_dataset.select(range(max_eval_samples))
if training_args.do_predict: if training_args.do_predict:
if "test" not in raw_datasets: if "test" not in raw_datasets:
...@@ -523,7 +527,8 @@ def main(): ...@@ -523,7 +527,8 @@ def main():
) )
if data_args.max_predict_samples is not None: if data_args.max_predict_samples is not None:
# During Feature creation dataset samples might increase, we will select required samples again # During Feature creation dataset samples might increase, we will select required samples again
predict_dataset = predict_dataset.select(range(data_args.max_predict_samples)) max_predict_samples = min(len(predict_dataset), data_args.max_predict_samples)
predict_dataset = predict_dataset.select(range(max_predict_samples))
# Data collator # Data collator
# We have already padded to max length if the corresponding flag is True, otherwise we need to pad in the data # We have already padded to max length if the corresponding flag is True, otherwise we need to pad in the data
......
...@@ -432,7 +432,8 @@ def main(): ...@@ -432,7 +432,8 @@ def main():
train_dataset = raw_datasets["train"] train_dataset = raw_datasets["train"]
if data_args.max_train_samples is not None: if data_args.max_train_samples is not None:
# Select samples from Dataset, This will help to decrease processing time # Select samples from Dataset, This will help to decrease processing time
train_dataset = train_dataset.select(range(data_args.max_train_samples)) max_train_samples = min(len(train_dataset), data_args.max_train_samples)
train_dataset = train_dataset.select(range(max_train_samples))
# Create Training Features # Create Training Features
with training_args.main_process_first(desc="train dataset map pre-processing"): with training_args.main_process_first(desc="train dataset map pre-processing"):
train_dataset = train_dataset.map( train_dataset = train_dataset.map(
...@@ -445,7 +446,8 @@ def main(): ...@@ -445,7 +446,8 @@ def main():
) )
if data_args.max_train_samples is not None: if data_args.max_train_samples is not None:
# Select samples from dataset again since Feature Creation might increase number of features # Select samples from dataset again since Feature Creation might increase number of features
train_dataset = train_dataset.select(range(data_args.max_train_samples)) max_train_samples = min(len(train_dataset), data_args.max_train_samples)
train_dataset = train_dataset.select(range(max_train_samples))
# Validation preprocessing # Validation preprocessing
def prepare_validation_features(examples): def prepare_validation_features(examples):
...@@ -519,7 +521,8 @@ def main(): ...@@ -519,7 +521,8 @@ def main():
eval_examples = raw_datasets["validation"] eval_examples = raw_datasets["validation"]
if data_args.max_eval_samples is not None: if data_args.max_eval_samples is not None:
# Selecting Eval Samples from Dataset # Selecting Eval Samples from Dataset
eval_examples = eval_examples.select(range(data_args.max_eval_samples)) max_eval_samples = min(len(eval_examples), data_args.max_eval_samples)
eval_examples = eval_examples.select(range(max_eval_samples))
# Create Features from Eval Dataset # Create Features from Eval Dataset
with training_args.main_process_first(desc="validation dataset map pre-processing"): with training_args.main_process_first(desc="validation dataset map pre-processing"):
eval_dataset = eval_examples.map( eval_dataset = eval_examples.map(
...@@ -532,7 +535,8 @@ def main(): ...@@ -532,7 +535,8 @@ def main():
) )
if data_args.max_eval_samples is not None: if data_args.max_eval_samples is not None:
# Selecting Samples from Dataset again since Feature Creation might increase samples size # Selecting Samples from Dataset again since Feature Creation might increase samples size
eval_dataset = eval_dataset.select(range(data_args.max_eval_samples)) max_eval_samples = min(len(eval_dataset), data_args.max_eval_samples)
eval_dataset = eval_dataset.select(range(max_eval_samples))
if training_args.do_predict: if training_args.do_predict:
if "test" not in raw_datasets: if "test" not in raw_datasets:
...@@ -553,7 +557,8 @@ def main(): ...@@ -553,7 +557,8 @@ def main():
) )
if data_args.max_predict_samples is not None: if data_args.max_predict_samples is not None:
# During Feature creation dataset samples might increase, we will select required samples again # During Feature creation dataset samples might increase, we will select required samples again
predict_dataset = predict_dataset.select(range(data_args.max_predict_samples)) max_predict_samples = min(len(predict_dataset), data_args.max_predict_samples)
predict_dataset = predict_dataset.select(range(max_predict_samples))
# Data collator # Data collator
# We have already padded to max length if the corresponding flag is True, otherwise we need to pad in the data # We have already padded to max length if the corresponding flag is True, otherwise we need to pad in the data
......
...@@ -489,7 +489,8 @@ def main(): ...@@ -489,7 +489,8 @@ def main():
train_dataset = raw_datasets["train"] train_dataset = raw_datasets["train"]
if data_args.max_train_samples is not None: if data_args.max_train_samples is not None:
# We will select sample from whole data if agument is specified # We will select sample from whole data if agument is specified
train_dataset = train_dataset.select(range(data_args.max_train_samples)) max_train_samples = min(len(train_dataset), data_args.max_train_samples)
train_dataset = train_dataset.select(range(max_train_samples))
# Create train feature from dataset # Create train feature from dataset
with training_args.main_process_first(desc="train dataset map pre-processing"): with training_args.main_process_first(desc="train dataset map pre-processing"):
train_dataset = train_dataset.map( train_dataset = train_dataset.map(
...@@ -502,7 +503,8 @@ def main(): ...@@ -502,7 +503,8 @@ def main():
) )
if data_args.max_train_samples is not None: if data_args.max_train_samples is not None:
# Number of samples might increase during Feature Creation, We select only specified max samples # Number of samples might increase during Feature Creation, We select only specified max samples
train_dataset = train_dataset.select(range(data_args.max_train_samples)) max_train_samples = min(len(train_dataset), data_args.max_train_samples)
train_dataset = train_dataset.select(range(max_train_samples))
if training_args.do_eval: if training_args.do_eval:
if "validation" not in raw_datasets: if "validation" not in raw_datasets:
...@@ -510,7 +512,8 @@ def main(): ...@@ -510,7 +512,8 @@ def main():
eval_examples = raw_datasets["validation"] eval_examples = raw_datasets["validation"]
if data_args.max_eval_samples is not None: if data_args.max_eval_samples is not None:
# We will select sample from whole data # We will select sample from whole data
eval_examples = eval_examples.select(range(data_args.max_eval_samples)) max_eval_samples = min(len(eval_examples), data_args.max_eval_samples)
eval_examples = eval_examples.select(range(max_eval_samples))
# Validation Feature Creation # Validation Feature Creation
with training_args.main_process_first(desc="validation dataset map pre-processing"): with training_args.main_process_first(desc="validation dataset map pre-processing"):
eval_dataset = eval_examples.map( eval_dataset = eval_examples.map(
...@@ -523,7 +526,8 @@ def main(): ...@@ -523,7 +526,8 @@ def main():
) )
if data_args.max_eval_samples is not None: if data_args.max_eval_samples is not None:
# During Feature creation dataset samples might increase, we will select required samples again # During Feature creation dataset samples might increase, we will select required samples again
eval_dataset = eval_dataset.select(range(data_args.max_eval_samples)) max_eval_samples = min(len(eval_dataset), data_args.max_eval_samples)
eval_dataset = eval_dataset.select(range(max_eval_samples))
if training_args.do_predict: if training_args.do_predict:
if "test" not in raw_datasets: if "test" not in raw_datasets:
...@@ -544,7 +548,8 @@ def main(): ...@@ -544,7 +548,8 @@ def main():
) )
if data_args.max_predict_samples is not None: if data_args.max_predict_samples is not None:
# During Feature creation dataset samples might increase, we will select required samples again # During Feature creation dataset samples might increase, we will select required samples again
predict_dataset = predict_dataset.select(range(data_args.max_predict_samples)) max_predict_samples = min(len(predict_dataset), data_args.max_predict_samples)
predict_dataset = predict_dataset.select(range(max_predict_samples))
# Data collator # Data collator
label_pad_token_id = -100 if data_args.ignore_pad_token_for_loss else tokenizer.pad_token_id label_pad_token_id = -100 if data_args.ignore_pad_token_for_loss else tokenizer.pad_token_id
......
...@@ -504,7 +504,8 @@ def main(): ...@@ -504,7 +504,8 @@ def main():
raise ValueError("--do_train requires a train dataset") raise ValueError("--do_train requires a train dataset")
train_dataset = raw_datasets["train"] train_dataset = raw_datasets["train"]
if data_args.max_train_samples is not None: if data_args.max_train_samples is not None:
train_dataset = train_dataset.select(range(data_args.max_train_samples)) max_train_samples = min(len(train_dataset), data_args.max_train_samples)
train_dataset = train_dataset.select(range(max_train_samples))
with training_args.main_process_first(desc="train dataset map pre-processing"): with training_args.main_process_first(desc="train dataset map pre-processing"):
train_dataset = train_dataset.map( train_dataset = train_dataset.map(
preprocess_function, preprocess_function,
...@@ -521,7 +522,8 @@ def main(): ...@@ -521,7 +522,8 @@ def main():
raise ValueError("--do_eval requires a validation dataset") raise ValueError("--do_eval requires a validation dataset")
eval_dataset = raw_datasets["validation"] eval_dataset = raw_datasets["validation"]
if data_args.max_eval_samples is not None: if data_args.max_eval_samples is not None:
eval_dataset = eval_dataset.select(range(data_args.max_eval_samples)) max_eval_samples = min(len(eval_dataset), data_args.max_eval_samples)
eval_dataset = eval_dataset.select(range(max_eval_samples))
with training_args.main_process_first(desc="validation dataset map pre-processing"): with training_args.main_process_first(desc="validation dataset map pre-processing"):
eval_dataset = eval_dataset.map( eval_dataset = eval_dataset.map(
preprocess_function, preprocess_function,
...@@ -538,7 +540,8 @@ def main(): ...@@ -538,7 +540,8 @@ def main():
raise ValueError("--do_predict requires a test dataset") raise ValueError("--do_predict requires a test dataset")
predict_dataset = raw_datasets["test"] predict_dataset = raw_datasets["test"]
if data_args.max_predict_samples is not None: if data_args.max_predict_samples is not None:
predict_dataset = predict_dataset.select(range(data_args.max_predict_samples)) max_predict_samples = min(len(predict_dataset), data_args.max_predict_samples)
predict_dataset = predict_dataset.select(range(max_predict_samples))
with training_args.main_process_first(desc="prediction dataset map pre-processing"): with training_args.main_process_first(desc="prediction dataset map pre-processing"):
predict_dataset = predict_dataset.map( predict_dataset = predict_dataset.map(
preprocess_function, preprocess_function,
......
...@@ -415,21 +415,24 @@ def main(): ...@@ -415,21 +415,24 @@ def main():
raise ValueError("--do_train requires a train dataset") raise ValueError("--do_train requires a train dataset")
train_dataset = raw_datasets["train"] train_dataset = raw_datasets["train"]
if data_args.max_train_samples is not None: if data_args.max_train_samples is not None:
train_dataset = train_dataset.select(range(data_args.max_train_samples)) max_train_samples = min(len(train_dataset), data_args.max_train_samples)
train_dataset = train_dataset.select(range(max_train_samples))
if training_args.do_eval: if training_args.do_eval:
if "validation" not in raw_datasets and "validation_matched" not in raw_datasets: if "validation" not in raw_datasets and "validation_matched" not in raw_datasets:
raise ValueError("--do_eval requires a validation dataset") raise ValueError("--do_eval requires a validation dataset")
eval_dataset = raw_datasets["validation_matched" if data_args.task_name == "mnli" else "validation"] eval_dataset = raw_datasets["validation_matched" if data_args.task_name == "mnli" else "validation"]
if data_args.max_eval_samples is not None: if data_args.max_eval_samples is not None:
eval_dataset = eval_dataset.select(range(data_args.max_eval_samples)) max_eval_samples = min(len(eval_dataset), data_args.max_eval_samples)
eval_dataset = eval_dataset.select(range(max_eval_samples))
if training_args.do_predict or data_args.task_name is not None or data_args.test_file is not None: if training_args.do_predict or data_args.task_name is not None or data_args.test_file is not None:
if "test" not in raw_datasets and "test_matched" not in raw_datasets: if "test" not in raw_datasets and "test_matched" not in raw_datasets:
raise ValueError("--do_predict requires a test dataset") raise ValueError("--do_predict requires a test dataset")
predict_dataset = raw_datasets["test_matched" if data_args.task_name == "mnli" else "test"] predict_dataset = raw_datasets["test_matched" if data_args.task_name == "mnli" else "test"]
if data_args.max_predict_samples is not None: if data_args.max_predict_samples is not None:
predict_dataset = predict_dataset.select(range(data_args.max_predict_samples)) max_predict_samples = min(len(predict_dataset), data_args.max_predict_samples)
predict_dataset = predict_dataset.select(range(max_predict_samples))
# Log a few random samples from the training set: # Log a few random samples from the training set:
if training_args.do_train: if training_args.do_train:
......
...@@ -279,7 +279,8 @@ def main(): ...@@ -279,7 +279,8 @@ def main():
if training_args.do_train: if training_args.do_train:
if data_args.max_train_samples is not None: if data_args.max_train_samples is not None:
train_dataset = train_dataset.select(range(data_args.max_train_samples)) max_train_samples = min(len(train_dataset), data_args.max_train_samples)
train_dataset = train_dataset.select(range(max_train_samples))
with training_args.main_process_first(desc="train dataset map pre-processing"): with training_args.main_process_first(desc="train dataset map pre-processing"):
train_dataset = train_dataset.map( train_dataset = train_dataset.map(
preprocess_function, preprocess_function,
...@@ -293,7 +294,8 @@ def main(): ...@@ -293,7 +294,8 @@ def main():
if training_args.do_eval: if training_args.do_eval:
if data_args.max_eval_samples is not None: if data_args.max_eval_samples is not None:
eval_dataset = eval_dataset.select(range(data_args.max_eval_samples)) max_eval_samples = min(len(eval_dataset), data_args.max_eval_samples)
eval_dataset = eval_dataset.select(range(max_eval_samples))
with training_args.main_process_first(desc="validation dataset map pre-processing"): with training_args.main_process_first(desc="validation dataset map pre-processing"):
eval_dataset = eval_dataset.map( eval_dataset = eval_dataset.map(
preprocess_function, preprocess_function,
...@@ -304,7 +306,8 @@ def main(): ...@@ -304,7 +306,8 @@ def main():
if training_args.do_predict: if training_args.do_predict:
if data_args.max_predict_samples is not None: if data_args.max_predict_samples is not None:
predict_dataset = predict_dataset.select(range(data_args.max_predict_samples)) max_predict_samples = min(len(predict_dataset), data_args.max_predict_samples)
predict_dataset = predict_dataset.select(range(max_predict_samples))
with training_args.main_process_first(desc="prediction dataset map pre-processing"): with training_args.main_process_first(desc="prediction dataset map pre-processing"):
predict_dataset = predict_dataset.map( predict_dataset = predict_dataset.map(
preprocess_function, preprocess_function,
......
...@@ -431,7 +431,8 @@ def main(): ...@@ -431,7 +431,8 @@ def main():
raise ValueError("--do_train requires a train dataset") raise ValueError("--do_train requires a train dataset")
train_dataset = raw_datasets["train"] train_dataset = raw_datasets["train"]
if data_args.max_train_samples is not None: if data_args.max_train_samples is not None:
train_dataset = train_dataset.select(range(data_args.max_train_samples)) max_train_samples = min(len(train_dataset), data_args.max_train_samples)
train_dataset = train_dataset.select(range(max_train_samples))
with training_args.main_process_first(desc="train dataset map pre-processing"): with training_args.main_process_first(desc="train dataset map pre-processing"):
train_dataset = train_dataset.map( train_dataset = train_dataset.map(
tokenize_and_align_labels, tokenize_and_align_labels,
...@@ -446,7 +447,8 @@ def main(): ...@@ -446,7 +447,8 @@ def main():
raise ValueError("--do_eval requires a validation dataset") raise ValueError("--do_eval requires a validation dataset")
eval_dataset = raw_datasets["validation"] eval_dataset = raw_datasets["validation"]
if data_args.max_eval_samples is not None: if data_args.max_eval_samples is not None:
eval_dataset = eval_dataset.select(range(data_args.max_eval_samples)) max_eval_samples = min(len(eval_dataset), data_args.max_eval_samples)
eval_dataset = eval_dataset.select(range(max_eval_samples))
with training_args.main_process_first(desc="validation dataset map pre-processing"): with training_args.main_process_first(desc="validation dataset map pre-processing"):
eval_dataset = eval_dataset.map( eval_dataset = eval_dataset.map(
tokenize_and_align_labels, tokenize_and_align_labels,
...@@ -461,7 +463,8 @@ def main(): ...@@ -461,7 +463,8 @@ def main():
raise ValueError("--do_predict requires a test dataset") raise ValueError("--do_predict requires a test dataset")
predict_dataset = raw_datasets["test"] predict_dataset = raw_datasets["test"]
if data_args.max_predict_samples is not None: if data_args.max_predict_samples is not None:
predict_dataset = predict_dataset.select(range(data_args.max_predict_samples)) max_predict_samples = min(len(predict_dataset), data_args.max_predict_samples)
predict_dataset = predict_dataset.select(range(max_predict_samples))
with training_args.main_process_first(desc="prediction dataset map pre-processing"): with training_args.main_process_first(desc="prediction dataset map pre-processing"):
predict_dataset = predict_dataset.map( predict_dataset = predict_dataset.map(
tokenize_and_align_labels, tokenize_and_align_labels,
......
...@@ -433,7 +433,8 @@ def main(): ...@@ -433,7 +433,8 @@ def main():
raise ValueError("--do_train requires a train dataset") raise ValueError("--do_train requires a train dataset")
train_dataset = raw_datasets["train"] train_dataset = raw_datasets["train"]
if data_args.max_train_samples is not None: if data_args.max_train_samples is not None:
train_dataset = train_dataset.select(range(data_args.max_train_samples)) max_train_samples = min(len(train_dataset), data_args.max_train_samples)
train_dataset = train_dataset.select(range(max_train_samples))
with training_args.main_process_first(desc="train dataset map pre-processing"): with training_args.main_process_first(desc="train dataset map pre-processing"):
train_dataset = train_dataset.map( train_dataset = train_dataset.map(
preprocess_function, preprocess_function,
...@@ -450,7 +451,8 @@ def main(): ...@@ -450,7 +451,8 @@ def main():
raise ValueError("--do_eval requires a validation dataset") raise ValueError("--do_eval requires a validation dataset")
eval_dataset = raw_datasets["validation"] eval_dataset = raw_datasets["validation"]
if data_args.max_eval_samples is not None: if data_args.max_eval_samples is not None:
eval_dataset = eval_dataset.select(range(data_args.max_eval_samples)) max_eval_samples = min(len(eval_dataset), data_args.max_eval_samples)
eval_dataset = eval_dataset.select(range(max_eval_samples))
with training_args.main_process_first(desc="validation dataset map pre-processing"): with training_args.main_process_first(desc="validation dataset map pre-processing"):
eval_dataset = eval_dataset.map( eval_dataset = eval_dataset.map(
preprocess_function, preprocess_function,
...@@ -467,7 +469,8 @@ def main(): ...@@ -467,7 +469,8 @@ def main():
raise ValueError("--do_predict requires a test dataset") raise ValueError("--do_predict requires a test dataset")
predict_dataset = raw_datasets["test"] predict_dataset = raw_datasets["test"]
if data_args.max_predict_samples is not None: if data_args.max_predict_samples is not None:
predict_dataset = predict_dataset.select(range(data_args.max_predict_samples)) max_predict_samples = min(len(predict_dataset), data_args.max_predict_samples)
predict_dataset = predict_dataset.select(range(max_predict_samples))
with training_args.main_process_first(desc="prediction dataset map pre-processing"): with training_args.main_process_first(desc="prediction dataset map pre-processing"):
predict_dataset = predict_dataset.map( predict_dataset = predict_dataset.map(
preprocess_function, preprocess_function,
......
...@@ -398,14 +398,16 @@ def main(): ...@@ -398,14 +398,16 @@ def main():
raise ValueError("--do_train requires a train dataset") raise ValueError("--do_train requires a train dataset")
train_dataset = lm_datasets["train"] train_dataset = lm_datasets["train"]
if data_args.max_train_samples is not None: if data_args.max_train_samples is not None:
train_dataset = train_dataset.select(range(data_args.max_train_samples)) max_train_samples = min(len(train_dataset), data_args.max_train_samples)
train_dataset = train_dataset.select(range(max_train_samples))
if training_args.do_eval: if training_args.do_eval:
if "validation" not in tokenized_datasets: if "validation" not in tokenized_datasets:
raise ValueError("--do_eval requires a validation dataset") raise ValueError("--do_eval requires a validation dataset")
eval_dataset = lm_datasets["validation"] eval_dataset = lm_datasets["validation"]
if data_args.max_eval_samples is not None: if data_args.max_eval_samples is not None:
eval_dataset = eval_dataset.select(range(data_args.max_eval_samples)) max_eval_samples = min(len(eval_dataset), data_args.max_eval_samples)
eval_dataset = eval_dataset.select(range(max_eval_samples))
# Enable tensorboard only on the master node # Enable tensorboard only on the master node
has_tensorboard = is_tensorboard_available() has_tensorboard = is_tensorboard_available()
......
...@@ -434,7 +434,8 @@ def main(): ...@@ -434,7 +434,8 @@ def main():
train_dataset = raw_datasets["train"] train_dataset = raw_datasets["train"]
if data_args.max_train_samples is not None: if data_args.max_train_samples is not None:
# We will select sample from whole data if agument is specified # We will select sample from whole data if agument is specified
train_dataset = train_dataset.select(range(data_args.max_train_samples)) max_train_samples = min(len(train_dataset), data_args.max_train_samples)
train_dataset = train_dataset.select(range(max_train_samples))
# Create train feature from dataset # Create train feature from dataset
with training_args.main_process_first(desc="train dataset map pre-processing"): with training_args.main_process_first(desc="train dataset map pre-processing"):
train_dataset = train_dataset.map( train_dataset = train_dataset.map(
...@@ -447,7 +448,8 @@ def main(): ...@@ -447,7 +448,8 @@ def main():
) )
if data_args.max_train_samples is not None: if data_args.max_train_samples is not None:
# Number of samples might increase during Feature Creation, We select only specified max samples # Number of samples might increase during Feature Creation, We select only specified max samples
train_dataset = train_dataset.select(range(data_args.max_train_samples)) max_train_samples = min(len(train_dataset), data_args.max_train_samples)
train_dataset = train_dataset.select(range(max_train_samples))
# Validation preprocessing # Validation preprocessing
def prepare_validation_features(examples): def prepare_validation_features(examples):
...@@ -497,7 +499,8 @@ def main(): ...@@ -497,7 +499,8 @@ def main():
eval_examples = raw_datasets["validation"] eval_examples = raw_datasets["validation"]
if data_args.max_eval_samples is not None: if data_args.max_eval_samples is not None:
# We will select sample from whole data # We will select sample from whole data
eval_examples = eval_examples.select(range(data_args.max_eval_samples)) max_eval_samples = min(len(eval_examples), data_args.max_eval_samples)
eval_examples = eval_examples.select(range(max_eval_samples))
# Validation Feature Creation # Validation Feature Creation
with training_args.main_process_first(desc="validation dataset map pre-processing"): with training_args.main_process_first(desc="validation dataset map pre-processing"):
eval_dataset = eval_examples.map( eval_dataset = eval_examples.map(
...@@ -510,7 +513,8 @@ def main(): ...@@ -510,7 +513,8 @@ def main():
) )
if data_args.max_eval_samples is not None: if data_args.max_eval_samples is not None:
# During Feature creation dataset samples might increase, we will select required samples again # During Feature creation dataset samples might increase, we will select required samples again
eval_dataset = eval_dataset.select(range(data_args.max_eval_samples)) max_eval_samples = min(len(eval_dataset), data_args.max_eval_samples)
eval_dataset = eval_dataset.select(range(max_eval_samples))
if training_args.do_predict: if training_args.do_predict:
if "test" not in raw_datasets: if "test" not in raw_datasets:
...@@ -531,7 +535,8 @@ def main(): ...@@ -531,7 +535,8 @@ def main():
) )
if data_args.max_predict_samples is not None: if data_args.max_predict_samples is not None:
# During Feature creation dataset samples might increase, we will select required samples again # During Feature creation dataset samples might increase, we will select required samples again
predict_dataset = predict_dataset.select(range(data_args.max_predict_samples)) max_predict_samples = min(len(predict_dataset), data_args.max_predict_samples)
predict_dataset = predict_dataset.select(range(max_predict_samples))
# Data collator # Data collator
# We have already padded to max length if the corresponding flag is True, otherwise we need to pad in the data # We have already padded to max length if the corresponding flag is True, otherwise we need to pad in the data
......
...@@ -375,7 +375,8 @@ def main(): ...@@ -375,7 +375,8 @@ def main():
) )
if data_args.max_train_samples is not None: if data_args.max_train_samples is not None:
train_dataset = train_dataset.select(range(data_args.max_train_samples)) max_train_samples = min(len(train_dataset), data_args.max_train_samples)
train_dataset = train_dataset.select(range(max_train_samples))
if data_args.max_val_samples is not None: if data_args.max_val_samples is not None:
eval_dataset = eval_dataset.select(range(data_args.max_val_samples)) eval_dataset = eval_dataset.select(range(data_args.max_val_samples))
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment