Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
44eb8bde
Unverified
Commit
44eb8bde
authored
Sep 30, 2021
by
Patrick von Platen
Committed by
GitHub
Sep 30, 2021
Browse files
map only on one process (#13810)
parent
9a9805fc
Changes
9
Hide whitespace changes
Inline
Side-by-side
Showing
9 changed files
with
142 additions
and
125 deletions
+142
-125
examples/pytorch/language-modeling/run_clm_no_trainer.py
examples/pytorch/language-modeling/run_clm_no_trainer.py
+17
-15
examples/pytorch/language-modeling/run_mlm_no_trainer.py
examples/pytorch/language-modeling/run_mlm_no_trainer.py
+26
-23
examples/pytorch/multiple-choice/run_swag_no_trainer.py
examples/pytorch/multiple-choice/run_swag_no_trainer.py
+4
-3
examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py
...torch/question-answering/run_qa_beam_search_no_trainer.py
+30
-27
examples/pytorch/question-answering/run_qa_no_trainer.py
examples/pytorch/question-answering/run_qa_no_trainer.py
+34
-30
examples/pytorch/summarization/run_summarization_no_trainer.py
...les/pytorch/summarization/run_summarization_no_trainer.py
+8
-7
examples/pytorch/text-classification/run_glue_no_trainer.py
examples/pytorch/text-classification/run_glue_no_trainer.py
+7
-6
examples/pytorch/token-classification/run_ner_no_trainer.py
examples/pytorch/token-classification/run_ner_no_trainer.py
+7
-6
examples/pytorch/translation/run_translation_no_trainer.py
examples/pytorch/translation/run_translation_no_trainer.py
+9
-8
No files found.
examples/pytorch/language-modeling/run_clm_no_trainer.py
View file @
44eb8bde
...
@@ -337,14 +337,15 @@ def main():
...
@@ -337,14 +337,15 @@ def main():
def
tokenize_function
(
examples
):
def
tokenize_function
(
examples
):
return
tokenizer
(
examples
[
text_column_name
])
return
tokenizer
(
examples
[
text_column_name
])
tokenized_datasets
=
raw_datasets
.
map
(
with
accelerator
.
main_process_first
():
tokenize_function
,
tokenized_datasets
=
raw_datasets
.
map
(
batched
=
True
,
tokenize_function
,
num_proc
=
args
.
preprocessing_num_workers
,
batched
=
True
,
remove_columns
=
column_names
,
num_proc
=
args
.
preprocessing_num_workers
,
load_from_cache_file
=
not
args
.
overwrite_cache
,
remove_columns
=
column_names
,
desc
=
"Running tokenizer on dataset"
,
load_from_cache_file
=
not
args
.
overwrite_cache
,
)
desc
=
"Running tokenizer on dataset"
,
)
if
args
.
block_size
is
None
:
if
args
.
block_size
is
None
:
block_size
=
tokenizer
.
model_max_length
block_size
=
tokenizer
.
model_max_length
...
@@ -386,13 +387,14 @@ def main():
...
@@ -386,13 +387,14 @@ def main():
# To speed up this part, we use multiprocessing. See the documentation of the map method for more information:
# To speed up this part, we use multiprocessing. See the documentation of the map method for more information:
# https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map
# https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map
lm_datasets
=
tokenized_datasets
.
map
(
with
accelerator
.
main_process_first
():
group_texts
,
lm_datasets
=
tokenized_datasets
.
map
(
batched
=
True
,
group_texts
,
num_proc
=
args
.
preprocessing_num_workers
,
batched
=
True
,
load_from_cache_file
=
not
args
.
overwrite_cache
,
num_proc
=
args
.
preprocessing_num_workers
,
desc
=
f
"Grouping texts in chunks of
{
block_size
}
"
,
load_from_cache_file
=
not
args
.
overwrite_cache
,
)
desc
=
f
"Grouping texts in chunks of
{
block_size
}
"
,
)
train_dataset
=
lm_datasets
[
"train"
]
train_dataset
=
lm_datasets
[
"train"
]
eval_dataset
=
lm_datasets
[
"validation"
]
eval_dataset
=
lm_datasets
[
"validation"
]
...
...
examples/pytorch/language-modeling/run_mlm_no_trainer.py
View file @
44eb8bde
...
@@ -374,14 +374,15 @@ def main():
...
@@ -374,14 +374,15 @@ def main():
return_special_tokens_mask
=
True
,
return_special_tokens_mask
=
True
,
)
)
tokenized_datasets
=
raw_datasets
.
map
(
with
accelerator
.
main_process_first
():
tokenize_function
,
tokenized_datasets
=
raw_datasets
.
map
(
batched
=
True
,
tokenize_function
,
num_proc
=
args
.
preprocessing_num_workers
,
batched
=
True
,
remove_columns
=
[
text_column_name
],
num_proc
=
args
.
preprocessing_num_workers
,
load_from_cache_file
=
not
args
.
overwrite_cache
,
remove_columns
=
[
text_column_name
],
desc
=
"Running tokenizer on dataset line_by_line"
,
load_from_cache_file
=
not
args
.
overwrite_cache
,
)
desc
=
"Running tokenizer on dataset line_by_line"
,
)
else
:
else
:
# Otherwise, we tokenize every text, then concatenate them together before splitting them in smaller parts.
# Otherwise, we tokenize every text, then concatenate them together before splitting them in smaller parts.
# We use `return_special_tokens_mask=True` because DataCollatorForLanguageModeling (see below) is more
# We use `return_special_tokens_mask=True` because DataCollatorForLanguageModeling (see below) is more
...
@@ -389,14 +390,15 @@ def main():
...
@@ -389,14 +390,15 @@ def main():
def
tokenize_function
(
examples
):
def
tokenize_function
(
examples
):
return
tokenizer
(
examples
[
text_column_name
],
return_special_tokens_mask
=
True
)
return
tokenizer
(
examples
[
text_column_name
],
return_special_tokens_mask
=
True
)
tokenized_datasets
=
raw_datasets
.
map
(
with
accelerator
.
main_process_first
():
tokenize_function
,
tokenized_datasets
=
raw_datasets
.
map
(
batched
=
True
,
tokenize_function
,
num_proc
=
args
.
preprocessing_num_workers
,
batched
=
True
,
remove_columns
=
column_names
,
num_proc
=
args
.
preprocessing_num_workers
,
load_from_cache_file
=
not
args
.
overwrite_cache
,
remove_columns
=
column_names
,
desc
=
"Running tokenizer on every text in dataset"
,
load_from_cache_file
=
not
args
.
overwrite_cache
,
)
desc
=
"Running tokenizer on every text in dataset"
,
)
# Main data processing function that will concatenate all texts from our dataset and generate chunks of
# Main data processing function that will concatenate all texts from our dataset and generate chunks of
# max_seq_length.
# max_seq_length.
...
@@ -422,13 +424,14 @@ def main():
...
@@ -422,13 +424,14 @@ def main():
# To speed up this part, we use multiprocessing. See the documentation of the map method for more information:
# To speed up this part, we use multiprocessing. See the documentation of the map method for more information:
# https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map
# https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map
tokenized_datasets
=
tokenized_datasets
.
map
(
with
accelerator
.
main_process_first
():
group_texts
,
tokenized_datasets
=
tokenized_datasets
.
map
(
batched
=
True
,
group_texts
,
num_proc
=
args
.
preprocessing_num_workers
,
batched
=
True
,
load_from_cache_file
=
not
args
.
overwrite_cache
,
num_proc
=
args
.
preprocessing_num_workers
,
desc
=
f
"Grouping texts in chunks of
{
max_seq_length
}
"
,
load_from_cache_file
=
not
args
.
overwrite_cache
,
)
desc
=
f
"Grouping texts in chunks of
{
max_seq_length
}
"
,
)
train_dataset
=
tokenized_datasets
[
"train"
]
train_dataset
=
tokenized_datasets
[
"train"
]
eval_dataset
=
tokenized_datasets
[
"validation"
]
eval_dataset
=
tokenized_datasets
[
"validation"
]
...
...
examples/pytorch/multiple-choice/run_swag_no_trainer.py
View file @
44eb8bde
...
@@ -381,9 +381,10 @@ def main():
...
@@ -381,9 +381,10 @@ def main():
tokenized_inputs
[
"labels"
]
=
labels
tokenized_inputs
[
"labels"
]
=
labels
return
tokenized_inputs
return
tokenized_inputs
processed_datasets
=
raw_datasets
.
map
(
with
accelerator
.
main_process_first
():
preprocess_function
,
batched
=
True
,
remove_columns
=
raw_datasets
[
"train"
].
column_names
processed_datasets
=
raw_datasets
.
map
(
)
preprocess_function
,
batched
=
True
,
remove_columns
=
raw_datasets
[
"train"
].
column_names
)
train_dataset
=
processed_datasets
[
"train"
]
train_dataset
=
processed_datasets
[
"train"
]
eval_dataset
=
processed_datasets
[
"validation"
]
eval_dataset
=
processed_datasets
[
"validation"
]
...
...
examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py
View file @
44eb8bde
...
@@ -440,14 +440,15 @@ def main():
...
@@ -440,14 +440,15 @@ def main():
# We will select sample from whole data if agument is specified
# We will select sample from whole data if agument is specified
train_dataset
=
train_dataset
.
select
(
range
(
args
.
max_train_samples
))
train_dataset
=
train_dataset
.
select
(
range
(
args
.
max_train_samples
))
# Create train feature from dataset
# Create train feature from dataset
train_dataset
=
train_dataset
.
map
(
with
accelerator
.
main_process_first
():
prepare_train_features
,
train_dataset
=
train_dataset
.
map
(
batched
=
True
,
prepare_train_features
,
num_proc
=
args
.
preprocessing_num_workers
,
batched
=
True
,
remove_columns
=
column_names
,
num_proc
=
args
.
preprocessing_num_workers
,
load_from_cache_file
=
not
args
.
overwrite_cache
,
remove_columns
=
column_names
,
desc
=
"Running tokenizer on train dataset"
,
load_from_cache_file
=
not
args
.
overwrite_cache
,
)
desc
=
"Running tokenizer on train dataset"
,
)
if
args
.
max_train_samples
is
not
None
:
if
args
.
max_train_samples
is
not
None
:
# Number of samples might increase during Feature Creation, We select only specified max samples
# Number of samples might increase during Feature Creation, We select only specified max samples
train_dataset
=
train_dataset
.
select
(
range
(
args
.
max_train_samples
))
train_dataset
=
train_dataset
.
select
(
range
(
args
.
max_train_samples
))
...
@@ -530,14 +531,15 @@ def main():
...
@@ -530,14 +531,15 @@ def main():
# We will select sample from whole data
# We will select sample from whole data
eval_examples
=
eval_examples
.
select
(
range
(
args
.
max_eval_samples
))
eval_examples
=
eval_examples
.
select
(
range
(
args
.
max_eval_samples
))
# Validation Feature Creation
# Validation Feature Creation
eval_dataset
=
eval_examples
.
map
(
with
accelerator
.
main_process_first
():
prepare_validation_features
,
eval_dataset
=
eval_examples
.
map
(
batched
=
True
,
prepare_validation_features
,
num_proc
=
args
.
preprocessing_num_workers
,
batched
=
True
,
remove_columns
=
column_names
,
num_proc
=
args
.
preprocessing_num_workers
,
load_from_cache_file
=
not
args
.
overwrite_cache
,
remove_columns
=
column_names
,
desc
=
"Running tokenizer on validation dataset"
,
load_from_cache_file
=
not
args
.
overwrite_cache
,
)
desc
=
"Running tokenizer on validation dataset"
,
)
if
args
.
max_eval_samples
is
not
None
:
if
args
.
max_eval_samples
is
not
None
:
# During Feature creation dataset samples might increase, we will select required samples again
# During Feature creation dataset samples might increase, we will select required samples again
...
@@ -551,17 +553,18 @@ def main():
...
@@ -551,17 +553,18 @@ def main():
# We will select sample from whole data
# We will select sample from whole data
predict_examples
=
predict_examples
.
select
(
range
(
args
.
max_predict_samples
))
predict_examples
=
predict_examples
.
select
(
range
(
args
.
max_predict_samples
))
# Predict Feature Creation
# Predict Feature Creation
predict_dataset
=
predict_examples
.
map
(
with
accelerator
.
main_process_first
():
prepare_validation_features
,
predict_dataset
=
predict_examples
.
map
(
batched
=
True
,
prepare_validation_features
,
num_proc
=
args
.
preprocessing_num_workers
,
batched
=
True
,
remove_columns
=
column_names
,
num_proc
=
args
.
preprocessing_num_workers
,
load_from_cache_file
=
not
args
.
overwrite_cache
,
remove_columns
=
column_names
,
desc
=
"Running tokenizer on prediction dataset"
,
load_from_cache_file
=
not
args
.
overwrite_cache
,
)
desc
=
"Running tokenizer on prediction dataset"
,
if
args
.
max_predict_samples
is
not
None
:
)
# During Feature creation dataset samples might increase, we will select required samples again
if
args
.
max_predict_samples
is
not
None
:
predict_dataset
=
predict_dataset
.
select
(
range
(
args
.
max_predict_samples
))
# During Feature creation dataset samples might increase, we will select required samples again
predict_dataset
=
predict_dataset
.
select
(
range
(
args
.
max_predict_samples
))
# Log a few random samples from the training set:
# Log a few random samples from the training set:
for
index
in
random
.
sample
(
range
(
len
(
train_dataset
)),
3
):
for
index
in
random
.
sample
(
range
(
len
(
train_dataset
)),
3
):
...
...
examples/pytorch/question-answering/run_qa_no_trainer.py
View file @
44eb8bde
...
@@ -468,18 +468,20 @@ def main():
...
@@ -468,18 +468,20 @@ def main():
if
args
.
max_train_samples
is
not
None
:
if
args
.
max_train_samples
is
not
None
:
# We will select sample from whole data if agument is specified
# We will select sample from whole data if agument is specified
train_dataset
=
train_dataset
.
select
(
range
(
args
.
max_train_samples
))
train_dataset
=
train_dataset
.
select
(
range
(
args
.
max_train_samples
))
# Create train feature from dataset
# Create train feature from dataset
train_dataset
=
train_dataset
.
map
(
with
accelerator
.
main_process_first
():
prepare_train_features
,
train_dataset
=
train_dataset
.
map
(
batched
=
True
,
prepare_train_features
,
num_proc
=
args
.
preprocessing_num_workers
,
batched
=
True
,
remove_columns
=
column_names
,
num_proc
=
args
.
preprocessing_num_workers
,
load_from_cache_file
=
not
args
.
overwrite_cache
,
remove_columns
=
column_names
,
desc
=
"Running tokenizer on train dataset"
,
load_from_cache_file
=
not
args
.
overwrite_cache
,
)
desc
=
"Running tokenizer on train dataset"
,
if
args
.
max_train_samples
is
not
None
:
)
# Number of samples might increase during Feature Creation, We select only specified max samples
if
args
.
max_train_samples
is
not
None
:
train_dataset
=
train_dataset
.
select
(
range
(
args
.
max_train_samples
))
# Number of samples might increase during Feature Creation, We select only specified max samples
train_dataset
=
train_dataset
.
select
(
range
(
args
.
max_train_samples
))
# Validation preprocessing
# Validation preprocessing
def
prepare_validation_features
(
examples
):
def
prepare_validation_features
(
examples
):
...
@@ -535,14 +537,15 @@ def main():
...
@@ -535,14 +537,15 @@ def main():
# We will select sample from whole data
# We will select sample from whole data
eval_examples
=
eval_examples
.
select
(
range
(
args
.
max_eval_samples
))
eval_examples
=
eval_examples
.
select
(
range
(
args
.
max_eval_samples
))
# Validation Feature Creation
# Validation Feature Creation
eval_dataset
=
eval_examples
.
map
(
with
accelerator
.
main_process_first
():
prepare_validation_features
,
eval_dataset
=
eval_examples
.
map
(
batched
=
True
,
prepare_validation_features
,
num_proc
=
args
.
preprocessing_num_workers
,
batched
=
True
,
remove_columns
=
column_names
,
num_proc
=
args
.
preprocessing_num_workers
,
load_from_cache_file
=
not
args
.
overwrite_cache
,
remove_columns
=
column_names
,
desc
=
"Running tokenizer on validation dataset"
,
load_from_cache_file
=
not
args
.
overwrite_cache
,
)
desc
=
"Running tokenizer on validation dataset"
,
)
if
args
.
max_eval_samples
is
not
None
:
if
args
.
max_eval_samples
is
not
None
:
# During Feature creation dataset samples might increase, we will select required samples again
# During Feature creation dataset samples might increase, we will select required samples again
...
@@ -556,17 +559,18 @@ def main():
...
@@ -556,17 +559,18 @@ def main():
# We will select sample from whole data
# We will select sample from whole data
predict_examples
=
predict_examples
.
select
(
range
(
args
.
max_predict_samples
))
predict_examples
=
predict_examples
.
select
(
range
(
args
.
max_predict_samples
))
# Predict Feature Creation
# Predict Feature Creation
predict_dataset
=
predict_examples
.
map
(
with
accelerator
.
main_process_first
():
prepare_validation_features
,
predict_dataset
=
predict_examples
.
map
(
batched
=
True
,
prepare_validation_features
,
num_proc
=
args
.
preprocessing_num_workers
,
batched
=
True
,
remove_columns
=
column_names
,
num_proc
=
args
.
preprocessing_num_workers
,
load_from_cache_file
=
not
args
.
overwrite_cache
,
remove_columns
=
column_names
,
desc
=
"Running tokenizer on prediction dataset"
,
load_from_cache_file
=
not
args
.
overwrite_cache
,
)
desc
=
"Running tokenizer on prediction dataset"
,
if
args
.
max_predict_samples
is
not
None
:
)
# During Feature creation dataset samples might increase, we will select required samples again
if
args
.
max_predict_samples
is
not
None
:
predict_dataset
=
predict_dataset
.
select
(
range
(
args
.
max_predict_samples
))
# During Feature creation dataset samples might increase, we will select required samples again
predict_dataset
=
predict_dataset
.
select
(
range
(
args
.
max_predict_samples
))
# Log a few random samples from the training set:
# Log a few random samples from the training set:
for
index
in
random
.
sample
(
range
(
len
(
train_dataset
)),
3
):
for
index
in
random
.
sample
(
range
(
len
(
train_dataset
)),
3
):
...
...
examples/pytorch/summarization/run_summarization_no_trainer.py
View file @
44eb8bde
...
@@ -439,13 +439,14 @@ def main():
...
@@ -439,13 +439,14 @@ def main():
model_inputs
[
"labels"
]
=
labels
[
"input_ids"
]
model_inputs
[
"labels"
]
=
labels
[
"input_ids"
]
return
model_inputs
return
model_inputs
processed_datasets
=
raw_datasets
.
map
(
with
accelerator
.
main_process_first
():
preprocess_function
,
processed_datasets
=
raw_datasets
.
map
(
batched
=
True
,
preprocess_function
,
remove_columns
=
column_names
,
batched
=
True
,
load_from_cache_file
=
not
args
.
overwrite_cache
,
remove_columns
=
column_names
,
desc
=
"Running tokenizer on dataset"
,
load_from_cache_file
=
not
args
.
overwrite_cache
,
)
desc
=
"Running tokenizer on dataset"
,
)
train_dataset
=
processed_datasets
[
"train"
]
train_dataset
=
processed_datasets
[
"train"
]
eval_dataset
=
processed_datasets
[
"validation"
]
eval_dataset
=
processed_datasets
[
"validation"
]
...
...
examples/pytorch/text-classification/run_glue_no_trainer.py
View file @
44eb8bde
...
@@ -330,12 +330,13 @@ def main():
...
@@ -330,12 +330,13 @@ def main():
result
[
"labels"
]
=
examples
[
"label"
]
result
[
"labels"
]
=
examples
[
"label"
]
return
result
return
result
processed_datasets
=
raw_datasets
.
map
(
with
accelerator
.
main_process_first
():
preprocess_function
,
processed_datasets
=
raw_datasets
.
map
(
batched
=
True
,
preprocess_function
,
remove_columns
=
raw_datasets
[
"train"
].
column_names
,
batched
=
True
,
desc
=
"Running tokenizer on dataset"
,
remove_columns
=
raw_datasets
[
"train"
].
column_names
,
)
desc
=
"Running tokenizer on dataset"
,
)
train_dataset
=
processed_datasets
[
"train"
]
train_dataset
=
processed_datasets
[
"train"
]
eval_dataset
=
processed_datasets
[
"validation_matched"
if
args
.
task_name
==
"mnli"
else
"validation"
]
eval_dataset
=
processed_datasets
[
"validation_matched"
if
args
.
task_name
==
"mnli"
else
"validation"
]
...
...
examples/pytorch/token-classification/run_ner_no_trainer.py
View file @
44eb8bde
...
@@ -403,12 +403,13 @@ def main():
...
@@ -403,12 +403,13 @@ def main():
tokenized_inputs
[
"labels"
]
=
labels
tokenized_inputs
[
"labels"
]
=
labels
return
tokenized_inputs
return
tokenized_inputs
processed_raw_datasets
=
raw_datasets
.
map
(
with
accelerator
.
main_process_first
():
tokenize_and_align_labels
,
processed_raw_datasets
=
raw_datasets
.
map
(
batched
=
True
,
tokenize_and_align_labels
,
remove_columns
=
raw_datasets
[
"train"
].
column_names
,
batched
=
True
,
desc
=
"Running tokenizer on dataset"
,
remove_columns
=
raw_datasets
[
"train"
].
column_names
,
)
desc
=
"Running tokenizer on dataset"
,
)
train_dataset
=
processed_raw_datasets
[
"train"
]
train_dataset
=
processed_raw_datasets
[
"train"
]
eval_dataset
=
processed_raw_datasets
[
"validation"
]
eval_dataset
=
processed_raw_datasets
[
"validation"
]
...
...
examples/pytorch/translation/run_translation_no_trainer.py
View file @
44eb8bde
...
@@ -418,14 +418,15 @@ def main():
...
@@ -418,14 +418,15 @@ def main():
model_inputs
[
"labels"
]
=
labels
[
"input_ids"
]
model_inputs
[
"labels"
]
=
labels
[
"input_ids"
]
return
model_inputs
return
model_inputs
processed_datasets
=
raw_datasets
.
map
(
with
accelerator
.
main_process_first
():
preprocess_function
,
processed_datasets
=
raw_datasets
.
map
(
batched
=
True
,
preprocess_function
,
num_proc
=
args
.
preprocessing_num_workers
,
batched
=
True
,
remove_columns
=
column_names
,
num_proc
=
args
.
preprocessing_num_workers
,
load_from_cache_file
=
not
args
.
overwrite_cache
,
remove_columns
=
column_names
,
desc
=
"Running tokenizer on dataset"
,
load_from_cache_file
=
not
args
.
overwrite_cache
,
)
desc
=
"Running tokenizer on dataset"
,
)
train_dataset
=
processed_datasets
[
"train"
]
train_dataset
=
processed_datasets
[
"train"
]
eval_dataset
=
processed_datasets
[
"validation"
]
eval_dataset
=
processed_datasets
[
"validation"
]
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment