Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
44eb8bde
"git@developer.sourcefind.cn:chenpangpang/transformers.git" did not exist on "269078a7ebad5b36dfb2205d310f56c950b02ec3"
Unverified
Commit
44eb8bde
authored
Sep 30, 2021
by
Patrick von Platen
Committed by
GitHub
Sep 30, 2021
Browse files
map only on one process (#13810)
parent
9a9805fc
Changes
9
Hide whitespace changes
Inline
Side-by-side
Showing
9 changed files
with
142 additions
and
125 deletions
+142
-125
examples/pytorch/language-modeling/run_clm_no_trainer.py
examples/pytorch/language-modeling/run_clm_no_trainer.py
+17
-15
examples/pytorch/language-modeling/run_mlm_no_trainer.py
examples/pytorch/language-modeling/run_mlm_no_trainer.py
+26
-23
examples/pytorch/multiple-choice/run_swag_no_trainer.py
examples/pytorch/multiple-choice/run_swag_no_trainer.py
+4
-3
examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py
...torch/question-answering/run_qa_beam_search_no_trainer.py
+30
-27
examples/pytorch/question-answering/run_qa_no_trainer.py
examples/pytorch/question-answering/run_qa_no_trainer.py
+34
-30
examples/pytorch/summarization/run_summarization_no_trainer.py
...les/pytorch/summarization/run_summarization_no_trainer.py
+8
-7
examples/pytorch/text-classification/run_glue_no_trainer.py
examples/pytorch/text-classification/run_glue_no_trainer.py
+7
-6
examples/pytorch/token-classification/run_ner_no_trainer.py
examples/pytorch/token-classification/run_ner_no_trainer.py
+7
-6
examples/pytorch/translation/run_translation_no_trainer.py
examples/pytorch/translation/run_translation_no_trainer.py
+9
-8
No files found.
examples/pytorch/language-modeling/run_clm_no_trainer.py
View file @
44eb8bde
...
@@ -337,14 +337,15 @@ def main():
...
@@ -337,14 +337,15 @@ def main():
def
tokenize_function
(
examples
):
def
tokenize_function
(
examples
):
return
tokenizer
(
examples
[
text_column_name
])
return
tokenizer
(
examples
[
text_column_name
])
tokenized_datasets
=
raw_datasets
.
map
(
with
accelerator
.
main_process_first
():
tokenize_function
,
tokenized_datasets
=
raw_datasets
.
map
(
batched
=
True
,
tokenize_function
,
num_proc
=
args
.
preprocessing_num_workers
,
batched
=
True
,
remove_columns
=
column_names
,
num_proc
=
args
.
preprocessing_num_workers
,
load_from_cache_file
=
not
args
.
overwrite_cache
,
remove_columns
=
column_names
,
desc
=
"Running tokenizer on dataset"
,
load_from_cache_file
=
not
args
.
overwrite_cache
,
)
desc
=
"Running tokenizer on dataset"
,
)
if
args
.
block_size
is
None
:
if
args
.
block_size
is
None
:
block_size
=
tokenizer
.
model_max_length
block_size
=
tokenizer
.
model_max_length
...
@@ -386,13 +387,14 @@ def main():
...
@@ -386,13 +387,14 @@ def main():
# To speed up this part, we use multiprocessing. See the documentation of the map method for more information:
# To speed up this part, we use multiprocessing. See the documentation of the map method for more information:
# https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map
# https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map
lm_datasets
=
tokenized_datasets
.
map
(
with
accelerator
.
main_process_first
():
group_texts
,
lm_datasets
=
tokenized_datasets
.
map
(
batched
=
True
,
group_texts
,
num_proc
=
args
.
preprocessing_num_workers
,
batched
=
True
,
load_from_cache_file
=
not
args
.
overwrite_cache
,
num_proc
=
args
.
preprocessing_num_workers
,
desc
=
f
"Grouping texts in chunks of
{
block_size
}
"
,
load_from_cache_file
=
not
args
.
overwrite_cache
,
)
desc
=
f
"Grouping texts in chunks of
{
block_size
}
"
,
)
train_dataset
=
lm_datasets
[
"train"
]
train_dataset
=
lm_datasets
[
"train"
]
eval_dataset
=
lm_datasets
[
"validation"
]
eval_dataset
=
lm_datasets
[
"validation"
]
...
...
examples/pytorch/language-modeling/run_mlm_no_trainer.py
View file @
44eb8bde
...
@@ -374,14 +374,15 @@ def main():
...
@@ -374,14 +374,15 @@ def main():
return_special_tokens_mask
=
True
,
return_special_tokens_mask
=
True
,
)
)
tokenized_datasets
=
raw_datasets
.
map
(
with
accelerator
.
main_process_first
():
tokenize_function
,
tokenized_datasets
=
raw_datasets
.
map
(
batched
=
True
,
tokenize_function
,
num_proc
=
args
.
preprocessing_num_workers
,
batched
=
True
,
remove_columns
=
[
text_column_name
],
num_proc
=
args
.
preprocessing_num_workers
,
load_from_cache_file
=
not
args
.
overwrite_cache
,
remove_columns
=
[
text_column_name
],
desc
=
"Running tokenizer on dataset line_by_line"
,
load_from_cache_file
=
not
args
.
overwrite_cache
,
)
desc
=
"Running tokenizer on dataset line_by_line"
,
)
else
:
else
:
# Otherwise, we tokenize every text, then concatenate them together before splitting them in smaller parts.
# Otherwise, we tokenize every text, then concatenate them together before splitting them in smaller parts.
# We use `return_special_tokens_mask=True` because DataCollatorForLanguageModeling (see below) is more
# We use `return_special_tokens_mask=True` because DataCollatorForLanguageModeling (see below) is more
...
@@ -389,14 +390,15 @@ def main():
...
@@ -389,14 +390,15 @@ def main():
def
tokenize_function
(
examples
):
def
tokenize_function
(
examples
):
return
tokenizer
(
examples
[
text_column_name
],
return_special_tokens_mask
=
True
)
return
tokenizer
(
examples
[
text_column_name
],
return_special_tokens_mask
=
True
)
tokenized_datasets
=
raw_datasets
.
map
(
with
accelerator
.
main_process_first
():
tokenize_function
,
tokenized_datasets
=
raw_datasets
.
map
(
batched
=
True
,
tokenize_function
,
num_proc
=
args
.
preprocessing_num_workers
,
batched
=
True
,
remove_columns
=
column_names
,
num_proc
=
args
.
preprocessing_num_workers
,
load_from_cache_file
=
not
args
.
overwrite_cache
,
remove_columns
=
column_names
,
desc
=
"Running tokenizer on every text in dataset"
,
load_from_cache_file
=
not
args
.
overwrite_cache
,
)
desc
=
"Running tokenizer on every text in dataset"
,
)
# Main data processing function that will concatenate all texts from our dataset and generate chunks of
# Main data processing function that will concatenate all texts from our dataset and generate chunks of
# max_seq_length.
# max_seq_length.
...
@@ -422,13 +424,14 @@ def main():
...
@@ -422,13 +424,14 @@ def main():
# To speed up this part, we use multiprocessing. See the documentation of the map method for more information:
# To speed up this part, we use multiprocessing. See the documentation of the map method for more information:
# https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map
# https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map
tokenized_datasets
=
tokenized_datasets
.
map
(
with
accelerator
.
main_process_first
():
group_texts
,
tokenized_datasets
=
tokenized_datasets
.
map
(
batched
=
True
,
group_texts
,
num_proc
=
args
.
preprocessing_num_workers
,
batched
=
True
,
load_from_cache_file
=
not
args
.
overwrite_cache
,
num_proc
=
args
.
preprocessing_num_workers
,
desc
=
f
"Grouping texts in chunks of
{
max_seq_length
}
"
,
load_from_cache_file
=
not
args
.
overwrite_cache
,
)
desc
=
f
"Grouping texts in chunks of
{
max_seq_length
}
"
,
)
train_dataset
=
tokenized_datasets
[
"train"
]
train_dataset
=
tokenized_datasets
[
"train"
]
eval_dataset
=
tokenized_datasets
[
"validation"
]
eval_dataset
=
tokenized_datasets
[
"validation"
]
...
...
examples/pytorch/multiple-choice/run_swag_no_trainer.py
View file @
44eb8bde
...
@@ -381,9 +381,10 @@ def main():
...
@@ -381,9 +381,10 @@ def main():
tokenized_inputs
[
"labels"
]
=
labels
tokenized_inputs
[
"labels"
]
=
labels
return
tokenized_inputs
return
tokenized_inputs
processed_datasets
=
raw_datasets
.
map
(
with
accelerator
.
main_process_first
():
preprocess_function
,
batched
=
True
,
remove_columns
=
raw_datasets
[
"train"
].
column_names
processed_datasets
=
raw_datasets
.
map
(
)
preprocess_function
,
batched
=
True
,
remove_columns
=
raw_datasets
[
"train"
].
column_names
)
train_dataset
=
processed_datasets
[
"train"
]
train_dataset
=
processed_datasets
[
"train"
]
eval_dataset
=
processed_datasets
[
"validation"
]
eval_dataset
=
processed_datasets
[
"validation"
]
...
...
examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py
View file @
44eb8bde
...
@@ -440,14 +440,15 @@ def main():
...
@@ -440,14 +440,15 @@ def main():
# We will select sample from whole data if agument is specified
# We will select sample from whole data if agument is specified
train_dataset
=
train_dataset
.
select
(
range
(
args
.
max_train_samples
))
train_dataset
=
train_dataset
.
select
(
range
(
args
.
max_train_samples
))
# Create train feature from dataset
# Create train feature from dataset
train_dataset
=
train_dataset
.
map
(
with
accelerator
.
main_process_first
():
prepare_train_features
,
train_dataset
=
train_dataset
.
map
(
batched
=
True
,
prepare_train_features
,
num_proc
=
args
.
preprocessing_num_workers
,
batched
=
True
,
remove_columns
=
column_names
,
num_proc
=
args
.
preprocessing_num_workers
,
load_from_cache_file
=
not
args
.
overwrite_cache
,
remove_columns
=
column_names
,
desc
=
"Running tokenizer on train dataset"
,
load_from_cache_file
=
not
args
.
overwrite_cache
,
)
desc
=
"Running tokenizer on train dataset"
,
)
if
args
.
max_train_samples
is
not
None
:
if
args
.
max_train_samples
is
not
None
:
# Number of samples might increase during Feature Creation, We select only specified max samples
# Number of samples might increase during Feature Creation, We select only specified max samples
train_dataset
=
train_dataset
.
select
(
range
(
args
.
max_train_samples
))
train_dataset
=
train_dataset
.
select
(
range
(
args
.
max_train_samples
))
...
@@ -530,14 +531,15 @@ def main():
...
@@ -530,14 +531,15 @@ def main():
# We will select sample from whole data
# We will select sample from whole data
eval_examples
=
eval_examples
.
select
(
range
(
args
.
max_eval_samples
))
eval_examples
=
eval_examples
.
select
(
range
(
args
.
max_eval_samples
))
# Validation Feature Creation
# Validation Feature Creation
eval_dataset
=
eval_examples
.
map
(
with
accelerator
.
main_process_first
():
prepare_validation_features
,
eval_dataset
=
eval_examples
.
map
(
batched
=
True
,
prepare_validation_features
,
num_proc
=
args
.
preprocessing_num_workers
,
batched
=
True
,
remove_columns
=
column_names
,
num_proc
=
args
.
preprocessing_num_workers
,
load_from_cache_file
=
not
args
.
overwrite_cache
,
remove_columns
=
column_names
,
desc
=
"Running tokenizer on validation dataset"
,
load_from_cache_file
=
not
args
.
overwrite_cache
,
)
desc
=
"Running tokenizer on validation dataset"
,
)
if
args
.
max_eval_samples
is
not
None
:
if
args
.
max_eval_samples
is
not
None
:
# During Feature creation dataset samples might increase, we will select required samples again
# During Feature creation dataset samples might increase, we will select required samples again
...
@@ -551,17 +553,18 @@ def main():
...
@@ -551,17 +553,18 @@ def main():
# We will select sample from whole data
# We will select sample from whole data
predict_examples
=
predict_examples
.
select
(
range
(
args
.
max_predict_samples
))
predict_examples
=
predict_examples
.
select
(
range
(
args
.
max_predict_samples
))
# Predict Feature Creation
# Predict Feature Creation
predict_dataset
=
predict_examples
.
map
(
with
accelerator
.
main_process_first
():
prepare_validation_features
,
predict_dataset
=
predict_examples
.
map
(
batched
=
True
,
prepare_validation_features
,
num_proc
=
args
.
preprocessing_num_workers
,
batched
=
True
,
remove_columns
=
column_names
,
num_proc
=
args
.
preprocessing_num_workers
,
load_from_cache_file
=
not
args
.
overwrite_cache
,
remove_columns
=
column_names
,
desc
=
"Running tokenizer on prediction dataset"
,
load_from_cache_file
=
not
args
.
overwrite_cache
,
)
desc
=
"Running tokenizer on prediction dataset"
,
if
args
.
max_predict_samples
is
not
None
:
)
# During Feature creation dataset samples might increase, we will select required samples again
if
args
.
max_predict_samples
is
not
None
:
predict_dataset
=
predict_dataset
.
select
(
range
(
args
.
max_predict_samples
))
# During Feature creation dataset samples might increase, we will select required samples again
predict_dataset
=
predict_dataset
.
select
(
range
(
args
.
max_predict_samples
))
# Log a few random samples from the training set:
# Log a few random samples from the training set:
for
index
in
random
.
sample
(
range
(
len
(
train_dataset
)),
3
):
for
index
in
random
.
sample
(
range
(
len
(
train_dataset
)),
3
):
...
...
examples/pytorch/question-answering/run_qa_no_trainer.py
View file @
44eb8bde
...
@@ -468,18 +468,20 @@ def main():
...
@@ -468,18 +468,20 @@ def main():
if
args
.
max_train_samples
is
not
None
:
if
args
.
max_train_samples
is
not
None
:
# We will select sample from whole data if agument is specified
# We will select sample from whole data if agument is specified
train_dataset
=
train_dataset
.
select
(
range
(
args
.
max_train_samples
))
train_dataset
=
train_dataset
.
select
(
range
(
args
.
max_train_samples
))
# Create train feature from dataset
# Create train feature from dataset
train_dataset
=
train_dataset
.
map
(
with
accelerator
.
main_process_first
():
prepare_train_features
,
train_dataset
=
train_dataset
.
map
(
batched
=
True
,
prepare_train_features
,
num_proc
=
args
.
preprocessing_num_workers
,
batched
=
True
,
remove_columns
=
column_names
,
num_proc
=
args
.
preprocessing_num_workers
,
load_from_cache_file
=
not
args
.
overwrite_cache
,
remove_columns
=
column_names
,
desc
=
"Running tokenizer on train dataset"
,
load_from_cache_file
=
not
args
.
overwrite_cache
,
)
desc
=
"Running tokenizer on train dataset"
,
if
args
.
max_train_samples
is
not
None
:
)
# Number of samples might increase during Feature Creation, We select only specified max samples
if
args
.
max_train_samples
is
not
None
:
train_dataset
=
train_dataset
.
select
(
range
(
args
.
max_train_samples
))
# Number of samples might increase during Feature Creation, We select only specified max samples
train_dataset
=
train_dataset
.
select
(
range
(
args
.
max_train_samples
))
# Validation preprocessing
# Validation preprocessing
def
prepare_validation_features
(
examples
):
def
prepare_validation_features
(
examples
):
...
@@ -535,14 +537,15 @@ def main():
...
@@ -535,14 +537,15 @@ def main():
# We will select sample from whole data
# We will select sample from whole data
eval_examples
=
eval_examples
.
select
(
range
(
args
.
max_eval_samples
))
eval_examples
=
eval_examples
.
select
(
range
(
args
.
max_eval_samples
))
# Validation Feature Creation
# Validation Feature Creation
eval_dataset
=
eval_examples
.
map
(
with
accelerator
.
main_process_first
():
prepare_validation_features
,
eval_dataset
=
eval_examples
.
map
(
batched
=
True
,
prepare_validation_features
,
num_proc
=
args
.
preprocessing_num_workers
,
batched
=
True
,
remove_columns
=
column_names
,
num_proc
=
args
.
preprocessing_num_workers
,
load_from_cache_file
=
not
args
.
overwrite_cache
,
remove_columns
=
column_names
,
desc
=
"Running tokenizer on validation dataset"
,
load_from_cache_file
=
not
args
.
overwrite_cache
,
)
desc
=
"Running tokenizer on validation dataset"
,
)
if
args
.
max_eval_samples
is
not
None
:
if
args
.
max_eval_samples
is
not
None
:
# During Feature creation dataset samples might increase, we will select required samples again
# During Feature creation dataset samples might increase, we will select required samples again
...
@@ -556,17 +559,18 @@ def main():
...
@@ -556,17 +559,18 @@ def main():
# We will select sample from whole data
# We will select sample from whole data
predict_examples
=
predict_examples
.
select
(
range
(
args
.
max_predict_samples
))
predict_examples
=
predict_examples
.
select
(
range
(
args
.
max_predict_samples
))
# Predict Feature Creation
# Predict Feature Creation
predict_dataset
=
predict_examples
.
map
(
with
accelerator
.
main_process_first
():
prepare_validation_features
,
predict_dataset
=
predict_examples
.
map
(
batched
=
True
,
prepare_validation_features
,
num_proc
=
args
.
preprocessing_num_workers
,
batched
=
True
,
remove_columns
=
column_names
,
num_proc
=
args
.
preprocessing_num_workers
,
load_from_cache_file
=
not
args
.
overwrite_cache
,
remove_columns
=
column_names
,
desc
=
"Running tokenizer on prediction dataset"
,
load_from_cache_file
=
not
args
.
overwrite_cache
,
)
desc
=
"Running tokenizer on prediction dataset"
,
if
args
.
max_predict_samples
is
not
None
:
)
# During Feature creation dataset samples might increase, we will select required samples again
if
args
.
max_predict_samples
is
not
None
:
predict_dataset
=
predict_dataset
.
select
(
range
(
args
.
max_predict_samples
))
# During Feature creation dataset samples might increase, we will select required samples again
predict_dataset
=
predict_dataset
.
select
(
range
(
args
.
max_predict_samples
))
# Log a few random samples from the training set:
# Log a few random samples from the training set:
for
index
in
random
.
sample
(
range
(
len
(
train_dataset
)),
3
):
for
index
in
random
.
sample
(
range
(
len
(
train_dataset
)),
3
):
...
...
examples/pytorch/summarization/run_summarization_no_trainer.py
View file @
44eb8bde
...
@@ -439,13 +439,14 @@ def main():
...
@@ -439,13 +439,14 @@ def main():
model_inputs
[
"labels"
]
=
labels
[
"input_ids"
]
model_inputs
[
"labels"
]
=
labels
[
"input_ids"
]
return
model_inputs
return
model_inputs
processed_datasets
=
raw_datasets
.
map
(
with
accelerator
.
main_process_first
():
preprocess_function
,
processed_datasets
=
raw_datasets
.
map
(
batched
=
True
,
preprocess_function
,
remove_columns
=
column_names
,
batched
=
True
,
load_from_cache_file
=
not
args
.
overwrite_cache
,
remove_columns
=
column_names
,
desc
=
"Running tokenizer on dataset"
,
load_from_cache_file
=
not
args
.
overwrite_cache
,
)
desc
=
"Running tokenizer on dataset"
,
)
train_dataset
=
processed_datasets
[
"train"
]
train_dataset
=
processed_datasets
[
"train"
]
eval_dataset
=
processed_datasets
[
"validation"
]
eval_dataset
=
processed_datasets
[
"validation"
]
...
...
examples/pytorch/text-classification/run_glue_no_trainer.py
View file @
44eb8bde
...
@@ -330,12 +330,13 @@ def main():
...
@@ -330,12 +330,13 @@ def main():
result
[
"labels"
]
=
examples
[
"label"
]
result
[
"labels"
]
=
examples
[
"label"
]
return
result
return
result
processed_datasets
=
raw_datasets
.
map
(
with
accelerator
.
main_process_first
():
preprocess_function
,
processed_datasets
=
raw_datasets
.
map
(
batched
=
True
,
preprocess_function
,
remove_columns
=
raw_datasets
[
"train"
].
column_names
,
batched
=
True
,
desc
=
"Running tokenizer on dataset"
,
remove_columns
=
raw_datasets
[
"train"
].
column_names
,
)
desc
=
"Running tokenizer on dataset"
,
)
train_dataset
=
processed_datasets
[
"train"
]
train_dataset
=
processed_datasets
[
"train"
]
eval_dataset
=
processed_datasets
[
"validation_matched"
if
args
.
task_name
==
"mnli"
else
"validation"
]
eval_dataset
=
processed_datasets
[
"validation_matched"
if
args
.
task_name
==
"mnli"
else
"validation"
]
...
...
examples/pytorch/token-classification/run_ner_no_trainer.py
View file @
44eb8bde
...
@@ -403,12 +403,13 @@ def main():
...
@@ -403,12 +403,13 @@ def main():
tokenized_inputs
[
"labels"
]
=
labels
tokenized_inputs
[
"labels"
]
=
labels
return
tokenized_inputs
return
tokenized_inputs
processed_raw_datasets
=
raw_datasets
.
map
(
with
accelerator
.
main_process_first
():
tokenize_and_align_labels
,
processed_raw_datasets
=
raw_datasets
.
map
(
batched
=
True
,
tokenize_and_align_labels
,
remove_columns
=
raw_datasets
[
"train"
].
column_names
,
batched
=
True
,
desc
=
"Running tokenizer on dataset"
,
remove_columns
=
raw_datasets
[
"train"
].
column_names
,
)
desc
=
"Running tokenizer on dataset"
,
)
train_dataset
=
processed_raw_datasets
[
"train"
]
train_dataset
=
processed_raw_datasets
[
"train"
]
eval_dataset
=
processed_raw_datasets
[
"validation"
]
eval_dataset
=
processed_raw_datasets
[
"validation"
]
...
...
examples/pytorch/translation/run_translation_no_trainer.py
View file @
44eb8bde
...
@@ -418,14 +418,15 @@ def main():
...
@@ -418,14 +418,15 @@ def main():
model_inputs
[
"labels"
]
=
labels
[
"input_ids"
]
model_inputs
[
"labels"
]
=
labels
[
"input_ids"
]
return
model_inputs
return
model_inputs
processed_datasets
=
raw_datasets
.
map
(
with
accelerator
.
main_process_first
():
preprocess_function
,
processed_datasets
=
raw_datasets
.
map
(
batched
=
True
,
preprocess_function
,
num_proc
=
args
.
preprocessing_num_workers
,
batched
=
True
,
remove_columns
=
column_names
,
num_proc
=
args
.
preprocessing_num_workers
,
load_from_cache_file
=
not
args
.
overwrite_cache
,
remove_columns
=
column_names
,
desc
=
"Running tokenizer on dataset"
,
load_from_cache_file
=
not
args
.
overwrite_cache
,
)
desc
=
"Running tokenizer on dataset"
,
)
train_dataset
=
processed_datasets
[
"train"
]
train_dataset
=
processed_datasets
[
"train"
]
eval_dataset
=
processed_datasets
[
"validation"
]
eval_dataset
=
processed_datasets
[
"validation"
]
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment