Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
44eb8bde
Unverified
Commit
44eb8bde
authored
Sep 30, 2021
by
Patrick von Platen
Committed by
GitHub
Sep 30, 2021
Browse files
map only on one process (#13810)
parent
9a9805fc
Changes
9
Show whitespace changes
Inline
Side-by-side
Showing
9 changed files
with
142 additions
and
125 deletions
+142
-125
examples/pytorch/language-modeling/run_clm_no_trainer.py
examples/pytorch/language-modeling/run_clm_no_trainer.py
+17
-15
examples/pytorch/language-modeling/run_mlm_no_trainer.py
examples/pytorch/language-modeling/run_mlm_no_trainer.py
+26
-23
examples/pytorch/multiple-choice/run_swag_no_trainer.py
examples/pytorch/multiple-choice/run_swag_no_trainer.py
+4
-3
examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py
...torch/question-answering/run_qa_beam_search_no_trainer.py
+30
-27
examples/pytorch/question-answering/run_qa_no_trainer.py
examples/pytorch/question-answering/run_qa_no_trainer.py
+34
-30
examples/pytorch/summarization/run_summarization_no_trainer.py
...les/pytorch/summarization/run_summarization_no_trainer.py
+8
-7
examples/pytorch/text-classification/run_glue_no_trainer.py
examples/pytorch/text-classification/run_glue_no_trainer.py
+7
-6
examples/pytorch/token-classification/run_ner_no_trainer.py
examples/pytorch/token-classification/run_ner_no_trainer.py
+7
-6
examples/pytorch/translation/run_translation_no_trainer.py
examples/pytorch/translation/run_translation_no_trainer.py
+9
-8
No files found.
examples/pytorch/language-modeling/run_clm_no_trainer.py
View file @
44eb8bde
...
@@ -337,6 +337,7 @@ def main():
...
@@ -337,6 +337,7 @@ def main():
def
tokenize_function
(
examples
):
def
tokenize_function
(
examples
):
return
tokenizer
(
examples
[
text_column_name
])
return
tokenizer
(
examples
[
text_column_name
])
with
accelerator
.
main_process_first
():
tokenized_datasets
=
raw_datasets
.
map
(
tokenized_datasets
=
raw_datasets
.
map
(
tokenize_function
,
tokenize_function
,
batched
=
True
,
batched
=
True
,
...
@@ -386,6 +387,7 @@ def main():
...
@@ -386,6 +387,7 @@ def main():
# To speed up this part, we use multiprocessing. See the documentation of the map method for more information:
# To speed up this part, we use multiprocessing. See the documentation of the map method for more information:
# https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map
# https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map
with
accelerator
.
main_process_first
():
lm_datasets
=
tokenized_datasets
.
map
(
lm_datasets
=
tokenized_datasets
.
map
(
group_texts
,
group_texts
,
batched
=
True
,
batched
=
True
,
...
...
examples/pytorch/language-modeling/run_mlm_no_trainer.py
View file @
44eb8bde
...
@@ -374,6 +374,7 @@ def main():
...
@@ -374,6 +374,7 @@ def main():
return_special_tokens_mask
=
True
,
return_special_tokens_mask
=
True
,
)
)
with
accelerator
.
main_process_first
():
tokenized_datasets
=
raw_datasets
.
map
(
tokenized_datasets
=
raw_datasets
.
map
(
tokenize_function
,
tokenize_function
,
batched
=
True
,
batched
=
True
,
...
@@ -389,6 +390,7 @@ def main():
...
@@ -389,6 +390,7 @@ def main():
def
tokenize_function
(
examples
):
def
tokenize_function
(
examples
):
return
tokenizer
(
examples
[
text_column_name
],
return_special_tokens_mask
=
True
)
return
tokenizer
(
examples
[
text_column_name
],
return_special_tokens_mask
=
True
)
with
accelerator
.
main_process_first
():
tokenized_datasets
=
raw_datasets
.
map
(
tokenized_datasets
=
raw_datasets
.
map
(
tokenize_function
,
tokenize_function
,
batched
=
True
,
batched
=
True
,
...
@@ -422,6 +424,7 @@ def main():
...
@@ -422,6 +424,7 @@ def main():
# To speed up this part, we use multiprocessing. See the documentation of the map method for more information:
# To speed up this part, we use multiprocessing. See the documentation of the map method for more information:
# https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map
# https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map
with
accelerator
.
main_process_first
():
tokenized_datasets
=
tokenized_datasets
.
map
(
tokenized_datasets
=
tokenized_datasets
.
map
(
group_texts
,
group_texts
,
batched
=
True
,
batched
=
True
,
...
...
examples/pytorch/multiple-choice/run_swag_no_trainer.py
View file @
44eb8bde
...
@@ -381,6 +381,7 @@ def main():
...
@@ -381,6 +381,7 @@ def main():
tokenized_inputs
[
"labels"
]
=
labels
tokenized_inputs
[
"labels"
]
=
labels
return
tokenized_inputs
return
tokenized_inputs
with
accelerator
.
main_process_first
():
processed_datasets
=
raw_datasets
.
map
(
processed_datasets
=
raw_datasets
.
map
(
preprocess_function
,
batched
=
True
,
remove_columns
=
raw_datasets
[
"train"
].
column_names
preprocess_function
,
batched
=
True
,
remove_columns
=
raw_datasets
[
"train"
].
column_names
)
)
...
...
examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py
View file @
44eb8bde
...
@@ -440,6 +440,7 @@ def main():
...
@@ -440,6 +440,7 @@ def main():
# We will select sample from whole data if agument is specified
# We will select sample from whole data if agument is specified
train_dataset
=
train_dataset
.
select
(
range
(
args
.
max_train_samples
))
train_dataset
=
train_dataset
.
select
(
range
(
args
.
max_train_samples
))
# Create train feature from dataset
# Create train feature from dataset
with
accelerator
.
main_process_first
():
train_dataset
=
train_dataset
.
map
(
train_dataset
=
train_dataset
.
map
(
prepare_train_features
,
prepare_train_features
,
batched
=
True
,
batched
=
True
,
...
@@ -530,6 +531,7 @@ def main():
...
@@ -530,6 +531,7 @@ def main():
# We will select sample from whole data
# We will select sample from whole data
eval_examples
=
eval_examples
.
select
(
range
(
args
.
max_eval_samples
))
eval_examples
=
eval_examples
.
select
(
range
(
args
.
max_eval_samples
))
# Validation Feature Creation
# Validation Feature Creation
with
accelerator
.
main_process_first
():
eval_dataset
=
eval_examples
.
map
(
eval_dataset
=
eval_examples
.
map
(
prepare_validation_features
,
prepare_validation_features
,
batched
=
True
,
batched
=
True
,
...
@@ -551,6 +553,7 @@ def main():
...
@@ -551,6 +553,7 @@ def main():
# We will select sample from whole data
# We will select sample from whole data
predict_examples
=
predict_examples
.
select
(
range
(
args
.
max_predict_samples
))
predict_examples
=
predict_examples
.
select
(
range
(
args
.
max_predict_samples
))
# Predict Feature Creation
# Predict Feature Creation
with
accelerator
.
main_process_first
():
predict_dataset
=
predict_examples
.
map
(
predict_dataset
=
predict_examples
.
map
(
prepare_validation_features
,
prepare_validation_features
,
batched
=
True
,
batched
=
True
,
...
...
examples/pytorch/question-answering/run_qa_no_trainer.py
View file @
44eb8bde
...
@@ -468,7 +468,9 @@ def main():
...
@@ -468,7 +468,9 @@ def main():
if
args
.
max_train_samples
is
not
None
:
if
args
.
max_train_samples
is
not
None
:
# We will select sample from whole data if agument is specified
# We will select sample from whole data if agument is specified
train_dataset
=
train_dataset
.
select
(
range
(
args
.
max_train_samples
))
train_dataset
=
train_dataset
.
select
(
range
(
args
.
max_train_samples
))
# Create train feature from dataset
# Create train feature from dataset
with
accelerator
.
main_process_first
():
train_dataset
=
train_dataset
.
map
(
train_dataset
=
train_dataset
.
map
(
prepare_train_features
,
prepare_train_features
,
batched
=
True
,
batched
=
True
,
...
@@ -535,6 +537,7 @@ def main():
...
@@ -535,6 +537,7 @@ def main():
# We will select sample from whole data
# We will select sample from whole data
eval_examples
=
eval_examples
.
select
(
range
(
args
.
max_eval_samples
))
eval_examples
=
eval_examples
.
select
(
range
(
args
.
max_eval_samples
))
# Validation Feature Creation
# Validation Feature Creation
with
accelerator
.
main_process_first
():
eval_dataset
=
eval_examples
.
map
(
eval_dataset
=
eval_examples
.
map
(
prepare_validation_features
,
prepare_validation_features
,
batched
=
True
,
batched
=
True
,
...
@@ -556,6 +559,7 @@ def main():
...
@@ -556,6 +559,7 @@ def main():
# We will select sample from whole data
# We will select sample from whole data
predict_examples
=
predict_examples
.
select
(
range
(
args
.
max_predict_samples
))
predict_examples
=
predict_examples
.
select
(
range
(
args
.
max_predict_samples
))
# Predict Feature Creation
# Predict Feature Creation
with
accelerator
.
main_process_first
():
predict_dataset
=
predict_examples
.
map
(
predict_dataset
=
predict_examples
.
map
(
prepare_validation_features
,
prepare_validation_features
,
batched
=
True
,
batched
=
True
,
...
...
examples/pytorch/summarization/run_summarization_no_trainer.py
View file @
44eb8bde
...
@@ -439,6 +439,7 @@ def main():
...
@@ -439,6 +439,7 @@ def main():
model_inputs
[
"labels"
]
=
labels
[
"input_ids"
]
model_inputs
[
"labels"
]
=
labels
[
"input_ids"
]
return
model_inputs
return
model_inputs
with
accelerator
.
main_process_first
():
processed_datasets
=
raw_datasets
.
map
(
processed_datasets
=
raw_datasets
.
map
(
preprocess_function
,
preprocess_function
,
batched
=
True
,
batched
=
True
,
...
...
examples/pytorch/text-classification/run_glue_no_trainer.py
View file @
44eb8bde
...
@@ -330,6 +330,7 @@ def main():
...
@@ -330,6 +330,7 @@ def main():
result
[
"labels"
]
=
examples
[
"label"
]
result
[
"labels"
]
=
examples
[
"label"
]
return
result
return
result
with
accelerator
.
main_process_first
():
processed_datasets
=
raw_datasets
.
map
(
processed_datasets
=
raw_datasets
.
map
(
preprocess_function
,
preprocess_function
,
batched
=
True
,
batched
=
True
,
...
...
examples/pytorch/token-classification/run_ner_no_trainer.py
View file @
44eb8bde
...
@@ -403,6 +403,7 @@ def main():
...
@@ -403,6 +403,7 @@ def main():
tokenized_inputs
[
"labels"
]
=
labels
tokenized_inputs
[
"labels"
]
=
labels
return
tokenized_inputs
return
tokenized_inputs
with
accelerator
.
main_process_first
():
processed_raw_datasets
=
raw_datasets
.
map
(
processed_raw_datasets
=
raw_datasets
.
map
(
tokenize_and_align_labels
,
tokenize_and_align_labels
,
batched
=
True
,
batched
=
True
,
...
...
examples/pytorch/translation/run_translation_no_trainer.py
View file @
44eb8bde
...
@@ -418,6 +418,7 @@ def main():
...
@@ -418,6 +418,7 @@ def main():
model_inputs
[
"labels"
]
=
labels
[
"input_ids"
]
model_inputs
[
"labels"
]
=
labels
[
"input_ids"
]
return
model_inputs
return
model_inputs
with
accelerator
.
main_process_first
():
processed_datasets
=
raw_datasets
.
map
(
processed_datasets
=
raw_datasets
.
map
(
preprocess_function
,
preprocess_function
,
batched
=
True
,
batched
=
True
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment