Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
44eb8bde
"...git@developer.sourcefind.cn:chenpangpang/transformers.git" did not exist on "569743f510780fc42aeb2364501584638f400558"
Unverified
Commit
44eb8bde
authored
Sep 30, 2021
by
Patrick von Platen
Committed by
GitHub
Sep 30, 2021
Browse files
map only on one process (#13810)
parent
9a9805fc
Changes
9
Hide whitespace changes
Inline
Side-by-side
Showing
9 changed files
with
142 additions
and
125 deletions
+142
-125
examples/pytorch/language-modeling/run_clm_no_trainer.py
examples/pytorch/language-modeling/run_clm_no_trainer.py
+17
-15
examples/pytorch/language-modeling/run_mlm_no_trainer.py
examples/pytorch/language-modeling/run_mlm_no_trainer.py
+26
-23
examples/pytorch/multiple-choice/run_swag_no_trainer.py
examples/pytorch/multiple-choice/run_swag_no_trainer.py
+4
-3
examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py
...torch/question-answering/run_qa_beam_search_no_trainer.py
+30
-27
examples/pytorch/question-answering/run_qa_no_trainer.py
examples/pytorch/question-answering/run_qa_no_trainer.py
+34
-30
examples/pytorch/summarization/run_summarization_no_trainer.py
...les/pytorch/summarization/run_summarization_no_trainer.py
+8
-7
examples/pytorch/text-classification/run_glue_no_trainer.py
examples/pytorch/text-classification/run_glue_no_trainer.py
+7
-6
examples/pytorch/token-classification/run_ner_no_trainer.py
examples/pytorch/token-classification/run_ner_no_trainer.py
+7
-6
examples/pytorch/translation/run_translation_no_trainer.py
examples/pytorch/translation/run_translation_no_trainer.py
+9
-8
No files found.
examples/pytorch/language-modeling/run_clm_no_trainer.py
View file @
44eb8bde
...
...
@@ -337,14 +337,15 @@ def main():
def
tokenize_function
(
examples
):
return
tokenizer
(
examples
[
text_column_name
])
tokenized_datasets
=
raw_datasets
.
map
(
tokenize_function
,
batched
=
True
,
num_proc
=
args
.
preprocessing_num_workers
,
remove_columns
=
column_names
,
load_from_cache_file
=
not
args
.
overwrite_cache
,
desc
=
"Running tokenizer on dataset"
,
)
with
accelerator
.
main_process_first
():
tokenized_datasets
=
raw_datasets
.
map
(
tokenize_function
,
batched
=
True
,
num_proc
=
args
.
preprocessing_num_workers
,
remove_columns
=
column_names
,
load_from_cache_file
=
not
args
.
overwrite_cache
,
desc
=
"Running tokenizer on dataset"
,
)
if
args
.
block_size
is
None
:
block_size
=
tokenizer
.
model_max_length
...
...
@@ -386,13 +387,14 @@ def main():
# To speed up this part, we use multiprocessing. See the documentation of the map method for more information:
# https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map
lm_datasets
=
tokenized_datasets
.
map
(
group_texts
,
batched
=
True
,
num_proc
=
args
.
preprocessing_num_workers
,
load_from_cache_file
=
not
args
.
overwrite_cache
,
desc
=
f
"Grouping texts in chunks of
{
block_size
}
"
,
)
with
accelerator
.
main_process_first
():
lm_datasets
=
tokenized_datasets
.
map
(
group_texts
,
batched
=
True
,
num_proc
=
args
.
preprocessing_num_workers
,
load_from_cache_file
=
not
args
.
overwrite_cache
,
desc
=
f
"Grouping texts in chunks of
{
block_size
}
"
,
)
train_dataset
=
lm_datasets
[
"train"
]
eval_dataset
=
lm_datasets
[
"validation"
]
...
...
examples/pytorch/language-modeling/run_mlm_no_trainer.py
View file @
44eb8bde
...
...
@@ -374,14 +374,15 @@ def main():
return_special_tokens_mask
=
True
,
)
tokenized_datasets
=
raw_datasets
.
map
(
tokenize_function
,
batched
=
True
,
num_proc
=
args
.
preprocessing_num_workers
,
remove_columns
=
[
text_column_name
],
load_from_cache_file
=
not
args
.
overwrite_cache
,
desc
=
"Running tokenizer on dataset line_by_line"
,
)
with
accelerator
.
main_process_first
():
tokenized_datasets
=
raw_datasets
.
map
(
tokenize_function
,
batched
=
True
,
num_proc
=
args
.
preprocessing_num_workers
,
remove_columns
=
[
text_column_name
],
load_from_cache_file
=
not
args
.
overwrite_cache
,
desc
=
"Running tokenizer on dataset line_by_line"
,
)
else
:
# Otherwise, we tokenize every text, then concatenate them together before splitting them in smaller parts.
# We use `return_special_tokens_mask=True` because DataCollatorForLanguageModeling (see below) is more
...
...
@@ -389,14 +390,15 @@ def main():
def
tokenize_function
(
examples
):
return
tokenizer
(
examples
[
text_column_name
],
return_special_tokens_mask
=
True
)
tokenized_datasets
=
raw_datasets
.
map
(
tokenize_function
,
batched
=
True
,
num_proc
=
args
.
preprocessing_num_workers
,
remove_columns
=
column_names
,
load_from_cache_file
=
not
args
.
overwrite_cache
,
desc
=
"Running tokenizer on every text in dataset"
,
)
with
accelerator
.
main_process_first
():
tokenized_datasets
=
raw_datasets
.
map
(
tokenize_function
,
batched
=
True
,
num_proc
=
args
.
preprocessing_num_workers
,
remove_columns
=
column_names
,
load_from_cache_file
=
not
args
.
overwrite_cache
,
desc
=
"Running tokenizer on every text in dataset"
,
)
# Main data processing function that will concatenate all texts from our dataset and generate chunks of
# max_seq_length.
...
...
@@ -422,13 +424,14 @@ def main():
# To speed up this part, we use multiprocessing. See the documentation of the map method for more information:
# https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map
tokenized_datasets
=
tokenized_datasets
.
map
(
group_texts
,
batched
=
True
,
num_proc
=
args
.
preprocessing_num_workers
,
load_from_cache_file
=
not
args
.
overwrite_cache
,
desc
=
f
"Grouping texts in chunks of
{
max_seq_length
}
"
,
)
with
accelerator
.
main_process_first
():
tokenized_datasets
=
tokenized_datasets
.
map
(
group_texts
,
batched
=
True
,
num_proc
=
args
.
preprocessing_num_workers
,
load_from_cache_file
=
not
args
.
overwrite_cache
,
desc
=
f
"Grouping texts in chunks of
{
max_seq_length
}
"
,
)
train_dataset
=
tokenized_datasets
[
"train"
]
eval_dataset
=
tokenized_datasets
[
"validation"
]
...
...
examples/pytorch/multiple-choice/run_swag_no_trainer.py
View file @
44eb8bde
...
...
@@ -381,9 +381,10 @@ def main():
tokenized_inputs
[
"labels"
]
=
labels
return
tokenized_inputs
processed_datasets
=
raw_datasets
.
map
(
preprocess_function
,
batched
=
True
,
remove_columns
=
raw_datasets
[
"train"
].
column_names
)
with
accelerator
.
main_process_first
():
processed_datasets
=
raw_datasets
.
map
(
preprocess_function
,
batched
=
True
,
remove_columns
=
raw_datasets
[
"train"
].
column_names
)
train_dataset
=
processed_datasets
[
"train"
]
eval_dataset
=
processed_datasets
[
"validation"
]
...
...
examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py
View file @
44eb8bde
...
...
@@ -440,14 +440,15 @@ def main():
# We will select sample from whole data if agument is specified
train_dataset
=
train_dataset
.
select
(
range
(
args
.
max_train_samples
))
# Create train feature from dataset
train_dataset
=
train_dataset
.
map
(
prepare_train_features
,
batched
=
True
,
num_proc
=
args
.
preprocessing_num_workers
,
remove_columns
=
column_names
,
load_from_cache_file
=
not
args
.
overwrite_cache
,
desc
=
"Running tokenizer on train dataset"
,
)
with
accelerator
.
main_process_first
():
train_dataset
=
train_dataset
.
map
(
prepare_train_features
,
batched
=
True
,
num_proc
=
args
.
preprocessing_num_workers
,
remove_columns
=
column_names
,
load_from_cache_file
=
not
args
.
overwrite_cache
,
desc
=
"Running tokenizer on train dataset"
,
)
if
args
.
max_train_samples
is
not
None
:
# Number of samples might increase during Feature Creation, We select only specified max samples
train_dataset
=
train_dataset
.
select
(
range
(
args
.
max_train_samples
))
...
...
@@ -530,14 +531,15 @@ def main():
# We will select sample from whole data
eval_examples
=
eval_examples
.
select
(
range
(
args
.
max_eval_samples
))
# Validation Feature Creation
eval_dataset
=
eval_examples
.
map
(
prepare_validation_features
,
batched
=
True
,
num_proc
=
args
.
preprocessing_num_workers
,
remove_columns
=
column_names
,
load_from_cache_file
=
not
args
.
overwrite_cache
,
desc
=
"Running tokenizer on validation dataset"
,
)
with
accelerator
.
main_process_first
():
eval_dataset
=
eval_examples
.
map
(
prepare_validation_features
,
batched
=
True
,
num_proc
=
args
.
preprocessing_num_workers
,
remove_columns
=
column_names
,
load_from_cache_file
=
not
args
.
overwrite_cache
,
desc
=
"Running tokenizer on validation dataset"
,
)
if
args
.
max_eval_samples
is
not
None
:
# During Feature creation dataset samples might increase, we will select required samples again
...
...
@@ -551,17 +553,18 @@ def main():
# We will select sample from whole data
predict_examples
=
predict_examples
.
select
(
range
(
args
.
max_predict_samples
))
# Predict Feature Creation
predict_dataset
=
predict_examples
.
map
(
prepare_validation_features
,
batched
=
True
,
num_proc
=
args
.
preprocessing_num_workers
,
remove_columns
=
column_names
,
load_from_cache_file
=
not
args
.
overwrite_cache
,
desc
=
"Running tokenizer on prediction dataset"
,
)
if
args
.
max_predict_samples
is
not
None
:
# During Feature creation dataset samples might increase, we will select required samples again
predict_dataset
=
predict_dataset
.
select
(
range
(
args
.
max_predict_samples
))
with
accelerator
.
main_process_first
():
predict_dataset
=
predict_examples
.
map
(
prepare_validation_features
,
batched
=
True
,
num_proc
=
args
.
preprocessing_num_workers
,
remove_columns
=
column_names
,
load_from_cache_file
=
not
args
.
overwrite_cache
,
desc
=
"Running tokenizer on prediction dataset"
,
)
if
args
.
max_predict_samples
is
not
None
:
# During Feature creation dataset samples might increase, we will select required samples again
predict_dataset
=
predict_dataset
.
select
(
range
(
args
.
max_predict_samples
))
# Log a few random samples from the training set:
for
index
in
random
.
sample
(
range
(
len
(
train_dataset
)),
3
):
...
...
examples/pytorch/question-answering/run_qa_no_trainer.py
View file @
44eb8bde
...
...
@@ -468,18 +468,20 @@ def main():
if
args
.
max_train_samples
is
not
None
:
# We will select sample from whole data if agument is specified
train_dataset
=
train_dataset
.
select
(
range
(
args
.
max_train_samples
))
# Create train feature from dataset
train_dataset
=
train_dataset
.
map
(
prepare_train_features
,
batched
=
True
,
num_proc
=
args
.
preprocessing_num_workers
,
remove_columns
=
column_names
,
load_from_cache_file
=
not
args
.
overwrite_cache
,
desc
=
"Running tokenizer on train dataset"
,
)
if
args
.
max_train_samples
is
not
None
:
# Number of samples might increase during Feature Creation, We select only specified max samples
train_dataset
=
train_dataset
.
select
(
range
(
args
.
max_train_samples
))
with
accelerator
.
main_process_first
():
train_dataset
=
train_dataset
.
map
(
prepare_train_features
,
batched
=
True
,
num_proc
=
args
.
preprocessing_num_workers
,
remove_columns
=
column_names
,
load_from_cache_file
=
not
args
.
overwrite_cache
,
desc
=
"Running tokenizer on train dataset"
,
)
if
args
.
max_train_samples
is
not
None
:
# Number of samples might increase during Feature Creation, We select only specified max samples
train_dataset
=
train_dataset
.
select
(
range
(
args
.
max_train_samples
))
# Validation preprocessing
def
prepare_validation_features
(
examples
):
...
...
@@ -535,14 +537,15 @@ def main():
# We will select sample from whole data
eval_examples
=
eval_examples
.
select
(
range
(
args
.
max_eval_samples
))
# Validation Feature Creation
eval_dataset
=
eval_examples
.
map
(
prepare_validation_features
,
batched
=
True
,
num_proc
=
args
.
preprocessing_num_workers
,
remove_columns
=
column_names
,
load_from_cache_file
=
not
args
.
overwrite_cache
,
desc
=
"Running tokenizer on validation dataset"
,
)
with
accelerator
.
main_process_first
():
eval_dataset
=
eval_examples
.
map
(
prepare_validation_features
,
batched
=
True
,
num_proc
=
args
.
preprocessing_num_workers
,
remove_columns
=
column_names
,
load_from_cache_file
=
not
args
.
overwrite_cache
,
desc
=
"Running tokenizer on validation dataset"
,
)
if
args
.
max_eval_samples
is
not
None
:
# During Feature creation dataset samples might increase, we will select required samples again
...
...
@@ -556,17 +559,18 @@ def main():
# We will select sample from whole data
predict_examples
=
predict_examples
.
select
(
range
(
args
.
max_predict_samples
))
# Predict Feature Creation
predict_dataset
=
predict_examples
.
map
(
prepare_validation_features
,
batched
=
True
,
num_proc
=
args
.
preprocessing_num_workers
,
remove_columns
=
column_names
,
load_from_cache_file
=
not
args
.
overwrite_cache
,
desc
=
"Running tokenizer on prediction dataset"
,
)
if
args
.
max_predict_samples
is
not
None
:
# During Feature creation dataset samples might increase, we will select required samples again
predict_dataset
=
predict_dataset
.
select
(
range
(
args
.
max_predict_samples
))
with
accelerator
.
main_process_first
():
predict_dataset
=
predict_examples
.
map
(
prepare_validation_features
,
batched
=
True
,
num_proc
=
args
.
preprocessing_num_workers
,
remove_columns
=
column_names
,
load_from_cache_file
=
not
args
.
overwrite_cache
,
desc
=
"Running tokenizer on prediction dataset"
,
)
if
args
.
max_predict_samples
is
not
None
:
# During Feature creation dataset samples might increase, we will select required samples again
predict_dataset
=
predict_dataset
.
select
(
range
(
args
.
max_predict_samples
))
# Log a few random samples from the training set:
for
index
in
random
.
sample
(
range
(
len
(
train_dataset
)),
3
):
...
...
examples/pytorch/summarization/run_summarization_no_trainer.py
View file @
44eb8bde
...
...
@@ -439,13 +439,14 @@ def main():
model_inputs
[
"labels"
]
=
labels
[
"input_ids"
]
return
model_inputs
processed_datasets
=
raw_datasets
.
map
(
preprocess_function
,
batched
=
True
,
remove_columns
=
column_names
,
load_from_cache_file
=
not
args
.
overwrite_cache
,
desc
=
"Running tokenizer on dataset"
,
)
with
accelerator
.
main_process_first
():
processed_datasets
=
raw_datasets
.
map
(
preprocess_function
,
batched
=
True
,
remove_columns
=
column_names
,
load_from_cache_file
=
not
args
.
overwrite_cache
,
desc
=
"Running tokenizer on dataset"
,
)
train_dataset
=
processed_datasets
[
"train"
]
eval_dataset
=
processed_datasets
[
"validation"
]
...
...
examples/pytorch/text-classification/run_glue_no_trainer.py
View file @
44eb8bde
...
...
@@ -330,12 +330,13 @@ def main():
result
[
"labels"
]
=
examples
[
"label"
]
return
result
processed_datasets
=
raw_datasets
.
map
(
preprocess_function
,
batched
=
True
,
remove_columns
=
raw_datasets
[
"train"
].
column_names
,
desc
=
"Running tokenizer on dataset"
,
)
with
accelerator
.
main_process_first
():
processed_datasets
=
raw_datasets
.
map
(
preprocess_function
,
batched
=
True
,
remove_columns
=
raw_datasets
[
"train"
].
column_names
,
desc
=
"Running tokenizer on dataset"
,
)
train_dataset
=
processed_datasets
[
"train"
]
eval_dataset
=
processed_datasets
[
"validation_matched"
if
args
.
task_name
==
"mnli"
else
"validation"
]
...
...
examples/pytorch/token-classification/run_ner_no_trainer.py
View file @
44eb8bde
...
...
@@ -403,12 +403,13 @@ def main():
tokenized_inputs
[
"labels"
]
=
labels
return
tokenized_inputs
processed_raw_datasets
=
raw_datasets
.
map
(
tokenize_and_align_labels
,
batched
=
True
,
remove_columns
=
raw_datasets
[
"train"
].
column_names
,
desc
=
"Running tokenizer on dataset"
,
)
with
accelerator
.
main_process_first
():
processed_raw_datasets
=
raw_datasets
.
map
(
tokenize_and_align_labels
,
batched
=
True
,
remove_columns
=
raw_datasets
[
"train"
].
column_names
,
desc
=
"Running tokenizer on dataset"
,
)
train_dataset
=
processed_raw_datasets
[
"train"
]
eval_dataset
=
processed_raw_datasets
[
"validation"
]
...
...
examples/pytorch/translation/run_translation_no_trainer.py
View file @
44eb8bde
...
...
@@ -418,14 +418,15 @@ def main():
model_inputs
[
"labels"
]
=
labels
[
"input_ids"
]
return
model_inputs
processed_datasets
=
raw_datasets
.
map
(
preprocess_function
,
batched
=
True
,
num_proc
=
args
.
preprocessing_num_workers
,
remove_columns
=
column_names
,
load_from_cache_file
=
not
args
.
overwrite_cache
,
desc
=
"Running tokenizer on dataset"
,
)
with
accelerator
.
main_process_first
():
processed_datasets
=
raw_datasets
.
map
(
preprocess_function
,
batched
=
True
,
num_proc
=
args
.
preprocessing_num_workers
,
remove_columns
=
column_names
,
load_from_cache_file
=
not
args
.
overwrite_cache
,
desc
=
"Running tokenizer on dataset"
,
)
train_dataset
=
processed_datasets
[
"train"
]
eval_dataset
=
processed_datasets
[
"validation"
]
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment