Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
05c237ea
"git@developer.sourcefind.cn:chenpangpang/transformers.git" did not exist on "cba7bcf87b1aeda9dc9396f326982f6dd3345f15"
Unverified
Commit
05c237ea
authored
Mar 02, 2022
by
Joao Gante
Committed by
GitHub
Mar 02, 2022
Browse files
Update TF QA example (#15870)
parent
6e57a569
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
24 additions
and
61 deletions
+24
-61
examples/tensorflow/question-answering/run_qa.py
examples/tensorflow/question-answering/run_qa.py
+24
-61
No files found.
examples/tensorflow/question-answering/run_qa.py
View file @
05c237ea
...
@@ -32,6 +32,8 @@ import transformers
...
@@ -32,6 +32,8 @@ import transformers
from
transformers
import
(
from
transformers
import
(
AutoConfig
,
AutoConfig
,
AutoTokenizer
,
AutoTokenizer
,
DataCollatorWithPadding
,
DefaultDataCollator
,
EvalPrediction
,
EvalPrediction
,
HfArgumentParser
,
HfArgumentParser
,
PreTrainedTokenizerFast
,
PreTrainedTokenizerFast
,
...
@@ -209,51 +211,6 @@ class SavePretrainedCallback(tf.keras.callbacks.Callback):
...
@@ -209,51 +211,6 @@ class SavePretrainedCallback(tf.keras.callbacks.Callback):
self
.
model
.
save_pretrained
(
self
.
output_dir
)
self
.
model
.
save_pretrained
(
self
.
output_dir
)
def
convert_dataset_for_tensorflow
(
dataset
,
batch_size
,
dataset_mode
=
"variable_batch"
,
shuffle
=
True
,
drop_remainder
=
True
):
"""Converts a Hugging Face dataset to a Tensorflow Dataset. The dataset_mode controls whether we pad all batches
to the maximum sequence length, or whether we only pad to the maximum length within that batch. The former
is most useful when training on TPU, as a new graph compilation is required for each sequence length.
"""
def
densify_ragged_batch
(
features
,
label
=
None
):
features
=
{
feature
:
ragged_tensor
.
to_tensor
(
shape
=
batch_shape
[
feature
])
if
feature
in
tensor_keys
else
ragged_tensor
for
feature
,
ragged_tensor
in
features
.
items
()
}
if
label
is
None
:
return
features
else
:
return
features
,
label
tensor_keys
=
[
"attention_mask"
,
"input_ids"
]
label_keys
=
[
"start_positions"
,
"end_positions"
]
if
dataset_mode
==
"variable_batch"
:
batch_shape
=
{
key
:
None
for
key
in
tensor_keys
}
data
=
{
key
:
tf
.
ragged
.
constant
(
dataset
[
key
])
for
key
in
tensor_keys
}
elif
dataset_mode
==
"constant_batch"
:
data
=
{
key
:
tf
.
ragged
.
constant
(
dataset
[
key
])
for
key
in
tensor_keys
}
batch_shape
=
{
key
:
tf
.
concat
(([
batch_size
],
ragged_tensor
.
bounding_shape
()[
1
:]),
axis
=
0
)
for
key
,
ragged_tensor
in
data
.
items
()
}
else
:
raise
ValueError
(
"Unknown dataset mode!"
)
if
all
([
key
in
dataset
.
features
for
key
in
label_keys
]):
for
key
in
label_keys
:
data
[
key
]
=
tf
.
convert_to_tensor
(
dataset
[
key
])
dummy_labels
=
tf
.
zeros_like
(
dataset
[
key
])
tf_dataset
=
tf
.
data
.
Dataset
.
from_tensor_slices
((
data
,
dummy_labels
))
else
:
tf_dataset
=
tf
.
data
.
Dataset
.
from_tensor_slices
(
data
)
if
shuffle
:
tf_dataset
=
tf_dataset
.
shuffle
(
buffer_size
=
len
(
dataset
))
tf_dataset
=
tf_dataset
.
batch
(
batch_size
=
batch_size
,
drop_remainder
=
drop_remainder
).
map
(
densify_ragged_batch
)
return
tf_dataset
# endregion
# endregion
...
@@ -391,6 +348,12 @@ def main():
...
@@ -391,6 +348,12 @@ def main():
)
)
max_seq_length
=
min
(
data_args
.
max_seq_length
,
tokenizer
.
model_max_length
)
max_seq_length
=
min
(
data_args
.
max_seq_length
,
tokenizer
.
model_max_length
)
if
data_args
.
pad_to_max_length
or
isinstance
(
training_args
.
strategy
,
tf
.
distribute
.
TPUStrategy
):
logger
.
info
(
"Padding all batches to max length because argument was set or we're on TPU."
)
padding
=
"max_length"
else
:
padding
=
False
# Training preprocessing
# Training preprocessing
def
prepare_train_features
(
examples
):
def
prepare_train_features
(
examples
):
# Some of the questions have lots of whitespace on the left, which is not useful and will make the
# Some of the questions have lots of whitespace on the left, which is not useful and will make the
...
@@ -409,7 +372,7 @@ def main():
...
@@ -409,7 +372,7 @@ def main():
stride
=
data_args
.
doc_stride
,
stride
=
data_args
.
doc_stride
,
return_overflowing_tokens
=
True
,
return_overflowing_tokens
=
True
,
return_offsets_mapping
=
True
,
return_offsets_mapping
=
True
,
padding
=
"max_length"
if
data_args
.
pad_to_max_length
else
False
,
padding
=
padding
,
)
)
# Since one example might give us several features if it has a long context, we need a map from a feature to
# Since one example might give us several features if it has a long context, we need a map from a feature to
...
@@ -508,7 +471,7 @@ def main():
...
@@ -508,7 +471,7 @@ def main():
stride
=
data_args
.
doc_stride
,
stride
=
data_args
.
doc_stride
,
return_overflowing_tokens
=
True
,
return_overflowing_tokens
=
True
,
return_offsets_mapping
=
True
,
return_offsets_mapping
=
True
,
padding
=
"max_length"
if
data_args
.
pad_to_max_length
else
False
,
padding
=
padding
,
)
)
# Since one example might give us several features if it has a long context, we need a map from a feature to
# Since one example might give us several features if it has a long context, we need a map from a feature to
...
@@ -631,27 +594,27 @@ def main():
...
@@ -631,27 +594,27 @@ def main():
clipnorm
=
training_args
.
max_grad_norm
,
clipnorm
=
training_args
.
max_grad_norm
,
)
)
def
dummy_loss
(
y_true
,
y_pred
):
# no user-specified loss = will use the model internal loss
return
tf
.
reduce_mean
(
y_pred
)
model
.
compile
(
optimizer
=
optimizer
)
losses
=
{
"loss"
:
dummy_loss
}
model
.
compile
(
optimizer
=
optimizer
,
loss
=
losses
)
# endregion
# endregion
# region Training
# region Training
if
padding
:
data_collator
=
DefaultDataCollator
(
return_tensors
=
"tf"
)
else
:
data_collator
=
DataCollatorWithPadding
(
tokenizer
,
return_tensors
=
"tf"
)
tensor_keys
=
[
"attention_mask"
,
"input_ids"
]
label_keys
=
[
"start_positions"
,
"end_positions"
]
if
training_args
.
do_train
:
if
training_args
.
do_train
:
# Make a tf.data.Dataset for this
# Make a tf.data.Dataset for this
if
isinstance
(
training_args
.
strategy
,
tf
.
distribute
.
TPUStrategy
)
or
data_args
.
pad_to_max_length
:
training_dataset
=
processed_datasets
[
"train"
].
to_tf_dataset
(
logger
.
info
(
"Padding all batches to max length because argument was set or we're on TPU."
)
# labels are passed as input, as we will use the model's internal loss
dataset_mode
=
"constant_batch"
columns
=
tensor_keys
+
label_keys
,
else
:
shuffle
=
True
,
dataset_mode
=
"variable_batch"
training_dataset
=
convert_dataset_for_tensorflow
(
processed_datasets
[
"train"
],
batch_size
=
training_args
.
per_device_train_batch_size
,
batch_size
=
training_args
.
per_device_train_batch_size
,
dataset_mode
=
dataset_mode
,
collate_fn
=
data_collator
,
drop_remainder
=
True
,
drop_remainder
=
True
,
shuffle
=
True
,
)
)
model
.
fit
(
training_dataset
,
epochs
=
int
(
training_args
.
num_train_epochs
))
model
.
fit
(
training_dataset
,
epochs
=
int
(
training_args
.
num_train_epochs
))
# endregion
# endregion
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment