Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
59b7334c
Unverified
Commit
59b7334c
authored
Oct 14, 2022
by
Yih-Dar
Committed by
GitHub
Oct 14, 2022
Browse files
Fix `test_tf_encode_plus_sent_to_model` for `TAPAS` (#19559)
Co-authored-by:
ydshieh
<
ydshieh@users.noreply.github.com
>
parent
1967be98
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
32 additions
and
1 deletion
+32
-1
tests/models/tapas/test_tokenization_tapas.py
tests/models/tapas/test_tokenization_tapas.py
+32
-1
No files found.
tests/models/tapas/test_tokenization_tapas.py
View file @
59b7334c
...
...
@@ -143,8 +143,39 @@ class TapasTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
return
input_text
,
output_text
@
require_tensorflow_probability
@
slow
def
test_tf_encode_plus_sent_to_model
(
self
):
super
().
test_tf_encode_plus_sent_to_model
()
from
transformers
import
TF_MODEL_MAPPING
,
TOKENIZER_MAPPING
MODEL_TOKENIZER_MAPPING
=
merge_model_tokenizer_mappings
(
TF_MODEL_MAPPING
,
TOKENIZER_MAPPING
)
tokenizers
=
self
.
get_tokenizers
(
do_lower_case
=
False
)
for
tokenizer
in
tokenizers
:
with
self
.
subTest
(
f
"
{
tokenizer
.
__class__
.
__name__
}
"
):
if
tokenizer
.
__class__
not
in
MODEL_TOKENIZER_MAPPING
:
return
config_class
,
model_class
=
MODEL_TOKENIZER_MAPPING
[
tokenizer
.
__class__
]
config
=
config_class
()
if
config
.
is_encoder_decoder
or
config
.
pad_token_id
is
None
:
return
model
=
model_class
(
config
)
# Make sure the model contains at least the full vocabulary size in its embedding matrix
self
.
assertGreaterEqual
(
model
.
config
.
vocab_size
,
len
(
tokenizer
))
# Build sequence
first_ten_tokens
=
list
(
tokenizer
.
get_vocab
().
keys
())[:
10
]
sequence
=
" "
.
join
(
first_ten_tokens
)
table
=
self
.
get_table
(
tokenizer
,
length
=
0
)
encoded_sequence
=
tokenizer
.
encode_plus
(
table
,
sequence
,
return_tensors
=
"tf"
)
batch_encoded_sequence
=
tokenizer
.
batch_encode_plus
(
table
,
[
sequence
,
sequence
],
return_tensors
=
"tf"
)
# This should not fail
model
(
encoded_sequence
)
model
(
batch_encoded_sequence
)
def
test_rust_and_python_full_tokenizers
(
self
):
if
not
self
.
test_rust_tokenizer
:
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment