Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
af38c837
Unverified
Commit
af38c837
authored
Oct 06, 2023
by
Towdo
Committed by
GitHub
Oct 06, 2023
Browse files
Fixed inconsistency in several fast tokenizers (#26561)
parent
8878eb1b
Changes
13
Show whitespace changes
Inline
Side-by-side
Showing
13 changed files
with
31 additions
and
23 deletions
+31
-23
src/transformers/models/bert/tokenization_bert_fast.py
src/transformers/models/bert/tokenization_bert_fast.py
+1
-1
src/transformers/models/convbert/tokenization_convbert_fast.py
...ransformers/models/convbert/tokenization_convbert_fast.py
+1
-1
src/transformers/models/deprecated/retribert/tokenization_retribert_fast.py
...odels/deprecated/retribert/tokenization_retribert_fast.py
+1
-1
src/transformers/models/distilbert/tokenization_distilbert_fast.py
...formers/models/distilbert/tokenization_distilbert_fast.py
+1
-1
src/transformers/models/electra/tokenization_electra_fast.py
src/transformers/models/electra/tokenization_electra_fast.py
+1
-1
src/transformers/models/funnel/tokenization_funnel_fast.py
src/transformers/models/funnel/tokenization_funnel_fast.py
+1
-1
src/transformers/models/layoutlm/tokenization_layoutlm_fast.py
...ransformers/models/layoutlm/tokenization_layoutlm_fast.py
+1
-1
src/transformers/models/lxmert/tokenization_lxmert_fast.py
src/transformers/models/lxmert/tokenization_lxmert_fast.py
+1
-1
src/transformers/models/mobilebert/tokenization_mobilebert_fast.py
...formers/models/mobilebert/tokenization_mobilebert_fast.py
+1
-1
src/transformers/models/realm/tokenization_realm_fast.py
src/transformers/models/realm/tokenization_realm_fast.py
+1
-1
src/transformers/models/roformer/tokenization_roformer_fast.py
...ransformers/models/roformer/tokenization_roformer_fast.py
+1
-1
src/transformers/models/squeezebert/tokenization_squeezebert_fast.py
...rmers/models/squeezebert/tokenization_squeezebert_fast.py
+1
-1
tests/test_tokenization_common.py
tests/test_tokenization_common.py
+19
-11
No files found.
src/transformers/models/bert/tokenization_bert_fast.py
View file @
af38c837
...
...
@@ -265,7 +265,7 @@ class BertTokenizerFast(PreTrainedTokenizerFast):
"""
output
=
[
self
.
cls_token_id
]
+
token_ids_0
+
[
self
.
sep_token_id
]
if
token_ids_1
:
if
token_ids_1
is
not
None
:
output
+=
token_ids_1
+
[
self
.
sep_token_id
]
return
output
...
...
src/transformers/models/convbert/tokenization_convbert_fast.py
View file @
af38c837
...
...
@@ -159,7 +159,7 @@ class ConvBertTokenizerFast(PreTrainedTokenizerFast):
"""
output
=
[
self
.
cls_token_id
]
+
token_ids_0
+
[
self
.
sep_token_id
]
if
token_ids_1
:
if
token_ids_1
is
not
None
:
output
+=
token_ids_1
+
[
self
.
sep_token_id
]
return
output
...
...
src/transformers/models/deprecated/retribert/tokenization_retribert_fast.py
View file @
af38c837
...
...
@@ -164,7 +164,7 @@ class RetriBertTokenizerFast(PreTrainedTokenizerFast):
"""
output
=
[
self
.
cls_token_id
]
+
token_ids_0
+
[
self
.
sep_token_id
]
if
token_ids_1
:
if
token_ids_1
is
not
None
:
output
+=
token_ids_1
+
[
self
.
sep_token_id
]
return
output
...
...
src/transformers/models/distilbert/tokenization_distilbert_fast.py
View file @
af38c837
...
...
@@ -190,7 +190,7 @@ class DistilBertTokenizerFast(PreTrainedTokenizerFast):
"""
output
=
[
self
.
cls_token_id
]
+
token_ids_0
+
[
self
.
sep_token_id
]
if
token_ids_1
:
if
token_ids_1
is
not
None
:
output
+=
token_ids_1
+
[
self
.
sep_token_id
]
return
output
...
...
src/transformers/models/electra/tokenization_electra_fast.py
View file @
af38c837
...
...
@@ -192,7 +192,7 @@ class ElectraTokenizerFast(PreTrainedTokenizerFast):
"""
output
=
[
self
.
cls_token_id
]
+
token_ids_0
+
[
self
.
sep_token_id
]
if
token_ids_1
:
if
token_ids_1
is
not
None
:
output
+=
token_ids_1
+
[
self
.
sep_token_id
]
return
output
...
...
src/transformers/models/funnel/tokenization_funnel_fast.py
View file @
af38c837
...
...
@@ -212,7 +212,7 @@ class FunnelTokenizerFast(PreTrainedTokenizerFast):
"""
output
=
[
self
.
cls_token_id
]
+
token_ids_0
+
[
self
.
sep_token_id
]
if
token_ids_1
:
if
token_ids_1
is
not
None
:
output
+=
token_ids_1
+
[
self
.
sep_token_id
]
return
output
...
...
src/transformers/models/layoutlm/tokenization_layoutlm_fast.py
View file @
af38c837
...
...
@@ -166,7 +166,7 @@ class LayoutLMTokenizerFast(PreTrainedTokenizerFast):
"""
output
=
[
self
.
cls_token_id
]
+
token_ids_0
+
[
self
.
sep_token_id
]
if
token_ids_1
:
if
token_ids_1
is
not
None
:
output
+=
token_ids_1
+
[
self
.
sep_token_id
]
return
output
...
...
src/transformers/models/lxmert/tokenization_lxmert_fast.py
View file @
af38c837
...
...
@@ -152,7 +152,7 @@ class LxmertTokenizerFast(PreTrainedTokenizerFast):
"""
output
=
[
self
.
cls_token_id
]
+
token_ids_0
+
[
self
.
sep_token_id
]
if
token_ids_1
:
if
token_ids_1
is
not
None
:
output
+=
token_ids_1
+
[
self
.
sep_token_id
]
return
output
...
...
src/transformers/models/mobilebert/tokenization_mobilebert_fast.py
View file @
af38c837
...
...
@@ -150,7 +150,7 @@ class MobileBertTokenizerFast(PreTrainedTokenizerFast):
"""
output
=
[
self
.
cls_token_id
]
+
token_ids_0
+
[
self
.
sep_token_id
]
if
token_ids_1
:
if
token_ids_1
is
not
None
:
output
+=
token_ids_1
+
[
self
.
sep_token_id
]
return
output
...
...
src/transformers/models/realm/tokenization_realm_fast.py
View file @
af38c837
...
...
@@ -282,7 +282,7 @@ class RealmTokenizerFast(PreTrainedTokenizerFast):
"""
output
=
[
self
.
cls_token_id
]
+
token_ids_0
+
[
self
.
sep_token_id
]
if
token_ids_1
:
if
token_ids_1
is
not
None
:
output
+=
token_ids_1
+
[
self
.
sep_token_id
]
return
output
...
...
src/transformers/models/roformer/tokenization_roformer_fast.py
View file @
af38c837
...
...
@@ -163,7 +163,7 @@ class RoFormerTokenizerFast(PreTrainedTokenizerFast):
"""
output
=
[
self
.
cls_token_id
]
+
token_ids_0
+
[
self
.
sep_token_id
]
if
token_ids_1
:
if
token_ids_1
is
not
None
:
output
+=
token_ids_1
+
[
self
.
sep_token_id
]
return
output
...
...
src/transformers/models/squeezebert/tokenization_squeezebert_fast.py
View file @
af38c837
...
...
@@ -173,7 +173,7 @@ class SqueezeBertTokenizerFast(PreTrainedTokenizerFast):
"""
output
=
[
self
.
cls_token_id
]
+
token_ids_0
+
[
self
.
sep_token_id
]
if
token_ids_1
:
if
token_ids_1
is
not
None
:
output
+=
token_ids_1
+
[
self
.
sep_token_id
]
return
output
...
...
tests/test_tokenization_common.py
View file @
af38c837
...
...
@@ -3209,9 +3209,17 @@ class TokenizerTesterMixin:
# output_p = tokenizer_p.build_inputs_with_special_tokens(input_simple, input_pair)
# self.assertEqual(output_p, output_r)
input_pairs
=
[
(
""
,
""
),
(
""
,
"This is a sample pair"
),
(
"This is a sample input"
,
""
),
(
"This is a sample input"
,
"This is a sample pair"
),
]
for
sample_input
,
sample_pair
in
input_pairs
:
# Input tokens id
input_simple
=
tokenizer_p
.
encode
(
"This is a
sample
input
"
,
add_special_tokens
=
False
)
input_pair
=
tokenizer_p
.
encode
(
"This is a
sample
pair
"
,
add_special_tokens
=
False
)
input_simple
=
tokenizer_p
.
encode
(
sample
_
input
,
add_special_tokens
=
False
)
input_pair
=
tokenizer_p
.
encode
(
sample
_
pair
,
add_special_tokens
=
False
)
# Generate output
output_r
=
tokenizer_r
.
build_inputs_with_special_tokens
(
input_simple
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment