Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
870e6f29
"git@developer.sourcefind.cn:chenpangpang/transformers.git" did not exist on "80aa4b8aa6b9830444cc5ca882b34c16159af51b"
Unverified
Commit
870e6f29
authored
May 04, 2022
by
Patrick Deutschmann
Committed by
GitHub
May 04, 2022
Browse files
Fix DeBERTa `token_type_ids` (#17082)
parent
279bc584
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
10 additions
and
4 deletions
+10
-4
src/transformers/convert_slow_tokenizer.py
src/transformers/convert_slow_tokenizer.py
+1
-1
src/transformers/models/deberta/tokenization_deberta.py
src/transformers/models/deberta/tokenization_deberta.py
+1
-1
src/transformers/models/deberta/tokenization_deberta_fast.py
src/transformers/models/deberta/tokenization_deberta_fast.py
+2
-2
tests/models/deberta/test_tokenization_deberta.py
tests/models/deberta/test_tokenization_deberta.py
+6
-0
No files found.
src/transformers/convert_slow_tokenizer.py
View file @
870e6f29
...
...
@@ -407,7 +407,7 @@ class DebertaConverter(Converter):
tokenizer
.
decoder
=
decoders
.
ByteLevel
()
tokenizer
.
post_processor
=
processors
.
TemplateProcessing
(
single
=
"[CLS]:0 $A:0 [SEP]:0"
,
pair
=
"[CLS]:0 $A:0 [SEP]:0 $B:
0
[SEP]:
0
"
,
pair
=
"[CLS]:0 $A:0 [SEP]:0 $B:
1
[SEP]:
1
"
,
special_tokens
=
[
(
"[CLS]"
,
self
.
original_tokenizer
.
convert_tokens_to_ids
(
"[CLS]"
)),
(
"[SEP]"
,
self
.
original_tokenizer
.
convert_tokens_to_ids
(
"[SEP]"
)),
...
...
src/transformers/models/deberta/tokenization_deberta.py
View file @
870e6f29
...
...
@@ -210,7 +210,7 @@ class DebertaTokenizer(GPT2Tokenizer):
if
token_ids_1
is
None
:
return
len
(
cls
+
token_ids_0
+
sep
)
*
[
0
]
return
len
(
cls
+
token_ids_0
+
sep
+
token_ids_1
+
sep
)
*
[
0
]
return
len
(
cls
+
token_ids_0
+
sep
)
*
[
0
]
+
len
(
token_ids_1
+
sep
)
*
[
1
]
def
prepare_for_tokenization
(
self
,
text
,
is_split_into_words
=
False
,
**
kwargs
):
add_prefix_space
=
kwargs
.
pop
(
"add_prefix_space"
,
self
.
add_prefix_space
)
...
...
src/transformers/models/deberta/tokenization_deberta_fast.py
View file @
870e6f29
...
...
@@ -183,7 +183,7 @@ class DebertaTokenizerFast(GPT2TokenizerFast):
sequence pair mask has the following format:
```
0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0
1 1 1 1 1 1 1 1 1
| first sequence | second sequence |
```
...
...
@@ -203,4 +203,4 @@ class DebertaTokenizerFast(GPT2TokenizerFast):
if
token_ids_1
is
None
:
return
len
(
cls
+
token_ids_0
+
sep
)
*
[
0
]
return
len
(
cls
+
token_ids_0
+
sep
+
token_ids_1
+
sep
)
*
[
0
]
return
len
(
cls
+
token_ids_0
+
sep
)
*
[
0
]
+
len
(
token_ids_1
+
sep
)
*
[
1
]
tests/models/deberta/test_tokenization_deberta.py
View file @
870e6f29
...
...
@@ -88,6 +88,12 @@ class DebertaTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
input_bpe_tokens
=
[
0
,
1
,
2
,
15
,
10
,
9
,
3
,
2
,
15
,
19
]
self
.
assertListEqual
(
tokenizer
.
convert_tokens_to_ids
(
input_tokens
),
input_bpe_tokens
)
def
test_token_type_ids
(
self
):
tokenizer
=
self
.
get_tokenizer
()
tokd
=
tokenizer
(
"Hello"
,
"World"
)
expected_token_type_ids
=
[
0
,
0
,
0
,
0
,
0
,
0
,
0
,
1
,
1
,
1
,
1
,
1
,
1
]
self
.
assertListEqual
(
tokd
[
"token_type_ids"
],
expected_token_type_ids
)
@
slow
def
test_sequence_builders
(
self
):
tokenizer
=
self
.
tokenizer_class
.
from_pretrained
(
"microsoft/deberta-base"
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment