Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
cc360649
Unverified
Commit
cc360649
authored
Oct 14, 2021
by
Patrick von Platen
Committed by
GitHub
Oct 14, 2021
Browse files
up (#13988)
parent
5b6bd4e7
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
17 additions
and
1 deletion
+17
-1
src/transformers/models/byt5/tokenization_byt5.py
src/transformers/models/byt5/tokenization_byt5.py
+1
-1
tests/test_tokenization_byt5.py
tests/test_tokenization_byt5.py
+16
-0
No files found.
src/transformers/models/byt5/tokenization_byt5.py
View file @
cc360649
...
@@ -237,7 +237,7 @@ class ByT5Tokenizer(PreTrainedTokenizer):
...
@@ -237,7 +237,7 @@ class ByT5Tokenizer(PreTrainedTokenizer):
else
:
else
:
tok_string
=
bytes
([
ord
(
token
)])
tok_string
=
bytes
([
ord
(
token
)])
bstring
+=
tok_string
bstring
+=
tok_string
string
=
bstring
.
decode
(
"utf-8"
)
string
=
bstring
.
decode
(
"utf-8"
,
errors
=
"ignore"
)
return
string
return
string
# ByT5Tokenizer has no vocab file
# ByT5Tokenizer has no vocab file
...
...
tests/test_tokenization_byt5.py
View file @
cc360649
...
@@ -290,6 +290,22 @@ class ByT5TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
...
@@ -290,6 +290,22 @@ class ByT5TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
),
),
)
)
def
test_decode_single_bytes
(
self
):
tokenizer_list
=
[]
if
self
.
test_slow_tokenizer
:
tokenizer_list
.
append
((
self
.
tokenizer_class
,
self
.
get_tokenizer
()))
if
self
.
test_rust_tokenizer
:
tokenizer_list
.
append
((
self
.
rust_tokenizer_class
,
self
.
get_rust_tokenizer
()))
for
tokenizer_class
,
tokenizer_utils
in
tokenizer_list
:
with
tempfile
.
TemporaryDirectory
()
as
tmp_dir
:
tokenizer_utils
.
save_pretrained
(
tmp_dir
)
tokenizer
=
tokenizer_class
.
from_pretrained
(
tmp_dir
)
self
.
assertTrue
(
tokenizer
.
decode
([
255
])
==
""
)
# tokenizer can be instantiated without any pretrained files, so no need for pretrained tokenizer list
# tokenizer can be instantiated without any pretrained files, so no need for pretrained tokenizer list
def
test_pretrained_model_lists
(
self
):
def
test_pretrained_model_lists
(
self
):
pass
pass
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment