Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
3d87991f
"test/vscode:/vscode.git/clone" did not exist on "20b8d2306c3d9501b4d47399b6e56d63d30d53a3"
Commit
3d87991f
authored
Aug 13, 2019
by
LysandreJik
Browse files
Fixed error with encoding
parent
634a3172
Changes
2
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
8 additions
and
10 deletions
+8
-10
pytorch_transformers/tests/tokenization_roberta_test.py
pytorch_transformers/tests/tokenization_roberta_test.py
+5
-2
pytorch_transformers/tokenization_utils.py
pytorch_transformers/tokenization_utils.py
+3
-8
No files found.
pytorch_transformers/tests/tokenization_roberta_test.py
View file @
3d87991f
...
@@ -81,11 +81,14 @@ class RobertaTokenizationTest(CommonTestCases.CommonTokenizerTester):
...
@@ -81,11 +81,14 @@ class RobertaTokenizationTest(CommonTestCases.CommonTokenizerTester):
text
=
tokenizer
.
encode
(
"sequence builders"
)
text
=
tokenizer
.
encode
(
"sequence builders"
)
text_2
=
tokenizer
.
encode
(
"multi-sequence build"
)
text_2
=
tokenizer
.
encode
(
"multi-sequence build"
)
encoded_text_from_decode
=
tokenizer
.
encode
(
"sequence builders"
,
add_special_tokens
=
True
)
encoded_pair_from_decode
=
tokenizer
.
encode
(
"sequence builders"
,
"multi-sequence build"
,
add_special_tokens
=
True
)
encoded_sentence
=
tokenizer
.
add_special_tokens_single_sentence
(
text
)
encoded_sentence
=
tokenizer
.
add_special_tokens_single_sentence
(
text
)
encoded_pair
=
tokenizer
.
add_special_tokens_sentences_pair
(
text
,
text_2
)
encoded_pair
=
tokenizer
.
add_special_tokens_sentences_pair
(
text
,
text_2
)
assert
encoded_sentence
==
[
0
]
+
text
+
[
2
]
assert
encoded_sentence
==
encoded_text_from_decode
assert
encoded_pair
==
[
0
]
+
text
+
[
2
,
2
]
+
text_2
+
[
2
]
assert
encoded_pair
==
encoded_pair_from_decode
if
__name__
==
'__main__'
:
if
__name__
==
'__main__'
:
...
...
pytorch_transformers/tokenization_utils.py
View file @
3d87991f
...
@@ -519,24 +519,19 @@ class PreTrainedTokenizer(object):
...
@@ -519,24 +519,19 @@ class PreTrainedTokenizer(object):
def
_convert_token_to_id
(
self
,
token
):
def
_convert_token_to_id
(
self
,
token
):
raise
NotImplementedError
raise
NotImplementedError
def
encode
(
self
,
text
,
add_special_tokens
=
False
,
*
sequences
):
def
encode
(
self
,
text
,
text_pair
=
None
,
add_special_tokens
=
False
):
""" Converts a string in a sequence of ids (integer), using the tokenizer and vocabulary.
""" Converts a string in a sequence of ids (integer), using the tokenizer and vocabulary.
Same doing ``self.convert_tokens_to_ids(self.tokenize(text))``.
Same doing ``self.convert_tokens_to_ids(self.tokenize(text))``.
"""
"""
if
text_pair
is
None
:
if
len
(
sequences
)
==
0
:
if
add_special_tokens
:
if
add_special_tokens
:
return
self
.
add_special_tokens_single_sentence
(
self
.
convert_tokens_to_ids
(
self
.
tokenize
(
text
)))
return
self
.
add_special_tokens_single_sentence
(
self
.
convert_tokens_to_ids
(
self
.
tokenize
(
text
)))
else
:
else
:
return
self
.
convert_tokens_to_ids
(
self
.
tokenize
(
text
))
return
self
.
convert_tokens_to_ids
(
self
.
tokenize
(
text
))
if
len
(
sequences
)
>
1
:
logger
.
warning
(
"Tokenization currently only supports sentence pairs. Ignoring every string following the "
"initial two."
)
first_sentence_tokens
=
[
self
.
_convert_token_to_id
(
token
)
for
token
in
self
.
tokenize
(
text
)]
first_sentence_tokens
=
[
self
.
_convert_token_to_id
(
token
)
for
token
in
self
.
tokenize
(
text
)]
second_sentence_tokens
=
[
self
.
_convert_token_to_id
(
token
)
for
token
in
self
.
tokenize
(
sequences
[
0
]
)]
second_sentence_tokens
=
[
self
.
_convert_token_to_id
(
token
)
for
token
in
self
.
tokenize
(
text_pair
)]
if
add_special_tokens
:
if
add_special_tokens
:
return
self
.
add_special_tokens_sentences_pair
(
first_sentence_tokens
,
second_sentence_tokens
)
return
self
.
add_special_tokens_sentences_pair
(
first_sentence_tokens
,
second_sentence_tokens
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment