Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
8cba0572
Commit
8cba0572
authored
Sep 19, 2019
by
LysandreJik
Browse files
Doc + remove artefacts
parent
6393261e
Changes
1
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
2 additions
and
39 deletions
+2
-39
pytorch_transformers/tokenization_utils.py
pytorch_transformers/tokenization_utils.py
+2
-39
No files found.
pytorch_transformers/tokenization_utils.py
View file @
8cba0572
...
@@ -724,9 +724,8 @@ class PreTrainedTokenizer(object):
...
@@ -724,9 +724,8 @@ class PreTrainedTokenizer(object):
def
encode_plus
(
self
,
text
,
text_pair
=
None
,
add_special_tokens
=
False
,
output_mask
=
False
,
max_length
=
None
,
**
kwargs
):
def
encode_plus
(
self
,
text
,
text_pair
=
None
,
add_special_tokens
=
False
,
output_mask
=
False
,
max_length
=
None
,
**
kwargs
):
"""
"""
Converts a string in a sequence of ids (integer), using the tokenizer and vocabulary.
Returns a dictionary containing the encoded sequence or sequence pair. Other values can be returned by this
method: the mask for sequence classification and the overflowing elements if a ``max_length`` is specified.
Same as doing ``self.convert_tokens_to_ids(self.tokenize(text))``.
Args:
Args:
text: The first sequence to be encoded.
text: The first sequence to be encoded.
...
@@ -801,42 +800,6 @@ class PreTrainedTokenizer(object):
...
@@ -801,42 +800,6 @@ class PreTrainedTokenizer(object):
return
information
return
information
if
text_pair
is
None
:
if
add_special_tokens
:
sequence_tokens
=
self
.
convert_tokens_to_ids
(
self
.
tokenize
(
text
,
**
kwargs
))
if
max_length
:
sequence_tokens
=
sequence_tokens
[:
max_length
-
self
.
num_added_tokens
()]
return
self
.
add_special_tokens_single_sentence
(
sequence_tokens
)
else
:
ids
=
self
.
convert_tokens_to_ids
(
self
.
tokenize
(
text
,
**
kwargs
))
return
ids
[:
max_length
]
if
max_length
!=
-
1
else
ids
first_sentence_tokens
=
[
self
.
_convert_token_to_id
(
token
)
for
token
in
self
.
tokenize
(
text
,
**
kwargs
)]
second_sentence_tokens
=
[
self
.
_convert_token_to_id
(
token
)
for
token
in
self
.
tokenize
(
text_pair
,
**
kwargs
)]
if
add_special_tokens
:
if
max_length
:
if
len
(
first_sentence_tokens
)
+
self
.
num_added_tokens
(
pair
=
True
)
>=
max_length
:
logger
.
warning
(
"The first sequence is longer than the maximum specified length. This sequence will not be truncated."
)
else
:
if
len
(
second_sentence_tokens
)
+
len
(
first_sentence_tokens
)
+
self
.
num_added_tokens
(
pair
=
True
)
>
max_length
:
second_sentence_tokens
=
second_sentence_tokens
[
:
max_length
-
len
(
first_sentence_tokens
)
-
self
.
num_added_tokens
(
pair
=
True
)]
return
self
.
add_special_tokens_sentences_pair
(
first_sentence_tokens
,
second_sentence_tokens
,
output_mask
)
else
:
if
max_length
:
first_sentence_tokens
=
first_sentence_tokens
[:
max_length
]
second_sentence_tokens
=
second_sentence_tokens
[:
max_length
]
if
output_mask
:
logger
.
warning
(
"Can't output mask if you're not joining two sequences."
)
return
first_sentence_tokens
,
second_sentence_tokens
def
add_special_tokens_single_sentence
(
self
,
token_ids
):
def
add_special_tokens_single_sentence
(
self
,
token_ids
):
logger
.
warning
(
"This tokenizer does not make use of special tokens. The sequence has been returned with no modification."
)
logger
.
warning
(
"This tokenizer does not make use of special tokens. The sequence has been returned with no modification."
)
return
token_ids
return
token_ids
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment