Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
a67e7478
"vscode:/vscode.git/clone" did not exist on "28a30af6d184c84b5d844add96c15bf023a53f7f"
Commit
a67e7478
authored
Nov 14, 2019
by
Lysandre
Browse files
Reorganized max_len warning
parent
e18f786c
Changes
1
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
5 additions
and
4 deletions
+5
-4
transformers/tokenization_utils.py
transformers/tokenization_utils.py
+5
-4
No files found.
transformers/tokenization_utils.py
View file @
a67e7478
...
@@ -671,10 +671,6 @@ class PreTrainedTokenizer(object):
...
@@ -671,10 +671,6 @@ class PreTrainedTokenizer(object):
ids
=
[]
ids
=
[]
for
token
in
tokens
:
for
token
in
tokens
:
ids
.
append
(
self
.
_convert_token_to_id_with_added_voc
(
token
))
ids
.
append
(
self
.
_convert_token_to_id_with_added_voc
(
token
))
if
len
(
ids
)
>
self
.
max_len
:
logger
.
warning
(
"Token indices sequence length is longer than the specified maximum sequence length "
"for this model ({} > {}). Running this sequence through the model will result in "
"indexing errors"
.
format
(
len
(
ids
),
self
.
max_len
))
return
ids
return
ids
def
_convert_token_to_id_with_added_voc
(
self
,
token
):
def
_convert_token_to_id_with_added_voc
(
self
,
token
):
...
@@ -877,6 +873,11 @@ class PreTrainedTokenizer(object):
...
@@ -877,6 +873,11 @@ class PreTrainedTokenizer(object):
encoded_inputs
[
"token_type_ids"
]
=
encoded_inputs
[
"token_type_ids"
][:
max_length
]
encoded_inputs
[
"token_type_ids"
]
=
encoded_inputs
[
"token_type_ids"
][:
max_length
]
encoded_inputs
[
"special_tokens_mask"
]
=
encoded_inputs
[
"special_tokens_mask"
][:
max_length
]
encoded_inputs
[
"special_tokens_mask"
]
=
encoded_inputs
[
"special_tokens_mask"
][:
max_length
]
if
max_length
is
None
and
len
(
encoded_inputs
[
"input_ids"
])
>
self
.
max_len
:
logger
.
warning
(
"Token indices sequence length is longer than the specified maximum sequence length "
"for this model ({} > {}). Running this sequence through the model will result in "
"indexing errors"
.
format
(
len
(
ids
),
self
.
max_len
))
return
encoded_inputs
return
encoded_inputs
def
truncate_sequences
(
self
,
ids
,
pair_ids
=
None
,
num_tokens_to_remove
=
0
,
truncation_strategy
=
'longest_first'
,
stride
=
0
):
def
truncate_sequences
(
self
,
ids
,
pair_ids
=
None
,
num_tokens_to_remove
=
0
,
truncation_strategy
=
'longest_first'
,
stride
=
0
):
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment