Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
c832f43a
"git@developer.sourcefind.cn:chenpangpang/diffusers.git" did not exist on "b1fe1706425aa0ca36a38dafab54e0dd4a6e6baf"
Commit
c832f43a
authored
Sep 24, 2019
by
LysandreJik
Browse files
`output_token_type` -> `token_type_ids`
parent
3927d775
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
5 additions
and
5 deletions
+5
-5
examples/utils_glue.py
examples/utils_glue.py
+1
-1
pytorch_transformers/tests/tokenization_tests_commons.py
pytorch_transformers/tests/tokenization_tests_commons.py
+1
-1
pytorch_transformers/tokenization_utils.py
pytorch_transformers/tokenization_utils.py
+3
-3
No files found.
examples/utils_glue.py
View file @
c832f43a
...
@@ -413,7 +413,7 @@ def convert_examples_to_features(examples, label_list, max_seq_length,
...
@@ -413,7 +413,7 @@ def convert_examples_to_features(examples, label_list, max_seq_length,
max_length
=
max_seq_length
,
max_length
=
max_seq_length
,
truncate_first_sequence
=
True
# We're truncating the first sequence as a priority
truncate_first_sequence
=
True
# We're truncating the first sequence as a priority
)
)
input_ids
,
segment_ids
=
inputs
[
"input_ids"
],
inputs
[
"
output_
token_type"
]
input_ids
,
segment_ids
=
inputs
[
"input_ids"
],
inputs
[
"token_type
_ids
"
]
# The mask has 1 for real tokens and 0 for padding tokens. Only real
# The mask has 1 for real tokens and 0 for padding tokens. Only real
# tokens are attended to.
# tokens are attended to.
...
...
pytorch_transformers/tests/tokenization_tests_commons.py
View file @
c832f43a
...
@@ -197,7 +197,7 @@ class CommonTestCases:
...
@@ -197,7 +197,7 @@ class CommonTestCases:
seq_0
=
"Test this method."
seq_0
=
"Test this method."
seq_1
=
"With these inputs."
seq_1
=
"With these inputs."
information
=
tokenizer
.
encode_plus
(
seq_0
,
seq_1
,
add_special_tokens
=
True
,
output_token_type
=
True
)
information
=
tokenizer
.
encode_plus
(
seq_0
,
seq_1
,
add_special_tokens
=
True
,
output_token_type
=
True
)
sequences
,
mask
=
information
[
"input_ids"
],
information
[
"
output_
token_type"
]
sequences
,
mask
=
information
[
"input_ids"
],
information
[
"token_type
_ids
"
]
assert
len
(
sequences
)
==
len
(
mask
)
assert
len
(
sequences
)
==
len
(
mask
)
def
test_number_of_added_tokens
(
self
):
def
test_number_of_added_tokens
(
self
):
...
...
pytorch_transformers/tokenization_utils.py
View file @
c832f43a
...
@@ -765,7 +765,7 @@ class PreTrainedTokenizer(object):
...
@@ -765,7 +765,7 @@ class PreTrainedTokenizer(object):
information
[
"input_ids"
]
=
sequence_tokens
information
[
"input_ids"
]
=
sequence_tokens
if
output_token_type
:
if
output_token_type
:
information
[
"
output_
token_type"
]
=
[
0
]
*
len
(
information
[
"input_ids"
])
information
[
"token_type
_ids
"
]
=
[
0
]
*
len
(
information
[
"input_ids"
])
else
:
else
:
first_sentence_tokens
=
get_input_ids
(
text
)
first_sentence_tokens
=
get_input_ids
(
text
)
second_sentence_tokens
=
get_input_ids
(
text_pair
)
second_sentence_tokens
=
get_input_ids
(
text_pair
)
...
@@ -780,7 +780,7 @@ class PreTrainedTokenizer(object):
...
@@ -780,7 +780,7 @@ class PreTrainedTokenizer(object):
)
)
if
output_token_type
:
if
output_token_type
:
information
[
"
output_
token_type"
]
=
self
.
create_mask_from_sequences
(
text
,
text_pair
)
information
[
"token_type
_ids
"
]
=
self
.
create_mask_from_sequences
(
text
,
text_pair
)
else
:
else
:
logger
.
warning
(
"No special tokens were added. The two sequences have been concatenated."
)
logger
.
warning
(
"No special tokens were added. The two sequences have been concatenated."
)
sequence
=
first_sentence_tokens
+
second_sentence_tokens
sequence
=
first_sentence_tokens
+
second_sentence_tokens
...
@@ -789,7 +789,7 @@ class PreTrainedTokenizer(object):
...
@@ -789,7 +789,7 @@ class PreTrainedTokenizer(object):
information
[
"overflowing_tokens"
]
=
sequence
[
max_length
-
stride
:]
information
[
"overflowing_tokens"
]
=
sequence
[
max_length
-
stride
:]
sequence
=
sequence
[:
max_length
]
sequence
=
sequence
[:
max_length
]
if
output_token_type
:
if
output_token_type
:
information
[
"
output_
token_type"
]
=
[
0
]
*
len
(
sequence
)
information
[
"token_type
_ids
"
]
=
[
0
]
*
len
(
sequence
)
information
[
"input_ids"
]
=
sequence
information
[
"input_ids"
]
=
sequence
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment