Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
7d709e55
Commit
7d709e55
authored
Oct 22, 2019
by
Lysandre
Browse files
Remove
parent
44286b94
Changes
10
Hide whitespace changes
Inline
Side-by-side
Showing
10 changed files
with
41 additions
and
39 deletions
+41
-39
examples/benchmarks.py
examples/benchmarks.py
+2
-2
examples/distillation/scripts/binarized_data.py
examples/distillation/scripts/binarized_data.py
+1
-1
examples/run_generation.py
examples/run_generation.py
+1
-1
transformers/tests/tokenization_bert_test.py
transformers/tests/tokenization_bert_test.py
+2
-2
transformers/tests/tokenization_distilbert_test.py
transformers/tests/tokenization_distilbert_test.py
+2
-2
transformers/tests/tokenization_roberta_test.py
transformers/tests/tokenization_roberta_test.py
+4
-4
transformers/tests/tokenization_tests_commons.py
transformers/tests/tokenization_tests_commons.py
+15
-13
transformers/tests/tokenization_xlm_test.py
transformers/tests/tokenization_xlm_test.py
+2
-2
transformers/tests/tokenization_xlnet_test.py
transformers/tests/tokenization_xlnet_test.py
+2
-2
transformers/tokenization_utils.py
transformers/tokenization_utils.py
+10
-10
No files found.
examples/benchmarks.py
View file @
7d709e55
...
@@ -309,7 +309,7 @@ def _compute_pytorch(model_names, dictionary, average_over, device, torchscript)
...
@@ -309,7 +309,7 @@ def _compute_pytorch(model_names, dictionary, average_over, device, torchscript)
model
=
AutoModel
.
from_pretrained
(
model_name
,
config
=
config
)
model
=
AutoModel
.
from_pretrained
(
model_name
,
config
=
config
)
tokenizer
=
AutoTokenizer
.
from_pretrained
(
model_name
)
tokenizer
=
AutoTokenizer
.
from_pretrained
(
model_name
)
tokenized_sequence
=
tokenizer
.
encode
(
input_text
)
tokenized_sequence
=
tokenizer
.
encode
(
input_text
,
add_special_tokens
=
False
)
max_input_size
=
tokenizer
.
max_model_input_sizes
[
model_name
]
max_input_size
=
tokenizer
.
max_model_input_sizes
[
model_name
]
batch_sizes
=
[
1
,
2
,
4
,
8
]
batch_sizes
=
[
1
,
2
,
4
,
8
]
...
@@ -353,7 +353,7 @@ def _compute_tensorflow(model_names, dictionary, average_over):
...
@@ -353,7 +353,7 @@ def _compute_tensorflow(model_names, dictionary, average_over):
model
=
TFAutoModel
.
from_pretrained
(
model_name
,
config
=
config
)
model
=
TFAutoModel
.
from_pretrained
(
model_name
,
config
=
config
)
tokenizer
=
AutoTokenizer
.
from_pretrained
(
model_name
)
tokenizer
=
AutoTokenizer
.
from_pretrained
(
model_name
)
tokenized_sequence
=
tokenizer
.
encode
(
input_text
)
tokenized_sequence
=
tokenizer
.
encode
(
input_text
,
add_special_tokens
=
False
)
max_input_size
=
tokenizer
.
max_model_input_sizes
[
model_name
]
max_input_size
=
tokenizer
.
max_model_input_sizes
[
model_name
]
batch_sizes
=
[
1
,
2
,
4
,
8
]
batch_sizes
=
[
1
,
2
,
4
,
8
]
...
...
examples/distillation/scripts/binarized_data.py
View file @
7d709e55
...
@@ -68,7 +68,7 @@ def main():
...
@@ -68,7 +68,7 @@ def main():
start
=
time
.
time
()
start
=
time
.
time
()
for
text
in
data
:
for
text
in
data
:
text
=
f
'
{
bos
}
{
text
.
strip
()
}
{
sep
}
'
text
=
f
'
{
bos
}
{
text
.
strip
()
}
{
sep
}
'
token_ids
=
tokenizer
.
encode
(
text
)
token_ids
=
tokenizer
.
encode
(
text
,
add_special_tokens
=
False
)
rslt
.
append
(
token_ids
)
rslt
.
append
(
token_ids
)
iter
+=
1
iter
+=
1
...
...
examples/run_generation.py
View file @
7d709e55
...
@@ -223,7 +223,7 @@ def main():
...
@@ -223,7 +223,7 @@ def main():
if
args
.
model_type
in
[
"transfo-xl"
,
"xlnet"
]:
if
args
.
model_type
in
[
"transfo-xl"
,
"xlnet"
]:
# Models with memory likes to have a long prompt for short inputs.
# Models with memory likes to have a long prompt for short inputs.
raw_text
=
(
args
.
padding_text
if
args
.
padding_text
else
PADDING_TEXT
)
+
raw_text
raw_text
=
(
args
.
padding_text
if
args
.
padding_text
else
PADDING_TEXT
)
+
raw_text
context_tokens
=
tokenizer
.
encode
(
raw_text
)
context_tokens
=
tokenizer
.
encode
(
raw_text
,
add_special_tokens
=
False
)
out
=
sample_sequence
(
out
=
sample_sequence
(
model
=
model
,
model
=
model
,
context
=
context_tokens
,
context
=
context_tokens
,
...
...
transformers/tests/tokenization_bert_test.py
View file @
7d709e55
...
@@ -128,8 +128,8 @@ class BertTokenizationTest(CommonTestCases.CommonTokenizerTester):
...
@@ -128,8 +128,8 @@ class BertTokenizationTest(CommonTestCases.CommonTokenizerTester):
def
test_sequence_builders
(
self
):
def
test_sequence_builders
(
self
):
tokenizer
=
self
.
tokenizer_class
.
from_pretrained
(
"bert-base-uncased"
)
tokenizer
=
self
.
tokenizer_class
.
from_pretrained
(
"bert-base-uncased"
)
text
=
tokenizer
.
encode
(
"sequence builders"
)
text
=
tokenizer
.
encode
(
"sequence builders"
,
add_special_tokens
=
False
)
text_2
=
tokenizer
.
encode
(
"multi-sequence build"
)
text_2
=
tokenizer
.
encode
(
"multi-sequence build"
,
add_special_tokens
=
False
)
encoded_sentence
=
tokenizer
.
build_inputs_with_special_tokens
(
text
)
encoded_sentence
=
tokenizer
.
build_inputs_with_special_tokens
(
text
)
encoded_pair
=
tokenizer
.
build_inputs_with_special_tokens
(
text
,
text_2
)
encoded_pair
=
tokenizer
.
build_inputs_with_special_tokens
(
text
,
text_2
)
...
...
transformers/tests/tokenization_distilbert_test.py
View file @
7d709e55
...
@@ -33,8 +33,8 @@ class DistilBertTokenizationTest(BertTokenizationTest):
...
@@ -33,8 +33,8 @@ class DistilBertTokenizationTest(BertTokenizationTest):
def
test_sequence_builders
(
self
):
def
test_sequence_builders
(
self
):
tokenizer
=
DistilBertTokenizer
.
from_pretrained
(
"distilbert-base-uncased"
)
tokenizer
=
DistilBertTokenizer
.
from_pretrained
(
"distilbert-base-uncased"
)
text
=
tokenizer
.
encode
(
"sequence builders"
)
text
=
tokenizer
.
encode
(
"sequence builders"
,
add_special_tokens
=
False
)
text_2
=
tokenizer
.
encode
(
"multi-sequence build"
)
text_2
=
tokenizer
.
encode
(
"multi-sequence build"
,
add_special_tokens
=
False
)
encoded_sentence
=
tokenizer
.
build_inputs_with_special_tokens
(
text
)
encoded_sentence
=
tokenizer
.
build_inputs_with_special_tokens
(
text
)
encoded_pair
=
tokenizer
.
build_inputs_with_special_tokens
(
text
,
text_2
)
encoded_pair
=
tokenizer
.
build_inputs_with_special_tokens
(
text
,
text_2
)
...
...
transformers/tests/tokenization_roberta_test.py
View file @
7d709e55
...
@@ -70,19 +70,19 @@ class RobertaTokenizationTest(CommonTestCases.CommonTokenizerTester):
...
@@ -70,19 +70,19 @@ class RobertaTokenizationTest(CommonTestCases.CommonTokenizerTester):
tokenizer
=
self
.
get_tokenizer
()
tokenizer
=
self
.
get_tokenizer
()
self
.
assertListEqual
(
self
.
assertListEqual
(
tokenizer
.
encode
(
'Hello world!'
),
tokenizer
.
encode
(
'Hello world!'
,
add_special_tokens
=
False
),
[
0
,
31414
,
232
,
328
,
2
]
[
0
,
31414
,
232
,
328
,
2
]
)
)
self
.
assertListEqual
(
self
.
assertListEqual
(
tokenizer
.
encode
(
'Hello world! cécé herlolip 418'
),
tokenizer
.
encode
(
'Hello world! cécé herlolip 418'
,
add_special_tokens
=
False
),
[
0
,
31414
,
232
,
328
,
740
,
1140
,
12695
,
69
,
46078
,
1588
,
2
]
[
0
,
31414
,
232
,
328
,
740
,
1140
,
12695
,
69
,
46078
,
1588
,
2
]
)
)
def
test_sequence_builders
(
self
):
def
test_sequence_builders
(
self
):
tokenizer
=
RobertaTokenizer
.
from_pretrained
(
"roberta-base"
)
tokenizer
=
RobertaTokenizer
.
from_pretrained
(
"roberta-base"
)
text
=
tokenizer
.
encode
(
"sequence builders"
)
text
=
tokenizer
.
encode
(
"sequence builders"
,
add_special_tokens
=
False
)
text_2
=
tokenizer
.
encode
(
"multi-sequence build"
)
text_2
=
tokenizer
.
encode
(
"multi-sequence build"
,
add_special_tokens
=
False
)
encoded_text_from_decode
=
tokenizer
.
encode
(
"sequence builders"
,
add_special_tokens
=
True
)
encoded_text_from_decode
=
tokenizer
.
encode
(
"sequence builders"
,
add_special_tokens
=
True
)
encoded_pair_from_decode
=
tokenizer
.
encode
(
"sequence builders"
,
"multi-sequence build"
,
add_special_tokens
=
True
)
encoded_pair_from_decode
=
tokenizer
.
encode
(
"sequence builders"
,
"multi-sequence build"
,
add_special_tokens
=
True
)
...
...
transformers/tests/tokenization_tests_commons.py
View file @
7d709e55
...
@@ -79,13 +79,13 @@ class CommonTestCases:
...
@@ -79,13 +79,13 @@ class CommonTestCases:
# Now let's start the test
# Now let's start the test
tokenizer
=
self
.
get_tokenizer
(
max_len
=
42
)
tokenizer
=
self
.
get_tokenizer
(
max_len
=
42
)
before_tokens
=
tokenizer
.
encode
(
u
"He is very happy, UNwant
\u00E9
d,running"
)
before_tokens
=
tokenizer
.
encode
(
u
"He is very happy, UNwant
\u00E9
d,running"
,
add_special_tokens
=
False
)
with
TemporaryDirectory
()
as
tmpdirname
:
with
TemporaryDirectory
()
as
tmpdirname
:
tokenizer
.
save_pretrained
(
tmpdirname
)
tokenizer
.
save_pretrained
(
tmpdirname
)
tokenizer
=
self
.
tokenizer_class
.
from_pretrained
(
tmpdirname
)
tokenizer
=
self
.
tokenizer_class
.
from_pretrained
(
tmpdirname
)
after_tokens
=
tokenizer
.
encode
(
u
"He is very happy, UNwant
\u00E9
d,running"
)
after_tokens
=
tokenizer
.
encode
(
u
"He is very happy, UNwant
\u00E9
d,running"
,
add_special_tokens
=
False
)
self
.
assertListEqual
(
before_tokens
,
after_tokens
)
self
.
assertListEqual
(
before_tokens
,
after_tokens
)
self
.
assertEqual
(
tokenizer
.
max_len
,
42
)
self
.
assertEqual
(
tokenizer
.
max_len
,
42
)
...
@@ -130,7 +130,7 @@ class CommonTestCases:
...
@@ -130,7 +130,7 @@ class CommonTestCases:
self
.
assertEqual
(
added_toks
,
len
(
new_toks
))
self
.
assertEqual
(
added_toks
,
len
(
new_toks
))
self
.
assertEqual
(
all_size_2
,
all_size
+
len
(
new_toks
))
self
.
assertEqual
(
all_size_2
,
all_size
+
len
(
new_toks
))
tokens
=
tokenizer
.
encode
(
"aaaaa bbbbbb low cccccccccdddddddd l"
)
tokens
=
tokenizer
.
encode
(
"aaaaa bbbbbb low cccccccccdddddddd l"
,
add_special_tokens
=
False
)
out_string
=
tokenizer
.
decode
(
tokens
)
out_string
=
tokenizer
.
decode
(
tokens
)
self
.
assertGreaterEqual
(
len
(
tokens
),
4
)
self
.
assertGreaterEqual
(
len
(
tokens
),
4
)
...
@@ -148,7 +148,8 @@ class CommonTestCases:
...
@@ -148,7 +148,8 @@ class CommonTestCases:
self
.
assertEqual
(
added_toks_2
,
len
(
new_toks_2
))
self
.
assertEqual
(
added_toks_2
,
len
(
new_toks_2
))
self
.
assertEqual
(
all_size_3
,
all_size_2
+
len
(
new_toks_2
))
self
.
assertEqual
(
all_size_3
,
all_size_2
+
len
(
new_toks_2
))
tokens
=
tokenizer
.
encode
(
">>>>|||<||<<|<< aaaaabbbbbb low cccccccccdddddddd <<<<<|||>|>>>>|> l"
)
tokens
=
tokenizer
.
encode
(
">>>>|||<||<<|<< aaaaabbbbbb low cccccccccdddddddd <<<<<|||>|>>>>|> l"
,
add_special_tokens
=
False
)
out_string
=
tokenizer
.
decode
(
tokens
)
out_string
=
tokenizer
.
decode
(
tokens
)
self
.
assertGreaterEqual
(
len
(
tokens
),
6
)
self
.
assertGreaterEqual
(
len
(
tokens
),
6
)
...
@@ -166,7 +167,7 @@ class CommonTestCases:
...
@@ -166,7 +167,7 @@ class CommonTestCases:
tokens
=
tokenizer
.
tokenize
(
input_text
)
tokens
=
tokenizer
.
tokenize
(
input_text
)
ids
=
tokenizer
.
convert_tokens_to_ids
(
tokens
)
ids
=
tokenizer
.
convert_tokens_to_ids
(
tokens
)
ids_2
=
tokenizer
.
encode
(
input_text
)
ids_2
=
tokenizer
.
encode
(
input_text
,
add_special_tokens
=
False
)
self
.
assertListEqual
(
ids
,
ids_2
)
self
.
assertListEqual
(
ids
,
ids_2
)
tokens_2
=
tokenizer
.
convert_ids_to_tokens
(
ids
)
tokens_2
=
tokenizer
.
convert_ids_to_tokens
(
ids
)
...
@@ -206,7 +207,7 @@ class CommonTestCases:
...
@@ -206,7 +207,7 @@ class CommonTestCases:
seq_0
=
"Test this method."
seq_0
=
"Test this method."
seq_1
=
"With these inputs."
seq_1
=
"With these inputs."
sequences
=
tokenizer
.
encode
(
seq_0
,
seq_1
)
sequences
=
tokenizer
.
encode
(
seq_0
,
seq_1
,
add_special_tokens
=
False
)
attached_sequences
=
tokenizer
.
encode
(
seq_0
,
seq_1
,
add_special_tokens
=
True
)
attached_sequences
=
tokenizer
.
encode
(
seq_0
,
seq_1
,
add_special_tokens
=
True
)
# Method is implemented (e.g. not GPT-2)
# Method is implemented (e.g. not GPT-2)
...
@@ -219,7 +220,7 @@ class CommonTestCases:
...
@@ -219,7 +220,7 @@ class CommonTestCases:
seq_0
=
"This is a sentence to be encoded."
seq_0
=
"This is a sentence to be encoded."
stride
=
2
stride
=
2
sequence
=
tokenizer
.
encode
(
seq_0
)
sequence
=
tokenizer
.
encode
(
seq_0
,
add_special_tokens
=
False
)
num_added_tokens
=
tokenizer
.
num_added_tokens
()
num_added_tokens
=
tokenizer
.
num_added_tokens
()
total_length
=
len
(
sequence
)
+
num_added_tokens
total_length
=
len
(
sequence
)
+
num_added_tokens
information
=
tokenizer
.
encode_plus
(
seq_0
,
max_length
=
total_length
-
2
,
add_special_tokens
=
True
,
stride
=
stride
)
information
=
tokenizer
.
encode_plus
(
seq_0
,
max_length
=
total_length
-
2
,
add_special_tokens
=
True
,
stride
=
stride
)
...
@@ -239,13 +240,13 @@ class CommonTestCases:
...
@@ -239,13 +240,13 @@ class CommonTestCases:
seq_1
=
"This is another sentence to be encoded."
seq_1
=
"This is another sentence to be encoded."
stride
=
2
stride
=
2
sequence_0_no_special_tokens
=
tokenizer
.
encode
(
seq_0
)
sequence_0_no_special_tokens
=
tokenizer
.
encode
(
seq_0
,
add_special_tokens
=
False
)
sequence_1_no_special_tokens
=
tokenizer
.
encode
(
seq_1
)
sequence_1_no_special_tokens
=
tokenizer
.
encode
(
seq_1
,
add_special_tokens
=
False
)
sequence
=
tokenizer
.
encode
(
seq_0
,
seq_1
,
add_special_tokens
=
True
)
sequence
=
tokenizer
.
encode
(
seq_0
,
seq_1
,
add_special_tokens
=
True
)
truncated_second_sequence
=
tokenizer
.
build_inputs_with_special_tokens
(
truncated_second_sequence
=
tokenizer
.
build_inputs_with_special_tokens
(
tokenizer
.
encode
(
seq_0
),
tokenizer
.
encode
(
seq_0
,
add_special_tokens
=
False
),
tokenizer
.
encode
(
seq_1
)[:
-
2
]
tokenizer
.
encode
(
seq_1
,
add_special_tokens
=
False
)[:
-
2
]
)
)
information
=
tokenizer
.
encode_plus
(
seq_0
,
seq_1
,
max_length
=
len
(
sequence
)
-
2
,
add_special_tokens
=
True
,
information
=
tokenizer
.
encode_plus
(
seq_0
,
seq_1
,
max_length
=
len
(
sequence
)
-
2
,
add_special_tokens
=
True
,
...
@@ -283,7 +284,7 @@ class CommonTestCases:
...
@@ -283,7 +284,7 @@ class CommonTestCases:
sequence_1
=
"This one too please."
sequence_1
=
"This one too please."
# Testing single inputs
# Testing single inputs
encoded_sequence
=
tokenizer
.
encode
(
sequence_0
)
encoded_sequence
=
tokenizer
.
encode
(
sequence_0
,
add_special_tokens
=
False
)
encoded_sequence_dict
=
tokenizer
.
encode_plus
(
sequence_0
,
add_special_tokens
=
True
)
encoded_sequence_dict
=
tokenizer
.
encode_plus
(
sequence_0
,
add_special_tokens
=
True
)
encoded_sequence_w_special
=
encoded_sequence_dict
[
"input_ids"
]
encoded_sequence_w_special
=
encoded_sequence_dict
[
"input_ids"
]
special_tokens_mask
=
encoded_sequence_dict
[
"special_tokens_mask"
]
special_tokens_mask
=
encoded_sequence_dict
[
"special_tokens_mask"
]
...
@@ -294,7 +295,8 @@ class CommonTestCases:
...
@@ -294,7 +295,8 @@ class CommonTestCases:
self
.
assertEqual
(
encoded_sequence
,
filtered_sequence
)
self
.
assertEqual
(
encoded_sequence
,
filtered_sequence
)
# Testing inputs pairs
# Testing inputs pairs
encoded_sequence
=
tokenizer
.
encode
(
sequence_0
)
+
tokenizer
.
encode
(
sequence_1
)
encoded_sequence
=
tokenizer
.
encode
(
sequence_0
,
add_special_tokens
=
False
)
+
tokenizer
.
encode
(
sequence_1
,
add_special_tokens
=
False
)
encoded_sequence_dict
=
tokenizer
.
encode_plus
(
sequence_0
,
sequence_1
,
add_special_tokens
=
True
)
encoded_sequence_dict
=
tokenizer
.
encode_plus
(
sequence_0
,
sequence_1
,
add_special_tokens
=
True
)
encoded_sequence_w_special
=
encoded_sequence_dict
[
"input_ids"
]
encoded_sequence_w_special
=
encoded_sequence_dict
[
"input_ids"
]
special_tokens_mask
=
encoded_sequence_dict
[
"special_tokens_mask"
]
special_tokens_mask
=
encoded_sequence_dict
[
"special_tokens_mask"
]
...
...
transformers/tests/tokenization_xlm_test.py
View file @
7d709e55
...
@@ -69,8 +69,8 @@ class XLMTokenizationTest(CommonTestCases.CommonTokenizerTester):
...
@@ -69,8 +69,8 @@ class XLMTokenizationTest(CommonTestCases.CommonTokenizerTester):
def
test_sequence_builders
(
self
):
def
test_sequence_builders
(
self
):
tokenizer
=
XLMTokenizer
.
from_pretrained
(
"xlm-mlm-en-2048"
)
tokenizer
=
XLMTokenizer
.
from_pretrained
(
"xlm-mlm-en-2048"
)
text
=
tokenizer
.
encode
(
"sequence builders"
)
text
=
tokenizer
.
encode
(
"sequence builders"
,
add_special_tokens
=
False
)
text_2
=
tokenizer
.
encode
(
"multi-sequence build"
)
text_2
=
tokenizer
.
encode
(
"multi-sequence build"
,
add_special_tokens
=
False
)
encoded_sentence
=
tokenizer
.
build_inputs_with_special_tokens
(
text
)
encoded_sentence
=
tokenizer
.
build_inputs_with_special_tokens
(
text
)
encoded_pair
=
tokenizer
.
build_inputs_with_special_tokens
(
text
,
text_2
)
encoded_pair
=
tokenizer
.
build_inputs_with_special_tokens
(
text
,
text_2
)
...
...
transformers/tests/tokenization_xlnet_test.py
View file @
7d709e55
...
@@ -92,8 +92,8 @@ class XLNetTokenizationTest(CommonTestCases.CommonTokenizerTester):
...
@@ -92,8 +92,8 @@ class XLNetTokenizationTest(CommonTestCases.CommonTokenizerTester):
def
test_sequence_builders
(
self
):
def
test_sequence_builders
(
self
):
tokenizer
=
XLNetTokenizer
.
from_pretrained
(
"xlnet-base-cased"
)
tokenizer
=
XLNetTokenizer
.
from_pretrained
(
"xlnet-base-cased"
)
text
=
tokenizer
.
encode
(
"sequence builders"
)
text
=
tokenizer
.
encode
(
"sequence builders"
,
add_special_tokens
=
False
)
text_2
=
tokenizer
.
encode
(
"multi-sequence build"
)
text_2
=
tokenizer
.
encode
(
"multi-sequence build"
,
add_special_tokens
=
False
)
encoded_sentence
=
tokenizer
.
build_inputs_with_special_tokens
(
text
)
encoded_sentence
=
tokenizer
.
build_inputs_with_special_tokens
(
text
)
encoded_pair
=
tokenizer
.
build_inputs_with_special_tokens
(
text
,
text_2
)
encoded_pair
=
tokenizer
.
build_inputs_with_special_tokens
(
text
,
text_2
)
...
...
transformers/tokenization_utils.py
View file @
7d709e55
...
@@ -689,14 +689,14 @@ class PreTrainedTokenizer(object):
...
@@ -689,14 +689,14 @@ class PreTrainedTokenizer(object):
raise
NotImplementedError
raise
NotImplementedError
def
encode
(
self
,
def
encode
(
self
,
text
,
text
,
text_pair
=
None
,
text_pair
=
None
,
add_special_tokens
=
Fals
e
,
add_special_tokens
=
Tru
e
,
max_length
=
None
,
max_length
=
None
,
stride
=
0
,
stride
=
0
,
truncation_strategy
=
'longest_first'
,
truncation_strategy
=
'longest_first'
,
return_tensors
=
None
,
return_tensors
=
None
,
**
kwargs
):
**
kwargs
):
"""
"""
Converts a string in a sequence of ids (integer), using the tokenizer and vocabulary.
Converts a string in a sequence of ids (integer), using the tokenizer and vocabulary.
...
@@ -739,7 +739,7 @@ class PreTrainedTokenizer(object):
...
@@ -739,7 +739,7 @@ class PreTrainedTokenizer(object):
def
encode_plus
(
self
,
def
encode_plus
(
self
,
text
,
text
,
text_pair
=
None
,
text_pair
=
None
,
add_special_tokens
=
Fals
e
,
add_special_tokens
=
Tru
e
,
max_length
=
None
,
max_length
=
None
,
stride
=
0
,
stride
=
0
,
truncation_strategy
=
'longest_first'
,
truncation_strategy
=
'longest_first'
,
...
@@ -794,7 +794,7 @@ class PreTrainedTokenizer(object):
...
@@ -794,7 +794,7 @@ class PreTrainedTokenizer(object):
truncation_strategy
=
truncation_strategy
,
truncation_strategy
=
truncation_strategy
,
return_tensors
=
return_tensors
)
return_tensors
=
return_tensors
)
def
prepare_for_model
(
self
,
ids
,
pair_ids
=
None
,
max_length
=
None
,
add_special_tokens
=
Fals
e
,
stride
=
0
,
def
prepare_for_model
(
self
,
ids
,
pair_ids
=
None
,
max_length
=
None
,
add_special_tokens
=
Tru
e
,
stride
=
0
,
truncation_strategy
=
'longest_first'
,
return_tensors
=
None
):
truncation_strategy
=
'longest_first'
,
return_tensors
=
None
):
"""
"""
Prepares a sequence of input id, or a pair of sequences of inputs ids so that it can be used by the model.
Prepares a sequence of input id, or a pair of sequences of inputs ids so that it can be used by the model.
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment