Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
86a63070
Commit
86a63070
authored
Oct 21, 2019
by
erenup
Browse files
Merge branch 'huggingface/master'
parents
b5d73976
82f6abd9
Changes
102
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
1131 additions
and
120 deletions
+1131
-120
transformers/tests/modeling_common_test.py
transformers/tests/modeling_common_test.py
+40
-0
transformers/tests/modeling_ctrl_test.py
transformers/tests/modeling_ctrl_test.py
+215
-0
transformers/tests/modeling_tf_common_test.py
transformers/tests/modeling_tf_common_test.py
+95
-0
transformers/tests/modeling_tf_ctrl_test.py
transformers/tests/modeling_tf_ctrl_test.py
+201
-0
transformers/tests/modeling_tf_gpt2_test.py
transformers/tests/modeling_tf_gpt2_test.py
+1
-1
transformers/tests/modeling_tf_xlnet_test.py
transformers/tests/modeling_tf_xlnet_test.py
+5
-0
transformers/tests/modeling_xlnet_test.py
transformers/tests/modeling_xlnet_test.py
+6
-0
transformers/tests/tokenization_bert_test.py
transformers/tests/tokenization_bert_test.py
+2
-2
transformers/tests/tokenization_ctrl_test.py
transformers/tests/tokenization_ctrl_test.py
+69
-0
transformers/tests/tokenization_distilbert_test.py
transformers/tests/tokenization_distilbert_test.py
+2
-2
transformers/tests/tokenization_roberta_test.py
transformers/tests/tokenization_roberta_test.py
+2
-2
transformers/tests/tokenization_tests_commons.py
transformers/tests/tokenization_tests_commons.py
+55
-17
transformers/tests/tokenization_xlm_test.py
transformers/tests/tokenization_xlm_test.py
+2
-2
transformers/tests/tokenization_xlnet_test.py
transformers/tests/tokenization_xlnet_test.py
+2
-2
transformers/tokenization_auto.py
transformers/tokenization_auto.py
+6
-2
transformers/tokenization_bert.py
transformers/tokenization_bert.py
+44
-12
transformers/tokenization_ctrl.py
transformers/tokenization_ctrl.py
+187
-0
transformers/tokenization_gpt2.py
transformers/tokenization_gpt2.py
+3
-0
transformers/tokenization_roberta.py
transformers/tokenization_roberta.py
+42
-12
transformers/tokenization_utils.py
transformers/tokenization_utils.py
+152
-66
No files found.
transformers/tests/modeling_common_test.py
View file @
86a63070
...
@@ -17,8 +17,10 @@ from __future__ import division
...
@@ -17,8 +17,10 @@ from __future__ import division
from
__future__
import
print_function
from
__future__
import
print_function
import
copy
import
copy
import
sys
import
os
import
os
import
shutil
import
shutil
import
tempfile
import
json
import
json
import
random
import
random
import
uuid
import
uuid
...
@@ -31,6 +33,7 @@ from transformers import is_torch_available
...
@@ -31,6 +33,7 @@ from transformers import is_torch_available
if
is_torch_available
():
if
is_torch_available
():
import
torch
import
torch
import
numpy
as
np
from
transformers
import
(
PretrainedConfig
,
PreTrainedModel
,
from
transformers
import
(
PretrainedConfig
,
PreTrainedModel
,
BertModel
,
BertConfig
,
BERT_PRETRAINED_MODEL_ARCHIVE_MAP
,
BertModel
,
BertConfig
,
BERT_PRETRAINED_MODEL_ARCHIVE_MAP
,
...
@@ -38,6 +41,20 @@ if is_torch_available():
...
@@ -38,6 +41,20 @@ if is_torch_available():
else
:
else
:
pytestmark
=
pytest
.
mark
.
skip
(
"Require Torch"
)
pytestmark
=
pytest
.
mark
.
skip
(
"Require Torch"
)
if
sys
.
version_info
[
0
]
==
2
:
import
cPickle
as
pickle
class
TemporaryDirectory
(
object
):
"""Context manager for tempfile.mkdtemp() so it's usable with "with" statement."""
def
__enter__
(
self
):
self
.
name
=
tempfile
.
mkdtemp
()
return
self
.
name
def
__exit__
(
self
,
exc_type
,
exc_value
,
traceback
):
shutil
.
rmtree
(
self
.
name
)
else
:
import
pickle
TemporaryDirectory
=
tempfile
.
TemporaryDirectory
unicode
=
str
def
_config_zero_init
(
config
):
def
_config_zero_init
(
config
):
configs_no_init
=
copy
.
deepcopy
(
config
)
configs_no_init
=
copy
.
deepcopy
(
config
)
...
@@ -57,6 +74,29 @@ class CommonTestCases:
...
@@ -57,6 +74,29 @@ class CommonTestCases:
test_resize_embeddings
=
True
test_resize_embeddings
=
True
test_head_masking
=
True
test_head_masking
=
True
def
test_save_load
(
self
):
config
,
inputs_dict
=
self
.
model_tester
.
prepare_config_and_inputs_for_common
()
for
model_class
in
self
.
all_model_classes
:
model
=
model_class
(
config
)
model
.
eval
()
with
torch
.
no_grad
():
outputs
=
model
(
**
inputs_dict
)
with
TemporaryDirectory
()
as
tmpdirname
:
model
.
save_pretrained
(
tmpdirname
)
model
=
model_class
.
from_pretrained
(
tmpdirname
)
with
torch
.
no_grad
():
after_outputs
=
model
(
**
inputs_dict
)
# Make sure we don't have nans
out_1
=
after_outputs
[
0
].
numpy
()
out_2
=
outputs
[
0
].
numpy
()
out_1
=
out_1
[
~
np
.
isnan
(
out_1
)]
out_2
=
out_2
[
~
np
.
isnan
(
out_2
)]
max_diff
=
np
.
amax
(
np
.
abs
(
out_1
-
out_2
))
self
.
assertLessEqual
(
max_diff
,
1e-5
)
def
test_initialization
(
self
):
def
test_initialization
(
self
):
config
,
inputs_dict
=
self
.
model_tester
.
prepare_config_and_inputs_for_common
()
config
,
inputs_dict
=
self
.
model_tester
.
prepare_config_and_inputs_for_common
()
...
...
transformers/tests/modeling_ctrl_test.py
0 → 100644
View file @
86a63070
# coding=utf-8
# Copyright 2018 Salesforce and HuggingFace Inc. team.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
__future__
import
absolute_import
from
__future__
import
division
from
__future__
import
print_function
import
unittest
import
pytest
import
shutil
import
pdb
from
transformers
import
is_torch_available
if
is_torch_available
():
from
transformers
import
(
CTRLConfig
,
CTRLModel
,
CTRL_PRETRAINED_MODEL_ARCHIVE_MAP
,
CTRLLMHeadModel
)
else
:
pytestmark
=
pytest
.
mark
.
skip
(
"Require Torch"
)
from
.modeling_common_test
import
(
CommonTestCases
,
ids_tensor
)
from
.configuration_common_test
import
ConfigTester
class
CTRLModelTest
(
CommonTestCases
.
CommonModelTester
):
all_model_classes
=
(
CTRLModel
,
CTRLLMHeadModel
)
if
is_torch_available
()
else
()
test_pruning
=
False
test_torchscript
=
False
test_resize_embeddings
=
False
test_head_masking
=
False
class
CTRLModelTester
(
object
):
def
__init__
(
self
,
parent
,
batch_size
=
13
,
seq_length
=
7
,
is_training
=
True
,
use_token_type_ids
=
True
,
use_input_mask
=
True
,
use_labels
=
True
,
use_mc_token_ids
=
True
,
vocab_size
=
99
,
hidden_size
=
32
,
num_hidden_layers
=
5
,
num_attention_heads
=
4
,
intermediate_size
=
37
,
hidden_act
=
"gelu"
,
hidden_dropout_prob
=
0.1
,
attention_probs_dropout_prob
=
0.1
,
max_position_embeddings
=
512
,
type_vocab_size
=
16
,
type_sequence_label_size
=
2
,
initializer_range
=
0.02
,
num_labels
=
3
,
num_choices
=
4
,
scope
=
None
,
):
self
.
parent
=
parent
self
.
batch_size
=
batch_size
self
.
seq_length
=
seq_length
self
.
is_training
=
is_training
self
.
use_token_type_ids
=
use_token_type_ids
self
.
use_input_mask
=
use_input_mask
self
.
use_labels
=
use_labels
self
.
use_mc_token_ids
=
use_mc_token_ids
self
.
vocab_size
=
vocab_size
self
.
hidden_size
=
hidden_size
self
.
num_hidden_layers
=
num_hidden_layers
self
.
num_attention_heads
=
num_attention_heads
self
.
intermediate_size
=
intermediate_size
self
.
hidden_act
=
hidden_act
self
.
hidden_dropout_prob
=
hidden_dropout_prob
self
.
attention_probs_dropout_prob
=
attention_probs_dropout_prob
self
.
max_position_embeddings
=
max_position_embeddings
self
.
type_vocab_size
=
type_vocab_size
self
.
type_sequence_label_size
=
type_sequence_label_size
self
.
initializer_range
=
initializer_range
self
.
num_labels
=
num_labels
self
.
num_choices
=
num_choices
self
.
scope
=
scope
def
prepare_config_and_inputs
(
self
):
input_ids
=
ids_tensor
([
self
.
batch_size
,
self
.
seq_length
],
self
.
vocab_size
)
input_mask
=
None
if
self
.
use_input_mask
:
input_mask
=
ids_tensor
([
self
.
batch_size
,
self
.
seq_length
],
vocab_size
=
2
)
token_type_ids
=
None
if
self
.
use_token_type_ids
:
token_type_ids
=
ids_tensor
([
self
.
batch_size
,
self
.
seq_length
],
self
.
type_vocab_size
)
mc_token_ids
=
None
if
self
.
use_mc_token_ids
:
mc_token_ids
=
ids_tensor
([
self
.
batch_size
,
self
.
num_choices
],
self
.
seq_length
)
sequence_labels
=
None
token_labels
=
None
choice_labels
=
None
if
self
.
use_labels
:
sequence_labels
=
ids_tensor
([
self
.
batch_size
],
self
.
type_sequence_label_size
)
token_labels
=
ids_tensor
([
self
.
batch_size
,
self
.
seq_length
],
self
.
num_labels
)
choice_labels
=
ids_tensor
([
self
.
batch_size
],
self
.
num_choices
)
config
=
CTRLConfig
(
vocab_size_or_config_json_file
=
self
.
vocab_size
,
n_embd
=
self
.
hidden_size
,
n_layer
=
self
.
num_hidden_layers
,
n_head
=
self
.
num_attention_heads
,
# intermediate_size=self.intermediate_size,
# hidden_act=self.hidden_act,
# hidden_dropout_prob=self.hidden_dropout_prob,
# attention_probs_dropout_prob=self.attention_probs_dropout_prob,
n_positions
=
self
.
max_position_embeddings
,
n_ctx
=
self
.
max_position_embeddings
# type_vocab_size=self.type_vocab_size,
# initializer_range=self.initializer_range
)
head_mask
=
ids_tensor
([
self
.
num_hidden_layers
,
self
.
num_attention_heads
],
2
)
return
config
,
input_ids
,
input_mask
,
head_mask
,
token_type_ids
,
mc_token_ids
,
sequence_labels
,
token_labels
,
choice_labels
def
check_loss_output
(
self
,
result
):
self
.
parent
.
assertListEqual
(
list
(
result
[
"loss"
].
size
()),
[])
def
create_and_check_ctrl_model
(
self
,
config
,
input_ids
,
input_mask
,
head_mask
,
token_type_ids
,
*
args
):
model
=
CTRLModel
(
config
=
config
)
model
.
eval
()
model
(
input_ids
,
token_type_ids
=
token_type_ids
,
head_mask
=
head_mask
)
model
(
input_ids
,
token_type_ids
=
token_type_ids
)
sequence_output
,
presents
=
model
(
input_ids
)
result
=
{
"sequence_output"
:
sequence_output
,
"presents"
:
presents
,
}
self
.
parent
.
assertListEqual
(
list
(
result
[
"sequence_output"
].
size
()),
[
self
.
batch_size
,
self
.
seq_length
,
self
.
hidden_size
])
self
.
parent
.
assertEqual
(
len
(
result
[
"presents"
]),
config
.
n_layer
)
def
create_and_check_lm_head_model
(
self
,
config
,
input_ids
,
input_mask
,
head_mask
,
token_type_ids
,
*
args
):
model
=
CTRLLMHeadModel
(
config
)
model
.
eval
()
loss
,
lm_logits
,
_
=
model
(
input_ids
,
token_type_ids
=
token_type_ids
,
labels
=
input_ids
)
result
=
{
"loss"
:
loss
,
"lm_logits"
:
lm_logits
}
self
.
parent
.
assertListEqual
(
list
(
result
[
"loss"
].
size
()),
[])
self
.
parent
.
assertListEqual
(
list
(
result
[
"lm_logits"
].
size
()),
[
self
.
batch_size
,
self
.
seq_length
,
self
.
vocab_size
])
def
prepare_config_and_inputs_for_common
(
self
):
config_and_inputs
=
self
.
prepare_config_and_inputs
()
(
config
,
input_ids
,
input_mask
,
head_mask
,
token_type_ids
,
mc_token_ids
,
sequence_labels
,
token_labels
,
choice_labels
)
=
config_and_inputs
inputs_dict
=
{
'input_ids'
:
input_ids
,
'token_type_ids'
:
token_type_ids
,
'head_mask'
:
head_mask
}
return
config
,
inputs_dict
def
setUp
(
self
):
self
.
model_tester
=
CTRLModelTest
.
CTRLModelTester
(
self
)
self
.
config_tester
=
ConfigTester
(
self
,
config_class
=
CTRLConfig
,
n_embd
=
37
)
def
test_config
(
self
):
self
.
config_tester
.
run_common_tests
()
def
test_ctrl_model
(
self
):
config_and_inputs
=
self
.
model_tester
.
prepare_config_and_inputs
()
self
.
model_tester
.
create_and_check_ctrl_model
(
*
config_and_inputs
)
def
test_ctrl_lm_head_model
(
self
):
config_and_inputs
=
self
.
model_tester
.
prepare_config_and_inputs
()
self
.
model_tester
.
create_and_check_lm_head_model
(
*
config_and_inputs
)
@
pytest
.
mark
.
slow
def
test_model_from_pretrained
(
self
):
cache_dir
=
"/tmp/transformers_test/"
for
model_name
in
list
(
CTRL_PRETRAINED_MODEL_ARCHIVE_MAP
.
keys
())[:
1
]:
model
=
CTRLModel
.
from_pretrained
(
model_name
,
cache_dir
=
cache_dir
)
shutil
.
rmtree
(
cache_dir
)
self
.
assertIsNotNone
(
model
)
if
__name__
==
"__main__"
:
unittest
.
main
()
transformers/tests/modeling_tf_common_test.py
View file @
86a63070
...
@@ -14,6 +14,7 @@
...
@@ -14,6 +14,7 @@
# limitations under the License.
# limitations under the License.
from
__future__
import
absolute_import
,
division
,
print_function
from
__future__
import
absolute_import
,
division
,
print_function
import
os
import
copy
import
copy
import
json
import
json
import
logging
import
logging
...
@@ -22,6 +23,7 @@ import random
...
@@ -22,6 +23,7 @@ import random
import
shutil
import
shutil
import
unittest
import
unittest
import
uuid
import
uuid
import
tempfile
import
pytest
import
pytest
import
sys
import
sys
...
@@ -36,6 +38,20 @@ if is_tf_available():
...
@@ -36,6 +38,20 @@ if is_tf_available():
else
:
else
:
pytestmark
=
pytest
.
mark
.
skip
(
"Require TensorFlow"
)
pytestmark
=
pytest
.
mark
.
skip
(
"Require TensorFlow"
)
if
sys
.
version_info
[
0
]
==
2
:
import
cPickle
as
pickle
class
TemporaryDirectory
(
object
):
"""Context manager for tempfile.mkdtemp() so it's usable with "with" statement."""
def
__enter__
(
self
):
self
.
name
=
tempfile
.
mkdtemp
()
return
self
.
name
def
__exit__
(
self
,
exc_type
,
exc_value
,
traceback
):
shutil
.
rmtree
(
self
.
name
)
else
:
import
pickle
TemporaryDirectory
=
tempfile
.
TemporaryDirectory
unicode
=
str
def
_config_zero_init
(
config
):
def
_config_zero_init
(
config
):
configs_no_init
=
copy
.
deepcopy
(
config
)
configs_no_init
=
copy
.
deepcopy
(
config
)
...
@@ -66,11 +82,31 @@ class TFCommonTestCases:
...
@@ -66,11 +82,31 @@ class TFCommonTestCases:
# self.assertIn(param.data.mean().item(), [0.0, 1.0],
# self.assertIn(param.data.mean().item(), [0.0, 1.0],
# msg="Parameter {} of model {} seems not properly initialized".format(name, model_class))
# msg="Parameter {} of model {} seems not properly initialized".format(name, model_class))
def
test_save_load
(
self
):
config
,
inputs_dict
=
self
.
model_tester
.
prepare_config_and_inputs_for_common
()
for
model_class
in
self
.
all_model_classes
:
model
=
model_class
(
config
)
outputs
=
model
(
inputs_dict
)
with
TemporaryDirectory
()
as
tmpdirname
:
model
.
save_pretrained
(
tmpdirname
)
model
=
model_class
.
from_pretrained
(
tmpdirname
)
after_outputs
=
model
(
inputs_dict
)
# Make sure we don't have nans
out_1
=
after_outputs
[
0
].
numpy
()
out_2
=
outputs
[
0
].
numpy
()
out_1
=
out_1
[
~
np
.
isnan
(
out_1
)]
out_2
=
out_2
[
~
np
.
isnan
(
out_2
)]
max_diff
=
np
.
amax
(
np
.
abs
(
out_1
-
out_2
))
self
.
assertLessEqual
(
max_diff
,
1e-5
)
def
test_pt_tf_model_equivalence
(
self
):
def
test_pt_tf_model_equivalence
(
self
):
if
not
is_torch_available
():
if
not
is_torch_available
():
return
return
import
torch
import
transformers
import
transformers
config
,
inputs_dict
=
self
.
model_tester
.
prepare_config_and_inputs_for_common
()
config
,
inputs_dict
=
self
.
model_tester
.
prepare_config_and_inputs_for_common
()
...
@@ -79,12 +115,71 @@ class TFCommonTestCases:
...
@@ -79,12 +115,71 @@ class TFCommonTestCases:
pt_model_class_name
=
model_class
.
__name__
[
2
:]
# Skip the "TF" at the beggining
pt_model_class_name
=
model_class
.
__name__
[
2
:]
# Skip the "TF" at the beggining
pt_model_class
=
getattr
(
transformers
,
pt_model_class_name
)
pt_model_class
=
getattr
(
transformers
,
pt_model_class_name
)
config
.
output_hidden_states
=
True
tf_model
=
model_class
(
config
)
tf_model
=
model_class
(
config
)
pt_model
=
pt_model_class
(
config
)
pt_model
=
pt_model_class
(
config
)
# Check we can load pt model in tf and vice-versa with model => model functions
tf_model
=
transformers
.
load_pytorch_model_in_tf2_model
(
tf_model
,
pt_model
,
tf_inputs
=
inputs_dict
)
tf_model
=
transformers
.
load_pytorch_model_in_tf2_model
(
tf_model
,
pt_model
,
tf_inputs
=
inputs_dict
)
pt_model
=
transformers
.
load_tf2_model_in_pytorch_model
(
pt_model
,
tf_model
)
pt_model
=
transformers
.
load_tf2_model_in_pytorch_model
(
pt_model
,
tf_model
)
# Check predictions on first output (logits/hidden-states) are close enought given low-level computational differences
pt_model
.
eval
()
pt_inputs_dict
=
dict
((
name
,
torch
.
from_numpy
(
key
.
numpy
()).
to
(
torch
.
long
))
for
name
,
key
in
inputs_dict
.
items
())
with
torch
.
no_grad
():
pto
=
pt_model
(
**
pt_inputs_dict
)
tfo
=
tf_model
(
inputs_dict
)
max_diff
=
np
.
amax
(
np
.
abs
(
tfo
[
0
].
numpy
()
-
pto
[
0
].
numpy
()))
self
.
assertLessEqual
(
max_diff
,
2e-2
)
# Check we can load pt model in tf and vice-versa with checkpoint => model functions
with
TemporaryDirectory
()
as
tmpdirname
:
pt_checkpoint_path
=
os
.
path
.
join
(
tmpdirname
,
'pt_model.bin'
)
torch
.
save
(
pt_model
.
state_dict
(),
pt_checkpoint_path
)
tf_model
=
transformers
.
load_pytorch_checkpoint_in_tf2_model
(
tf_model
,
pt_checkpoint_path
)
tf_checkpoint_path
=
os
.
path
.
join
(
tmpdirname
,
'tf_model.h5'
)
tf_model
.
save_weights
(
tf_checkpoint_path
)
pt_model
=
transformers
.
load_tf2_checkpoint_in_pytorch_model
(
pt_model
,
tf_checkpoint_path
)
# Check predictions on first output (logits/hidden-states) are close enought given low-level computational differences
pt_model
.
eval
()
pt_inputs_dict
=
dict
((
name
,
torch
.
from_numpy
(
key
.
numpy
()).
to
(
torch
.
long
))
for
name
,
key
in
inputs_dict
.
items
())
with
torch
.
no_grad
():
pto
=
pt_model
(
**
pt_inputs_dict
)
tfo
=
tf_model
(
inputs_dict
)
max_diff
=
np
.
amax
(
np
.
abs
(
tfo
[
0
].
numpy
()
-
pto
[
0
].
numpy
()))
self
.
assertLessEqual
(
max_diff
,
2e-2
)
def
test_compile_tf_model
(
self
):
config
,
inputs_dict
=
self
.
model_tester
.
prepare_config_and_inputs_for_common
()
input_ids
=
tf
.
keras
.
Input
(
batch_shape
=
(
2
,
2000
),
name
=
'input_ids'
,
dtype
=
'int32'
)
optimizer
=
tf
.
keras
.
optimizers
.
Adam
(
learning_rate
=
3e-5
,
epsilon
=
1e-08
,
clipnorm
=
1.0
)
loss
=
tf
.
keras
.
losses
.
SparseCategoricalCrossentropy
(
from_logits
=
True
)
metric
=
tf
.
keras
.
metrics
.
SparseCategoricalAccuracy
(
'accuracy'
)
for
model_class
in
self
.
all_model_classes
:
# Prepare our model
model
=
model_class
(
config
)
# Let's load it from the disk to be sure we can use pretrained weights
with
TemporaryDirectory
()
as
tmpdirname
:
outputs
=
model
(
inputs_dict
)
# build the model
model
.
save_pretrained
(
tmpdirname
)
model
=
model_class
.
from_pretrained
(
tmpdirname
)
outputs_dict
=
model
(
input_ids
)
hidden_states
=
outputs_dict
[
0
]
# Add a dense layer on top to test intetgration with other keras modules
outputs
=
tf
.
keras
.
layers
.
Dense
(
2
,
activation
=
'softmax'
,
name
=
'outputs'
)(
hidden_states
)
# Compile extended model
extended_model
=
tf
.
keras
.
Model
(
inputs
=
[
input_ids
],
outputs
=
[
outputs
])
extended_model
.
compile
(
optimizer
=
optimizer
,
loss
=
loss
,
metrics
=
[
metric
])
def
test_keyword_and_dict_args
(
self
):
def
test_keyword_and_dict_args
(
self
):
config
,
inputs_dict
=
self
.
model_tester
.
prepare_config_and_inputs_for_common
()
config
,
inputs_dict
=
self
.
model_tester
.
prepare_config_and_inputs_for_common
()
...
...
transformers/tests/modeling_tf_ctrl_test.py
0 → 100644
View file @
86a63070
# coding=utf-8
# Copyright 2018 The Google AI Language Team Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
__future__
import
absolute_import
from
__future__
import
division
from
__future__
import
print_function
import
unittest
import
shutil
import
pytest
import
sys
from
.modeling_tf_common_test
import
(
TFCommonTestCases
,
ids_tensor
)
from
.configuration_common_test
import
ConfigTester
from
transformers
import
CTRLConfig
,
is_tf_available
if
is_tf_available
():
import
tensorflow
as
tf
from
transformers.modeling_tf_ctrl
import
(
TFCTRLModel
,
TFCTRLLMHeadModel
,
TF_CTRL_PRETRAINED_MODEL_ARCHIVE_MAP
)
else
:
pytestmark
=
pytest
.
mark
.
skip
(
"Require TensorFlow"
)
class
TFCTRLModelTest
(
TFCommonTestCases
.
TFCommonModelTester
):
all_model_classes
=
(
TFCTRLModel
,
TFCTRLLMHeadModel
)
if
is_tf_available
()
else
()
class
TFCTRLModelTester
(
object
):
def
__init__
(
self
,
parent
,
batch_size
=
13
,
seq_length
=
7
,
is_training
=
True
,
use_token_type_ids
=
True
,
use_input_mask
=
True
,
use_labels
=
True
,
use_mc_token_ids
=
True
,
vocab_size
=
99
,
hidden_size
=
32
,
num_hidden_layers
=
5
,
num_attention_heads
=
4
,
intermediate_size
=
37
,
hidden_act
=
"gelu"
,
hidden_dropout_prob
=
0.1
,
attention_probs_dropout_prob
=
0.1
,
max_position_embeddings
=
512
,
type_vocab_size
=
16
,
type_sequence_label_size
=
2
,
initializer_range
=
0.02
,
num_labels
=
3
,
num_choices
=
4
,
scope
=
None
,
):
self
.
parent
=
parent
self
.
batch_size
=
batch_size
self
.
seq_length
=
seq_length
self
.
is_training
=
is_training
self
.
use_token_type_ids
=
use_token_type_ids
self
.
use_input_mask
=
use_input_mask
self
.
use_labels
=
use_labels
self
.
use_mc_token_ids
=
use_mc_token_ids
self
.
vocab_size
=
vocab_size
self
.
hidden_size
=
hidden_size
self
.
num_hidden_layers
=
num_hidden_layers
self
.
num_attention_heads
=
num_attention_heads
self
.
intermediate_size
=
intermediate_size
self
.
hidden_act
=
hidden_act
self
.
hidden_dropout_prob
=
hidden_dropout_prob
self
.
attention_probs_dropout_prob
=
attention_probs_dropout_prob
self
.
max_position_embeddings
=
max_position_embeddings
self
.
type_vocab_size
=
type_vocab_size
self
.
type_sequence_label_size
=
type_sequence_label_size
self
.
initializer_range
=
initializer_range
self
.
num_labels
=
num_labels
self
.
num_choices
=
num_choices
self
.
scope
=
scope
def
prepare_config_and_inputs
(
self
):
input_ids
=
ids_tensor
([
self
.
batch_size
,
self
.
seq_length
],
self
.
vocab_size
)
input_mask
=
None
if
self
.
use_input_mask
:
input_mask
=
ids_tensor
([
self
.
batch_size
,
self
.
seq_length
],
vocab_size
=
2
)
token_type_ids
=
None
if
self
.
use_token_type_ids
:
token_type_ids
=
ids_tensor
([
self
.
batch_size
,
self
.
seq_length
],
self
.
type_vocab_size
)
mc_token_ids
=
None
if
self
.
use_mc_token_ids
:
mc_token_ids
=
ids_tensor
([
self
.
batch_size
,
self
.
num_choices
],
self
.
seq_length
)
sequence_labels
=
None
token_labels
=
None
choice_labels
=
None
if
self
.
use_labels
:
sequence_labels
=
ids_tensor
([
self
.
batch_size
],
self
.
type_sequence_label_size
)
token_labels
=
ids_tensor
([
self
.
batch_size
,
self
.
seq_length
],
self
.
num_labels
)
choice_labels
=
ids_tensor
([
self
.
batch_size
],
self
.
num_choices
)
config
=
CTRLConfig
(
vocab_size_or_config_json_file
=
self
.
vocab_size
,
n_embd
=
self
.
hidden_size
,
n_layer
=
self
.
num_hidden_layers
,
n_head
=
self
.
num_attention_heads
,
# intermediate_size=self.intermediate_size,
# hidden_act=self.hidden_act,
# hidden_dropout_prob=self.hidden_dropout_prob,
# attention_probs_dropout_prob=self.attention_probs_dropout_prob,
n_positions
=
self
.
max_position_embeddings
,
n_ctx
=
self
.
max_position_embeddings
# type_vocab_size=self.type_vocab_size,
# initializer_range=self.initializer_range
)
head_mask
=
ids_tensor
([
self
.
num_hidden_layers
,
self
.
num_attention_heads
],
2
)
return
config
,
input_ids
,
input_mask
,
head_mask
,
token_type_ids
,
mc_token_ids
,
sequence_labels
,
token_labels
,
choice_labels
def
create_and_check_ctrl_model
(
self
,
config
,
input_ids
,
input_mask
,
head_mask
,
token_type_ids
,
*
args
):
model
=
TFCTRLModel
(
config
=
config
)
inputs
=
{
'input_ids'
:
input_ids
,
'attention_mask'
:
input_mask
,
'token_type_ids'
:
token_type_ids
}
sequence_output
=
model
(
inputs
)[
0
]
inputs
=
[
input_ids
,
None
,
input_mask
]
# None is the input for 'past'
sequence_output
=
model
(
inputs
)[
0
]
sequence_output
=
model
(
input_ids
)[
0
]
result
=
{
"sequence_output"
:
sequence_output
.
numpy
(),
}
self
.
parent
.
assertListEqual
(
list
(
result
[
"sequence_output"
].
shape
),
[
self
.
batch_size
,
self
.
seq_length
,
self
.
hidden_size
])
def
create_and_check_ctrl_lm_head
(
self
,
config
,
input_ids
,
input_mask
,
head_mask
,
token_type_ids
,
*
args
):
model
=
TFCTRLLMHeadModel
(
config
=
config
)
inputs
=
{
'input_ids'
:
input_ids
,
'attention_mask'
:
input_mask
,
'token_type_ids'
:
token_type_ids
}
prediction_scores
=
model
(
inputs
)[
0
]
result
=
{
"prediction_scores"
:
prediction_scores
.
numpy
(),
}
self
.
parent
.
assertListEqual
(
list
(
result
[
"prediction_scores"
].
shape
),
[
self
.
batch_size
,
self
.
seq_length
,
self
.
vocab_size
])
def
prepare_config_and_inputs_for_common
(
self
):
config_and_inputs
=
self
.
prepare_config_and_inputs
()
(
config
,
input_ids
,
input_mask
,
head_mask
,
token_type_ids
,
mc_token_ids
,
sequence_labels
,
token_labels
,
choice_labels
)
=
config_and_inputs
inputs_dict
=
{
'input_ids'
:
input_ids
,
'token_type_ids'
:
token_type_ids
,
'attention_mask'
:
input_mask
}
return
config
,
inputs_dict
def
setUp
(
self
):
self
.
model_tester
=
TFCTRLModelTest
.
TFCTRLModelTester
(
self
)
self
.
config_tester
=
ConfigTester
(
self
,
config_class
=
CTRLConfig
,
n_embd
=
37
)
def
test_config
(
self
):
self
.
config_tester
.
run_common_tests
()
def
test_ctrl_model
(
self
):
config_and_inputs
=
self
.
model_tester
.
prepare_config_and_inputs
()
self
.
model_tester
.
create_and_check_ctrl_model
(
*
config_and_inputs
)
def
test_ctrl_lm_head
(
self
):
config_and_inputs
=
self
.
model_tester
.
prepare_config_and_inputs
()
self
.
model_tester
.
create_and_check_ctrl_lm_head
(
*
config_and_inputs
)
@
pytest
.
mark
.
slow
def
test_model_from_pretrained
(
self
):
cache_dir
=
"/tmp/transformers_test/"
for
model_name
in
list
(
TF_CTRL_PRETRAINED_MODEL_ARCHIVE_MAP
.
keys
())[:
1
]:
model
=
TFCTRLModel
.
from_pretrained
(
model_name
,
cache_dir
=
cache_dir
)
shutil
.
rmtree
(
cache_dir
)
self
.
assertIsNotNone
(
model
)
if
__name__
==
"__main__"
:
unittest
.
main
()
transformers/tests/modeling_tf_gpt2_test.py
View file @
86a63070
...
@@ -222,7 +222,7 @@ class TFGPT2ModelTest(TFCommonTestCases.TFCommonModelTester):
...
@@ -222,7 +222,7 @@ class TFGPT2ModelTest(TFCommonTestCases.TFCommonModelTester):
@
pytest
.
mark
.
slow
@
pytest
.
mark
.
slow
def
test_model_from_pretrained
(
self
):
def
test_model_from_pretrained
(
self
):
cache_dir
=
"/tmp/transformers_test/"
cache_dir
=
"/tmp/transformers_test/"
for
model_name
in
list
(
TF_
gpt
2_PRETRAINED_MODEL_ARCHIVE_MAP
.
keys
())[:
1
]:
for
model_name
in
list
(
TF_
GPT
2_PRETRAINED_MODEL_ARCHIVE_MAP
.
keys
())[:
1
]:
model
=
TFGPT2Model
.
from_pretrained
(
model_name
,
cache_dir
=
cache_dir
)
model
=
TFGPT2Model
.
from_pretrained
(
model_name
,
cache_dir
=
cache_dir
)
shutil
.
rmtree
(
cache_dir
)
shutil
.
rmtree
(
cache_dir
)
self
.
assertIsNotNone
(
model
)
self
.
assertIsNotNone
(
model
)
...
...
transformers/tests/modeling_tf_xlnet_test.py
View file @
86a63070
...
@@ -161,6 +161,11 @@ class TFXLNetModelTest(TFCommonTestCases.TFCommonModelTester):
...
@@ -161,6 +161,11 @@ class TFXLNetModelTest(TFCommonTestCases.TFCommonModelTester):
"outputs"
:
outputs
.
numpy
(),
"outputs"
:
outputs
.
numpy
(),
}
}
config
.
mem_len
=
0
model
=
TFXLNetModel
(
config
)
no_mems_outputs
=
model
(
inputs
)
self
.
parent
.
assertEqual
(
len
(
no_mems_outputs
),
1
)
self
.
parent
.
assertListEqual
(
self
.
parent
.
assertListEqual
(
list
(
result
[
"outputs"
].
shape
),
list
(
result
[
"outputs"
].
shape
),
[
self
.
batch_size
,
self
.
seq_length
,
self
.
hidden_size
])
[
self
.
batch_size
,
self
.
seq_length
,
self
.
hidden_size
])
...
...
transformers/tests/modeling_xlnet_test.py
View file @
86a63070
...
@@ -150,6 +150,12 @@ class XLNetModelTest(CommonTestCases.CommonModelTester):
...
@@ -150,6 +150,12 @@ class XLNetModelTest(CommonTestCases.CommonModelTester):
"outputs"
:
outputs
,
"outputs"
:
outputs
,
}
}
config
.
mem_len
=
0
model
=
XLNetModel
(
config
)
model
.
eval
()
no_mems_outputs
=
model
(
input_ids_1
)
self
.
parent
.
assertEqual
(
len
(
no_mems_outputs
),
1
)
self
.
parent
.
assertListEqual
(
self
.
parent
.
assertListEqual
(
list
(
result
[
"outputs"
].
size
()),
list
(
result
[
"outputs"
].
size
()),
[
self
.
batch_size
,
self
.
seq_length
,
self
.
hidden_size
])
[
self
.
batch_size
,
self
.
seq_length
,
self
.
hidden_size
])
...
...
transformers/tests/tokenization_bert_test.py
View file @
86a63070
...
@@ -131,8 +131,8 @@ class BertTokenizationTest(CommonTestCases.CommonTokenizerTester):
...
@@ -131,8 +131,8 @@ class BertTokenizationTest(CommonTestCases.CommonTokenizerTester):
text
=
tokenizer
.
encode
(
"sequence builders"
)
text
=
tokenizer
.
encode
(
"sequence builders"
)
text_2
=
tokenizer
.
encode
(
"multi-sequence build"
)
text_2
=
tokenizer
.
encode
(
"multi-sequence build"
)
encoded_sentence
=
tokenizer
.
add_special_tokens_single_sequence
(
text
)
encoded_sentence
=
tokenizer
.
build_inputs_with_special_tokens
(
text
)
encoded_pair
=
tokenizer
.
add_special_tokens_sequence_pair
(
text
,
text_2
)
encoded_pair
=
tokenizer
.
build_inputs_with_special_tokens
(
text
,
text_2
)
assert
encoded_sentence
==
[
101
]
+
text
+
[
102
]
assert
encoded_sentence
==
[
101
]
+
text
+
[
102
]
assert
encoded_pair
==
[
101
]
+
text
+
[
102
]
+
text_2
+
[
102
]
assert
encoded_pair
==
[
101
]
+
text
+
[
102
]
+
text_2
+
[
102
]
...
...
transformers/tests/tokenization_ctrl_test.py
0 → 100644
View file @
86a63070
# coding=utf-8
# Copyright 2018 Salesforce and HuggingFace Inc. team.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
__future__
import
absolute_import
,
division
,
print_function
,
unicode_literals
import
os
import
unittest
import
json
from
io
import
open
from
transformers.tokenization_ctrl
import
CTRLTokenizer
,
VOCAB_FILES_NAMES
from
.tokenization_tests_commons
import
CommonTestCases
class
CTRLTokenizationTest
(
CommonTestCases
.
CommonTokenizerTester
):
tokenizer_class
=
CTRLTokenizer
def
setUp
(
self
):
super
(
CTRLTokenizationTest
,
self
).
setUp
()
# Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt
vocab
=
[
'adapt'
,
're@@'
,
'a@@'
,
'apt'
,
'c@@'
,
't'
,
'<unk>'
]
vocab_tokens
=
dict
(
zip
(
vocab
,
range
(
len
(
vocab
))))
merges
=
[
"#version: 0.2"
,
'a p'
,
'ap t</w>'
,
'r e'
,
'a d'
,
'ad apt</w>'
,
''
]
self
.
special_tokens_map
=
{
"unk_token"
:
"<unk>"
}
self
.
vocab_file
=
os
.
path
.
join
(
self
.
tmpdirname
,
VOCAB_FILES_NAMES
[
'vocab_file'
])
self
.
merges_file
=
os
.
path
.
join
(
self
.
tmpdirname
,
VOCAB_FILES_NAMES
[
'merges_file'
])
with
open
(
self
.
vocab_file
,
"w"
,
encoding
=
"utf-8"
)
as
fp
:
fp
.
write
(
json
.
dumps
(
vocab_tokens
)
+
"
\n
"
)
with
open
(
self
.
merges_file
,
"w"
,
encoding
=
"utf-8"
)
as
fp
:
fp
.
write
(
"
\n
"
.
join
(
merges
))
def
get_tokenizer
(
self
,
**
kwargs
):
kwargs
.
update
(
self
.
special_tokens_map
)
return
CTRLTokenizer
.
from_pretrained
(
self
.
tmpdirname
,
**
kwargs
)
def
get_input_output_texts
(
self
):
input_text
=
u
"adapt react readapt apt"
output_text
=
u
"adapt react readapt apt"
return
input_text
,
output_text
def
test_full_tokenizer
(
self
):
tokenizer
=
CTRLTokenizer
(
self
.
vocab_file
,
self
.
merges_file
,
**
self
.
special_tokens_map
)
text
=
"adapt react readapt apt"
bpe_tokens
=
'adapt re@@ a@@ c@@ t re@@ adapt apt'
.
split
()
tokens
=
tokenizer
.
tokenize
(
text
)
self
.
assertListEqual
(
tokens
,
bpe_tokens
)
input_tokens
=
tokens
+
[
tokenizer
.
unk_token
]
input_bpe_tokens
=
[
0
,
1
,
2
,
4
,
5
,
1
,
0
,
3
,
6
]
self
.
assertListEqual
(
tokenizer
.
convert_tokens_to_ids
(
input_tokens
),
input_bpe_tokens
)
if
__name__
==
'__main__'
:
unittest
.
main
()
transformers/tests/tokenization_distilbert_test.py
View file @
86a63070
...
@@ -36,8 +36,8 @@ class DistilBertTokenizationTest(BertTokenizationTest):
...
@@ -36,8 +36,8 @@ class DistilBertTokenizationTest(BertTokenizationTest):
text
=
tokenizer
.
encode
(
"sequence builders"
)
text
=
tokenizer
.
encode
(
"sequence builders"
)
text_2
=
tokenizer
.
encode
(
"multi-sequence build"
)
text_2
=
tokenizer
.
encode
(
"multi-sequence build"
)
encoded_sentence
=
tokenizer
.
add_special_tokens_single_sequence
(
text
)
encoded_sentence
=
tokenizer
.
build_inputs_with_special_tokens
(
text
)
encoded_pair
=
tokenizer
.
add_special_tokens_sequence_pair
(
text
,
text_2
)
encoded_pair
=
tokenizer
.
build_inputs_with_special_tokens
(
text
,
text_2
)
assert
encoded_sentence
==
[
tokenizer
.
cls_token_id
]
+
text
+
[
tokenizer
.
sep_token_id
]
assert
encoded_sentence
==
[
tokenizer
.
cls_token_id
]
+
text
+
[
tokenizer
.
sep_token_id
]
assert
encoded_pair
==
[
tokenizer
.
cls_token_id
]
+
text
+
[
tokenizer
.
sep_token_id
]
+
\
assert
encoded_pair
==
[
tokenizer
.
cls_token_id
]
+
text
+
[
tokenizer
.
sep_token_id
]
+
\
...
...
transformers/tests/tokenization_roberta_test.py
View file @
86a63070
...
@@ -87,8 +87,8 @@ class RobertaTokenizationTest(CommonTestCases.CommonTokenizerTester):
...
@@ -87,8 +87,8 @@ class RobertaTokenizationTest(CommonTestCases.CommonTokenizerTester):
encoded_text_from_decode
=
tokenizer
.
encode
(
"sequence builders"
,
add_special_tokens
=
True
)
encoded_text_from_decode
=
tokenizer
.
encode
(
"sequence builders"
,
add_special_tokens
=
True
)
encoded_pair_from_decode
=
tokenizer
.
encode
(
"sequence builders"
,
"multi-sequence build"
,
add_special_tokens
=
True
)
encoded_pair_from_decode
=
tokenizer
.
encode
(
"sequence builders"
,
"multi-sequence build"
,
add_special_tokens
=
True
)
encoded_sentence
=
tokenizer
.
add_special_tokens_single_sequence
(
text
)
encoded_sentence
=
tokenizer
.
build_inputs_with_special_tokens
(
text
)
encoded_pair
=
tokenizer
.
add_special_tokens_sequence_pair
(
text
,
text_2
)
encoded_pair
=
tokenizer
.
build_inputs_with_special_tokens
(
text
,
text_2
)
assert
encoded_sentence
==
encoded_text_from_decode
assert
encoded_sentence
==
encoded_text_from_decode
assert
encoded_pair
==
encoded_pair_from_decode
assert
encoded_pair
==
encoded_pair_from_decode
...
...
transformers/tests/tokenization_tests_commons.py
View file @
86a63070
...
@@ -193,12 +193,12 @@ class CommonTestCases:
...
@@ -193,12 +193,12 @@ class CommonTestCases:
tokenizer
=
self
.
get_tokenizer
()
tokenizer
=
self
.
get_tokenizer
()
if
tokenizer
.
add_special_tokens_sequence_pair
.
__qualname__
.
split
(
'.'
)[
0
]
!=
"PreTrainedTokenizer"
:
if
tokenizer
.
build_inputs_with_special_tokens
.
__qualname__
.
split
(
'.'
)[
0
]
!=
"PreTrainedTokenizer"
:
seq_0
=
"Test this method."
seq_0
=
"Test this method."
seq_1
=
"With these inputs."
seq_1
=
"With these inputs."
information
=
tokenizer
.
encode_plus
(
seq_0
,
seq_1
,
add_special_tokens
=
True
)
information
=
tokenizer
.
encode_plus
(
seq_0
,
seq_1
,
add_special_tokens
=
True
)
sequences
,
mask
=
information
[
"input_ids"
],
information
[
"token_type_ids"
]
sequences
,
mask
=
information
[
"input_ids"
],
information
[
"token_type_ids"
]
assert
len
(
sequences
)
==
len
(
mask
)
self
.
assert
Equal
(
len
(
sequences
)
,
len
(
mask
)
)
def
test_number_of_added_tokens
(
self
):
def
test_number_of_added_tokens
(
self
):
tokenizer
=
self
.
get_tokenizer
()
tokenizer
=
self
.
get_tokenizer
()
...
@@ -211,7 +211,7 @@ class CommonTestCases:
...
@@ -211,7 +211,7 @@ class CommonTestCases:
# Method is implemented (e.g. not GPT-2)
# Method is implemented (e.g. not GPT-2)
if
len
(
attached_sequences
)
!=
2
:
if
len
(
attached_sequences
)
!=
2
:
assert
tokenizer
.
num_added_tokens
(
pair
=
True
)
==
len
(
attached_sequences
)
-
len
(
sequences
)
self
.
assert
Equal
(
tokenizer
.
num_added_tokens
(
pair
=
True
)
,
len
(
attached_sequences
)
-
len
(
sequences
)
)
def
test_maximum_encoding_length_single_input
(
self
):
def
test_maximum_encoding_length_single_input
(
self
):
tokenizer
=
self
.
get_tokenizer
()
tokenizer
=
self
.
get_tokenizer
()
...
@@ -227,10 +227,10 @@ class CommonTestCases:
...
@@ -227,10 +227,10 @@ class CommonTestCases:
truncated_sequence
=
information
[
"input_ids"
]
truncated_sequence
=
information
[
"input_ids"
]
overflowing_tokens
=
information
[
"overflowing_tokens"
]
overflowing_tokens
=
information
[
"overflowing_tokens"
]
assert
len
(
overflowing_tokens
)
==
2
+
stride
self
.
assert
Equal
(
len
(
overflowing_tokens
)
,
2
+
stride
)
assert
overflowing_tokens
==
sequence
[
-
(
2
+
stride
):]
self
.
assert
Equal
(
overflowing_tokens
,
sequence
[
-
(
2
+
stride
):]
)
assert
len
(
truncated_sequence
)
==
total_length
-
2
self
.
assert
Equal
(
len
(
truncated_sequence
)
,
total_length
-
2
)
assert
truncated_sequence
==
tokenizer
.
add_special_tokens_single_sequence
(
sequence
[:
-
2
])
self
.
assert
Equal
(
truncated_sequence
,
tokenizer
.
build_inputs_with_special_tokens
(
sequence
[:
-
2
])
)
def
test_maximum_encoding_length_pair_input
(
self
):
def
test_maximum_encoding_length_pair_input
(
self
):
tokenizer
=
self
.
get_tokenizer
()
tokenizer
=
self
.
get_tokenizer
()
...
@@ -243,26 +243,26 @@ class CommonTestCases:
...
@@ -243,26 +243,26 @@ class CommonTestCases:
sequence_1_no_special_tokens
=
tokenizer
.
encode
(
seq_1
)
sequence_1_no_special_tokens
=
tokenizer
.
encode
(
seq_1
)
sequence
=
tokenizer
.
encode
(
seq_0
,
seq_1
,
add_special_tokens
=
True
)
sequence
=
tokenizer
.
encode
(
seq_0
,
seq_1
,
add_special_tokens
=
True
)
truncated_second_sequence
=
tokenizer
.
add_special_tokens_sequence_pair
(
truncated_second_sequence
=
tokenizer
.
build_inputs_with_special_tokens
(
tokenizer
.
encode
(
seq_0
),
tokenizer
.
encode
(
seq_0
),
tokenizer
.
encode
(
seq_1
)[:
-
2
]
tokenizer
.
encode
(
seq_1
)[:
-
2
]
)
)
information
=
tokenizer
.
encode_plus
(
seq_0
,
seq_1
,
max_length
=
len
(
sequence
)
-
2
,
add_special_tokens
=
True
,
information
=
tokenizer
.
encode_plus
(
seq_0
,
seq_1
,
max_length
=
len
(
sequence
)
-
2
,
add_special_tokens
=
True
,
stride
=
stride
,
truncat
e_first_sequence
=
False
)
stride
=
stride
,
truncat
ion_strategy
=
'only_second'
)
information_first_truncated
=
tokenizer
.
encode_plus
(
seq_0
,
seq_1
,
max_length
=
len
(
sequence
)
-
2
,
information_first_truncated
=
tokenizer
.
encode_plus
(
seq_0
,
seq_1
,
max_length
=
len
(
sequence
)
-
2
,
add_special_tokens
=
True
,
stride
=
stride
,
add_special_tokens
=
True
,
stride
=
stride
,
truncat
e_first_sequence
=
True
)
truncat
ion_strategy
=
'only_first'
)
truncated_sequence
=
information
[
"input_ids"
]
truncated_sequence
=
information
[
"input_ids"
]
overflowing_tokens
=
information
[
"overflowing_tokens"
]
overflowing_tokens
=
information
[
"overflowing_tokens"
]
overflowing_tokens_first_truncated
=
information_first_truncated
[
"overflowing_tokens"
]
overflowing_tokens_first_truncated
=
information_first_truncated
[
"overflowing_tokens"
]
assert
len
(
overflowing_tokens
)
==
2
+
stride
self
.
assert
Equal
(
len
(
overflowing_tokens
)
,
2
+
stride
)
assert
overflowing_tokens
==
sequence_1_no_special_tokens
[
-
(
2
+
stride
):]
self
.
assert
Equal
(
overflowing_tokens
,
sequence_1_no_special_tokens
[
-
(
2
+
stride
):]
)
assert
overflowing_tokens_first_truncated
==
sequence_0_no_special_tokens
[
-
(
2
+
stride
):]
self
.
assert
Equal
(
overflowing_tokens_first_truncated
,
sequence_0_no_special_tokens
[
-
(
2
+
stride
):]
)
assert
len
(
truncated_sequence
)
==
len
(
sequence
)
-
2
self
.
assert
Equal
(
len
(
truncated_sequence
)
,
len
(
sequence
)
-
2
)
assert
truncated_sequence
==
truncated_second_sequence
self
.
assert
Equal
(
truncated_sequence
,
truncated_second_sequence
)
def
test_encode_input_type
(
self
):
def
test_encode_input_type
(
self
):
tokenizer
=
self
.
get_tokenizer
()
tokenizer
=
self
.
get_tokenizer
()
...
@@ -273,5 +273,43 @@ class CommonTestCases:
...
@@ -273,5 +273,43 @@ class CommonTestCases:
input_ids
=
tokenizer
.
convert_tokens_to_ids
(
tokens
)
input_ids
=
tokenizer
.
convert_tokens_to_ids
(
tokens
)
formatted_input
=
tokenizer
.
encode
(
sequence
,
add_special_tokens
=
True
)
formatted_input
=
tokenizer
.
encode
(
sequence
,
add_special_tokens
=
True
)
assert
tokenizer
.
encode
(
tokens
,
add_special_tokens
=
True
)
==
formatted_input
self
.
assertEqual
(
tokenizer
.
encode
(
tokens
,
add_special_tokens
=
True
),
formatted_input
)
assert
tokenizer
.
encode
(
input_ids
,
add_special_tokens
=
True
)
==
formatted_input
self
.
assertEqual
(
tokenizer
.
encode
(
input_ids
,
add_special_tokens
=
True
),
formatted_input
)
def
test_special_tokens_mask
(
self
):
tokenizer
=
self
.
get_tokenizer
()
sequence_0
=
"Encode this."
sequence_1
=
"This one too please."
# Testing single inputs
encoded_sequence
=
tokenizer
.
encode
(
sequence_0
)
encoded_sequence_dict
=
tokenizer
.
encode_plus
(
sequence_0
,
add_special_tokens
=
True
)
encoded_sequence_w_special
=
encoded_sequence_dict
[
"input_ids"
]
special_tokens_mask
=
encoded_sequence_dict
[
"special_tokens_mask"
]
self
.
assertEqual
(
len
(
special_tokens_mask
),
len
(
encoded_sequence_w_special
))
filtered_sequence
=
[(
x
if
not
special_tokens_mask
[
i
]
else
None
)
for
i
,
x
in
enumerate
(
encoded_sequence_w_special
)]
filtered_sequence
=
[
x
for
x
in
filtered_sequence
if
x
is
not
None
]
self
.
assertEqual
(
encoded_sequence
,
filtered_sequence
)
# Testing inputs pairs
encoded_sequence
=
tokenizer
.
encode
(
sequence_0
)
+
tokenizer
.
encode
(
sequence_1
)
encoded_sequence_dict
=
tokenizer
.
encode_plus
(
sequence_0
,
sequence_1
,
add_special_tokens
=
True
)
encoded_sequence_w_special
=
encoded_sequence_dict
[
"input_ids"
]
special_tokens_mask
=
encoded_sequence_dict
[
"special_tokens_mask"
]
self
.
assertEqual
(
len
(
special_tokens_mask
),
len
(
encoded_sequence_w_special
))
filtered_sequence
=
[(
x
if
not
special_tokens_mask
[
i
]
else
None
)
for
i
,
x
in
enumerate
(
encoded_sequence_w_special
)]
filtered_sequence
=
[
x
for
x
in
filtered_sequence
if
x
is
not
None
]
self
.
assertEqual
(
encoded_sequence
,
filtered_sequence
)
# Testing with already existing special tokens
if
tokenizer
.
cls_token_id
==
tokenizer
.
unk_token_id
and
tokenizer
.
cls_token_id
==
tokenizer
.
unk_token_id
:
tokenizer
.
add_special_tokens
({
'cls_token'
:
'</s>'
,
'sep_token'
:
'<s>'
})
encoded_sequence_dict
=
tokenizer
.
encode_plus
(
sequence_0
,
add_special_tokens
=
True
)
encoded_sequence_w_special
=
encoded_sequence_dict
[
"input_ids"
]
special_tokens_mask_orig
=
encoded_sequence_dict
[
"special_tokens_mask"
]
special_tokens_mask
=
tokenizer
.
get_special_tokens_mask
(
encoded_sequence_w_special
,
already_has_special_tokens
=
True
)
self
.
assertEqual
(
len
(
special_tokens_mask
),
len
(
encoded_sequence_w_special
))
self
.
assertEqual
(
special_tokens_mask_orig
,
special_tokens_mask
)
transformers/tests/tokenization_xlm_test.py
View file @
86a63070
...
@@ -72,8 +72,8 @@ class XLMTokenizationTest(CommonTestCases.CommonTokenizerTester):
...
@@ -72,8 +72,8 @@ class XLMTokenizationTest(CommonTestCases.CommonTokenizerTester):
text
=
tokenizer
.
encode
(
"sequence builders"
)
text
=
tokenizer
.
encode
(
"sequence builders"
)
text_2
=
tokenizer
.
encode
(
"multi-sequence build"
)
text_2
=
tokenizer
.
encode
(
"multi-sequence build"
)
encoded_sentence
=
tokenizer
.
add_special_tokens_single_sequence
(
text
)
encoded_sentence
=
tokenizer
.
build_inputs_with_special_tokens
(
text
)
encoded_pair
=
tokenizer
.
add_special_tokens_sequence_pair
(
text
,
text_2
)
encoded_pair
=
tokenizer
.
build_inputs_with_special_tokens
(
text
,
text_2
)
assert
encoded_sentence
==
[
1
]
+
text
+
[
1
]
assert
encoded_sentence
==
[
1
]
+
text
+
[
1
]
assert
encoded_pair
==
[
1
]
+
text
+
[
1
]
+
text_2
+
[
1
]
assert
encoded_pair
==
[
1
]
+
text
+
[
1
]
+
text_2
+
[
1
]
...
...
transformers/tests/tokenization_xlnet_test.py
View file @
86a63070
...
@@ -95,8 +95,8 @@ class XLNetTokenizationTest(CommonTestCases.CommonTokenizerTester):
...
@@ -95,8 +95,8 @@ class XLNetTokenizationTest(CommonTestCases.CommonTokenizerTester):
text
=
tokenizer
.
encode
(
"sequence builders"
)
text
=
tokenizer
.
encode
(
"sequence builders"
)
text_2
=
tokenizer
.
encode
(
"multi-sequence build"
)
text_2
=
tokenizer
.
encode
(
"multi-sequence build"
)
encoded_sentence
=
tokenizer
.
add_special_tokens_single_sequence
(
text
)
encoded_sentence
=
tokenizer
.
build_inputs_with_special_tokens
(
text
)
encoded_pair
=
tokenizer
.
add_special_tokens_sequence_pair
(
text
,
text_2
)
encoded_pair
=
tokenizer
.
build_inputs_with_special_tokens
(
text
,
text_2
)
assert
encoded_sentence
==
text
+
[
4
,
3
]
assert
encoded_sentence
==
text
+
[
4
,
3
]
assert
encoded_pair
==
text
+
[
4
]
+
text_2
+
[
4
,
3
]
assert
encoded_pair
==
text
+
[
4
]
+
text_2
+
[
4
,
3
]
...
...
transformers/tokenization_auto.py
View file @
86a63070
...
@@ -21,6 +21,7 @@ import logging
...
@@ -21,6 +21,7 @@ import logging
from
.tokenization_bert
import
BertTokenizer
from
.tokenization_bert
import
BertTokenizer
from
.tokenization_openai
import
OpenAIGPTTokenizer
from
.tokenization_openai
import
OpenAIGPTTokenizer
from
.tokenization_gpt2
import
GPT2Tokenizer
from
.tokenization_gpt2
import
GPT2Tokenizer
from
.tokenization_ctrl
import
CTRLTokenizer
from
.tokenization_transfo_xl
import
TransfoXLTokenizer
from
.tokenization_transfo_xl
import
TransfoXLTokenizer
from
.tokenization_xlnet
import
XLNetTokenizer
from
.tokenization_xlnet
import
XLNetTokenizer
from
.tokenization_xlm
import
XLMTokenizer
from
.tokenization_xlm
import
XLMTokenizer
...
@@ -45,6 +46,7 @@ class AutoTokenizer(object):
...
@@ -45,6 +46,7 @@ class AutoTokenizer(object):
- contains `bert`: BertTokenizer (Bert model)
- contains `bert`: BertTokenizer (Bert model)
- contains `openai-gpt`: OpenAIGPTTokenizer (OpenAI GPT model)
- contains `openai-gpt`: OpenAIGPTTokenizer (OpenAI GPT model)
- contains `gpt2`: GPT2Tokenizer (OpenAI GPT-2 model)
- contains `gpt2`: GPT2Tokenizer (OpenAI GPT-2 model)
- contains `ctrl`: CTRLTokenizer (Salesforce CTRL model)
- contains `transfo-xl`: TransfoXLTokenizer (Transformer-XL model)
- contains `transfo-xl`: TransfoXLTokenizer (Transformer-XL model)
- contains `xlnet`: XLNetTokenizer (XLNet model)
- contains `xlnet`: XLNetTokenizer (XLNet model)
- contains `xlm`: XLMTokenizer (XLM model)
- contains `xlm`: XLMTokenizer (XLM model)
...
@@ -67,6 +69,7 @@ class AutoTokenizer(object):
...
@@ -67,6 +69,7 @@ class AutoTokenizer(object):
- contains `bert`: BertTokenizer (Bert model)
- contains `bert`: BertTokenizer (Bert model)
- contains `openai-gpt`: OpenAIGPTTokenizer (OpenAI GPT model)
- contains `openai-gpt`: OpenAIGPTTokenizer (OpenAI GPT model)
- contains `gpt2`: GPT2Tokenizer (OpenAI GPT-2 model)
- contains `gpt2`: GPT2Tokenizer (OpenAI GPT-2 model)
- contains `ctrl`: CTRLTokenizer (Salesforce CTRL model)
- contains `transfo-xl`: TransfoXLTokenizer (Transformer-XL model)
- contains `transfo-xl`: TransfoXLTokenizer (Transformer-XL model)
- contains `xlnet`: XLNetTokenizer (XLNet model)
- contains `xlnet`: XLNetTokenizer (XLNet model)
- contains `xlm`: XLMTokenizer (XLM model)
- contains `xlm`: XLMTokenizer (XLM model)
...
@@ -114,7 +117,8 @@ class AutoTokenizer(object):
...
@@ -114,7 +117,8 @@ class AutoTokenizer(object):
return
XLNetTokenizer
.
from_pretrained
(
pretrained_model_name_or_path
,
*
inputs
,
**
kwargs
)
return
XLNetTokenizer
.
from_pretrained
(
pretrained_model_name_or_path
,
*
inputs
,
**
kwargs
)
elif
'xlm'
in
pretrained_model_name_or_path
:
elif
'xlm'
in
pretrained_model_name_or_path
:
return
XLMTokenizer
.
from_pretrained
(
pretrained_model_name_or_path
,
*
inputs
,
**
kwargs
)
return
XLMTokenizer
.
from_pretrained
(
pretrained_model_name_or_path
,
*
inputs
,
**
kwargs
)
elif
'ctrl'
in
pretrained_model_name_or_path
:
return
CTRLTokenizer
.
from_pretrained
(
pretrained_model_name_or_path
,
*
inputs
,
**
kwargs
)
raise
ValueError
(
"Unrecognized model identifier in {}. Should contains one of "
raise
ValueError
(
"Unrecognized model identifier in {}. Should contains one of "
"'bert', 'openai-gpt', 'gpt2', 'transfo-xl', 'xlnet', "
"'bert', 'openai-gpt', 'gpt2', 'transfo-xl', 'xlnet', "
"'xlm', 'roberta'"
.
format
(
pretrained_model_name_or_path
))
"'xlm', 'roberta'
, 'ctrl'
"
.
format
(
pretrained_model_name_or_path
))
transformers/tokenization_bert.py
View file @
86a63070
...
@@ -44,6 +44,8 @@ PRETRAINED_VOCAB_FILES_MAP = {
...
@@ -44,6 +44,8 @@ PRETRAINED_VOCAB_FILES_MAP = {
'bert-large-uncased-whole-word-masking-finetuned-squad'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-finetuned-squad-vocab.txt"
,
'bert-large-uncased-whole-word-masking-finetuned-squad'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-finetuned-squad-vocab.txt"
,
'bert-large-cased-whole-word-masking-finetuned-squad'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-finetuned-squad-vocab.txt"
,
'bert-large-cased-whole-word-masking-finetuned-squad'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-finetuned-squad-vocab.txt"
,
'bert-base-cased-finetuned-mrpc'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-vocab.txt"
,
'bert-base-cased-finetuned-mrpc'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-vocab.txt"
,
'bert-base-german-dbmdz-cased'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-cased-vocab.txt"
,
'bert-base-german-dbmdz-uncased'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-uncased-vocab.txt"
,
}
}
}
}
...
@@ -61,6 +63,8 @@ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
...
@@ -61,6 +63,8 @@ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
'bert-large-uncased-whole-word-masking-finetuned-squad'
:
512
,
'bert-large-uncased-whole-word-masking-finetuned-squad'
:
512
,
'bert-large-cased-whole-word-masking-finetuned-squad'
:
512
,
'bert-large-cased-whole-word-masking-finetuned-squad'
:
512
,
'bert-base-cased-finetuned-mrpc'
:
512
,
'bert-base-cased-finetuned-mrpc'
:
512
,
'bert-base-german-dbmdz-cased'
:
512
,
'bert-base-german-dbmdz-uncased'
:
512
,
}
}
PRETRAINED_INIT_CONFIGURATION
=
{
PRETRAINED_INIT_CONFIGURATION
=
{
...
@@ -77,6 +81,8 @@ PRETRAINED_INIT_CONFIGURATION = {
...
@@ -77,6 +81,8 @@ PRETRAINED_INIT_CONFIGURATION = {
'bert-large-uncased-whole-word-masking-finetuned-squad'
:
{
'do_lower_case'
:
True
},
'bert-large-uncased-whole-word-masking-finetuned-squad'
:
{
'do_lower_case'
:
True
},
'bert-large-cased-whole-word-masking-finetuned-squad'
:
{
'do_lower_case'
:
False
},
'bert-large-cased-whole-word-masking-finetuned-squad'
:
{
'do_lower_case'
:
False
},
'bert-base-cased-finetuned-mrpc'
:
{
'do_lower_case'
:
False
},
'bert-base-cased-finetuned-mrpc'
:
{
'do_lower_case'
:
False
},
'bert-base-german-dbmdz-cased'
:
{
'do_lower_case'
:
False
},
'bert-base-german-dbmdz-uncased'
:
{
'do_lower_case'
:
True
},
}
}
...
@@ -187,33 +193,59 @@ class BertTokenizer(PreTrainedTokenizer):
...
@@ -187,33 +193,59 @@ class BertTokenizer(PreTrainedTokenizer):
out_string
=
' '
.
join
(
tokens
).
replace
(
' ##'
,
''
).
strip
()
out_string
=
' '
.
join
(
tokens
).
replace
(
' ##'
,
''
).
strip
()
return
out_string
return
out_string
def
add_special_tokens_single_sequence
(
self
,
token_ids
):
def
build_inputs_with_special_tokens
(
self
,
token_ids_0
,
token_ids_1
=
None
):
"""
"""
Adds special tokens to the a sequence for sequence classification tasks.
Build model inputs from a sequence or a pair of sequence for sequence classification tasks
A BERT sequence has the following format: [CLS] X [SEP]
by concatenating and adding special tokens.
A BERT sequence has the following format:
single sequence: [CLS] X [SEP]
pair of sequences: [CLS] A [SEP] B [SEP]
"""
"""
return
[
self
.
cls_token_id
]
+
token_ids
+
[
self
.
sep_token_id
]
if
token_ids_1
is
None
:
return
[
self
.
cls_token_id
]
+
token_ids_0
+
[
self
.
sep_token_id
]
cls
=
[
self
.
cls_token_id
]
sep
=
[
self
.
sep_token_id
]
return
cls
+
token_ids_0
+
sep
+
token_ids_1
+
sep
def
add
_special_tokens_
sequence_pair
(
self
,
token_ids_0
,
token_ids_1
):
def
get
_special_tokens_
mask
(
self
,
token_ids_0
,
token_ids_1
=
None
,
already_has_special_tokens
=
False
):
"""
"""
Adds special tokens to a sequence pair for sequence classification tasks.
Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
A BERT sequence pair has the following format: [CLS] A [SEP] B [SEP]
special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods.
Args:
token_ids_0: list of ids (must not contain special tokens)
token_ids_1: Optional list of ids (must not contain special tokens), necessary when fetching sequence ids
for sequence pairs
already_has_special_tokens: (default False) Set to True if the token list is already formated with
special tokens for the model
Returns:
A list of integers in the range [0, 1]: 0 for a special token, 1 for a sequence token.
"""
"""
sep
=
[
self
.
sep_token_id
]
cls
=
[
self
.
cls_token_id
]
return
cls
+
token_ids_0
+
sep
+
token_ids_1
+
sep
if
already_has_special_tokens
:
if
token_ids_1
is
not
None
:
raise
ValueError
(
"You should not supply a second sequence if the provided sequence of "
"ids is already formated with special tokens for the model."
)
return
list
(
map
(
lambda
x
:
1
if
x
in
[
self
.
sep_token_id
,
self
.
cls_token_id
]
else
0
,
token_ids_0
))
def
create_token_type_ids_from_sequences
(
self
,
token_ids_0
,
token_ids_1
):
if
token_ids_1
is
not
None
:
return
[
1
]
+
([
0
]
*
len
(
token_ids_0
))
+
[
1
]
+
([
0
]
*
len
(
token_ids_1
))
+
[
1
]
return
[
1
]
+
([
0
]
*
len
(
token_ids_0
))
+
[
1
]
def
create_token_type_ids_from_sequences
(
self
,
token_ids_0
,
token_ids_1
=
None
):
"""
"""
Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
A BERT sequence pair mask has the following format:
A BERT sequence pair mask has the following format:
0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1
0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1
| first sequence | second sequence
| first sequence | second sequence
if token_ids_1 is None, only returns the first portion of the mask (0's).
"""
"""
sep
=
[
self
.
sep_token_id
]
sep
=
[
self
.
sep_token_id
]
cls
=
[
self
.
cls_token_id
]
cls
=
[
self
.
cls_token_id
]
if
token_ids_1
is
None
:
return
len
(
cls
+
token_ids_0
+
sep
)
*
[
0
]
return
len
(
cls
+
token_ids_0
+
sep
)
*
[
0
]
+
len
(
token_ids_1
+
sep
)
*
[
1
]
return
len
(
cls
+
token_ids_0
+
sep
)
*
[
0
]
+
len
(
token_ids_1
+
sep
)
*
[
1
]
def
save_vocabulary
(
self
,
vocab_path
):
def
save_vocabulary
(
self
,
vocab_path
):
...
...
transformers/tokenization_ctrl.py
0 → 100644
View file @
86a63070
# coding=utf-8
# Copyright 2018 Salesforce and The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tokenization classes for Salesforce CTRL."""
from
__future__
import
(
absolute_import
,
division
,
print_function
,
unicode_literals
)
import
json
import
logging
import
os
import
regex
as
re
from
io
import
open
from
.tokenization_utils
import
PreTrainedTokenizer
logger
=
logging
.
getLogger
(
__name__
)
VOCAB_FILES_NAMES
=
{
'vocab_file'
:
'vocab.json'
,
'merges_file'
:
'merges.txt'
,
}
PRETRAINED_VOCAB_FILES_MAP
=
{
'vocab_file'
:
{
'ctrl'
:
"https://raw.githubusercontent.com/salesforce/ctrl/master/ctrl-vocab.json"
,
},
'merges_file'
:
{
'ctrl'
:
"https://raw.githubusercontent.com/salesforce/ctrl/master/ctrl-merges.txt"
,
},
}
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
=
{
'ctrl'
:
256
,
}
def
get_pairs
(
word
):
"""Return set of symbol pairs in a word.
Word is represented as tuple of symbols (symbols being variable-length strings).
"""
pairs
=
set
()
prev_char
=
word
[
0
]
for
char
in
word
[
1
:]:
pairs
.
add
((
prev_char
,
char
))
prev_char
=
char
pairs
=
set
(
pairs
)
return
pairs
class
CTRLTokenizer
(
PreTrainedTokenizer
):
"""
CTRL BPE tokenizer. Peculiarities:
- Byte-level Byte-Pair-Encoding
- Requires a space to start the input string => the encoding methods should be called with the
``add_prefix_space`` flag set to ``True``.
Otherwise, this tokenizer ``encode`` and ``decode`` method will not conserve
the absence of a space at the beginning of a string: `tokenizer.decode(tokenizer.encode("Hello")) = " Hello"`
"""
vocab_files_names
=
VOCAB_FILES_NAMES
pretrained_vocab_files_map
=
PRETRAINED_VOCAB_FILES_MAP
max_model_input_sizes
=
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
def
__init__
(
self
,
vocab_file
,
merges_file
,
unk_token
=
"<unk>"
,
**
kwargs
):
super
(
CTRLTokenizer
,
self
).
__init__
(
unk_token
=
unk_token
,
**
kwargs
)
self
.
max_len_single_sentence
=
self
.
max_len
# no default special tokens - you can update this value if you add special tokens
self
.
max_len_sentences_pair
=
self
.
max_len
# no default special tokens - you can update this value if you add special tokens
self
.
encoder
=
json
.
load
(
open
(
vocab_file
,
encoding
=
"utf-8"
))
self
.
decoder
=
{
v
:
k
for
k
,
v
in
self
.
encoder
.
items
()}
merges
=
open
(
merges_file
,
encoding
=
'utf-8'
).
read
().
split
(
'
\n
'
)[
1
:
-
1
]
merges
=
[
tuple
(
merge
.
split
())
for
merge
in
merges
]
self
.
bpe_ranks
=
dict
(
zip
(
merges
,
range
(
len
(
merges
))))
self
.
cache
=
{}
@
property
def
vocab_size
(
self
):
return
len
(
self
.
encoder
)
def
bpe
(
self
,
token
):
if
token
in
self
.
cache
:
return
self
.
cache
[
token
]
word
=
tuple
(
token
)
word
=
tuple
(
list
(
word
[:
-
1
])
+
[
word
[
-
1
]
+
'</w>'
])
pairs
=
get_pairs
(
word
)
if
not
pairs
:
return
token
while
True
:
bigram
=
min
(
pairs
,
key
=
lambda
pair
:
self
.
bpe_ranks
.
get
(
pair
,
float
(
'inf'
)))
if
bigram
not
in
self
.
bpe_ranks
:
break
first
,
second
=
bigram
new_word
=
[]
i
=
0
while
i
<
len
(
word
):
try
:
j
=
word
.
index
(
first
,
i
)
new_word
.
extend
(
word
[
i
:
j
])
i
=
j
except
:
new_word
.
extend
(
word
[
i
:])
break
if
word
[
i
]
==
first
and
i
<
len
(
word
)
-
1
and
word
[
i
+
1
]
==
second
:
new_word
.
append
(
first
+
second
)
i
+=
2
else
:
new_word
.
append
(
word
[
i
])
i
+=
1
new_word
=
tuple
(
new_word
)
word
=
new_word
if
len
(
word
)
==
1
:
break
else
:
pairs
=
get_pairs
(
word
)
word
=
'@@ '
.
join
(
word
)
word
=
word
[:
-
4
]
self
.
cache
[
token
]
=
word
return
word
def
_tokenize
(
self
,
text
):
""" Tokenize a string.
"""
split_tokens
=
[]
text
=
text
.
split
(
' '
)
for
token
in
text
:
split_tokens
.
extend
([
t
for
t
in
self
.
bpe
(
token
).
split
(
' '
)])
return
split_tokens
def
_convert_token_to_id
(
self
,
token
):
""" Converts a token (str/unicode) in an id using the vocab. """
return
self
.
encoder
.
get
(
token
,
self
.
encoder
.
get
(
self
.
unk_token
))
def
_convert_id_to_token
(
self
,
index
):
"""Converts an index (integer) in a token (string/unicode) using the vocab."""
return
self
.
decoder
.
get
(
index
,
self
.
unk_token
)
def
convert_tokens_to_string
(
self
,
tokens
):
""" Converts a sequence of tokens (string) in a single string. """
out_string
=
' '
.
join
(
tokens
).
replace
(
'@@ '
,
''
).
strip
()
return
out_string
def
save_vocabulary
(
self
,
save_directory
):
"""Save the tokenizer vocabulary and merge files to a directory."""
if
not
os
.
path
.
isdir
(
save_directory
):
logger
.
error
(
"Vocabulary path ({}) should be a directory"
.
format
(
save_directory
))
return
vocab_file
=
os
.
path
.
join
(
save_directory
,
VOCAB_FILES_NAMES
[
'vocab_file'
])
merge_file
=
os
.
path
.
join
(
save_directory
,
VOCAB_FILES_NAMES
[
'merges_file'
])
with
open
(
vocab_file
,
'w'
,
encoding
=
'utf-8'
)
as
f
:
f
.
write
(
json
.
dumps
(
self
.
encoder
,
ensure_ascii
=
False
))
index
=
0
with
open
(
merge_file
,
"w"
,
encoding
=
"utf-8"
)
as
writer
:
writer
.
write
(
u
'#version: 0.2
\n
'
)
for
bpe_tokens
,
token_index
in
sorted
(
self
.
bpe_ranks
.
items
(),
key
=
lambda
kv
:
kv
[
1
]):
if
index
!=
token_index
:
logger
.
warning
(
"Saving vocabulary to {}: BPE merge indices are not consecutive."
" Please check that the tokenizer is not corrupted!"
.
format
(
merge_file
))
index
=
token_index
writer
.
write
(
' '
.
join
(
bpe_tokens
)
+
u
'
\n
'
)
index
+=
1
return
vocab_file
,
merge_file
# def decode(self, token_ids, skip_special_tokens=False, clean_up_tokenization_spaces=True):
# filtered_tokens = ' '.join(self.convert_ids_to_tokens(token_ids, skip_special_tokens=skip_special_tokens))
# tokens_generated_so_far = re.sub('(@@ )', '', string=filtered_tokens)
# tokens_generated_so_far = re.sub('(@@ ?$)', '', string=tokens_generated_so_far)
# return ''.join(tokens_generated_so_far)
transformers/tokenization_gpt2.py
View file @
86a63070
...
@@ -46,12 +46,14 @@ PRETRAINED_VOCAB_FILES_MAP = {
...
@@ -46,12 +46,14 @@ PRETRAINED_VOCAB_FILES_MAP = {
'gpt2'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json"
,
'gpt2'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json"
,
'gpt2-medium'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-vocab.json"
,
'gpt2-medium'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-vocab.json"
,
'gpt2-large'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-large-vocab.json"
,
'gpt2-large'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-large-vocab.json"
,
'distilgpt2'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/distilgpt2-vocab.json"
,
},
},
'merges_file'
:
'merges_file'
:
{
{
'gpt2'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt"
,
'gpt2'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt"
,
'gpt2-medium'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-merges.txt"
,
'gpt2-medium'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-merges.txt"
,
'gpt2-large'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-large-merges.txt"
,
'gpt2-large'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-large-merges.txt"
,
'distilgpt2'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/distilgpt2-merges.txt"
,
},
},
}
}
...
@@ -59,6 +61,7 @@ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
...
@@ -59,6 +61,7 @@ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
'gpt2'
:
1024
,
'gpt2'
:
1024
,
'gpt2-medium'
:
1024
,
'gpt2-medium'
:
1024
,
'gpt2-large'
:
1024
,
'gpt2-large'
:
1024
,
'distilgpt2'
:
1024
,
}
}
@
lru_cache
()
@
lru_cache
()
...
...
transformers/tokenization_roberta.py
View file @
86a63070
...
@@ -46,12 +46,14 @@ PRETRAINED_VOCAB_FILES_MAP = {
...
@@ -46,12 +46,14 @@ PRETRAINED_VOCAB_FILES_MAP = {
'roberta-base'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-vocab.json"
,
'roberta-base'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-vocab.json"
,
'roberta-large'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-vocab.json"
,
'roberta-large'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-vocab.json"
,
'roberta-large-mnli'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-mnli-vocab.json"
,
'roberta-large-mnli'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-mnli-vocab.json"
,
'distilroberta-base'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/distilroberta-base-vocab.json"
,
},
},
'merges_file'
:
'merges_file'
:
{
{
'roberta-base'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-merges.txt"
,
'roberta-base'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-merges.txt"
,
'roberta-large'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-merges.txt"
,
'roberta-large'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-merges.txt"
,
'roberta-large-mnli'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-mnli-merges.txt"
,
'roberta-large-mnli'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-mnli-merges.txt"
,
'distilroberta-base'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/distilroberta-base-merges.txt"
,
},
},
}
}
...
@@ -59,6 +61,7 @@ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
...
@@ -59,6 +61,7 @@ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
'roberta-base'
:
512
,
'roberta-base'
:
512
,
'roberta-large'
:
512
,
'roberta-large'
:
512
,
'roberta-large-mnli'
:
512
,
'roberta-large-mnli'
:
512
,
'distilroberta-base'
:
512
,
}
}
...
@@ -84,30 +87,57 @@ class RobertaTokenizer(GPT2Tokenizer):
...
@@ -84,30 +87,57 @@ class RobertaTokenizer(GPT2Tokenizer):
self
.
max_len_single_sentence
=
self
.
max_len
-
2
# take into account special tokens
self
.
max_len_single_sentence
=
self
.
max_len
-
2
# take into account special tokens
self
.
max_len_sentences_pair
=
self
.
max_len
-
4
# take into account special tokens
self
.
max_len_sentences_pair
=
self
.
max_len
-
4
# take into account special tokens
def
add_special_tokens_single_sequence
(
self
,
token_ids
):
def
build_inputs_with_special_tokens
(
self
,
token_ids_0
,
token_ids_1
=
None
):
"""
"""
Adds special tokens to a sequence for sequence classification tasks.
Build model inputs from a sequence or a pair of sequence for sequence classification tasks
A RoBERTa sequence has the following format: <s> X </s>
by concatenating and adding special tokens.
A RoBERTa sequence has the following format:
single sequence: <s> X </s>
pair of sequences: <s> A </s></s> B </s>
"""
"""
return
[
self
.
cls_token_id
]
+
token_ids
+
[
self
.
sep_token_id
]
if
token_ids_1
is
None
:
return
[
self
.
cls_token_id
]
+
token_ids_0
+
[
self
.
sep_token_id
]
cls
=
[
self
.
cls_token_id
]
sep
=
[
self
.
sep_token_id
]
return
cls
+
token_ids_0
+
sep
+
sep
+
token_ids_1
+
sep
def
add
_special_tokens_
sequence_pair
(
self
,
token_ids_0
,
token_ids_1
):
def
get
_special_tokens_
mask
(
self
,
token_ids_0
,
token_ids_1
=
None
,
already_has_special_tokens
=
False
):
"""
"""
Adds special tokens to a sequence pair for sequence classification tasks.
Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
A RoBERTa sequence pair has the following format: <s> A </s></s> B </s>
special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods.
Args:
token_ids_0: list of ids (must not contain special tokens)
token_ids_1: Optional list of ids (must not contain special tokens), necessary when fetching sequence ids
for sequence pairs
already_has_special_tokens: (default False) Set to True if the token list is already formated with
special tokens for the model
Returns:
A list of integers in the range [0, 1]: 0 for a special token, 1 for a sequence token.
"""
"""
sep
=
[
self
.
sep_token_id
]
if
already_has_special_tokens
:
cls
=
[
self
.
cls_token_id
]
if
token_ids_1
is
not
None
:
return
cls
+
token_ids_0
+
sep
+
sep
+
token_ids_1
+
sep
raise
ValueError
(
"You should not supply a second sequence if the provided sequence of "
"ids is already formated with special tokens for the model."
)
return
list
(
map
(
lambda
x
:
1
if
x
in
[
self
.
sep_token_id
,
self
.
cls_token_id
]
else
0
,
token_ids_0
))
def
create_token_type_ids_from_sequences
(
self
,
token_ids_0
,
token_ids_1
):
if
token_ids_1
is
None
:
return
[
1
]
+
([
0
]
*
len
(
token_ids_0
))
+
[
1
]
return
[
1
]
+
([
0
]
*
len
(
token_ids_0
))
+
[
1
,
1
]
+
([
0
]
*
len
(
token_ids_1
))
+
[
1
]
def
create_token_type_ids_from_sequences
(
self
,
token_ids_0
,
token_ids_1
=
None
):
"""
"""
Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
A RoBERTa sequence pair mask has the following format:
A RoBERTa sequence pair mask has the following format:
0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1
0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1
| first sequence | second sequence
| first sequence | second sequence
if token_ids_1 is None, only returns the first portion of the mask (0's).
"""
"""
sep
=
[
self
.
sep_token_id
]
sep
=
[
self
.
sep_token_id
]
cls
=
[
self
.
cls_token_id
]
cls
=
[
self
.
cls_token_id
]
return
len
(
cls
+
token_ids_0
+
sep
+
sep
)
*
[
0
]
+
len
(
token_ids_1
+
sep
)
*
[
1
]
if
token_ids_1
is
None
:
\ No newline at end of file
return
len
(
cls
+
token_ids_0
+
sep
)
*
[
0
]
return
len
(
cls
+
token_ids_0
+
sep
+
sep
)
*
[
0
]
+
len
(
token_ids_1
+
sep
)
*
[
1
]
transformers/tokenization_utils.py
View file @
86a63070
...
@@ -337,13 +337,13 @@ class PreTrainedTokenizer(object):
...
@@ -337,13 +337,13 @@ class PreTrainedTokenizer(object):
vocab_files
[
file_id
]
=
full_file_name
vocab_files
[
file_id
]
=
full_file_name
if
all
(
full_file_name
is
None
for
full_file_name
in
vocab_files
.
values
()):
if
all
(
full_file_name
is
None
for
full_file_name
in
vocab_files
.
values
()):
logger
.
e
rror
(
raise
EnvironmentE
rror
(
"Model name '{}' was not found in model name list ({}). "
"Model name '{}' was not found in
tokenizers
model name list ({}). "
"We assumed '{}' was a path or url
but couldn't find tokenizer
files"
"We assumed '{}' was a path or url
to a directory containing vocabulary
files
"
"at this path or url."
.
format
(
"
named {} but couldn't find such vocabulary files
at this path or url."
.
format
(
pretrained_model_name_or_path
,
', '
.
join
(
s3_models
),
pretrained_model_name_or_path
,
', '
.
join
(
s3_models
),
pretrained_model_name_or_path
,
))
pretrained_model_name_or_path
,
return
None
list
(
cls
.
vocab_files_names
.
values
())))
# Get files from url, cache, or disk depending on the case
# Get files from url, cache, or disk depending on the case
try
:
try
:
...
@@ -353,17 +353,18 @@ class PreTrainedTokenizer(object):
...
@@ -353,17 +353,18 @@ class PreTrainedTokenizer(object):
resolved_vocab_files
[
file_id
]
=
None
resolved_vocab_files
[
file_id
]
=
None
else
:
else
:
resolved_vocab_files
[
file_id
]
=
cached_path
(
file_path
,
cache_dir
=
cache_dir
,
force_download
=
force_download
,
proxies
=
proxies
)
resolved_vocab_files
[
file_id
]
=
cached_path
(
file_path
,
cache_dir
=
cache_dir
,
force_download
=
force_download
,
proxies
=
proxies
)
except
EnvironmentError
as
e
:
except
EnvironmentError
:
if
pretrained_model_name_or_path
in
s3_models
:
if
pretrained_model_name_or_path
in
s3_models
:
logger
.
error
(
"Couldn't reach server to download vocabulary."
)
msg
=
"Couldn't reach server
at '{}'
to download vocabulary
files
."
else
:
else
:
logger
.
error
(
msg
=
"Model name '{}' was not found in tokenizers model name list ({}). "
\
"Model name '{}' was not found in model name list ({}). "
"We assumed '{}' was a path or url to a directory containing vocabulary files "
\
"We assumed '{}' was a path or url but couldn't find files {} "
"named {}, but couldn't find such vocabulary files at this path or url."
.
format
(
"at this path or url."
.
format
(
pretrained_model_name_or_path
,
', '
.
join
(
s3_models
),
pretrained_model_name_or_path
,
', '
.
join
(
s3_models
),
pretrained_model_name_or_path
,
str
(
vocab_files
.
keys
())))
pretrained_model_name_or_path
,
raise
e
list
(
cls
.
vocab_files_names
.
values
()))
raise
EnvironmentError
(
msg
)
for
file_id
,
file_path
in
vocab_files
.
items
():
for
file_id
,
file_path
in
vocab_files
.
items
():
if
file_path
==
resolved_vocab_files
[
file_id
]:
if
file_path
==
resolved_vocab_files
[
file_id
]:
...
@@ -512,7 +513,8 @@ class PreTrainedTokenizer(object):
...
@@ -512,7 +513,8 @@ class PreTrainedTokenizer(object):
for
token
in
new_tokens
:
for
token
in
new_tokens
:
assert
isinstance
(
token
,
str
)
or
(
six
.
PY2
and
isinstance
(
token
,
unicode
))
assert
isinstance
(
token
,
str
)
or
(
six
.
PY2
and
isinstance
(
token
,
unicode
))
if
token
!=
self
.
unk_token
and
\
if
token
!=
self
.
unk_token
and
\
self
.
convert_tokens_to_ids
(
token
)
==
self
.
convert_tokens_to_ids
(
self
.
unk_token
):
self
.
convert_tokens_to_ids
(
token
)
==
self
.
convert_tokens_to_ids
(
self
.
unk_token
)
and
\
token
not
in
to_add_tokens
:
to_add_tokens
.
append
(
token
)
to_add_tokens
.
append
(
token
)
logger
.
info
(
"Adding %s to the vocabulary"
,
token
)
logger
.
info
(
"Adding %s to the vocabulary"
,
token
)
...
@@ -538,15 +540,9 @@ class PreTrainedTokenizer(object):
...
@@ -538,15 +540,9 @@ class PreTrainedTokenizer(object):
Returns:
Returns:
Number of tokens added to sequences
Number of tokens added to sequences
"""
"""
token_ids_0
=
[]
if
pair
:
token_ids_1
=
[]
initial_tokens_len
=
len
(
self
.
encode
(
"This is a sequence"
)
+
self
.
encode
(
"This is another"
))
return
len
(
self
.
build_inputs_with_special_tokens
(
token_ids_0
,
token_ids_1
if
pair
else
None
))
final_tokens_len
=
len
(
self
.
encode
(
"This is a sequence"
,
"This is another"
,
add_special_tokens
=
True
))
else
:
initial_tokens_len
=
len
(
self
.
encode
(
"This is a sequence"
))
final_tokens_len
=
len
(
self
.
encode
(
"This is a sequence"
,
add_special_tokens
=
True
))
return
final_tokens_len
-
initial_tokens_len
def
add_special_tokens
(
self
,
special_tokens_dict
):
def
add_special_tokens
(
self
,
special_tokens_dict
):
"""
"""
...
@@ -698,7 +694,7 @@ class PreTrainedTokenizer(object):
...
@@ -698,7 +694,7 @@ class PreTrainedTokenizer(object):
add_special_tokens
=
False
,
add_special_tokens
=
False
,
max_length
=
None
,
max_length
=
None
,
stride
=
0
,
stride
=
0
,
truncat
e_first_sequence
=
True
,
truncat
ion_strategy
=
'longest_first'
,
return_tensors
=
None
,
return_tensors
=
None
,
**
kwargs
):
**
kwargs
):
"""
"""
...
@@ -718,9 +714,13 @@ class PreTrainedTokenizer(object):
...
@@ -718,9 +714,13 @@ class PreTrainedTokenizer(object):
max_length: if set to a number, will limit the total sequence returned so that it has a maximum length.
max_length: if set to a number, will limit the total sequence returned so that it has a maximum length.
If there are overflowing tokens, those will be added to the returned dictionary
If there are overflowing tokens, those will be added to the returned dictionary
stride: if set to a number along with max_length, the overflowing tokens returned will contain some tokens
stride: if set to a number along with max_length, the overflowing tokens returned will contain some tokens
from the main sequence returned. The value of this argument defined the number of additional tokens.
from the main sequence returned. The value of this argument defines the number of additional tokens.
truncate_first_sequence: if there is a specified max_length, this flag will choose which sequence
truncation_strategy: string selected in the following options:
will be truncated.
- 'longest_first' (default) Iteratively reduce the inputs sequence until the input is under max_length
starting from the longest one at each token (when there is a pair of input sequences)
- 'only_first': Only truncate the first sequence
- 'only_second': Only truncate the second sequence
- 'do_not_truncate': Does not truncate (raise an error if the input sequence is longer than max_length)
return_tensors: (optional) can be set to 'tf' or 'pt' to return respectively TensorFlow tf.constant
return_tensors: (optional) can be set to 'tf' or 'pt' to return respectively TensorFlow tf.constant
or PyTorch torch.Tensor instead of a list of python integers.
or PyTorch torch.Tensor instead of a list of python integers.
**kwargs: passed to the `self.tokenize()` method
**kwargs: passed to the `self.tokenize()` method
...
@@ -730,7 +730,7 @@ class PreTrainedTokenizer(object):
...
@@ -730,7 +730,7 @@ class PreTrainedTokenizer(object):
max_length
=
max_length
,
max_length
=
max_length
,
add_special_tokens
=
add_special_tokens
,
add_special_tokens
=
add_special_tokens
,
stride
=
stride
,
stride
=
stride
,
truncat
e_first_sequence
=
truncate_first_sequence
,
truncat
ion_strategy
=
truncation_strategy
,
return_tensors
=
return_tensors
,
return_tensors
=
return_tensors
,
**
kwargs
)
**
kwargs
)
...
@@ -742,7 +742,7 @@ class PreTrainedTokenizer(object):
...
@@ -742,7 +742,7 @@ class PreTrainedTokenizer(object):
add_special_tokens
=
False
,
add_special_tokens
=
False
,
max_length
=
None
,
max_length
=
None
,
stride
=
0
,
stride
=
0
,
truncat
e_first_sequence
=
True
,
truncat
ion_strategy
=
'longest_first'
,
return_tensors
=
None
,
return_tensors
=
None
,
**
kwargs
):
**
kwargs
):
"""
"""
...
@@ -761,9 +761,13 @@ class PreTrainedTokenizer(object):
...
@@ -761,9 +761,13 @@ class PreTrainedTokenizer(object):
max_length: if set to a number, will limit the total sequence returned so that it has a maximum length.
max_length: if set to a number, will limit the total sequence returned so that it has a maximum length.
If there are overflowing tokens, those will be added to the returned dictionary
If there are overflowing tokens, those will be added to the returned dictionary
stride: if set to a number along with max_length, the overflowing tokens returned will contain some tokens
stride: if set to a number along with max_length, the overflowing tokens returned will contain some tokens
from the main sequence returned. The value of this argument defined the number of additional tokens.
from the main sequence returned. The value of this argument defines the number of additional tokens.
truncate_first_sequence: if there is a specified max_length, this flag will choose which sequence
truncation_strategy: string selected in the following options:
will be truncated.
- 'longest_first' (default) Iteratively reduce the inputs sequence until the input is under max_length
starting from the longest one at each token (when there is a pair of input sequences)
- 'only_first': Only truncate the first sequence
- 'only_second': Only truncate the second sequence
- 'do_not_truncate': Does not truncate (raise an error if the input sequence is longer than max_length)
return_tensors: (optional) can be set to 'tf' or 'pt' to return respectively TensorFlow tf.constant
return_tensors: (optional) can be set to 'tf' or 'pt' to return respectively TensorFlow tf.constant
or PyTorch torch.Tensor instead of a list of python integers.
or PyTorch torch.Tensor instead of a list of python integers.
**kwargs: passed to the `self.tokenize()` method
**kwargs: passed to the `self.tokenize()` method
...
@@ -787,12 +791,11 @@ class PreTrainedTokenizer(object):
...
@@ -787,12 +791,11 @@ class PreTrainedTokenizer(object):
max_length
=
max_length
,
max_length
=
max_length
,
add_special_tokens
=
add_special_tokens
,
add_special_tokens
=
add_special_tokens
,
stride
=
stride
,
stride
=
stride
,
truncat
e_first_sequence
=
truncate_first_sequence
,
truncat
ion_strategy
=
truncation_strategy
,
return_tensors
=
return_tensors
)
return_tensors
=
return_tensors
)
def
prepare_for_model
(
self
,
ids
,
pair_ids
=
None
,
max_length
=
None
,
add_special_tokens
=
False
,
stride
=
0
,
def
prepare_for_model
(
self
,
ids
,
pair_ids
=
None
,
max_length
=
None
,
add_special_tokens
=
False
,
stride
=
0
,
truncat
e_first_sequence
=
True
,
return_tensors
=
None
):
truncat
ion_strategy
=
'longest_first'
,
return_tensors
=
None
):
"""
"""
Prepares a sequence of input id, or a pair of sequences of inputs ids so that it can be used by the model.
Prepares a sequence of input id, or a pair of sequences of inputs ids so that it can be used by the model.
It adds special tokens, truncates
It adds special tokens, truncates
...
@@ -809,41 +812,50 @@ class PreTrainedTokenizer(object):
...
@@ -809,41 +812,50 @@ class PreTrainedTokenizer(object):
to their model.
to their model.
stride: window stride for overflowing tokens. Can be useful for edge effect removal when using sequential
stride: window stride for overflowing tokens. Can be useful for edge effect removal when using sequential
list of inputs.
list of inputs.
truncate_first_sequence: if set to `True` and an optional second list of input ids is provided,
truncation_strategy: string selected in the following options:
alongside a specified `max_length`, will truncate the first sequence if the total size is superior
- 'longest_first' (default) Iteratively reduce the inputs sequence until the input is under max_length
than the specified `max_length`. If set to `False`, will truncate the second sequence instead.
starting from the longest one at each token (when there is a pair of input sequences)
- 'only_first': Only truncate the first sequence
- 'only_second': Only truncate the second sequence
- 'do_not_truncate': Does not truncate (raise an error if the input sequence is longer than max_length)
return_tensors: (optional) can be set to 'tf' or 'pt' to return respectively TensorFlow tf.constant
return_tensors: (optional) can be set to 'tf' or 'pt' to return respectively TensorFlow tf.constant
or PyTorch torch.Tensor instead of a list of python integers.
or PyTorch torch.Tensor instead of a list of python integers.
Return:
Return:
a dictionary containing the `input_ids` as well as the `overflowing_tokens` if a `max_length` was given.
A Dictionary of shape::
{
input_ids: list[int],
overflowing_tokens: list[int] if a ``max_length`` is specified, else None
special_tokens_mask: list[int] if ``add_special_tokens`` if set to ``True``
}
With the fields:
``input_ids``: list of tokens to be fed to a model
``overflowing_tokens``: list of overflowing tokens if a max length is specified.
``special_tokens_mask``: if adding special tokens, this is a list of [0, 1], with 0 specifying special added
tokens and 1 specifying sequence tokens.
"""
"""
pair
=
bool
(
pair_ids
is
not
None
)
pair
=
bool
(
pair_ids
is
not
None
)
len_ids
=
len
(
ids
)
len_ids
=
len
(
ids
)
len_pair_ids
=
len
(
pair_ids
)
if
pair
else
0
len_pair_ids
=
len
(
pair_ids
)
if
pair
else
0
encoded_inputs
=
{}
encoded_inputs
=
{}
if
max_length
:
total_len
=
len_ids
+
len_pair_ids
+
(
self
.
num_added_tokens
(
pair
=
pair
)
if
add_special_tokens
else
0
)
n_added_tokens
=
self
.
num_added_tokens
(
pair
=
pair
)
if
add_special_tokens
else
0
if
max_length
and
total_len
>
max_length
:
if
pair
and
n_added_tokens
+
(
len_pair_ids
if
truncate_first_sequence
else
len_ids
)
>=
max_length
:
ids
,
pair_ids
,
overflowing_tokens
=
self
.
truncate_sequences
(
ids
,
pair_ids
=
pair_ids
,
logger
.
warning
(
num_tokens_to_remove
=
total_len
-
max_length
,
"You supplied a pair of sequence in which the sequence that will not be truncated is longer than the maximum specified length."
truncation_strategy
=
truncation_strategy
,
"This pair of sequences will not be truncated."
)
stride
=
stride
)
else
:
encoded_inputs
[
"overflowing_tokens"
]
=
overflowing_tokens
if
n_added_tokens
+
len_ids
+
len_pair_ids
>
max_length
:
encoded_inputs
[
"num_truncated_tokens"
]
=
total_len
-
max_length
if
truncate_first_sequence
or
not
pair
:
encoded_inputs
[
"overflowing_tokens"
]
=
ids
[
max_length
-
len_pair_ids
-
n_added_tokens
-
stride
:]
ids
=
ids
[:
max_length
-
len_pair_ids
-
n_added_tokens
]
elif
not
truncate_first_sequence
and
pair
:
encoded_inputs
[
"overflowing_tokens"
]
=
pair_ids
[
max_length
-
len_ids
-
n_added_tokens
-
stride
:]
pair_ids
=
pair_ids
[:
max_length
-
len_ids
-
n_added_tokens
]
else
:
logger
.
warning
(
"Cannot truncate second sequence as it is not provided. No truncation."
)
if
add_special_tokens
:
if
add_special_tokens
:
sequence
=
self
.
add_special_tokens_sequence_pair
(
ids
,
pair_ids
)
if
pair
else
self
.
add_special_tokens_single_sequence
(
ids
)
sequence
=
self
.
build_inputs_with_special_tokens
(
ids
,
pair_ids
)
token_type_ids
=
self
.
create_token_type_ids_from_sequences
(
ids
,
pair_ids
)
if
pair
else
[
0
]
*
len
(
sequence
)
token_type_ids
=
self
.
create_token_type_ids_from_sequences
(
ids
,
pair_ids
)
encoded_inputs
[
"special_tokens_mask"
]
=
self
.
get_special_tokens_mask
(
ids
,
pair_ids
)
else
:
else
:
sequence
=
ids
+
pair_ids
if
pair
else
ids
sequence
=
ids
+
pair_ids
if
pair
else
ids
token_type_ids
=
[
0
]
*
len
(
ids
)
+
([
1
]
*
len
(
pair_ids
)
if
pair
else
[])
token_type_ids
=
[
0
]
*
len
(
ids
)
+
([
1
]
*
len
(
pair_ids
)
if
pair
else
[])
...
@@ -860,20 +872,89 @@ class PreTrainedTokenizer(object):
...
@@ -860,20 +872,89 @@ class PreTrainedTokenizer(object):
encoded_inputs
[
"input_ids"
]
=
sequence
encoded_inputs
[
"input_ids"
]
=
sequence
encoded_inputs
[
"token_type_ids"
]
=
token_type_ids
encoded_inputs
[
"token_type_ids"
]
=
token_type_ids
if
max_length
and
len
(
encoded_inputs
[
"input_ids"
])
>
max_length
:
encoded_inputs
[
"input_ids"
]
=
encoded_inputs
[
"input_ids"
][:
max_length
]
encoded_inputs
[
"token_type_ids"
]
=
encoded_inputs
[
"token_type_ids"
][:
max_length
]
encoded_inputs
[
"special_tokens_mask"
]
=
encoded_inputs
[
"special_tokens_mask"
][:
max_length
]
return
encoded_inputs
return
encoded_inputs
def
create_token_type_ids_from_sequences
(
self
,
token_ids_0
,
token_ids_1
):
def
truncate_sequences
(
self
,
ids
,
pair_ids
=
None
,
num_tokens_to_remove
=
0
,
truncation_strategy
=
'longest_first'
,
stride
=
0
):
"""Truncates a sequence pair in place to the maximum length.
truncation_strategy: string selected in the following options:
- 'longest_first' (default) Iteratively reduce the inputs sequence until the input is under max_length
starting from the longest one at each token (when there is a pair of input sequences).
Overflowing tokens only contains overflow from the first sequence.
- 'only_first': Only truncate the first sequence. raise an error if the first sequence is shorter or equal to than num_tokens_to_remove.
- 'only_second': Only truncate the second sequence
- 'do_not_truncate': Does not truncate (raise an error if the input sequence is longer than max_length)
"""
if
num_tokens_to_remove
<=
0
:
return
ids
,
pair_ids
,
[]
if
truncation_strategy
==
'longest_first'
:
overflowing_tokens
=
[]
for
_
in
range
(
num_tokens_to_remove
):
if
pair_ids
is
None
or
len
(
ids
)
>
len
(
pair_ids
):
overflowing_tokens
=
[
ids
[
-
1
]]
+
overflowing_tokens
ids
=
ids
[:
-
1
]
else
:
pair_ids
=
pair_ids
[:
-
1
]
window_len
=
min
(
len
(
ids
),
stride
)
if
window_len
>
0
:
overflowing_tokens
=
ids
[
-
window_len
:]
+
overflowing_tokens
elif
truncation_strategy
==
'only_first'
:
assert
len
(
ids
)
>
num_tokens_to_remove
window_len
=
min
(
len
(
ids
),
stride
+
num_tokens_to_remove
)
overflowing_tokens
=
ids
[
-
window_len
:]
ids
=
ids
[:
-
num_tokens_to_remove
]
elif
truncation_strategy
==
'only_second'
:
assert
pair_ids
is
not
None
and
len
(
pair_ids
)
>
num_tokens_to_remove
window_len
=
min
(
len
(
pair_ids
),
stride
+
num_tokens_to_remove
)
overflowing_tokens
=
pair_ids
[
-
window_len
:]
pair_ids
=
pair_ids
[:
-
num_tokens_to_remove
]
elif
truncation_strategy
==
'do_not_truncate'
:
raise
ValueError
(
"Input sequence are too long for max_length. Please select a truncation strategy."
)
else
:
raise
ValueError
(
"Truncation_strategy should be selected in ['longest_first', 'only_first', 'only_second', 'do_not_truncate']"
)
return
(
ids
,
pair_ids
,
overflowing_tokens
)
def
create_token_type_ids_from_sequences
(
self
,
token_ids_0
,
token_ids_1
=
None
):
logger
.
warning
(
"This tokenizer does not make use of special tokens."
)
logger
.
warning
(
"This tokenizer does not make use of special tokens."
)
if
token_ids_1
is
None
:
return
len
(
token_ids_0
)
*
[
0
]
return
[
0
]
*
len
(
token_ids_0
)
+
[
1
]
*
len
(
token_ids_1
)
return
[
0
]
*
len
(
token_ids_0
)
+
[
1
]
*
len
(
token_ids_1
)
def
add_special_tokens_single_sequence
(
self
,
token_ids
):
def
build_inputs_with_special_tokens
(
self
,
token_ids_0
,
token_ids_1
=
None
):
logger
.
warning
(
"This tokenizer does not make use of special tokens. The sequence has been returned with no modification."
)
"""
return
token_ids
Build model inputs from a sequence or a pair of sequence for sequence classification tasks
by concatenating and adding special tokens.
def
add_special_tokens_sequence_pair
(
self
,
token_ids_0
,
token_ids_1
):
A RoBERTa sequence has the following format:
logger
.
warning
(
"This tokenizer does not make use of special tokens. The two sequences have been concatenated."
)
single sequence: <s> X </s>
pair of sequences: <s> A </s></s> B </s>
"""
logger
.
warning
(
"This tokenizer does not make use of special tokens. Input is returned with no modification."
)
if
token_ids_1
is
None
:
return
token_ids_0
return
token_ids_0
+
token_ids_1
return
token_ids_0
+
token_ids_1
def
get_special_tokens_mask
(
self
,
token_ids_0
,
token_ids_1
=
None
,
already_has_special_tokens
=
False
):
"""
Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods.
Args:
token_ids_0: list of ids (must not contain special tokens)
token_ids_1: Optional list of ids (must not contain special tokens), necessary when fetching sequence ids
for sequence pairs
already_has_special_tokens: (default False) Set to True if the token list is already formated with
special tokens for the model
Returns:
A list of integers in the range [0, 1]: 0 for a special token, 1 for a sequence token.
"""
return
[
0
]
*
((
len
(
token_ids_1
)
if
token_ids_1
else
0
)
+
len
(
token_ids_0
))
def
convert_ids_to_tokens
(
self
,
ids
,
skip_special_tokens
=
False
):
def
convert_ids_to_tokens
(
self
,
ids
,
skip_special_tokens
=
False
):
""" Converts a single index or a sequence of indices (integers) in a token "
""" Converts a single index or a sequence of indices (integers) in a token "
(resp.) a sequence of tokens (str/unicode), using the vocabulary and added tokens.
(resp.) a sequence of tokens (str/unicode), using the vocabulary and added tokens.
...
@@ -911,6 +992,11 @@ class PreTrainedTokenizer(object):
...
@@ -911,6 +992,11 @@ class PreTrainedTokenizer(object):
Converts a sequence of ids (integer) in a string, using the tokenizer and vocabulary
Converts a sequence of ids (integer) in a string, using the tokenizer and vocabulary
with options to remove special tokens and clean up tokenization spaces.
with options to remove special tokens and clean up tokenization spaces.
Similar to doing ``self.convert_tokens_to_string(self.convert_ids_to_tokens(token_ids))``.
Similar to doing ``self.convert_tokens_to_string(self.convert_ids_to_tokens(token_ids))``.
Args:
token_ids: list of tokenized input ids. Can be obtained using the `encode` or `encode_plus` methods.
skip_special_tokens: if set to True, will replace special tokens.
clean_up_tokenization_spaces: if set to True, will clean up the tokenization spaces.
"""
"""
filtered_tokens
=
self
.
convert_ids_to_tokens
(
token_ids
,
skip_special_tokens
=
skip_special_tokens
)
filtered_tokens
=
self
.
convert_ids_to_tokens
(
token_ids
,
skip_special_tokens
=
skip_special_tokens
)
...
...
Prev
1
2
3
4
5
6
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment