Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
5daca95d
Unverified
Commit
5daca95d
authored
Dec 22, 2019
by
Thomas Wolf
Committed by
GitHub
Dec 22, 2019
Browse files
Merge pull request #2268 from aaugustin/improve-repository-structure
Improve repository structure
parents
54abc67a
00204f2b
Changes
167
Hide whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
4 additions
and
1829 deletions
+4
-1829
tests/test_tokenization_utils.py
tests/test_tokenization_utils.py
+0
-4
tests/test_tokenization_xlm.py
tests/test_tokenization_xlm.py
+2
-6
tests/test_tokenization_xlnet.py
tests/test_tokenization_xlnet.py
+2
-6
tests/utils.py
tests/utils.py
+0
-0
transformers/tests/modeling_common_test.py
transformers/tests/modeling_common_test.py
+0
-898
transformers/tests/modeling_tf_common_test.py
transformers/tests/modeling_tf_common_test.py
+0
-384
transformers/tests/tokenization_tests_commons.py
transformers/tests/tokenization_tests_commons.py
+0
-531
No files found.
t
ransformer
s/test
s/
tokenization_utils
_test
.py
→
t
est
s/test
_
tokenization_utils.py
View file @
5daca95d
...
@@ -44,7 +44,3 @@ class TokenizerUtilsTest(unittest.TestCase):
...
@@ -44,7 +44,3 @@ class TokenizerUtilsTest(unittest.TestCase):
@
slow
@
slow
def
test_pretrained_tokenizers
(
self
):
def
test_pretrained_tokenizers
(
self
):
self
.
check_tokenizer_from_pretrained
(
GPT2Tokenizer
)
self
.
check_tokenizer_from_pretrained
(
GPT2Tokenizer
)
if
__name__
==
"__main__"
:
unittest
.
main
()
t
ransformer
s/test
s/
tokenization_xlm
_test
.py
→
t
est
s/test
_
tokenization_xlm.py
View file @
5daca95d
...
@@ -20,11 +20,11 @@ import unittest
...
@@ -20,11 +20,11 @@ import unittest
from
transformers.tokenization_xlm
import
VOCAB_FILES_NAMES
,
XLMTokenizer
from
transformers.tokenization_xlm
import
VOCAB_FILES_NAMES
,
XLMTokenizer
from
.tokenization_
tests_
common
s
import
CommonTestCases
from
.
test_
tokenization_common
import
TokenizerTesterMixin
from
.utils
import
slow
from
.utils
import
slow
class
XLMTokenizationTest
(
CommonTestCases
.
CommonTokenizerTester
):
class
XLMTokenizationTest
(
TokenizerTesterMixin
,
unittest
.
TestCase
):
tokenizer_class
=
XLMTokenizer
tokenizer_class
=
XLMTokenizer
...
@@ -98,7 +98,3 @@ class XLMTokenizationTest(CommonTestCases.CommonTokenizerTester):
...
@@ -98,7 +98,3 @@ class XLMTokenizationTest(CommonTestCases.CommonTokenizerTester):
assert
encoded_sentence
==
[
1
]
+
text
+
[
1
]
assert
encoded_sentence
==
[
1
]
+
text
+
[
1
]
assert
encoded_pair
==
[
1
]
+
text
+
[
1
]
+
text_2
+
[
1
]
assert
encoded_pair
==
[
1
]
+
text
+
[
1
]
+
text_2
+
[
1
]
if
__name__
==
"__main__"
:
unittest
.
main
()
t
ransformer
s/test
s/
tokenization_xlnet
_test
.py
→
t
est
s/test
_
tokenization_xlnet.py
View file @
5daca95d
...
@@ -19,14 +19,14 @@ import unittest
...
@@ -19,14 +19,14 @@ import unittest
from
transformers.tokenization_xlnet
import
SPIECE_UNDERLINE
,
XLNetTokenizer
from
transformers.tokenization_xlnet
import
SPIECE_UNDERLINE
,
XLNetTokenizer
from
.tokenization_
tests_
common
s
import
CommonTestCases
from
.
test_
tokenization_common
import
TokenizerTesterMixin
from
.utils
import
slow
from
.utils
import
slow
SAMPLE_VOCAB
=
os
.
path
.
join
(
os
.
path
.
dirname
(
os
.
path
.
abspath
(
__file__
)),
"fixtures/test_sentencepiece.model"
)
SAMPLE_VOCAB
=
os
.
path
.
join
(
os
.
path
.
dirname
(
os
.
path
.
abspath
(
__file__
)),
"fixtures/test_sentencepiece.model"
)
class
XLNetTokenizationTest
(
CommonTestCases
.
CommonTokenizerTester
):
class
XLNetTokenizationTest
(
TokenizerTesterMixin
,
unittest
.
TestCase
):
tokenizer_class
=
XLNetTokenizer
tokenizer_class
=
XLNetTokenizer
...
@@ -183,7 +183,3 @@ class XLNetTokenizationTest(CommonTestCases.CommonTokenizerTester):
...
@@ -183,7 +183,3 @@ class XLNetTokenizationTest(CommonTestCases.CommonTokenizerTester):
assert
encoded_sentence
==
text
+
[
4
,
3
]
assert
encoded_sentence
==
text
+
[
4
,
3
]
assert
encoded_pair
==
text
+
[
4
]
+
text_2
+
[
4
,
3
]
assert
encoded_pair
==
text
+
[
4
]
+
text_2
+
[
4
,
3
]
if
__name__
==
"__main__"
:
unittest
.
main
()
transformers/
tests/utils.py
→
tests/utils.py
View file @
5daca95d
File moved
transformers/tests/modeling_common_test.py
deleted
100644 → 0
View file @
54abc67a
# coding=utf-8
# Copyright 2019 HuggingFace Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
__future__
import
absolute_import
,
division
,
print_function
import
copy
import
json
import
logging
import
os.path
import
random
import
shutil
import
sys
import
tempfile
import
unittest
import
uuid
from
transformers
import
is_torch_available
from
.utils
import
CACHE_DIR
,
require_torch
,
slow
,
torch_device
if
is_torch_available
():
import
torch
import
numpy
as
np
from
transformers
import
(
AdaptiveEmbedding
,
PretrainedConfig
,
PreTrainedModel
,
BertModel
,
BertConfig
,
BERT_PRETRAINED_MODEL_ARCHIVE_MAP
,
)
if
sys
.
version_info
[
0
]
==
2
:
class
TemporaryDirectory
(
object
):
"""Context manager for tempfile.mkdtemp() so it's usable with "with" statement."""
def
__enter__
(
self
):
self
.
name
=
tempfile
.
mkdtemp
()
return
self
.
name
def
__exit__
(
self
,
exc_type
,
exc_value
,
traceback
):
shutil
.
rmtree
(
self
.
name
)
else
:
TemporaryDirectory
=
tempfile
.
TemporaryDirectory
unicode
=
str
def
_config_zero_init
(
config
):
configs_no_init
=
copy
.
deepcopy
(
config
)
for
key
in
configs_no_init
.
__dict__
.
keys
():
if
"_range"
in
key
or
"_std"
in
key
or
"initializer_factor"
in
key
:
setattr
(
configs_no_init
,
key
,
0.0
)
return
configs_no_init
class
CommonTestCases
:
@
require_torch
class
CommonModelTester
(
unittest
.
TestCase
):
model_tester
=
None
all_model_classes
=
()
test_torchscript
=
True
test_pruning
=
True
test_resize_embeddings
=
True
test_head_masking
=
True
is_encoder_decoder
=
False
def
test_save_load
(
self
):
config
,
inputs_dict
=
self
.
model_tester
.
prepare_config_and_inputs_for_common
()
for
model_class
in
self
.
all_model_classes
:
model
=
model_class
(
config
)
model
.
to
(
torch_device
)
model
.
eval
()
with
torch
.
no_grad
():
outputs
=
model
(
**
inputs_dict
)
out_2
=
outputs
[
0
].
numpy
()
out_2
[
np
.
isnan
(
out_2
)]
=
0
with
TemporaryDirectory
()
as
tmpdirname
:
model
.
save_pretrained
(
tmpdirname
)
model
=
model_class
.
from_pretrained
(
tmpdirname
)
model
.
to
(
torch_device
)
with
torch
.
no_grad
():
after_outputs
=
model
(
**
inputs_dict
)
# Make sure we don't have nans
out_1
=
after_outputs
[
0
].
cpu
().
numpy
()
out_1
[
np
.
isnan
(
out_1
)]
=
0
max_diff
=
np
.
amax
(
np
.
abs
(
out_1
-
out_2
))
self
.
assertLessEqual
(
max_diff
,
1e-5
)
def
test_initialization
(
self
):
config
,
inputs_dict
=
self
.
model_tester
.
prepare_config_and_inputs_for_common
()
configs_no_init
=
_config_zero_init
(
config
)
for
model_class
in
self
.
all_model_classes
:
model
=
model_class
(
config
=
configs_no_init
)
for
name
,
param
in
model
.
named_parameters
():
if
param
.
requires_grad
:
self
.
assertIn
(
param
.
data
.
mean
().
item
(),
[
0.0
,
1.0
],
msg
=
"Parameter {} of model {} seems not properly initialized"
.
format
(
name
,
model_class
),
)
def
test_determinism
(
self
):
config
,
inputs_dict
=
self
.
model_tester
.
prepare_config_and_inputs_for_common
()
for
model_class
in
self
.
all_model_classes
:
model
=
model_class
(
config
)
model
.
to
(
torch_device
)
model
.
eval
()
with
torch
.
no_grad
():
first
=
model
(
**
inputs_dict
)[
0
]
second
=
model
(
**
inputs_dict
)[
0
]
out_1
=
first
.
cpu
().
numpy
()
out_2
=
second
.
cpu
().
numpy
()
out_1
=
out_1
[
~
np
.
isnan
(
out_1
)]
out_2
=
out_2
[
~
np
.
isnan
(
out_2
)]
max_diff
=
np
.
amax
(
np
.
abs
(
out_1
-
out_2
))
self
.
assertLessEqual
(
max_diff
,
1e-5
)
def
test_attention_outputs
(
self
):
config
,
inputs_dict
=
self
.
model_tester
.
prepare_config_and_inputs_for_common
()
decoder_seq_length
=
(
self
.
model_tester
.
decoder_seq_length
if
hasattr
(
self
.
model_tester
,
"decoder_seq_length"
)
else
self
.
model_tester
.
seq_length
)
encoder_seq_length
=
(
self
.
model_tester
.
encoder_seq_length
if
hasattr
(
self
.
model_tester
,
"encoder_seq_length"
)
else
self
.
model_tester
.
seq_length
)
decoder_key_length
=
(
self
.
model_tester
.
key_length
if
hasattr
(
self
.
model_tester
,
"key_length"
)
else
decoder_seq_length
)
encoder_key_length
=
(
self
.
model_tester
.
key_length
if
hasattr
(
self
.
model_tester
,
"key_length"
)
else
encoder_seq_length
)
for
model_class
in
self
.
all_model_classes
:
config
.
output_attentions
=
True
config
.
output_hidden_states
=
False
model
=
model_class
(
config
)
model
.
to
(
torch_device
)
model
.
eval
()
with
torch
.
no_grad
():
outputs
=
model
(
**
inputs_dict
)
attentions
=
outputs
[
-
1
]
self
.
assertEqual
(
model
.
config
.
output_attentions
,
True
)
self
.
assertEqual
(
model
.
config
.
output_hidden_states
,
False
)
self
.
assertEqual
(
len
(
attentions
),
self
.
model_tester
.
num_hidden_layers
)
self
.
assertListEqual
(
list
(
attentions
[
0
].
shape
[
-
3
:]),
[
self
.
model_tester
.
num_attention_heads
,
encoder_seq_length
,
encoder_key_length
],
)
out_len
=
len
(
outputs
)
if
self
.
is_encoder_decoder
:
self
.
assertEqual
(
out_len
%
2
,
0
)
decoder_attentions
=
outputs
[(
out_len
//
2
)
-
1
]
self
.
assertEqual
(
model
.
config
.
output_attentions
,
True
)
self
.
assertEqual
(
model
.
config
.
output_hidden_states
,
False
)
self
.
assertEqual
(
len
(
decoder_attentions
),
self
.
model_tester
.
num_hidden_layers
)
self
.
assertListEqual
(
list
(
decoder_attentions
[
0
].
shape
[
-
3
:]),
[
self
.
model_tester
.
num_attention_heads
,
decoder_seq_length
,
decoder_key_length
],
)
# Check attention is always last and order is fine
config
.
output_attentions
=
True
config
.
output_hidden_states
=
True
model
=
model_class
(
config
)
model
.
to
(
torch_device
)
model
.
eval
()
with
torch
.
no_grad
():
outputs
=
model
(
**
inputs_dict
)
self
.
assertEqual
(
out_len
+
(
2
if
self
.
is_encoder_decoder
else
1
),
len
(
outputs
))
self
.
assertEqual
(
model
.
config
.
output_attentions
,
True
)
self
.
assertEqual
(
model
.
config
.
output_hidden_states
,
True
)
self_attentions
=
outputs
[
-
1
]
self
.
assertEqual
(
len
(
self_attentions
),
self
.
model_tester
.
num_hidden_layers
)
self
.
assertListEqual
(
list
(
self_attentions
[
0
].
shape
[
-
3
:]),
[
self
.
model_tester
.
num_attention_heads
,
encoder_seq_length
,
encoder_key_length
],
)
def
test_torchscript
(
self
):
config
,
inputs_dict
=
self
.
model_tester
.
prepare_config_and_inputs_for_common
()
self
.
_create_and_check_torchscript
(
config
,
inputs_dict
)
def
test_torchscript_output_attentions
(
self
):
config
,
inputs_dict
=
self
.
model_tester
.
prepare_config_and_inputs_for_common
()
config
.
output_attentions
=
True
self
.
_create_and_check_torchscript
(
config
,
inputs_dict
)
def
test_torchscript_output_hidden_state
(
self
):
config
,
inputs_dict
=
self
.
model_tester
.
prepare_config_and_inputs_for_common
()
config
.
output_hidden_states
=
True
self
.
_create_and_check_torchscript
(
config
,
inputs_dict
)
def
_create_and_check_torchscript
(
self
,
config
,
inputs_dict
):
if
not
self
.
test_torchscript
:
return
configs_no_init
=
_config_zero_init
(
config
)
# To be sure we have no Nan
configs_no_init
.
torchscript
=
True
for
model_class
in
self
.
all_model_classes
:
model
=
model_class
(
config
=
configs_no_init
)
model
.
to
(
torch_device
)
model
.
eval
()
inputs
=
inputs_dict
[
"input_ids"
]
# Let's keep only input_ids
try
:
traced_gpt2
=
torch
.
jit
.
trace
(
model
,
inputs
)
except
RuntimeError
:
self
.
fail
(
"Couldn't trace module."
)
with
TemporaryDirectory
()
as
tmp_dir_name
:
pt_file_name
=
os
.
path
.
join
(
tmp_dir_name
,
"traced_model.pt"
)
try
:
torch
.
jit
.
save
(
traced_gpt2
,
pt_file_name
)
except
Exception
:
self
.
fail
(
"Couldn't save module."
)
try
:
loaded_model
=
torch
.
jit
.
load
(
pt_file_name
)
except
Exception
:
self
.
fail
(
"Couldn't load module."
)
model
.
to
(
torch_device
)
model
.
eval
()
loaded_model
.
to
(
torch_device
)
loaded_model
.
eval
()
model_params
=
model
.
parameters
()
loaded_model_params
=
loaded_model
.
parameters
()
models_equal
=
True
for
p1
,
p2
in
zip
(
model_params
,
loaded_model_params
):
if
p1
.
data
.
ne
(
p2
.
data
).
sum
()
>
0
:
models_equal
=
False
self
.
assertTrue
(
models_equal
)
def
test_headmasking
(
self
):
if
not
self
.
test_head_masking
:
return
global_rng
.
seed
(
42
)
config
,
inputs_dict
=
self
.
model_tester
.
prepare_config_and_inputs_for_common
()
global_rng
.
seed
()
config
.
output_attentions
=
True
config
.
output_hidden_states
=
True
configs_no_init
=
_config_zero_init
(
config
)
# To be sure we have no Nan
for
model_class
in
self
.
all_model_classes
:
model
=
model_class
(
config
=
configs_no_init
)
model
.
to
(
torch_device
)
model
.
eval
()
# Prepare head_mask
# Set require_grad after having prepared the tensor to avoid error (leaf variable has been moved into the graph interior)
head_mask
=
torch
.
ones
(
self
.
model_tester
.
num_hidden_layers
,
self
.
model_tester
.
num_attention_heads
,
device
=
torch_device
)
head_mask
[
0
,
0
]
=
0
head_mask
[
-
1
,
:
-
1
]
=
0
head_mask
.
requires_grad_
(
requires_grad
=
True
)
inputs
=
inputs_dict
.
copy
()
inputs
[
"head_mask"
]
=
head_mask
outputs
=
model
(
**
inputs
)
# Test that we can get a gradient back for importance score computation
output
=
sum
(
t
.
sum
()
for
t
in
outputs
[
0
])
output
=
output
.
sum
()
output
.
backward
()
multihead_outputs
=
head_mask
.
grad
attentions
=
outputs
[
-
1
]
hidden_states
=
outputs
[
-
2
]
# Remove Nan
for
t
in
attentions
:
self
.
assertLess
(
torch
.
sum
(
torch
.
isnan
(
t
)),
t
.
numel
()
/
4
)
# Check we don't have more than 25% nans (arbitrary)
attentions
=
[
t
.
masked_fill
(
torch
.
isnan
(
t
),
0.0
)
for
t
in
attentions
]
# remove them (the test is less complete)
self
.
assertIsNotNone
(
multihead_outputs
)
self
.
assertEqual
(
len
(
multihead_outputs
),
self
.
model_tester
.
num_hidden_layers
)
self
.
assertAlmostEqual
(
attentions
[
0
][...,
0
,
:,
:].
flatten
().
sum
().
item
(),
0.0
)
self
.
assertNotEqual
(
attentions
[
0
][...,
-
1
,
:,
:].
flatten
().
sum
().
item
(),
0.0
)
self
.
assertNotEqual
(
attentions
[
1
][...,
0
,
:,
:].
flatten
().
sum
().
item
(),
0.0
)
self
.
assertAlmostEqual
(
attentions
[
-
1
][...,
-
2
,
:,
:].
flatten
().
sum
().
item
(),
0.0
)
self
.
assertNotEqual
(
attentions
[
-
1
][...,
-
1
,
:,
:].
flatten
().
sum
().
item
(),
0.0
)
def
test_head_pruning
(
self
):
if
not
self
.
test_pruning
:
return
for
model_class
in
self
.
all_model_classes
:
config
,
inputs_dict
=
self
.
model_tester
.
prepare_config_and_inputs_for_common
()
if
"head_mask"
in
inputs_dict
:
del
inputs_dict
[
"head_mask"
]
config
.
output_attentions
=
True
config
.
output_hidden_states
=
False
model
=
model_class
(
config
=
config
)
model
.
to
(
torch_device
)
model
.
eval
()
heads_to_prune
=
{
0
:
list
(
range
(
1
,
self
.
model_tester
.
num_attention_heads
)),
-
1
:
[
0
]}
model
.
prune_heads
(
heads_to_prune
)
with
torch
.
no_grad
():
outputs
=
model
(
**
inputs_dict
)
attentions
=
outputs
[
-
1
]
self
.
assertEqual
(
attentions
[
0
].
shape
[
-
3
],
1
)
self
.
assertEqual
(
attentions
[
1
].
shape
[
-
3
],
self
.
model_tester
.
num_attention_heads
)
self
.
assertEqual
(
attentions
[
-
1
].
shape
[
-
3
],
self
.
model_tester
.
num_attention_heads
-
1
)
def
test_head_pruning_save_load_from_pretrained
(
self
):
if
not
self
.
test_pruning
:
return
for
model_class
in
self
.
all_model_classes
:
config
,
inputs_dict
=
self
.
model_tester
.
prepare_config_and_inputs_for_common
()
if
"head_mask"
in
inputs_dict
:
del
inputs_dict
[
"head_mask"
]
config
.
output_attentions
=
True
config
.
output_hidden_states
=
False
model
=
model_class
(
config
=
config
)
model
.
to
(
torch_device
)
model
.
eval
()
heads_to_prune
=
{
0
:
list
(
range
(
1
,
self
.
model_tester
.
num_attention_heads
)),
-
1
:
[
0
]}
model
.
prune_heads
(
heads_to_prune
)
with
TemporaryDirectory
()
as
temp_dir_name
:
model
.
save_pretrained
(
temp_dir_name
)
model
=
model_class
.
from_pretrained
(
temp_dir_name
)
model
.
to
(
torch_device
)
with
torch
.
no_grad
():
outputs
=
model
(
**
inputs_dict
)
attentions
=
outputs
[
-
1
]
self
.
assertEqual
(
attentions
[
0
].
shape
[
-
3
],
1
)
self
.
assertEqual
(
attentions
[
1
].
shape
[
-
3
],
self
.
model_tester
.
num_attention_heads
)
self
.
assertEqual
(
attentions
[
-
1
].
shape
[
-
3
],
self
.
model_tester
.
num_attention_heads
-
1
)
def
test_head_pruning_save_load_from_config_init
(
self
):
if
not
self
.
test_pruning
:
return
for
model_class
in
self
.
all_model_classes
:
config
,
inputs_dict
=
self
.
model_tester
.
prepare_config_and_inputs_for_common
()
if
"head_mask"
in
inputs_dict
:
del
inputs_dict
[
"head_mask"
]
config
.
output_attentions
=
True
config
.
output_hidden_states
=
False
heads_to_prune
=
{
0
:
list
(
range
(
1
,
self
.
model_tester
.
num_attention_heads
)),
-
1
:
[
0
]}
config
.
pruned_heads
=
heads_to_prune
model
=
model_class
(
config
=
config
)
model
.
to
(
torch_device
)
model
.
eval
()
with
torch
.
no_grad
():
outputs
=
model
(
**
inputs_dict
)
attentions
=
outputs
[
-
1
]
self
.
assertEqual
(
attentions
[
0
].
shape
[
-
3
],
1
)
self
.
assertEqual
(
attentions
[
1
].
shape
[
-
3
],
self
.
model_tester
.
num_attention_heads
)
self
.
assertEqual
(
attentions
[
-
1
].
shape
[
-
3
],
self
.
model_tester
.
num_attention_heads
-
1
)
def
test_head_pruning_integration
(
self
):
if
not
self
.
test_pruning
:
return
for
model_class
in
self
.
all_model_classes
:
config
,
inputs_dict
=
self
.
model_tester
.
prepare_config_and_inputs_for_common
()
if
"head_mask"
in
inputs_dict
:
del
inputs_dict
[
"head_mask"
]
config
.
output_attentions
=
True
config
.
output_hidden_states
=
False
heads_to_prune
=
{
0
:
[
0
],
1
:
[
1
,
2
]}
config
.
pruned_heads
=
heads_to_prune
model
=
model_class
(
config
=
config
)
model
.
to
(
torch_device
)
model
.
eval
()
with
torch
.
no_grad
():
outputs
=
model
(
**
inputs_dict
)
attentions
=
outputs
[
-
1
]
self
.
assertEqual
(
attentions
[
0
].
shape
[
-
3
],
self
.
model_tester
.
num_attention_heads
-
1
)
self
.
assertEqual
(
attentions
[
1
].
shape
[
-
3
],
self
.
model_tester
.
num_attention_heads
-
2
)
self
.
assertEqual
(
attentions
[
2
].
shape
[
-
3
],
self
.
model_tester
.
num_attention_heads
)
self
.
assertEqual
(
attentions
[
3
].
shape
[
-
3
],
self
.
model_tester
.
num_attention_heads
)
with
TemporaryDirectory
()
as
temp_dir_name
:
model
.
save_pretrained
(
temp_dir_name
)
model
=
model_class
.
from_pretrained
(
temp_dir_name
)
model
.
to
(
torch_device
)
with
torch
.
no_grad
():
outputs
=
model
(
**
inputs_dict
)
attentions
=
outputs
[
-
1
]
self
.
assertEqual
(
attentions
[
0
].
shape
[
-
3
],
self
.
model_tester
.
num_attention_heads
-
1
)
self
.
assertEqual
(
attentions
[
1
].
shape
[
-
3
],
self
.
model_tester
.
num_attention_heads
-
2
)
self
.
assertEqual
(
attentions
[
2
].
shape
[
-
3
],
self
.
model_tester
.
num_attention_heads
)
self
.
assertEqual
(
attentions
[
3
].
shape
[
-
3
],
self
.
model_tester
.
num_attention_heads
)
heads_to_prune
=
{
0
:
[
0
],
2
:
[
1
,
2
]}
model
.
prune_heads
(
heads_to_prune
)
with
torch
.
no_grad
():
outputs
=
model
(
**
inputs_dict
)
attentions
=
outputs
[
-
1
]
self
.
assertEqual
(
attentions
[
0
].
shape
[
-
3
],
self
.
model_tester
.
num_attention_heads
-
1
)
self
.
assertEqual
(
attentions
[
1
].
shape
[
-
3
],
self
.
model_tester
.
num_attention_heads
-
2
)
self
.
assertEqual
(
attentions
[
2
].
shape
[
-
3
],
self
.
model_tester
.
num_attention_heads
-
2
)
self
.
assertEqual
(
attentions
[
3
].
shape
[
-
3
],
self
.
model_tester
.
num_attention_heads
)
self
.
assertDictEqual
(
model
.
config
.
pruned_heads
,
{
0
:
[
0
],
1
:
[
1
,
2
],
2
:
[
1
,
2
]})
def
test_hidden_states_output
(
self
):
config
,
inputs_dict
=
self
.
model_tester
.
prepare_config_and_inputs_for_common
()
for
model_class
in
self
.
all_model_classes
:
config
.
output_hidden_states
=
True
config
.
output_attentions
=
False
model
=
model_class
(
config
)
model
.
to
(
torch_device
)
model
.
eval
()
with
torch
.
no_grad
():
outputs
=
model
(
**
inputs_dict
)
hidden_states
=
outputs
[
-
1
]
self
.
assertEqual
(
model
.
config
.
output_attentions
,
False
)
self
.
assertEqual
(
model
.
config
.
output_hidden_states
,
True
)
self
.
assertEqual
(
len
(
hidden_states
),
self
.
model_tester
.
num_hidden_layers
+
1
)
self
.
assertListEqual
(
list
(
hidden_states
[
0
].
shape
[
-
2
:]),
[
self
.
model_tester
.
encoder_seq_length
if
hasattr
(
self
.
model_tester
,
"encoder_seq_length"
)
else
self
.
model_tester
.
seq_length
,
self
.
model_tester
.
hidden_size
,
],
)
def
test_resize_tokens_embeddings
(
self
):
original_config
,
inputs_dict
=
self
.
model_tester
.
prepare_config_and_inputs_for_common
()
if
not
self
.
test_resize_embeddings
:
return
for
model_class
in
self
.
all_model_classes
:
config
=
copy
.
deepcopy
(
original_config
)
model
=
model_class
(
config
)
model_vocab_size
=
config
.
vocab_size
# Retrieve the embeddings and clone theme
model_embed
=
model
.
resize_token_embeddings
(
model_vocab_size
)
cloned_embeddings
=
model_embed
.
weight
.
clone
()
# Check that resizing the token embeddings with a larger vocab size increases the model's vocab size
model_embed
=
model
.
resize_token_embeddings
(
model_vocab_size
+
10
)
self
.
assertEqual
(
model
.
config
.
vocab_size
,
model_vocab_size
+
10
)
# Check that it actually resizes the embeddings matrix
self
.
assertEqual
(
model_embed
.
weight
.
shape
[
0
],
cloned_embeddings
.
shape
[
0
]
+
10
)
# Check that resizing the token embeddings with a smaller vocab size decreases the model's vocab size
model_embed
=
model
.
resize_token_embeddings
(
model_vocab_size
-
15
)
self
.
assertEqual
(
model
.
config
.
vocab_size
,
model_vocab_size
-
15
)
# Check that it actually resizes the embeddings matrix
self
.
assertEqual
(
model_embed
.
weight
.
shape
[
0
],
cloned_embeddings
.
shape
[
0
]
-
15
)
# Check that adding and removing tokens has not modified the first part of the embedding matrix.
models_equal
=
True
for
p1
,
p2
in
zip
(
cloned_embeddings
,
model_embed
.
weight
):
if
p1
.
data
.
ne
(
p2
.
data
).
sum
()
>
0
:
models_equal
=
False
self
.
assertTrue
(
models_equal
)
def
test_model_common_attributes
(
self
):
config
,
inputs_dict
=
self
.
model_tester
.
prepare_config_and_inputs_for_common
()
for
model_class
in
self
.
all_model_classes
:
model
=
model_class
(
config
)
self
.
assertIsInstance
(
model
.
get_input_embeddings
(),
(
torch
.
nn
.
Embedding
,
AdaptiveEmbedding
))
model
.
set_input_embeddings
(
torch
.
nn
.
Embedding
(
10
,
10
))
x
=
model
.
get_output_embeddings
()
self
.
assertTrue
(
x
is
None
or
isinstance
(
x
,
torch
.
nn
.
Linear
))
def
test_tie_model_weights
(
self
):
if
not
self
.
test_torchscript
:
return
config
,
inputs_dict
=
self
.
model_tester
.
prepare_config_and_inputs_for_common
()
def
check_same_values
(
layer_1
,
layer_2
):
equal
=
True
for
p1
,
p2
in
zip
(
layer_1
.
weight
,
layer_2
.
weight
):
if
p1
.
data
.
ne
(
p2
.
data
).
sum
()
>
0
:
equal
=
False
return
equal
for
model_class
in
self
.
all_model_classes
:
config
.
torchscript
=
True
model_not_tied
=
model_class
(
config
)
if
model_not_tied
.
get_output_embeddings
()
is
None
:
continue
params_not_tied
=
list
(
model_not_tied
.
parameters
())
config_tied
=
copy
.
deepcopy
(
config
)
config_tied
.
torchscript
=
False
model_tied
=
model_class
(
config_tied
)
params_tied
=
list
(
model_tied
.
parameters
())
# Check that the embedding layer and decoding layer are the same in size and in value
self
.
assertGreater
(
len
(
params_not_tied
),
len
(
params_tied
))
# self.assertTrue(check_same_values(embeddings, decoding))
# # Check that after modification, they remain the same.
# embeddings.weight.data.div_(2)
# # Check that the embedding layer and decoding layer are the same in size and in value
# self.assertTrue(embeddings.weight.shape, decoding.weight.shape)
# self.assertTrue(check_same_values(embeddings, decoding))
# # Check that after modification, they remain the same.
# decoding.weight.data.div_(4)
# # Check that the embedding layer and decoding layer are the same in size and in value
# self.assertTrue(embeddings.weight.shape, decoding.weight.shape)
# self.assertTrue(check_same_values(embeddings, decoding))
# Check that after resize they remain tied.
model_tied
.
resize_token_embeddings
(
config
.
vocab_size
+
10
)
params_tied_2
=
list
(
model_tied
.
parameters
())
self
.
assertGreater
(
len
(
params_not_tied
),
len
(
params_tied
))
self
.
assertEqual
(
len
(
params_tied_2
),
len
(
params_tied
))
# decoding.weight.data.mul_(20)
# # Check that the embedding layer and decoding layer are the same in size and in value
# self.assertTrue(model.transformer.wte.weight.shape, model.lm_head.weight.shape)
# self.assertTrue(check_same_values(model.transformer.wte, model.lm_head))
def
test_inputs_embeds
(
self
):
config
,
inputs_dict
=
self
.
model_tester
.
prepare_config_and_inputs_for_common
()
if
not
self
.
is_encoder_decoder
:
input_ids
=
inputs_dict
[
"input_ids"
]
del
inputs_dict
[
"input_ids"
]
else
:
encoder_input_ids
=
inputs_dict
[
"encoder_input_ids"
]
decoder_input_ids
=
inputs_dict
[
"decoder_input_ids"
]
del
inputs_dict
[
"encoder_input_ids"
]
del
inputs_dict
[
"decoder_input_ids"
]
for
model_class
in
self
.
all_model_classes
:
model
=
model_class
(
config
)
model
.
to
(
torch_device
)
model
.
eval
()
wte
=
model
.
get_input_embeddings
()
if
not
self
.
is_encoder_decoder
:
inputs_dict
[
"inputs_embeds"
]
=
wte
(
input_ids
)
else
:
inputs_dict
[
"encoder_inputs_embeds"
]
=
wte
(
encoder_input_ids
)
inputs_dict
[
"decoder_inputs_embeds"
]
=
wte
(
decoder_input_ids
)
with
torch
.
no_grad
():
outputs
=
model
(
**
inputs_dict
)
class
GPTModelTester
(
CommonModelTester
):
def
__init__
(
self
,
parent
,
batch_size
=
13
,
seq_length
=
7
,
is_training
=
True
,
use_position_ids
=
True
,
use_token_type_ids
=
True
,
use_labels
=
True
,
vocab_size
=
99
,
n_positions
=
33
,
hidden_size
=
32
,
num_hidden_layers
=
5
,
num_attention_heads
=
4
,
n_choices
=
3
,
type_sequence_label_size
=
2
,
initializer_range
=
0.02
,
num_labels
=
3
,
scope
=
None
,
config_class
=
None
,
base_model_class
=
None
,
lm_head_model_class
=
None
,
double_head_model_class
=
None
,
):
self
.
parent
=
parent
self
.
batch_size
=
batch_size
self
.
seq_length
=
seq_length
self
.
is_training
=
is_training
self
.
use_position_ids
=
use_position_ids
self
.
use_token_type_ids
=
use_token_type_ids
self
.
use_labels
=
use_labels
self
.
vocab_size
=
vocab_size
self
.
n_positions
=
n_positions
self
.
hidden_size
=
hidden_size
self
.
num_hidden_layers
=
num_hidden_layers
self
.
num_attention_heads
=
num_attention_heads
self
.
n_choices
=
n_choices
self
.
type_sequence_label_size
=
type_sequence_label_size
self
.
initializer_range
=
initializer_range
self
.
num_labels
=
num_labels
self
.
scope
=
scope
self
.
config_class
=
config_class
self
.
base_model_class
=
base_model_class
self
.
lm_head_model_class
=
lm_head_model_class
self
.
double_head_model_class
=
double_head_model_class
self
.
all_model_classes
=
(
base_model_class
,
lm_head_model_class
,
double_head_model_class
)
def
prepare_config_and_inputs
(
self
):
total_num_tokens
=
self
.
vocab_size
input_ids
=
ids_tensor
([
self
.
batch_size
,
self
.
n_choices
,
self
.
seq_length
],
total_num_tokens
)
position_ids
=
None
if
self
.
use_position_ids
:
position_ids
=
ids_tensor
([
self
.
batch_size
,
self
.
n_choices
,
self
.
seq_length
],
self
.
n_positions
)
token_type_ids
=
None
if
self
.
use_token_type_ids
:
total_voc
=
self
.
vocab_size
token_type_ids
=
ids_tensor
([
self
.
batch_size
,
self
.
n_choices
,
self
.
seq_length
],
total_voc
)
mc_labels
=
None
lm_labels
=
None
mc_token_ids
=
None
if
self
.
use_labels
:
mc_labels
=
ids_tensor
([
self
.
batch_size
],
self
.
type_sequence_label_size
)
lm_labels
=
ids_tensor
([
self
.
batch_size
,
self
.
n_choices
,
self
.
seq_length
],
self
.
num_labels
)
mc_token_ids
=
ids_tensor
([
self
.
batch_size
,
self
.
n_choices
],
self
.
seq_length
)
config
=
self
.
config_class
(
vocab_size
=
self
.
vocab_size
,
n_positions
=
self
.
n_positions
,
n_embd
=
self
.
hidden_size
,
n_layer
=
self
.
num_hidden_layers
,
n_head
=
self
.
num_attention_heads
,
initializer_range
=
self
.
initializer_range
,
)
return
(
config
,
input_ids
,
token_type_ids
,
position_ids
,
mc_labels
,
lm_labels
,
mc_token_ids
)
def
create_and_check_base_model
(
self
,
config
,
input_ids
,
token_type_ids
,
position_ids
,
mc_labels
,
lm_labels
,
mc_token_ids
):
model
=
self
.
base_model_class
(
config
)
model
.
to
(
torch_device
)
model
.
eval
()
with
torch
.
no_grad
():
outputs
=
model
(
input_ids
,
position_ids
,
token_type_ids
)
outputs
=
model
(
input_ids
,
position_ids
)
outputs
=
model
(
input_ids
)
hidden_state
=
outputs
[
0
]
self
.
parent
.
assertListEqual
(
list
(
hidden_state
.
size
()),
[
self
.
batch_size
,
self
.
n_choices
,
self
.
seq_length
,
self
.
hidden_size
]
)
def
create_and_check_lm_head
(
self
,
config
,
input_ids
,
token_type_ids
,
position_ids
,
mc_labels
,
lm_labels
,
mc_token_ids
):
model
=
self
.
lm_head_model_class
(
config
)
model
.
to
(
torch_device
)
model
.
eval
()
with
torch
.
no_grad
():
outputs
=
model
(
input_ids
,
position_ids
,
token_type_ids
,
lm_labels
)
loss
,
lm_logits
=
outputs
[:
2
]
total_voc
=
self
.
vocab_size
self
.
parent
.
assertListEqual
(
list
(
lm_logits
.
size
()),
[
self
.
batch_size
,
self
.
n_choices
,
self
.
seq_length
,
total_voc
]
)
self
.
parent
.
assertListEqual
(
list
(
loss
.
size
()),
[])
def
create_and_check_presents
(
self
,
config
,
input_ids
,
token_type_ids
,
position_ids
,
mc_labels
,
lm_labels
,
mc_token_ids
):
for
model_class
in
self
.
all_model_classes
:
model
=
model_class
(
config
)
model
.
to
(
torch_device
)
model
.
eval
()
with
torch
.
no_grad
():
outputs
=
model
(
input_ids
)
presents
=
outputs
[
-
1
]
self
.
parent
.
assertEqual
(
self
.
num_hidden_layers
,
len
(
presents
))
self
.
parent
.
assertListEqual
(
list
(
presents
[
0
].
size
()),
[
2
,
self
.
batch_size
*
self
.
n_choices
,
self
.
num_attention_heads
,
self
.
seq_length
,
self
.
hidden_size
//
self
.
num_attention_heads
,
],
)
def
create_and_check_double_heads
(
self
,
config
,
input_ids
,
token_type_ids
,
position_ids
,
mc_labels
,
lm_labels
,
mc_token_ids
):
model
=
self
.
double_head_model_class
(
config
)
model
.
to
(
torch_device
)
model
.
eval
()
with
torch
.
no_grad
():
outputs
=
model
(
input_ids
,
mc_token_ids
,
lm_labels
=
lm_labels
,
mc_labels
=
mc_labels
,
token_type_ids
=
token_type_ids
,
position_ids
=
position_ids
,
)
lm_loss
,
mc_loss
,
lm_logits
,
mc_logits
=
outputs
[:
4
]
loss
=
[
lm_loss
,
mc_loss
]
total_voc
=
self
.
vocab_size
self
.
parent
.
assertListEqual
(
list
(
lm_logits
.
size
()),
[
self
.
batch_size
,
self
.
n_choices
,
self
.
seq_length
,
total_voc
]
)
self
.
parent
.
assertListEqual
(
list
(
mc_logits
.
size
()),
[
self
.
batch_size
,
self
.
n_choices
])
self
.
parent
.
assertListEqual
([
list
(
l
.
size
())
for
l
in
loss
],
[[],
[]])
def
create_and_check_model_from_pretrained
(
self
):
for
model_name
in
list
(
self
.
base_model_class
.
pretrained_model_archive_map
.
keys
())[:
1
]:
model
=
self
.
base_model_class
.
from_pretrained
(
model_name
,
cache_dir
=
CACHE_DIR
)
self
.
parent
.
assertIsNotNone
(
model
)
def
prepare_config_and_inputs_for_common
(
self
):
config_and_inputs
=
self
.
prepare_config_and_inputs
()
(
config
,
input_ids
,
token_type_ids
,
position_ids
,
mc_labels
,
lm_labels
,
mc_token_ids
)
=
config_and_inputs
inputs_dict
=
{
"input_ids"
:
input_ids
}
return
config
,
inputs_dict
def
run_common_tests
(
self
,
test_presents
=
False
):
config_and_inputs
=
self
.
prepare_config_and_inputs
()
self
.
create_and_check_base_model
(
*
config_and_inputs
)
config_and_inputs
=
self
.
prepare_config_and_inputs
()
self
.
create_and_check_lm_head
(
*
config_and_inputs
)
config_and_inputs
=
self
.
prepare_config_and_inputs
()
self
.
create_and_check_double_heads
(
*
config_and_inputs
)
if
test_presents
:
config_and_inputs
=
self
.
prepare_config_and_inputs
()
self
.
create_and_check_presents
(
*
config_and_inputs
)
@
slow
def
run_slow_tests
(
self
):
self
.
create_and_check_model_from_pretrained
()
class
ConfigTester
(
object
):
def
__init__
(
self
,
parent
,
config_class
=
None
,
**
kwargs
):
self
.
parent
=
parent
self
.
config_class
=
config_class
self
.
inputs_dict
=
kwargs
def
create_and_test_config_common_properties
(
self
):
config
=
self
.
config_class
(
**
self
.
inputs_dict
)
self
.
parent
.
assertTrue
(
hasattr
(
config
,
"vocab_size"
))
self
.
parent
.
assertTrue
(
hasattr
(
config
,
"hidden_size"
))
self
.
parent
.
assertTrue
(
hasattr
(
config
,
"num_attention_heads"
))
self
.
parent
.
assertTrue
(
hasattr
(
config
,
"num_hidden_layers"
))
def
create_and_test_config_to_json_string
(
self
):
config
=
self
.
config_class
(
**
self
.
inputs_dict
)
obj
=
json
.
loads
(
config
.
to_json_string
())
for
key
,
value
in
self
.
inputs_dict
.
items
():
self
.
parent
.
assertEqual
(
obj
[
key
],
value
)
def
create_and_test_config_to_json_file
(
self
):
config_first
=
self
.
config_class
(
**
self
.
inputs_dict
)
json_file_path
=
os
.
path
.
join
(
os
.
getcwd
(),
"config_"
+
str
(
uuid
.
uuid4
())
+
".json"
)
config_first
.
to_json_file
(
json_file_path
)
config_second
=
self
.
config_class
.
from_json_file
(
json_file_path
)
os
.
remove
(
json_file_path
)
self
.
parent
.
assertEqual
(
config_second
.
to_dict
(),
config_first
.
to_dict
())
def
run_common_tests
(
self
):
self
.
create_and_test_config_common_properties
()
self
.
create_and_test_config_to_json_string
()
self
.
create_and_test_config_to_json_file
()
global_rng
=
random
.
Random
()
def
ids_tensor
(
shape
,
vocab_size
,
rng
=
None
,
name
=
None
):
"""Creates a random int32 tensor of the shape within the vocab size."""
if
rng
is
None
:
rng
=
global_rng
total_dims
=
1
for
dim
in
shape
:
total_dims
*=
dim
values
=
[]
for
_
in
range
(
total_dims
):
values
.
append
(
rng
.
randint
(
0
,
vocab_size
-
1
))
return
torch
.
tensor
(
data
=
values
,
dtype
=
torch
.
long
,
device
=
torch_device
).
view
(
shape
).
contiguous
()
def
floats_tensor
(
shape
,
scale
=
1.0
,
rng
=
None
,
name
=
None
):
"""Creates a random float32 tensor of the shape within the vocab size."""
if
rng
is
None
:
rng
=
global_rng
total_dims
=
1
for
dim
in
shape
:
total_dims
*=
dim
values
=
[]
for
_
in
range
(
total_dims
):
values
.
append
(
rng
.
random
()
*
scale
)
return
torch
.
tensor
(
data
=
values
,
dtype
=
torch
.
float
,
device
=
torch_device
).
view
(
shape
).
contiguous
()
@
require_torch
class
ModelUtilsTest
(
unittest
.
TestCase
):
@
slow
def
test_model_from_pretrained
(
self
):
logging
.
basicConfig
(
level
=
logging
.
INFO
)
for
model_name
in
list
(
BERT_PRETRAINED_MODEL_ARCHIVE_MAP
.
keys
())[:
1
]:
config
=
BertConfig
.
from_pretrained
(
model_name
)
self
.
assertIsNotNone
(
config
)
self
.
assertIsInstance
(
config
,
PretrainedConfig
)
model
=
BertModel
.
from_pretrained
(
model_name
)
model
,
loading_info
=
BertModel
.
from_pretrained
(
model_name
,
output_loading_info
=
True
)
self
.
assertIsNotNone
(
model
)
self
.
assertIsInstance
(
model
,
PreTrainedModel
)
for
value
in
loading_info
.
values
():
self
.
assertEqual
(
len
(
value
),
0
)
config
=
BertConfig
.
from_pretrained
(
model_name
,
output_attentions
=
True
,
output_hidden_states
=
True
)
model
=
BertModel
.
from_pretrained
(
model_name
,
output_attentions
=
True
,
output_hidden_states
=
True
)
self
.
assertEqual
(
model
.
config
.
output_attentions
,
True
)
self
.
assertEqual
(
model
.
config
.
output_hidden_states
,
True
)
self
.
assertEqual
(
model
.
config
,
config
)
if
__name__
==
"__main__"
:
unittest
.
main
()
transformers/tests/modeling_tf_common_test.py
deleted
100644 → 0
View file @
54abc67a
# coding=utf-8
# Copyright 2019 HuggingFace Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
__future__
import
absolute_import
,
division
,
print_function
import
copy
import
os
import
random
import
shutil
import
sys
import
tempfile
import
unittest
from
transformers
import
is_tf_available
,
is_torch_available
from
.utils
import
require_tf
if
is_tf_available
():
import
tensorflow
as
tf
import
numpy
as
np
# from transformers.modeling_bert import BertModel, BertConfig, BERT_PRETRAINED_MODEL_ARCHIVE_MAP
if
sys
.
version_info
[
0
]
==
2
:
class
TemporaryDirectory
(
object
):
"""Context manager for tempfile.mkdtemp() so it's usable with "with" statement."""
def
__enter__
(
self
):
self
.
name
=
tempfile
.
mkdtemp
()
return
self
.
name
def
__exit__
(
self
,
exc_type
,
exc_value
,
traceback
):
shutil
.
rmtree
(
self
.
name
)
else
:
TemporaryDirectory
=
tempfile
.
TemporaryDirectory
unicode
=
str
def
_config_zero_init
(
config
):
configs_no_init
=
copy
.
deepcopy
(
config
)
for
key
in
configs_no_init
.
__dict__
.
keys
():
if
"_range"
in
key
or
"_std"
in
key
:
setattr
(
configs_no_init
,
key
,
0.0
)
return
configs_no_init
class
TFCommonTestCases
:
@
require_tf
class
TFCommonModelTester
(
unittest
.
TestCase
):
model_tester
=
None
all_model_classes
=
()
test_torchscript
=
True
test_pruning
=
True
test_resize_embeddings
=
True
is_encoder_decoder
=
False
def
test_initialization
(
self
):
pass
# config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
# configs_no_init = _config_zero_init(config)
# for model_class in self.all_model_classes:
# model = model_class(config=configs_no_init)
# for name, param in model.named_parameters():
# if param.requires_grad:
# self.assertIn(param.data.mean().item(), [0.0, 1.0],
# msg="Parameter {} of model {} seems not properly initialized".format(name, model_class))
def
test_save_load
(
self
):
config
,
inputs_dict
=
self
.
model_tester
.
prepare_config_and_inputs_for_common
()
for
model_class
in
self
.
all_model_classes
:
model
=
model_class
(
config
)
outputs
=
model
(
inputs_dict
)
with
TemporaryDirectory
()
as
tmpdirname
:
model
.
save_pretrained
(
tmpdirname
)
model
=
model_class
.
from_pretrained
(
tmpdirname
)
after_outputs
=
model
(
inputs_dict
)
# Make sure we don't have nans
out_1
=
after_outputs
[
0
].
numpy
()
out_2
=
outputs
[
0
].
numpy
()
out_1
=
out_1
[
~
np
.
isnan
(
out_1
)]
out_2
=
out_2
[
~
np
.
isnan
(
out_2
)]
max_diff
=
np
.
amax
(
np
.
abs
(
out_1
-
out_2
))
self
.
assertLessEqual
(
max_diff
,
1e-5
)
def
test_pt_tf_model_equivalence
(
self
):
if
not
is_torch_available
():
return
import
torch
import
transformers
config
,
inputs_dict
=
self
.
model_tester
.
prepare_config_and_inputs_for_common
()
for
model_class
in
self
.
all_model_classes
:
pt_model_class_name
=
model_class
.
__name__
[
2
:]
# Skip the "TF" at the beggining
pt_model_class
=
getattr
(
transformers
,
pt_model_class_name
)
config
.
output_hidden_states
=
True
tf_model
=
model_class
(
config
)
pt_model
=
pt_model_class
(
config
)
# Check we can load pt model in tf and vice-versa with model => model functions
tf_model
=
transformers
.
load_pytorch_model_in_tf2_model
(
tf_model
,
pt_model
,
tf_inputs
=
inputs_dict
)
pt_model
=
transformers
.
load_tf2_model_in_pytorch_model
(
pt_model
,
tf_model
)
# Check predictions on first output (logits/hidden-states) are close enought given low-level computational differences
pt_model
.
eval
()
pt_inputs_dict
=
dict
(
(
name
,
torch
.
from_numpy
(
key
.
numpy
()).
to
(
torch
.
long
))
for
name
,
key
in
inputs_dict
.
items
()
)
with
torch
.
no_grad
():
pto
=
pt_model
(
**
pt_inputs_dict
)
tfo
=
tf_model
(
inputs_dict
,
training
=
False
)
tf_hidden_states
=
tfo
[
0
].
numpy
()
pt_hidden_states
=
pto
[
0
].
numpy
()
tf_hidden_states
[
np
.
isnan
(
tf_hidden_states
)]
=
0
pt_hidden_states
[
np
.
isnan
(
pt_hidden_states
)]
=
0
max_diff
=
np
.
amax
(
np
.
abs
(
tf_hidden_states
-
pt_hidden_states
))
self
.
assertLessEqual
(
max_diff
,
2e-2
)
# Check we can load pt model in tf and vice-versa with checkpoint => model functions
with
TemporaryDirectory
()
as
tmpdirname
:
pt_checkpoint_path
=
os
.
path
.
join
(
tmpdirname
,
"pt_model.bin"
)
torch
.
save
(
pt_model
.
state_dict
(),
pt_checkpoint_path
)
tf_model
=
transformers
.
load_pytorch_checkpoint_in_tf2_model
(
tf_model
,
pt_checkpoint_path
)
tf_checkpoint_path
=
os
.
path
.
join
(
tmpdirname
,
"tf_model.h5"
)
tf_model
.
save_weights
(
tf_checkpoint_path
)
pt_model
=
transformers
.
load_tf2_checkpoint_in_pytorch_model
(
pt_model
,
tf_checkpoint_path
)
# Check predictions on first output (logits/hidden-states) are close enought given low-level computational differences
pt_model
.
eval
()
pt_inputs_dict
=
dict
(
(
name
,
torch
.
from_numpy
(
key
.
numpy
()).
to
(
torch
.
long
))
for
name
,
key
in
inputs_dict
.
items
()
)
with
torch
.
no_grad
():
pto
=
pt_model
(
**
pt_inputs_dict
)
tfo
=
tf_model
(
inputs_dict
)
tfo
=
tfo
[
0
].
numpy
()
pto
=
pto
[
0
].
numpy
()
tfo
[
np
.
isnan
(
tfo
)]
=
0
pto
[
np
.
isnan
(
pto
)]
=
0
max_diff
=
np
.
amax
(
np
.
abs
(
tfo
-
pto
))
self
.
assertLessEqual
(
max_diff
,
2e-2
)
def
test_compile_tf_model
(
self
):
config
,
inputs_dict
=
self
.
model_tester
.
prepare_config_and_inputs_for_common
()
if
self
.
is_encoder_decoder
:
input_ids
=
{
"decoder_input_ids"
:
tf
.
keras
.
Input
(
batch_shape
=
(
2
,
2000
),
name
=
"decoder_input_ids"
,
dtype
=
"int32"
),
"encoder_input_ids"
:
tf
.
keras
.
Input
(
batch_shape
=
(
2
,
2000
),
name
=
"encoder_input_ids"
,
dtype
=
"int32"
),
}
else
:
input_ids
=
tf
.
keras
.
Input
(
batch_shape
=
(
2
,
2000
),
name
=
"input_ids"
,
dtype
=
"int32"
)
optimizer
=
tf
.
keras
.
optimizers
.
Adam
(
learning_rate
=
3e-5
,
epsilon
=
1e-08
,
clipnorm
=
1.0
)
loss
=
tf
.
keras
.
losses
.
SparseCategoricalCrossentropy
(
from_logits
=
True
)
metric
=
tf
.
keras
.
metrics
.
SparseCategoricalAccuracy
(
"accuracy"
)
for
model_class
in
self
.
all_model_classes
:
# Prepare our model
model
=
model_class
(
config
)
# Let's load it from the disk to be sure we can use pretrained weights
with
TemporaryDirectory
()
as
tmpdirname
:
outputs
=
model
(
inputs_dict
)
# build the model
model
.
save_pretrained
(
tmpdirname
)
model
=
model_class
.
from_pretrained
(
tmpdirname
)
outputs_dict
=
model
(
input_ids
)
hidden_states
=
outputs_dict
[
0
]
# Add a dense layer on top to test intetgration with other keras modules
outputs
=
tf
.
keras
.
layers
.
Dense
(
2
,
activation
=
"softmax"
,
name
=
"outputs"
)(
hidden_states
)
# Compile extended model
extended_model
=
tf
.
keras
.
Model
(
inputs
=
[
input_ids
],
outputs
=
[
outputs
])
extended_model
.
compile
(
optimizer
=
optimizer
,
loss
=
loss
,
metrics
=
[
metric
])
def
test_keyword_and_dict_args
(
self
):
config
,
inputs_dict
=
self
.
model_tester
.
prepare_config_and_inputs_for_common
()
for
model_class
in
self
.
all_model_classes
:
model
=
model_class
(
config
)
outputs_dict
=
model
(
inputs_dict
)
inputs_keywords
=
copy
.
deepcopy
(
inputs_dict
)
input_ids
=
inputs_keywords
.
pop
(
"input_ids"
if
not
self
.
is_encoder_decoder
else
"decoder_input_ids"
,
None
)
outputs_keywords
=
model
(
input_ids
,
**
inputs_keywords
)
output_dict
=
outputs_dict
[
0
].
numpy
()
output_keywords
=
outputs_keywords
[
0
].
numpy
()
self
.
assertLess
(
np
.
sum
(
np
.
abs
(
output_dict
-
output_keywords
)),
1e-6
)
def
test_attention_outputs
(
self
):
config
,
inputs_dict
=
self
.
model_tester
.
prepare_config_and_inputs_for_common
()
decoder_seq_length
=
(
self
.
model_tester
.
decoder_seq_length
if
hasattr
(
self
.
model_tester
,
"decoder_seq_length"
)
else
self
.
model_tester
.
seq_length
)
encoder_seq_length
=
(
self
.
model_tester
.
encoder_seq_length
if
hasattr
(
self
.
model_tester
,
"encoder_seq_length"
)
else
self
.
model_tester
.
seq_length
)
decoder_key_length
=
(
self
.
model_tester
.
key_length
if
hasattr
(
self
.
model_tester
,
"key_length"
)
else
decoder_seq_length
)
encoder_key_length
=
(
self
.
model_tester
.
key_length
if
hasattr
(
self
.
model_tester
,
"key_length"
)
else
encoder_seq_length
)
for
model_class
in
self
.
all_model_classes
:
config
.
output_attentions
=
True
config
.
output_hidden_states
=
False
model
=
model_class
(
config
)
outputs
=
model
(
inputs_dict
)
attentions
=
[
t
.
numpy
()
for
t
in
outputs
[
-
1
]]
self
.
assertEqual
(
model
.
config
.
output_attentions
,
True
)
self
.
assertEqual
(
model
.
config
.
output_hidden_states
,
False
)
self
.
assertEqual
(
len
(
attentions
),
self
.
model_tester
.
num_hidden_layers
)
self
.
assertListEqual
(
list
(
attentions
[
0
].
shape
[
-
3
:]),
[
self
.
model_tester
.
num_attention_heads
,
encoder_seq_length
,
encoder_key_length
],
)
out_len
=
len
(
outputs
)
if
self
.
is_encoder_decoder
:
self
.
assertEqual
(
out_len
%
2
,
0
)
decoder_attentions
=
outputs
[(
out_len
//
2
)
-
1
]
self
.
assertEqual
(
model
.
config
.
output_attentions
,
True
)
self
.
assertEqual
(
model
.
config
.
output_hidden_states
,
False
)
self
.
assertEqual
(
len
(
decoder_attentions
),
self
.
model_tester
.
num_hidden_layers
)
self
.
assertListEqual
(
list
(
decoder_attentions
[
0
].
shape
[
-
3
:]),
[
self
.
model_tester
.
num_attention_heads
,
decoder_seq_length
,
decoder_key_length
],
)
# Check attention is always last and order is fine
config
.
output_attentions
=
True
config
.
output_hidden_states
=
True
model
=
model_class
(
config
)
outputs
=
model
(
inputs_dict
)
self
.
assertEqual
(
out_len
+
(
2
if
self
.
is_encoder_decoder
else
1
),
len
(
outputs
))
self
.
assertEqual
(
model
.
config
.
output_attentions
,
True
)
self
.
assertEqual
(
model
.
config
.
output_hidden_states
,
True
)
attentions
=
[
t
.
numpy
()
for
t
in
outputs
[
-
1
]]
self
.
assertEqual
(
len
(
attentions
),
self
.
model_tester
.
num_hidden_layers
)
self
.
assertListEqual
(
list
(
attentions
[
0
].
shape
[
-
3
:]),
[
self
.
model_tester
.
num_attention_heads
,
encoder_seq_length
,
encoder_key_length
],
)
def
test_hidden_states_output
(
self
):
config
,
inputs_dict
=
self
.
model_tester
.
prepare_config_and_inputs_for_common
()
for
model_class
in
self
.
all_model_classes
:
config
.
output_hidden_states
=
True
config
.
output_attentions
=
False
model
=
model_class
(
config
)
outputs
=
model
(
inputs_dict
)
hidden_states
=
[
t
.
numpy
()
for
t
in
outputs
[
-
1
]]
self
.
assertEqual
(
model
.
config
.
output_attentions
,
False
)
self
.
assertEqual
(
model
.
config
.
output_hidden_states
,
True
)
self
.
assertEqual
(
len
(
hidden_states
),
self
.
model_tester
.
num_hidden_layers
+
1
)
self
.
assertListEqual
(
list
(
hidden_states
[
0
].
shape
[
-
2
:]),
[
self
.
model_tester
.
seq_length
,
self
.
model_tester
.
hidden_size
]
)
def
test_model_common_attributes
(
self
):
config
,
inputs_dict
=
self
.
model_tester
.
prepare_config_and_inputs_for_common
()
for
model_class
in
self
.
all_model_classes
:
model
=
model_class
(
config
)
assert
isinstance
(
model
.
get_input_embeddings
(),
tf
.
keras
.
layers
.
Layer
)
x
=
model
.
get_output_embeddings
()
assert
x
is
None
or
isinstance
(
x
,
tf
.
keras
.
layers
.
Layer
)
def
test_determinism
(
self
):
config
,
inputs_dict
=
self
.
model_tester
.
prepare_config_and_inputs_for_common
()
for
model_class
in
self
.
all_model_classes
:
model
=
model_class
(
config
)
first
,
second
=
model
(
inputs_dict
,
training
=
False
)[
0
],
model
(
inputs_dict
,
training
=
False
)[
0
]
out_1
=
first
.
numpy
()
out_2
=
second
.
numpy
()
out_1
=
out_1
[
~
np
.
isnan
(
out_1
)]
out_2
=
out_2
[
~
np
.
isnan
(
out_2
)]
max_diff
=
np
.
amax
(
np
.
abs
(
out_1
-
out_2
))
self
.
assertLessEqual
(
max_diff
,
1e-5
)
def
_get_embeds
(
self
,
wte
,
input_ids
):
# ^^ In our TF models, the input_embeddings can take slightly different forms,
# so we try a few of them.
# We used to fall back to just synthetically creating a dummy tensor of ones:
try
:
x
=
wte
(
input_ids
,
mode
=
"embedding"
)
except
Exception
:
try
:
x
=
wte
([
input_ids
],
mode
=
"embedding"
)
except
Exception
:
try
:
x
=
wte
([
input_ids
,
None
,
None
,
None
],
mode
=
"embedding"
)
except
Exception
:
if
hasattr
(
self
.
model_tester
,
"embedding_size"
):
x
=
tf
.
ones
(
input_ids
.
shape
+
[
self
.
model_tester
.
embedding_size
],
dtype
=
tf
.
dtypes
.
float32
)
else
:
x
=
tf
.
ones
(
input_ids
.
shape
+
[
self
.
model_tester
.
hidden_size
],
dtype
=
tf
.
dtypes
.
float32
)
return
x
def
test_inputs_embeds
(
self
):
config
,
inputs_dict
=
self
.
model_tester
.
prepare_config_and_inputs_for_common
()
if
not
self
.
is_encoder_decoder
:
input_ids
=
inputs_dict
[
"input_ids"
]
del
inputs_dict
[
"input_ids"
]
else
:
encoder_input_ids
=
inputs_dict
[
"encoder_input_ids"
]
decoder_input_ids
=
inputs_dict
[
"decoder_input_ids"
]
del
inputs_dict
[
"encoder_input_ids"
]
del
inputs_dict
[
"decoder_input_ids"
]
for
model_class
in
self
.
all_model_classes
:
model
=
model_class
(
config
)
wte
=
model
.
get_input_embeddings
()
if
not
self
.
is_encoder_decoder
:
inputs_dict
[
"inputs_embeds"
]
=
self
.
_get_embeds
(
wte
,
input_ids
)
else
:
inputs_dict
[
"encoder_inputs_embeds"
]
=
self
.
_get_embeds
(
wte
,
encoder_input_ids
)
inputs_dict
[
"decoder_inputs_embeds"
]
=
self
.
_get_embeds
(
wte
,
decoder_input_ids
)
outputs
=
model
(
inputs_dict
)
def
ids_tensor
(
shape
,
vocab_size
,
rng
=
None
,
name
=
None
,
dtype
=
None
):
"""Creates a random int32 tensor of the shape within the vocab size."""
if
rng
is
None
:
rng
=
random
.
Random
()
total_dims
=
1
for
dim
in
shape
:
total_dims
*=
dim
values
=
[]
for
_
in
range
(
total_dims
):
values
.
append
(
rng
.
randint
(
0
,
vocab_size
-
1
))
output
=
tf
.
constant
(
values
,
shape
=
shape
,
dtype
=
dtype
if
dtype
is
not
None
else
tf
.
int32
)
return
output
if
__name__
==
"__main__"
:
unittest
.
main
()
transformers/tests/tokenization_tests_commons.py
deleted
100644 → 0
View file @
54abc67a
# coding=utf-8
# Copyright 2019 HuggingFace Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
__future__
import
absolute_import
,
division
,
print_function
,
unicode_literals
import
os
import
shutil
import
sys
import
tempfile
import
unittest
from
io
import
open
if
sys
.
version_info
[
0
]
==
2
:
import
cPickle
as
pickle
class
TemporaryDirectory
(
object
):
"""Context manager for tempfile.mkdtemp() so it's usable with "with" statement."""
def
__enter__
(
self
):
self
.
name
=
tempfile
.
mkdtemp
()
return
self
.
name
def
__exit__
(
self
,
exc_type
,
exc_value
,
traceback
):
shutil
.
rmtree
(
self
.
name
)
else
:
import
pickle
TemporaryDirectory
=
tempfile
.
TemporaryDirectory
unicode
=
str
class
CommonTestCases
:
class
CommonTokenizerTester
(
unittest
.
TestCase
):
tokenizer_class
=
None
def
setUp
(
self
):
self
.
tmpdirname
=
tempfile
.
mkdtemp
()
def
tearDown
(
self
):
shutil
.
rmtree
(
self
.
tmpdirname
)
def
get_tokenizer
(
self
,
**
kwargs
):
raise
NotImplementedError
def
get_input_output_texts
(
self
):
raise
NotImplementedError
def
test_tokenizers_common_properties
(
self
):
tokenizer
=
self
.
get_tokenizer
()
attributes_list
=
[
"bos_token"
,
"eos_token"
,
"unk_token"
,
"sep_token"
,
"pad_token"
,
"cls_token"
,
"mask_token"
,
]
for
attr
in
attributes_list
:
self
.
assertTrue
(
hasattr
(
tokenizer
,
attr
))
self
.
assertTrue
(
hasattr
(
tokenizer
,
attr
+
"_id"
))
self
.
assertTrue
(
hasattr
(
tokenizer
,
"additional_special_tokens"
))
self
.
assertTrue
(
hasattr
(
tokenizer
,
"additional_special_tokens_ids"
))
attributes_list
=
[
"max_len"
,
"init_inputs"
,
"init_kwargs"
,
"added_tokens_encoder"
,
"added_tokens_decoder"
]
for
attr
in
attributes_list
:
self
.
assertTrue
(
hasattr
(
tokenizer
,
attr
))
def
test_save_and_load_tokenizer
(
self
):
# safety check on max_len default value so we are sure the test works
tokenizer
=
self
.
get_tokenizer
()
self
.
assertNotEqual
(
tokenizer
.
max_len
,
42
)
# Now let's start the test
tokenizer
=
self
.
get_tokenizer
(
max_len
=
42
)
before_tokens
=
tokenizer
.
encode
(
"He is very happy, UNwant
\u00E9
d,running"
,
add_special_tokens
=
False
)
with
TemporaryDirectory
()
as
tmpdirname
:
tokenizer
.
save_pretrained
(
tmpdirname
)
tokenizer
=
self
.
tokenizer_class
.
from_pretrained
(
tmpdirname
)
after_tokens
=
tokenizer
.
encode
(
"He is very happy, UNwant
\u00E9
d,running"
,
add_special_tokens
=
False
)
self
.
assertListEqual
(
before_tokens
,
after_tokens
)
self
.
assertEqual
(
tokenizer
.
max_len
,
42
)
tokenizer
=
self
.
tokenizer_class
.
from_pretrained
(
tmpdirname
,
max_len
=
43
)
self
.
assertEqual
(
tokenizer
.
max_len
,
43
)
def
test_pickle_tokenizer
(
self
):
tokenizer
=
self
.
get_tokenizer
()
self
.
assertIsNotNone
(
tokenizer
)
text
=
"Munich and Berlin are nice cities"
subwords
=
tokenizer
.
tokenize
(
text
)
with
TemporaryDirectory
()
as
tmpdirname
:
filename
=
os
.
path
.
join
(
tmpdirname
,
"tokenizer.bin"
)
with
open
(
filename
,
"wb"
)
as
handle
:
pickle
.
dump
(
tokenizer
,
handle
)
with
open
(
filename
,
"rb"
)
as
handle
:
tokenizer_new
=
pickle
.
load
(
handle
)
subwords_loaded
=
tokenizer_new
.
tokenize
(
text
)
self
.
assertListEqual
(
subwords
,
subwords_loaded
)
def
test_added_tokens_do_lower_case
(
self
):
tokenizer
=
self
.
get_tokenizer
(
do_lower_case
=
True
)
special_token
=
tokenizer
.
all_special_tokens
[
0
]
text
=
special_token
+
" aaaaa bbbbbb low cccccccccdddddddd l "
+
special_token
text2
=
special_token
+
" AAAAA BBBBBB low CCCCCCCCCDDDDDDDD l "
+
special_token
toks0
=
tokenizer
.
tokenize
(
text
)
# toks before adding new_toks
new_toks
=
[
"aaaaa bbbbbb"
,
"cccccccccdddddddd"
,
"AAAAA BBBBBB"
,
"CCCCCCCCCDDDDDDDD"
]
added
=
tokenizer
.
add_tokens
(
new_toks
)
self
.
assertEqual
(
added
,
2
)
toks
=
tokenizer
.
tokenize
(
text
)
toks2
=
tokenizer
.
tokenize
(
text2
)
self
.
assertEqual
(
len
(
toks
),
len
(
toks2
))
self
.
assertNotEqual
(
len
(
toks
),
len
(
toks0
))
# toks0 should be longer
self
.
assertListEqual
(
toks
,
toks2
)
# Check that none of the special tokens are lowercased
sequence_with_special_tokens
=
"A "
+
" yEs "
.
join
(
tokenizer
.
all_special_tokens
)
+
" B"
tokenized_sequence
=
tokenizer
.
tokenize
(
sequence_with_special_tokens
)
for
special_token
in
tokenizer
.
all_special_tokens
:
self
.
assertTrue
(
special_token
in
tokenized_sequence
)
tokenizer
=
self
.
get_tokenizer
(
do_lower_case
=
False
)
added
=
tokenizer
.
add_tokens
(
new_toks
)
self
.
assertEqual
(
added
,
4
)
toks
=
tokenizer
.
tokenize
(
text
)
toks2
=
tokenizer
.
tokenize
(
text2
)
self
.
assertEqual
(
len
(
toks
),
len
(
toks2
))
# Length should still be the same
self
.
assertNotEqual
(
len
(
toks
),
len
(
toks0
))
self
.
assertNotEqual
(
toks
[
1
],
toks2
[
1
])
# But at least the first non-special tokens should differ
def
test_add_tokens_tokenizer
(
self
):
tokenizer
=
self
.
get_tokenizer
()
vocab_size
=
tokenizer
.
vocab_size
all_size
=
len
(
tokenizer
)
self
.
assertNotEqual
(
vocab_size
,
0
)
self
.
assertEqual
(
vocab_size
,
all_size
)
new_toks
=
[
"aaaaa bbbbbb"
,
"cccccccccdddddddd"
]
added_toks
=
tokenizer
.
add_tokens
(
new_toks
)
vocab_size_2
=
tokenizer
.
vocab_size
all_size_2
=
len
(
tokenizer
)
self
.
assertNotEqual
(
vocab_size_2
,
0
)
self
.
assertEqual
(
vocab_size
,
vocab_size_2
)
self
.
assertEqual
(
added_toks
,
len
(
new_toks
))
self
.
assertEqual
(
all_size_2
,
all_size
+
len
(
new_toks
))
tokens
=
tokenizer
.
encode
(
"aaaaa bbbbbb low cccccccccdddddddd l"
,
add_special_tokens
=
False
)
out_string
=
tokenizer
.
decode
(
tokens
)
self
.
assertGreaterEqual
(
len
(
tokens
),
4
)
self
.
assertGreater
(
tokens
[
0
],
tokenizer
.
vocab_size
-
1
)
self
.
assertGreater
(
tokens
[
-
2
],
tokenizer
.
vocab_size
-
1
)
new_toks_2
=
{
"eos_token"
:
">>>>|||<||<<|<<"
,
"pad_token"
:
"<<<<<|||>|>>>>|>"
}
added_toks_2
=
tokenizer
.
add_special_tokens
(
new_toks_2
)
vocab_size_3
=
tokenizer
.
vocab_size
all_size_3
=
len
(
tokenizer
)
self
.
assertNotEqual
(
vocab_size_3
,
0
)
self
.
assertEqual
(
vocab_size
,
vocab_size_3
)
self
.
assertEqual
(
added_toks_2
,
len
(
new_toks_2
))
self
.
assertEqual
(
all_size_3
,
all_size_2
+
len
(
new_toks_2
))
tokens
=
tokenizer
.
encode
(
">>>>|||<||<<|<< aaaaabbbbbb low cccccccccdddddddd <<<<<|||>|>>>>|> l"
,
add_special_tokens
=
False
)
out_string
=
tokenizer
.
decode
(
tokens
)
self
.
assertGreaterEqual
(
len
(
tokens
),
6
)
self
.
assertGreater
(
tokens
[
0
],
tokenizer
.
vocab_size
-
1
)
self
.
assertGreater
(
tokens
[
0
],
tokens
[
1
])
self
.
assertGreater
(
tokens
[
-
2
],
tokenizer
.
vocab_size
-
1
)
self
.
assertGreater
(
tokens
[
-
2
],
tokens
[
-
3
])
self
.
assertEqual
(
tokens
[
0
],
tokenizer
.
eos_token_id
)
self
.
assertEqual
(
tokens
[
-
2
],
tokenizer
.
pad_token_id
)
def
test_add_special_tokens
(
self
):
tokenizer
=
self
.
get_tokenizer
()
input_text
,
output_text
=
self
.
get_input_output_texts
()
special_token
=
"[SPECIAL TOKEN]"
tokenizer
.
add_special_tokens
({
"cls_token"
:
special_token
})
encoded_special_token
=
tokenizer
.
encode
(
special_token
,
add_special_tokens
=
False
)
assert
len
(
encoded_special_token
)
==
1
text
=
" "
.
join
([
input_text
,
special_token
,
output_text
])
encoded
=
tokenizer
.
encode
(
text
,
add_special_tokens
=
False
)
input_encoded
=
tokenizer
.
encode
(
input_text
,
add_special_tokens
=
False
)
output_encoded
=
tokenizer
.
encode
(
output_text
,
add_special_tokens
=
False
)
special_token_id
=
tokenizer
.
encode
(
special_token
,
add_special_tokens
=
False
)
assert
encoded
==
input_encoded
+
special_token_id
+
output_encoded
decoded
=
tokenizer
.
decode
(
encoded
,
skip_special_tokens
=
True
)
assert
special_token
not
in
decoded
def
test_required_methods_tokenizer
(
self
):
tokenizer
=
self
.
get_tokenizer
()
input_text
,
output_text
=
self
.
get_input_output_texts
()
tokens
=
tokenizer
.
tokenize
(
input_text
)
ids
=
tokenizer
.
convert_tokens_to_ids
(
tokens
)
ids_2
=
tokenizer
.
encode
(
input_text
,
add_special_tokens
=
False
)
self
.
assertListEqual
(
ids
,
ids_2
)
tokens_2
=
tokenizer
.
convert_ids_to_tokens
(
ids
)
text_2
=
tokenizer
.
decode
(
ids
)
self
.
assertEqual
(
text_2
,
output_text
)
self
.
assertNotEqual
(
len
(
tokens_2
),
0
)
self
.
assertIsInstance
(
text_2
,
(
str
,
unicode
))
def
test_encode_decode_with_spaces
(
self
):
tokenizer
=
self
.
get_tokenizer
()
new_toks
=
[
"[ABC]"
,
"[DEF]"
,
"GHI IHG"
]
tokenizer
.
add_tokens
(
new_toks
)
input
=
"[ABC] [DEF] [ABC] GHI IHG [DEF]"
encoded
=
tokenizer
.
encode
(
input
,
add_special_tokens
=
False
)
decoded
=
tokenizer
.
decode
(
encoded
)
self
.
assertEqual
(
decoded
,
input
)
def
test_pretrained_model_lists
(
self
):
weights_list
=
list
(
self
.
tokenizer_class
.
max_model_input_sizes
.
keys
())
weights_lists_2
=
[]
for
file_id
,
map_list
in
self
.
tokenizer_class
.
pretrained_vocab_files_map
.
items
():
weights_lists_2
.
append
(
list
(
map_list
.
keys
()))
for
weights_list_2
in
weights_lists_2
:
self
.
assertListEqual
(
weights_list
,
weights_list_2
)
def
test_mask_output
(
self
):
if
sys
.
version_info
<=
(
3
,
0
):
return
tokenizer
=
self
.
get_tokenizer
()
if
tokenizer
.
build_inputs_with_special_tokens
.
__qualname__
.
split
(
"."
)[
0
]
!=
"PreTrainedTokenizer"
:
seq_0
=
"Test this method."
seq_1
=
"With these inputs."
information
=
tokenizer
.
encode_plus
(
seq_0
,
seq_1
,
add_special_tokens
=
True
)
sequences
,
mask
=
information
[
"input_ids"
],
information
[
"token_type_ids"
]
self
.
assertEqual
(
len
(
sequences
),
len
(
mask
))
def
test_number_of_added_tokens
(
self
):
tokenizer
=
self
.
get_tokenizer
()
seq_0
=
"Test this method."
seq_1
=
"With these inputs."
sequences
=
tokenizer
.
encode
(
seq_0
,
seq_1
,
add_special_tokens
=
False
)
attached_sequences
=
tokenizer
.
encode
(
seq_0
,
seq_1
,
add_special_tokens
=
True
)
# Method is implemented (e.g. not GPT-2)
if
len
(
attached_sequences
)
!=
2
:
self
.
assertEqual
(
tokenizer
.
num_added_tokens
(
pair
=
True
),
len
(
attached_sequences
)
-
len
(
sequences
))
def
test_maximum_encoding_length_single_input
(
self
):
tokenizer
=
self
.
get_tokenizer
()
seq_0
=
"This is a sentence to be encoded."
stride
=
2
sequence
=
tokenizer
.
encode
(
seq_0
,
add_special_tokens
=
False
)
num_added_tokens
=
tokenizer
.
num_added_tokens
()
total_length
=
len
(
sequence
)
+
num_added_tokens
information
=
tokenizer
.
encode_plus
(
seq_0
,
max_length
=
total_length
-
2
,
add_special_tokens
=
True
,
stride
=
stride
,
return_overflowing_tokens
=
True
,
)
truncated_sequence
=
information
[
"input_ids"
]
overflowing_tokens
=
information
[
"overflowing_tokens"
]
self
.
assertEqual
(
len
(
overflowing_tokens
),
2
+
stride
)
self
.
assertEqual
(
overflowing_tokens
,
sequence
[
-
(
2
+
stride
)
:])
self
.
assertEqual
(
len
(
truncated_sequence
),
total_length
-
2
)
self
.
assertEqual
(
truncated_sequence
,
tokenizer
.
build_inputs_with_special_tokens
(
sequence
[:
-
2
]))
def
test_maximum_encoding_length_pair_input
(
self
):
tokenizer
=
self
.
get_tokenizer
()
seq_0
=
"This is a sentence to be encoded."
seq_1
=
"This is another sentence to be encoded."
stride
=
2
sequence_0_no_special_tokens
=
tokenizer
.
encode
(
seq_0
,
add_special_tokens
=
False
)
sequence_1_no_special_tokens
=
tokenizer
.
encode
(
seq_1
,
add_special_tokens
=
False
)
sequence
=
tokenizer
.
encode
(
seq_0
,
seq_1
,
add_special_tokens
=
True
)
truncated_second_sequence
=
tokenizer
.
build_inputs_with_special_tokens
(
tokenizer
.
encode
(
seq_0
,
add_special_tokens
=
False
),
tokenizer
.
encode
(
seq_1
,
add_special_tokens
=
False
)[:
-
2
],
)
information
=
tokenizer
.
encode_plus
(
seq_0
,
seq_1
,
max_length
=
len
(
sequence
)
-
2
,
add_special_tokens
=
True
,
stride
=
stride
,
truncation_strategy
=
"only_second"
,
return_overflowing_tokens
=
True
,
)
information_first_truncated
=
tokenizer
.
encode_plus
(
seq_0
,
seq_1
,
max_length
=
len
(
sequence
)
-
2
,
add_special_tokens
=
True
,
stride
=
stride
,
truncation_strategy
=
"only_first"
,
return_overflowing_tokens
=
True
,
)
truncated_sequence
=
information
[
"input_ids"
]
overflowing_tokens
=
information
[
"overflowing_tokens"
]
overflowing_tokens_first_truncated
=
information_first_truncated
[
"overflowing_tokens"
]
self
.
assertEqual
(
len
(
overflowing_tokens
),
2
+
stride
)
self
.
assertEqual
(
overflowing_tokens
,
sequence_1_no_special_tokens
[
-
(
2
+
stride
)
:])
self
.
assertEqual
(
overflowing_tokens_first_truncated
,
sequence_0_no_special_tokens
[
-
(
2
+
stride
)
:])
self
.
assertEqual
(
len
(
truncated_sequence
),
len
(
sequence
)
-
2
)
self
.
assertEqual
(
truncated_sequence
,
truncated_second_sequence
)
def
test_encode_input_type
(
self
):
tokenizer
=
self
.
get_tokenizer
()
sequence
=
"Let's encode this sequence"
tokens
=
tokenizer
.
tokenize
(
sequence
)
input_ids
=
tokenizer
.
convert_tokens_to_ids
(
tokens
)
formatted_input
=
tokenizer
.
encode
(
sequence
,
add_special_tokens
=
True
)
self
.
assertEqual
(
tokenizer
.
encode
(
tokens
,
add_special_tokens
=
True
),
formatted_input
)
self
.
assertEqual
(
tokenizer
.
encode
(
input_ids
,
add_special_tokens
=
True
),
formatted_input
)
def
test_special_tokens_mask
(
self
):
tokenizer
=
self
.
get_tokenizer
()
sequence_0
=
"Encode this."
sequence_1
=
"This one too please."
# Testing single inputs
encoded_sequence
=
tokenizer
.
encode
(
sequence_0
,
add_special_tokens
=
False
)
encoded_sequence_dict
=
tokenizer
.
encode_plus
(
sequence_0
,
add_special_tokens
=
True
,
return_special_tokens_mask
=
True
)
encoded_sequence_w_special
=
encoded_sequence_dict
[
"input_ids"
]
special_tokens_mask
=
encoded_sequence_dict
[
"special_tokens_mask"
]
self
.
assertEqual
(
len
(
special_tokens_mask
),
len
(
encoded_sequence_w_special
))
filtered_sequence
=
[
(
x
if
not
special_tokens_mask
[
i
]
else
None
)
for
i
,
x
in
enumerate
(
encoded_sequence_w_special
)
]
filtered_sequence
=
[
x
for
x
in
filtered_sequence
if
x
is
not
None
]
self
.
assertEqual
(
encoded_sequence
,
filtered_sequence
)
# Testing inputs pairs
encoded_sequence
=
tokenizer
.
encode
(
sequence_0
,
add_special_tokens
=
False
)
+
tokenizer
.
encode
(
sequence_1
,
add_special_tokens
=
False
)
encoded_sequence_dict
=
tokenizer
.
encode_plus
(
sequence_0
,
sequence_1
,
add_special_tokens
=
True
,
return_special_tokens_mask
=
True
)
encoded_sequence_w_special
=
encoded_sequence_dict
[
"input_ids"
]
special_tokens_mask
=
encoded_sequence_dict
[
"special_tokens_mask"
]
self
.
assertEqual
(
len
(
special_tokens_mask
),
len
(
encoded_sequence_w_special
))
filtered_sequence
=
[
(
x
if
not
special_tokens_mask
[
i
]
else
None
)
for
i
,
x
in
enumerate
(
encoded_sequence_w_special
)
]
filtered_sequence
=
[
x
for
x
in
filtered_sequence
if
x
is
not
None
]
self
.
assertEqual
(
encoded_sequence
,
filtered_sequence
)
# Testing with already existing special tokens
if
tokenizer
.
cls_token_id
==
tokenizer
.
unk_token_id
and
tokenizer
.
cls_token_id
==
tokenizer
.
unk_token_id
:
tokenizer
.
add_special_tokens
({
"cls_token"
:
"</s>"
,
"sep_token"
:
"<s>"
})
encoded_sequence_dict
=
tokenizer
.
encode_plus
(
sequence_0
,
add_special_tokens
=
True
,
return_special_tokens_mask
=
True
)
encoded_sequence_w_special
=
encoded_sequence_dict
[
"input_ids"
]
special_tokens_mask_orig
=
encoded_sequence_dict
[
"special_tokens_mask"
]
special_tokens_mask
=
tokenizer
.
get_special_tokens_mask
(
encoded_sequence_w_special
,
already_has_special_tokens
=
True
)
self
.
assertEqual
(
len
(
special_tokens_mask
),
len
(
encoded_sequence_w_special
))
self
.
assertEqual
(
special_tokens_mask_orig
,
special_tokens_mask
)
def
test_padding_to_max_length
(
self
):
tokenizer
=
self
.
get_tokenizer
()
sequence
=
"Sequence"
padding_size
=
10
padding_idx
=
tokenizer
.
pad_token_id
# RIGHT PADDING - Check that it correctly pads when a maximum length is specified along with the padding flag set to True
tokenizer
.
padding_side
=
"right"
encoded_sequence
=
tokenizer
.
encode
(
sequence
)
sequence_length
=
len
(
encoded_sequence
)
padded_sequence
=
tokenizer
.
encode
(
sequence
,
max_length
=
sequence_length
+
padding_size
,
pad_to_max_length
=
True
)
padded_sequence_length
=
len
(
padded_sequence
)
assert
sequence_length
+
padding_size
==
padded_sequence_length
assert
encoded_sequence
+
[
padding_idx
]
*
padding_size
==
padded_sequence
# LEFT PADDING - Check that it correctly pads when a maximum length is specified along with the padding flag set to True
tokenizer
.
padding_side
=
"left"
encoded_sequence
=
tokenizer
.
encode
(
sequence
)
sequence_length
=
len
(
encoded_sequence
)
padded_sequence
=
tokenizer
.
encode
(
sequence
,
max_length
=
sequence_length
+
padding_size
,
pad_to_max_length
=
True
)
padded_sequence_length
=
len
(
padded_sequence
)
assert
sequence_length
+
padding_size
==
padded_sequence_length
assert
[
padding_idx
]
*
padding_size
+
encoded_sequence
==
padded_sequence
# RIGHT & LEFT PADDING - Check that nothing is done when a maximum length is not specified
encoded_sequence
=
tokenizer
.
encode
(
sequence
)
sequence_length
=
len
(
encoded_sequence
)
tokenizer
.
padding_side
=
"right"
padded_sequence_right
=
tokenizer
.
encode
(
sequence
,
pad_to_max_length
=
True
)
padded_sequence_right_length
=
len
(
padded_sequence_right
)
tokenizer
.
padding_side
=
"left"
padded_sequence_left
=
tokenizer
.
encode
(
sequence
,
pad_to_max_length
=
True
)
padded_sequence_left_length
=
len
(
padded_sequence_left
)
assert
sequence_length
==
padded_sequence_right_length
assert
encoded_sequence
==
padded_sequence_right
assert
sequence_length
==
padded_sequence_left_length
assert
encoded_sequence
==
padded_sequence_left
def
test_encode_plus_with_padding
(
self
):
tokenizer
=
self
.
get_tokenizer
()
sequence
=
"Sequence"
padding_size
=
10
padding_idx
=
tokenizer
.
pad_token_id
token_type_padding_idx
=
tokenizer
.
pad_token_type_id
encoded_sequence
=
tokenizer
.
encode_plus
(
sequence
,
return_special_tokens_mask
=
True
)
input_ids
=
encoded_sequence
[
"input_ids"
]
token_type_ids
=
encoded_sequence
[
"token_type_ids"
]
attention_mask
=
encoded_sequence
[
"attention_mask"
]
special_tokens_mask
=
encoded_sequence
[
"special_tokens_mask"
]
sequence_length
=
len
(
input_ids
)
# Test right padding
tokenizer
.
padding_side
=
"right"
padded_sequence
=
tokenizer
.
encode_plus
(
sequence
,
max_length
=
sequence_length
+
padding_size
,
pad_to_max_length
=
True
,
return_special_tokens_mask
=
True
,
)
padded_input_ids
=
padded_sequence
[
"input_ids"
]
padded_token_type_ids
=
padded_sequence
[
"token_type_ids"
]
padded_attention_mask
=
padded_sequence
[
"attention_mask"
]
padded_special_tokens_mask
=
padded_sequence
[
"special_tokens_mask"
]
padded_sequence_length
=
len
(
padded_input_ids
)
assert
sequence_length
+
padding_size
==
padded_sequence_length
assert
input_ids
+
[
padding_idx
]
*
padding_size
==
padded_input_ids
assert
token_type_ids
+
[
token_type_padding_idx
]
*
padding_size
==
padded_token_type_ids
assert
attention_mask
+
[
0
]
*
padding_size
==
padded_attention_mask
assert
special_tokens_mask
+
[
1
]
*
padding_size
==
padded_special_tokens_mask
# Test left padding
tokenizer
.
padding_side
=
"left"
padded_sequence
=
tokenizer
.
encode_plus
(
sequence
,
max_length
=
sequence_length
+
padding_size
,
pad_to_max_length
=
True
,
return_special_tokens_mask
=
True
,
)
padded_input_ids
=
padded_sequence
[
"input_ids"
]
padded_token_type_ids
=
padded_sequence
[
"token_type_ids"
]
padded_attention_mask
=
padded_sequence
[
"attention_mask"
]
padded_special_tokens_mask
=
padded_sequence
[
"special_tokens_mask"
]
padded_sequence_length
=
len
(
padded_input_ids
)
assert
sequence_length
+
padding_size
==
padded_sequence_length
assert
[
padding_idx
]
*
padding_size
+
input_ids
==
padded_input_ids
assert
[
token_type_padding_idx
]
*
padding_size
+
token_type_ids
==
padded_token_type_ids
assert
[
0
]
*
padding_size
+
attention_mask
==
padded_attention_mask
assert
[
1
]
*
padding_size
+
special_tokens_mask
==
padded_special_tokens_mask
Prev
1
…
5
6
7
8
9
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment