Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
40ed7172
Commit
40ed7172
authored
Dec 13, 2019
by
erenup
Browse files
Merge remote-tracking branch 'refs/remotes/huggingface/master'
parents
86a63070
7296f101
Changes
183
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
605 additions
and
260 deletions
+605
-260
transformers/tests/modeling_tf_bert_test.py
transformers/tests/modeling_tf_bert_test.py
+3
-8
transformers/tests/modeling_tf_common_test.py
transformers/tests/modeling_tf_common_test.py
+40
-170
transformers/tests/modeling_tf_ctrl_test.py
transformers/tests/modeling_tf_ctrl_test.py
+3
-4
transformers/tests/modeling_tf_distilbert_test.py
transformers/tests/modeling_tf_distilbert_test.py
+3
-4
transformers/tests/modeling_tf_gpt2_test.py
transformers/tests/modeling_tf_gpt2_test.py
+3
-4
transformers/tests/modeling_tf_openai_gpt_test.py
transformers/tests/modeling_tf_openai_gpt_test.py
+3
-4
transformers/tests/modeling_tf_roberta_test.py
transformers/tests/modeling_tf_roberta_test.py
+24
-10
transformers/tests/modeling_tf_transfo_xl_test.py
transformers/tests/modeling_tf_transfo_xl_test.py
+3
-4
transformers/tests/modeling_tf_xlm_test.py
transformers/tests/modeling_tf_xlm_test.py
+3
-4
transformers/tests/modeling_tf_xlnet_test.py
transformers/tests/modeling_tf_xlnet_test.py
+31
-5
transformers/tests/modeling_transfo_xl_test.py
transformers/tests/modeling_transfo_xl_test.py
+6
-4
transformers/tests/modeling_xlm_test.py
transformers/tests/modeling_xlm_test.py
+8
-4
transformers/tests/modeling_xlnet_test.py
transformers/tests/modeling_xlnet_test.py
+80
-15
transformers/tests/optimization_test.py
transformers/tests/optimization_test.py
+20
-15
transformers/tests/optimization_tf_test.py
transformers/tests/optimization_tf_test.py
+90
-0
transformers/tests/tokenization_albert_test.py
transformers/tests/tokenization_albert_test.py
+78
-0
transformers/tests/tokenization_auto_test.py
transformers/tests/tokenization_auto_test.py
+8
-1
transformers/tests/tokenization_bert_japanese_test.py
transformers/tests/tokenization_bert_japanese_test.py
+191
-0
transformers/tests/tokenization_bert_test.py
transformers/tests/tokenization_bert_test.py
+4
-2
transformers/tests/tokenization_distilbert_test.py
transformers/tests/tokenization_distilbert_test.py
+4
-2
No files found.
transformers/tests/modeling_tf_bert_test.py
View file @
40ed7172
...
...
@@ -18,11 +18,11 @@ from __future__ import print_function
import
unittest
import
shutil
import
pytest
import
sys
from
.modeling_tf_common_test
import
(
TFCommonTestCases
,
ids_tensor
)
from
.configuration_common_test
import
ConfigTester
from
.utils
import
require_tf
,
slow
from
transformers
import
BertConfig
,
is_tf_available
...
...
@@ -36,10 +36,9 @@ if is_tf_available():
TFBertForTokenClassification
,
TFBertForQuestionAnswering
,
TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP
)
else
:
pytestmark
=
pytest
.
mark
.
skip
(
"Require TensorFlow"
)
@
require_tf
class
TFBertModelTest
(
TFCommonTestCases
.
TFCommonModelTester
):
all_model_classes
=
(
TFBertModel
,
TFBertForMaskedLM
,
TFBertForNextSentencePrediction
,
...
...
@@ -131,10 +130,6 @@ class TFBertModelTest(TFCommonTestCases.TFCommonModelTester):
def
create_and_check_bert_model
(
self
,
config
,
input_ids
,
token_type_ids
,
input_mask
,
sequence_labels
,
token_labels
,
choice_labels
):
model
=
TFBertModel
(
config
=
config
)
# inputs = {'input_ids': input_ids,
# 'attention_mask': input_mask,
# 'token_type_ids': token_type_ids}
# sequence_output, pooled_output = model(**inputs)
inputs
=
{
'input_ids'
:
input_ids
,
'attention_mask'
:
input_mask
,
'token_type_ids'
:
token_type_ids
}
...
...
@@ -313,7 +308,7 @@ class TFBertModelTest(TFCommonTestCases.TFCommonModelTester):
config_and_inputs
=
self
.
model_tester
.
prepare_config_and_inputs
()
self
.
model_tester
.
create_and_check_bert_for_token_classification
(
*
config_and_inputs
)
@
pytest
.
mark
.
slow
@
slow
def
test_model_from_pretrained
(
self
):
cache_dir
=
"/tmp/transformers_test/"
# for model_name in list(TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
...
...
transformers/tests/modeling_tf_common_test.py
View file @
40ed7172
...
...
@@ -25,18 +25,17 @@ import unittest
import
uuid
import
tempfile
import
pytest
import
sys
from
transformers
import
is_tf_available
,
is_torch_available
from
.utils
import
require_tf
,
slow
if
is_tf_available
():
import
tensorflow
as
tf
import
numpy
as
np
from
transformers
import
TFPreTrainedModel
# from transformers.modeling_bert import BertModel, BertConfig, BERT_PRETRAINED_MODEL_ARCHIVE_MAP
else
:
pytestmark
=
pytest
.
mark
.
skip
(
"Require TensorFlow"
)
if
sys
.
version_info
[
0
]
==
2
:
import
cPickle
as
pickle
...
...
@@ -62,6 +61,7 @@ def _config_zero_init(config):
class
TFCommonTestCases
:
@
require_tf
class
TFCommonModelTester
(
unittest
.
TestCase
):
model_tester
=
None
...
...
@@ -233,80 +233,6 @@ class TFCommonTestCases:
self
.
model_tester
.
seq_length
,
self
.
model_tester
.
key_len
if
hasattr
(
self
.
model_tester
,
'key_len'
)
else
self
.
model_tester
.
seq_length
])
def
test_headmasking
(
self
):
pass
# config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
# config.output_attentions = True
# config.output_hidden_states = True
# configs_no_init = _config_zero_init(config) # To be sure we have no Nan
# for model_class in self.all_model_classes:
# model = model_class(config=configs_no_init)
# model.eval()
# # Prepare head_mask
# # Set require_grad after having prepared the tensor to avoid error (leaf variable has been moved into the graph interior)
# head_mask = torch.ones(self.model_tester.num_hidden_layers, self.model_tester.num_attention_heads)
# head_mask[0, 0] = 0
# head_mask[-1, :-1] = 0
# head_mask.requires_grad_(requires_grad=True)
# inputs = inputs_dict.copy()
# inputs['head_mask'] = head_mask
# outputs = model(**inputs)
# # Test that we can get a gradient back for importance score computation
# output = sum(t.sum() for t in outputs[0])
# output = output.sum()
# output.backward()
# multihead_outputs = head_mask.grad
# attentions = outputs[-1]
# hidden_states = outputs[-2]
# # Remove Nan
# self.assertIsNotNone(multihead_outputs)
# self.assertEqual(len(multihead_outputs), self.model_tester.num_hidden_layers)
# self.assertAlmostEqual(
# attentions[0][..., 0, :, :].flatten().sum().item(), 0.0)
# self.assertNotEqual(
# attentions[0][..., -1, :, :].flatten().sum().item(), 0.0)
# self.assertNotEqual(
# attentions[1][..., 0, :, :].flatten().sum().item(), 0.0)
# self.assertAlmostEqual(
# attentions[-1][..., -2, :, :].flatten().sum().item(), 0.0)
# self.assertNotEqual(
# attentions[-1][..., -1, :, :].flatten().sum().item(), 0.0)
def
test_head_pruning
(
self
):
pass
# if not self.test_pruning:
# return
# config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
# for model_class in self.all_model_classes:
# config.output_attentions = True
# config.output_hidden_states = False
# model = model_class(config=config)
# model.eval()
# heads_to_prune = {0: list(range(1, self.model_tester.num_attention_heads)),
# -1: [0]}
# model.prune_heads(heads_to_prune)
# outputs = model(**inputs_dict)
# attentions = outputs[-1]
# self.assertEqual(
# attentions[0].shape[-3], 1)
# self.assertEqual(
# attentions[1].shape[-3], self.model_tester.num_attention_heads)
# self.assertEqual(
# attentions[-1].shape[-3], self.model_tester.num_attention_heads - 1)
def
test_hidden_states_output
(
self
):
config
,
inputs_dict
=
self
.
model_tester
.
prepare_config_and_inputs_for_common
()
...
...
@@ -323,75 +249,14 @@ class TFCommonTestCases:
list
(
hidden_states
[
0
].
shape
[
-
2
:]),
[
self
.
model_tester
.
seq_length
,
self
.
model_tester
.
hidden_size
])
def
test_model_common_attributes
(
self
):
config
,
inputs_dict
=
self
.
model_tester
.
prepare_config_and_inputs_for_common
()
def
test_resize_tokens_embeddings
(
self
):
pass
# original_config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
# if not self.test_resize_embeddings:
# return
# for model_class in self.all_model_classes:
# config = copy.deepcopy(original_config)
# model = model_class(config)
# model_vocab_size = config.vocab_size
# # Retrieve the embeddings and clone theme
# model_embed = model.resize_token_embeddings(model_vocab_size)
# cloned_embeddings = model_embed.weight.clone()
# # Check that resizing the token embeddings with a larger vocab size increases the model's vocab size
# model_embed = model.resize_token_embeddings(model_vocab_size + 10)
# self.assertEqual(model.config.vocab_size, model_vocab_size + 10)
# # Check that it actually resizes the embeddings matrix
# self.assertEqual(model_embed.weight.shape[0], cloned_embeddings.shape[0] + 10)
# # Check that resizing the token embeddings with a smaller vocab size decreases the model's vocab size
# model_embed = model.resize_token_embeddings(model_vocab_size - 15)
# self.assertEqual(model.config.vocab_size, model_vocab_size - 15)
# # Check that it actually resizes the embeddings matrix
# self.assertEqual(model_embed.weight.shape[0], cloned_embeddings.shape[0] - 15)
# # Check that adding and removing tokens has not modified the first part of the embedding matrix.
# models_equal = True
# for p1, p2 in zip(cloned_embeddings, model_embed.weight):
# if p1.data.ne(p2.data).sum() > 0:
# models_equal = False
# self.assertTrue(models_equal)
def
test_tie_model_weights
(
self
):
pass
# config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
# def check_same_values(layer_1, layer_2):
# equal = True
# for p1, p2 in zip(layer_1.weight, layer_2.weight):
# if p1.data.ne(p2.data).sum() > 0:
# equal = False
# return equal
# for model_class in self.all_model_classes:
# if not hasattr(model_class, 'tie_weights'):
# continue
# config.torchscript = True
# model_not_tied = model_class(config)
# params_not_tied = list(model_not_tied.parameters())
# config_tied = copy.deepcopy(config)
# config_tied.torchscript = False
# model_tied = model_class(config_tied)
# params_tied = list(model_tied.parameters())
# # Check that the embedding layer and decoding layer are the same in size and in value
# self.assertGreater(len(params_not_tied), len(params_tied))
# # Check that after resize they remain tied.
# model_tied.resize_token_embeddings(config.vocab_size + 10)
# params_tied_2 = list(model_tied.parameters())
# self.assertGreater(len(params_not_tied), len(params_tied))
# self.assertEqual(len(params_tied_2), len(params_tied))
for
model_class
in
self
.
all_model_classes
:
model
=
model_class
(
config
)
assert
isinstance
(
model
.
get_input_embeddings
(),
tf
.
keras
.
layers
.
Layer
)
x
=
model
.
get_output_embeddings
()
assert
x
is
None
or
isinstance
(
x
,
tf
.
keras
.
layers
.
Layer
)
def
test_determinism
(
self
):
config
,
inputs_dict
=
self
.
model_tester
.
prepare_config_and_inputs_for_common
()
...
...
@@ -401,6 +266,35 @@ class TFCommonTestCases:
first
,
second
=
model
(
inputs_dict
,
training
=
False
)[
0
],
model
(
inputs_dict
,
training
=
False
)[
0
]
self
.
assertTrue
(
tf
.
math
.
equal
(
first
,
second
).
numpy
().
all
())
def
test_inputs_embeds
(
self
):
config
,
inputs_dict
=
self
.
model_tester
.
prepare_config_and_inputs_for_common
()
input_ids
=
inputs_dict
[
"input_ids"
]
del
inputs_dict
[
"input_ids"
]
for
model_class
in
self
.
all_model_classes
:
model
=
model_class
(
config
)
wte
=
model
.
get_input_embeddings
()
try
:
x
=
wte
(
input_ids
,
mode
=
"embedding"
)
except
:
try
:
x
=
wte
([
input_ids
],
mode
=
"embedding"
)
except
:
try
:
x
=
wte
([
input_ids
,
None
,
None
,
None
],
mode
=
"embedding"
)
except
:
if
hasattr
(
self
.
model_tester
,
"embedding_size"
):
x
=
tf
.
ones
(
input_ids
.
shape
+
[
self
.
model_tester
.
embedding_size
],
dtype
=
tf
.
dtypes
.
float32
)
else
:
x
=
tf
.
ones
(
input_ids
.
shape
+
[
self
.
model_tester
.
hidden_size
],
dtype
=
tf
.
dtypes
.
float32
)
# ^^ In our TF models, the input_embeddings can take slightly different forms,
# so we try a few of them.
# We used to fall back to just synthetically creating a dummy tensor of ones:
#
inputs_dict
[
"inputs_embeds"
]
=
x
outputs
=
model
(
inputs_dict
)
def
ids_tensor
(
shape
,
vocab_size
,
rng
=
None
,
name
=
None
,
dtype
=
None
):
"""Creates a random int32 tensor of the shape within the vocab size."""
...
...
@@ -422,29 +316,5 @@ def ids_tensor(shape, vocab_size, rng=None, name=None, dtype=None):
return
output
class
TFModelUtilsTest
(
unittest
.
TestCase
):
@
pytest
.
mark
.
skipif
(
'tensorflow'
not
in
sys
.
modules
,
reason
=
"requires TensorFlow"
)
def
test_model_from_pretrained
(
self
):
pass
# logging.basicConfig(level=logging.INFO)
# for model_name in list(BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
# config = BertConfig.from_pretrained(model_name)
# self.assertIsNotNone(config)
# self.assertIsInstance(config, PretrainedConfig)
# model = BertModel.from_pretrained(model_name)
# model, loading_info = BertModel.from_pretrained(model_name, output_loading_info=True)
# self.assertIsNotNone(model)
# self.assertIsInstance(model, PreTrainedModel)
# for value in loading_info.values():
# self.assertEqual(len(value), 0)
# config = BertConfig.from_pretrained(model_name, output_attentions=True, output_hidden_states=True)
# model = BertModel.from_pretrained(model_name, output_attentions=True, output_hidden_states=True)
# self.assertEqual(model.config.output_attentions, True)
# self.assertEqual(model.config.output_hidden_states, True)
# self.assertEqual(model.config, config)
if
__name__
==
"__main__"
:
unittest
.
main
()
transformers/tests/modeling_tf_ctrl_test.py
View file @
40ed7172
...
...
@@ -18,11 +18,11 @@ from __future__ import print_function
import
unittest
import
shutil
import
pytest
import
sys
from
.modeling_tf_common_test
import
(
TFCommonTestCases
,
ids_tensor
)
from
.configuration_common_test
import
ConfigTester
from
.utils
import
require_tf
,
slow
from
transformers
import
CTRLConfig
,
is_tf_available
...
...
@@ -30,10 +30,9 @@ if is_tf_available():
import
tensorflow
as
tf
from
transformers.modeling_tf_ctrl
import
(
TFCTRLModel
,
TFCTRLLMHeadModel
,
TF_CTRL_PRETRAINED_MODEL_ARCHIVE_MAP
)
else
:
pytestmark
=
pytest
.
mark
.
skip
(
"Require TensorFlow"
)
@
require_tf
class
TFCTRLModelTest
(
TFCommonTestCases
.
TFCommonModelTester
):
all_model_classes
=
(
TFCTRLModel
,
TFCTRLLMHeadModel
)
if
is_tf_available
()
else
()
...
...
@@ -188,7 +187,7 @@ class TFCTRLModelTest(TFCommonTestCases.TFCommonModelTester):
config_and_inputs
=
self
.
model_tester
.
prepare_config_and_inputs
()
self
.
model_tester
.
create_and_check_ctrl_lm_head
(
*
config_and_inputs
)
@
pytest
.
mark
.
slow
@
slow
def
test_model_from_pretrained
(
self
):
cache_dir
=
"/tmp/transformers_test/"
for
model_name
in
list
(
TF_CTRL_PRETRAINED_MODEL_ARCHIVE_MAP
.
keys
())[:
1
]:
...
...
transformers/tests/modeling_tf_distilbert_test.py
View file @
40ed7172
...
...
@@ -17,10 +17,10 @@ from __future__ import division
from
__future__
import
print_function
import
unittest
import
pytest
from
.modeling_tf_common_test
import
(
TFCommonTestCases
,
ids_tensor
)
from
.configuration_common_test
import
ConfigTester
from
.utils
import
require_tf
,
slow
from
transformers
import
DistilBertConfig
,
is_tf_available
...
...
@@ -30,10 +30,9 @@ if is_tf_available():
TFDistilBertForMaskedLM
,
TFDistilBertForQuestionAnswering
,
TFDistilBertForSequenceClassification
)
else
:
pytestmark
=
pytest
.
mark
.
skip
(
"Require TensorFlow"
)
@
require_tf
class
TFDistilBertModelTest
(
TFCommonTestCases
.
TFCommonModelTester
):
all_model_classes
=
(
TFDistilBertModel
,
TFDistilBertForMaskedLM
,
TFDistilBertForQuestionAnswering
,
...
...
@@ -210,7 +209,7 @@ class TFDistilBertModelTest(TFCommonTestCases.TFCommonModelTester):
config_and_inputs
=
self
.
model_tester
.
prepare_config_and_inputs
()
self
.
model_tester
.
create_and_check_distilbert_for_sequence_classification
(
*
config_and_inputs
)
# @
pytest.mark.
slow
# @slow
# def test_model_from_pretrained(self):
# cache_dir = "/tmp/transformers_test/"
# for model_name in list(DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
...
...
transformers/tests/modeling_tf_gpt2_test.py
View file @
40ed7172
...
...
@@ -18,11 +18,11 @@ from __future__ import print_function
import
unittest
import
shutil
import
pytest
import
sys
from
.modeling_tf_common_test
import
(
TFCommonTestCases
,
ids_tensor
)
from
.configuration_common_test
import
ConfigTester
from
.utils
import
require_tf
,
slow
from
transformers
import
GPT2Config
,
is_tf_available
...
...
@@ -31,10 +31,9 @@ if is_tf_available():
from
transformers.modeling_tf_gpt2
import
(
TFGPT2Model
,
TFGPT2LMHeadModel
,
TFGPT2DoubleHeadsModel
,
TF_GPT2_PRETRAINED_MODEL_ARCHIVE_MAP
)
else
:
pytestmark
=
pytest
.
mark
.
skip
(
"Require TensorFlow"
)
@
require_tf
class
TFGPT2ModelTest
(
TFCommonTestCases
.
TFCommonModelTester
):
all_model_classes
=
(
TFGPT2Model
,
TFGPT2LMHeadModel
,
...
...
@@ -219,7 +218,7 @@ class TFGPT2ModelTest(TFCommonTestCases.TFCommonModelTester):
config_and_inputs
=
self
.
model_tester
.
prepare_config_and_inputs
()
self
.
model_tester
.
create_and_check_gpt2_double_head
(
*
config_and_inputs
)
@
pytest
.
mark
.
slow
@
slow
def
test_model_from_pretrained
(
self
):
cache_dir
=
"/tmp/transformers_test/"
for
model_name
in
list
(
TF_GPT2_PRETRAINED_MODEL_ARCHIVE_MAP
.
keys
())[:
1
]:
...
...
transformers/tests/modeling_tf_openai_gpt_test.py
View file @
40ed7172
...
...
@@ -18,11 +18,11 @@ from __future__ import print_function
import
unittest
import
shutil
import
pytest
import
sys
from
.modeling_tf_common_test
import
(
TFCommonTestCases
,
ids_tensor
)
from
.configuration_common_test
import
ConfigTester
from
.utils
import
require_tf
,
slow
from
transformers
import
OpenAIGPTConfig
,
is_tf_available
...
...
@@ -31,10 +31,9 @@ if is_tf_available():
from
transformers.modeling_tf_openai
import
(
TFOpenAIGPTModel
,
TFOpenAIGPTLMHeadModel
,
TFOpenAIGPTDoubleHeadsModel
,
TF_OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP
)
else
:
pytestmark
=
pytest
.
mark
.
skip
(
"Require TensorFlow"
)
@
require_tf
class
TFOpenAIGPTModelTest
(
TFCommonTestCases
.
TFCommonModelTester
):
all_model_classes
=
(
TFOpenAIGPTModel
,
TFOpenAIGPTLMHeadModel
,
...
...
@@ -218,7 +217,7 @@ class TFOpenAIGPTModelTest(TFCommonTestCases.TFCommonModelTester):
config_and_inputs
=
self
.
model_tester
.
prepare_config_and_inputs
()
self
.
model_tester
.
create_and_check_openai_gpt_double_head
(
*
config_and_inputs
)
@
pytest
.
mark
.
slow
@
slow
def
test_model_from_pretrained
(
self
):
cache_dir
=
"/tmp/transformers_test/"
for
model_name
in
list
(
TF_OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP
.
keys
())[:
1
]:
...
...
transformers/tests/modeling_tf_roberta_test.py
View file @
40ed7172
...
...
@@ -18,10 +18,10 @@ from __future__ import print_function
import
unittest
import
shutil
import
pytest
from
.modeling_tf_common_test
import
(
TFCommonTestCases
,
ids_tensor
)
from
.configuration_common_test
import
ConfigTester
from
.utils
import
require_tf
,
slow
from
transformers
import
RobertaConfig
,
is_tf_available
...
...
@@ -30,11 +30,11 @@ if is_tf_available():
import
numpy
from
transformers.modeling_tf_roberta
import
(
TFRobertaModel
,
TFRobertaForMaskedLM
,
TFRobertaForSequenceClassification
,
TFRobertaForTokenClassification
,
TF_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP
)
else
:
pytestmark
=
pytest
.
mark
.
skip
(
"Require TensorFlow"
)
@
require_tf
class
TFRobertaModelTest
(
TFCommonTestCases
.
TFCommonModelTester
):
all_model_classes
=
(
TFRobertaModel
,
TFRobertaForMaskedLM
,
...
...
@@ -154,6 +154,20 @@ class TFRobertaModelTest(TFCommonTestCases.TFCommonModelTester):
list
(
result
[
"prediction_scores"
].
shape
),
[
self
.
batch_size
,
self
.
seq_length
,
self
.
vocab_size
])
def
create_and_check_roberta_for_token_classification
(
self
,
config
,
input_ids
,
token_type_ids
,
input_mask
,
sequence_labels
,
token_labels
,
choice_labels
):
config
.
num_labels
=
self
.
num_labels
model
=
TFRobertaForTokenClassification
(
config
=
config
)
inputs
=
{
'input_ids'
:
input_ids
,
'attention_mask'
:
input_mask
,
'token_type_ids'
:
token_type_ids
}
logits
,
=
model
(
inputs
)
result
=
{
"logits"
:
logits
.
numpy
(),
}
self
.
parent
.
assertListEqual
(
list
(
result
[
"logits"
].
shape
),
[
self
.
batch_size
,
self
.
seq_length
,
self
.
num_labels
])
def
prepare_config_and_inputs_for_common
(
self
):
config_and_inputs
=
self
.
prepare_config_and_inputs
()
(
config
,
input_ids
,
token_type_ids
,
input_mask
,
...
...
@@ -176,7 +190,7 @@ class TFRobertaModelTest(TFCommonTestCases.TFCommonModelTester):
config_and_inputs
=
self
.
model_tester
.
prepare_config_and_inputs
()
self
.
model_tester
.
create_and_check_roberta_for_masked_lm
(
*
config_and_inputs
)
@
pytest
.
mark
.
slow
@
slow
def
test_model_from_pretrained
(
self
):
cache_dir
=
"/tmp/transformers_test/"
for
model_name
in
list
(
TF_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP
.
keys
())[:
1
]:
...
...
@@ -188,7 +202,7 @@ class TFRobertaModelTest(TFCommonTestCases.TFCommonModelTester):
class
TFRobertaModelIntegrationTest
(
unittest
.
TestCase
):
@
pytest
.
mark
.
slow
@
slow
def
test_inference_masked_lm
(
self
):
model
=
TFRobertaForMaskedLM
.
from_pretrained
(
'roberta-base'
)
...
...
@@ -209,7 +223,7 @@ class TFRobertaModelIntegrationTest(unittest.TestCase):
numpy
.
allclose
(
output
[:,
:
3
,
:
3
].
numpy
(),
expected_slice
.
numpy
(),
atol
=
1e-3
)
)
@
pytest
.
mark
.
slow
@
slow
def
test_inference_no_head
(
self
):
model
=
TFRobertaModel
.
from_pretrained
(
'roberta-base'
)
...
...
@@ -225,7 +239,7 @@ class TFRobertaModelIntegrationTest(unittest.TestCase):
numpy
.
allclose
(
output
[:,
:
3
,
:
3
].
numpy
(),
expected_slice
.
numpy
(),
atol
=
1e-3
)
)
@
pytest
.
mark
.
slow
@
slow
def
test_inference_classification_head
(
self
):
model
=
TFRobertaForSequenceClassification
.
from_pretrained
(
'roberta-large-mnli'
)
...
...
transformers/tests/modeling_tf_transfo_xl_test.py
View file @
40ed7172
...
...
@@ -19,10 +19,10 @@ from __future__ import print_function
import
unittest
import
random
import
shutil
import
pytest
from
.modeling_tf_common_test
import
(
TFCommonTestCases
,
ids_tensor
)
from
.configuration_common_test
import
ConfigTester
from
.utils
import
require_tf
,
slow
from
transformers
import
TransfoXLConfig
,
is_tf_available
...
...
@@ -31,10 +31,9 @@ if is_tf_available():
from
transformers.modeling_tf_transfo_xl
import
(
TFTransfoXLModel
,
TFTransfoXLLMHeadModel
,
TF_TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP
)
else
:
pytestmark
=
pytest
.
mark
.
skip
(
"Require TensorFlow"
)
@
require_tf
class
TFTransfoXLModelTest
(
TFCommonTestCases
.
TFCommonModelTester
):
all_model_classes
=
(
TFTransfoXLModel
,
TFTransfoXLLMHeadModel
)
if
is_tf_available
()
else
()
...
...
@@ -204,7 +203,7 @@ class TFTransfoXLModelTest(TFCommonTestCases.TFCommonModelTester):
config_and_inputs
=
self
.
model_tester
.
prepare_config_and_inputs
()
self
.
model_tester
.
create_and_check_transfo_xl_lm_head
(
*
config_and_inputs
)
@
pytest
.
mark
.
slow
@
slow
def
test_model_from_pretrained
(
self
):
cache_dir
=
"/tmp/transformers_test/"
for
model_name
in
list
(
TF_TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP
.
keys
())[:
1
]:
...
...
transformers/tests/modeling_tf_xlm_test.py
View file @
40ed7172
...
...
@@ -18,7 +18,6 @@ from __future__ import print_function
import
unittest
import
shutil
import
pytest
from
transformers
import
is_tf_available
...
...
@@ -29,13 +28,13 @@ if is_tf_available():
TFXLMForSequenceClassification
,
TFXLMForQuestionAnsweringSimple
,
TF_XLM_PRETRAINED_MODEL_ARCHIVE_MAP
)
else
:
pytestmark
=
pytest
.
mark
.
skip
(
"Require TensorFlow"
)
from
.modeling_tf_common_test
import
(
TFCommonTestCases
,
ids_tensor
)
from
.configuration_common_test
import
ConfigTester
from
.utils
import
require_tf
,
slow
@
require_tf
class
TFXLMModelTest
(
TFCommonTestCases
.
TFCommonModelTester
):
all_model_classes
=
(
TFXLMModel
,
TFXLMWithLMHeadModel
,
...
...
@@ -251,7 +250,7 @@ class TFXLMModelTest(TFCommonTestCases.TFCommonModelTester):
config_and_inputs
=
self
.
model_tester
.
prepare_config_and_inputs
()
self
.
model_tester
.
create_and_check_xlm_sequence_classif
(
*
config_and_inputs
)
@
pytest
.
mark
.
slow
@
slow
def
test_model_from_pretrained
(
self
):
cache_dir
=
"/tmp/transformers_test/"
for
model_name
in
list
(
TF_XLM_PRETRAINED_MODEL_ARCHIVE_MAP
.
keys
())[:
1
]:
...
...
transformers/tests/modeling_tf_xlnet_test.py
View file @
40ed7172
...
...
@@ -21,7 +21,6 @@ import unittest
import
json
import
random
import
shutil
import
pytest
from
transformers
import
XLNetConfig
,
is_tf_available
...
...
@@ -30,18 +29,21 @@ if is_tf_available():
from
transformers.modeling_tf_xlnet
import
(
TFXLNetModel
,
TFXLNetLMHeadModel
,
TFXLNetForSequenceClassification
,
TFXLNetForTokenClassification
,
TFXLNetForQuestionAnsweringSimple
,
TF_XLNET_PRETRAINED_MODEL_ARCHIVE_MAP
)
else
:
pytestmark
=
pytest
.
mark
.
skip
(
"Require TensorFlow"
)
from
.modeling_tf_common_test
import
(
TFCommonTestCases
,
ids_tensor
)
from
.configuration_common_test
import
ConfigTester
from
.utils
import
require_tf
,
slow
@
require_tf
class
TFXLNetModelTest
(
TFCommonTestCases
.
TFCommonModelTester
):
all_model_classes
=
(
TFXLNetModel
,
TFXLNetLMHeadModel
,
TFXLNetForSequenceClassification
,
TFXLNetForTokenClassification
,
TFXLNetForQuestionAnsweringSimple
)
if
is_tf_available
()
else
()
test_pruning
=
False
...
...
@@ -258,6 +260,26 @@ class TFXLNetModelTest(TFCommonTestCases.TFCommonModelTester):
list
(
list
(
mem
.
shape
)
for
mem
in
result
[
"mems_1"
]),
[[
self
.
seq_length
,
self
.
batch_size
,
self
.
hidden_size
]]
*
self
.
num_hidden_layers
)
def
create_and_check_xlnet_for_token_classification
(
self
,
config
,
input_ids_1
,
input_ids_2
,
input_ids_q
,
perm_mask
,
input_mask
,
target_mapping
,
segment_ids
,
lm_labels
,
sequence_labels
,
is_impossible_labels
):
config
.
num_labels
=
input_ids_1
.
shape
[
1
]
model
=
TFXLNetForTokenClassification
(
config
)
inputs
=
{
'input_ids'
:
input_ids_1
,
'attention_mask'
:
input_mask
,
# 'token_type_ids': token_type_ids
}
logits
,
mems_1
=
model
(
inputs
)
result
=
{
"mems_1"
:
[
mem
.
numpy
()
for
mem
in
mems_1
],
"logits"
:
logits
.
numpy
(),
}
self
.
parent
.
assertListEqual
(
list
(
result
[
"logits"
].
shape
),
[
self
.
batch_size
,
self
.
seq_length
,
config
.
num_labels
])
self
.
parent
.
assertListEqual
(
list
(
list
(
mem
.
shape
)
for
mem
in
result
[
"mems_1"
]),
[[
self
.
seq_length
,
self
.
batch_size
,
self
.
hidden_size
]]
*
self
.
num_hidden_layers
)
def
prepare_config_and_inputs_for_common
(
self
):
config_and_inputs
=
self
.
prepare_config_and_inputs
()
(
config
,
input_ids_1
,
input_ids_2
,
input_ids_q
,
perm_mask
,
input_mask
,
...
...
@@ -289,12 +311,16 @@ class TFXLNetModelTest(TFCommonTestCases.TFCommonModelTester):
config_and_inputs
=
self
.
model_tester
.
prepare_config_and_inputs
()
self
.
model_tester
.
create_and_check_xlnet_sequence_classif
(
*
config_and_inputs
)
def
test_xlnet_token_classification
(
self
):
config_and_inputs
=
self
.
model_tester
.
prepare_config_and_inputs
()
self
.
model_tester
.
create_and_check_xlnet_for_token_classification
(
*
config_and_inputs
)
def
test_xlnet_qa
(
self
):
self
.
model_tester
.
set_seed
()
config_and_inputs
=
self
.
model_tester
.
prepare_config_and_inputs
()
self
.
model_tester
.
create_and_check_xlnet_qa
(
*
config_and_inputs
)
@
pytest
.
mark
.
slow
@
slow
def
test_model_from_pretrained
(
self
):
cache_dir
=
"/tmp/transformers_test/"
for
model_name
in
list
(
TF_XLNET_PRETRAINED_MODEL_ARCHIVE_MAP
.
keys
())[:
1
]:
...
...
transformers/tests/modeling_transfo_xl_test.py
View file @
40ed7172
...
...
@@ -19,7 +19,6 @@ from __future__ import print_function
import
unittest
import
random
import
shutil
import
pytest
from
transformers
import
is_torch_available
...
...
@@ -27,12 +26,13 @@ if is_torch_available():
import
torch
from
transformers
import
(
TransfoXLConfig
,
TransfoXLModel
,
TransfoXLLMHeadModel
)
from
transformers.modeling_transfo_xl
import
TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP
else
:
pytestmark
=
pytest
.
mark
.
skip
(
"Require Torch"
)
from
.modeling_common_test
import
(
CommonTestCases
,
ids_tensor
)
from
.configuration_common_test
import
ConfigTester
from
.utils
import
require_torch
,
slow
,
torch_device
@
require_torch
class
TransfoXLModelTest
(
CommonTestCases
.
CommonModelTester
):
all_model_classes
=
(
TransfoXLModel
,
TransfoXLLMHeadModel
)
if
is_torch_available
()
else
()
...
...
@@ -111,6 +111,7 @@ class TransfoXLModelTest(CommonTestCases.CommonModelTester):
def
create_transfo_xl_model
(
self
,
config
,
input_ids_1
,
input_ids_2
,
lm_labels
):
model
=
TransfoXLModel
(
config
)
model
.
to
(
torch_device
)
model
.
eval
()
hidden_states_1
,
mems_1
=
model
(
input_ids_1
)
...
...
@@ -140,6 +141,7 @@ class TransfoXLModelTest(CommonTestCases.CommonModelTester):
def
create_transfo_xl_lm_head
(
self
,
config
,
input_ids_1
,
input_ids_2
,
lm_labels
):
model
=
TransfoXLLMHeadModel
(
config
)
model
.
to
(
torch_device
)
model
.
eval
()
lm_logits_1
,
mems_1
=
model
(
input_ids_1
)
...
...
@@ -204,7 +206,7 @@ class TransfoXLModelTest(CommonTestCases.CommonModelTester):
output_result
=
self
.
model_tester
.
create_transfo_xl_lm_head
(
*
config_and_inputs
)
self
.
model_tester
.
check_transfo_xl_lm_head_output
(
output_result
)
@
pytest
.
mark
.
slow
@
slow
def
test_model_from_pretrained
(
self
):
cache_dir
=
"/tmp/transformers_test/"
for
model_name
in
list
(
TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP
.
keys
())[:
1
]:
...
...
transformers/tests/modeling_xlm_test.py
View file @
40ed7172
...
...
@@ -18,7 +18,6 @@ from __future__ import print_function
import
unittest
import
shutil
import
pytest
from
transformers
import
is_torch_available
...
...
@@ -26,13 +25,13 @@ if is_torch_available():
from
transformers
import
(
XLMConfig
,
XLMModel
,
XLMWithLMHeadModel
,
XLMForQuestionAnswering
,
XLMForSequenceClassification
,
XLMForQuestionAnsweringSimple
)
from
transformers.modeling_xlm
import
XLM_PRETRAINED_MODEL_ARCHIVE_MAP
else
:
pytestmark
=
pytest
.
mark
.
skip
(
"Require Torch"
)
from
.modeling_common_test
import
(
CommonTestCases
,
ids_tensor
)
from
.configuration_common_test
import
ConfigTester
from
.utils
import
require_torch
,
slow
,
torch_device
@
require_torch
class
XLMModelTest
(
CommonTestCases
.
CommonModelTester
):
all_model_classes
=
(
XLMModel
,
XLMWithLMHeadModel
,
XLMForQuestionAnswering
,
...
...
@@ -148,6 +147,7 @@ class XLMModelTest(CommonTestCases.CommonModelTester):
def
create_and_check_xlm_model
(
self
,
config
,
input_ids
,
token_type_ids
,
input_lengths
,
sequence_labels
,
token_labels
,
is_impossible_labels
,
input_mask
):
model
=
XLMModel
(
config
=
config
)
model
.
to
(
torch_device
)
model
.
eval
()
outputs
=
model
(
input_ids
,
lengths
=
input_lengths
,
langs
=
token_type_ids
)
outputs
=
model
(
input_ids
,
langs
=
token_type_ids
)
...
...
@@ -163,6 +163,7 @@ class XLMModelTest(CommonTestCases.CommonModelTester):
def
create_and_check_xlm_lm_head
(
self
,
config
,
input_ids
,
token_type_ids
,
input_lengths
,
sequence_labels
,
token_labels
,
is_impossible_labels
,
input_mask
):
model
=
XLMWithLMHeadModel
(
config
)
model
.
to
(
torch_device
)
model
.
eval
()
loss
,
logits
=
model
(
input_ids
,
token_type_ids
=
token_type_ids
,
labels
=
token_labels
)
...
...
@@ -182,6 +183,7 @@ class XLMModelTest(CommonTestCases.CommonModelTester):
def
create_and_check_xlm_simple_qa
(
self
,
config
,
input_ids
,
token_type_ids
,
input_lengths
,
sequence_labels
,
token_labels
,
is_impossible_labels
,
input_mask
):
model
=
XLMForQuestionAnsweringSimple
(
config
)
model
.
to
(
torch_device
)
model
.
eval
()
outputs
=
model
(
input_ids
)
...
...
@@ -206,6 +208,7 @@ class XLMModelTest(CommonTestCases.CommonModelTester):
def
create_and_check_xlm_qa
(
self
,
config
,
input_ids
,
token_type_ids
,
input_lengths
,
sequence_labels
,
token_labels
,
is_impossible_labels
,
input_mask
):
model
=
XLMForQuestionAnswering
(
config
)
model
.
to
(
torch_device
)
model
.
eval
()
outputs
=
model
(
input_ids
)
...
...
@@ -260,6 +263,7 @@ class XLMModelTest(CommonTestCases.CommonModelTester):
def
create_and_check_xlm_sequence_classif
(
self
,
config
,
input_ids
,
token_type_ids
,
input_lengths
,
sequence_labels
,
token_labels
,
is_impossible_labels
,
input_mask
):
model
=
XLMForSequenceClassification
(
config
)
model
.
to
(
torch_device
)
model
.
eval
()
(
logits
,)
=
model
(
input_ids
)
...
...
@@ -312,7 +316,7 @@ class XLMModelTest(CommonTestCases.CommonModelTester):
config_and_inputs
=
self
.
model_tester
.
prepare_config_and_inputs
()
self
.
model_tester
.
create_and_check_xlm_sequence_classif
(
*
config_and_inputs
)
@
pytest
.
mark
.
slow
@
slow
def
test_model_from_pretrained
(
self
):
cache_dir
=
"/tmp/transformers_test/"
for
model_name
in
list
(
XLM_PRETRAINED_MODEL_ARCHIVE_MAP
.
keys
())[:
1
]:
...
...
transformers/tests/modeling_xlnet_test.py
View file @
40ed7172
...
...
@@ -21,24 +21,25 @@ import unittest
import
json
import
random
import
shutil
import
pytest
from
transformers
import
is_torch_available
if
is_torch_available
():
import
torch
from
transformers
import
(
XLNetConfig
,
XLNetModel
,
XLNetLMHeadModel
,
XLNetForSequenceClassification
,
XLNetForQuestionAnswering
)
from
transformers
import
(
XLNetConfig
,
XLNetModel
,
XLNetLMHeadModel
,
XLNetForSequenceClassification
,
XLNetForTokenClassification
,
XLNetForQuestionAnswering
)
from
transformers.modeling_xlnet
import
XLNET_PRETRAINED_MODEL_ARCHIVE_MAP
else
:
pytestmark
=
pytest
.
mark
.
skip
(
"Require Torch"
)
from
.modeling_common_test
import
(
CommonTestCases
,
ids_tensor
)
from
.configuration_common_test
import
ConfigTester
from
.utils
import
require_torch
,
slow
,
torch_device
@
require_torch
class
XLNetModelTest
(
CommonTestCases
.
CommonModelTester
):
all_model_classes
=
(
XLNetModel
,
XLNetLMHeadModel
,
all_model_classes
=
(
XLNetModel
,
XLNetLMHeadModel
,
XLNetForTokenClassification
,
XLNetForSequenceClassification
,
XLNetForQuestionAnswering
)
if
is_torch_available
()
else
()
test_pruning
=
False
...
...
@@ -99,18 +100,20 @@ class XLNetModelTest(CommonTestCases.CommonModelTester):
input_mask
=
ids_tensor
([
self
.
batch_size
,
self
.
seq_length
],
2
).
float
()
input_ids_q
=
ids_tensor
([
self
.
batch_size
,
self
.
seq_length
+
1
],
self
.
vocab_size
)
perm_mask
=
torch
.
zeros
(
self
.
batch_size
,
self
.
seq_length
+
1
,
self
.
seq_length
+
1
,
dtype
=
torch
.
float
)
perm_mask
=
torch
.
zeros
(
self
.
batch_size
,
self
.
seq_length
+
1
,
self
.
seq_length
+
1
,
dtype
=
torch
.
float
,
device
=
torch_device
)
perm_mask
[:,
:,
-
1
]
=
1.0
# Previous tokens don't see last token
target_mapping
=
torch
.
zeros
(
self
.
batch_size
,
1
,
self
.
seq_length
+
1
,
dtype
=
torch
.
float
)
target_mapping
=
torch
.
zeros
(
self
.
batch_size
,
1
,
self
.
seq_length
+
1
,
dtype
=
torch
.
float
,
device
=
torch_device
)
target_mapping
[:,
0
,
-
1
]
=
1.0
# predict last token
sequence_labels
=
None
lm_labels
=
None
is_impossible_labels
=
None
token_labels
=
None
if
self
.
use_labels
:
lm_labels
=
ids_tensor
([
self
.
batch_size
,
self
.
seq_length
],
self
.
vocab_size
)
sequence_labels
=
ids_tensor
([
self
.
batch_size
],
self
.
type_sequence_label_size
)
is_impossible_labels
=
ids_tensor
([
self
.
batch_size
],
2
).
float
()
token_labels
=
ids_tensor
([
self
.
batch_size
,
self
.
seq_length
],
self
.
type_vocab_size
)
config
=
XLNetConfig
(
vocab_size_or_config_json_file
=
self
.
vocab_size
,
...
...
@@ -129,15 +132,16 @@ class XLNetModelTest(CommonTestCases.CommonModelTester):
num_labels
=
self
.
type_sequence_label_size
)
return
(
config
,
input_ids_1
,
input_ids_2
,
input_ids_q
,
perm_mask
,
input_mask
,
target_mapping
,
segment_ids
,
lm_labels
,
sequence_labels
,
is_impossible_labels
)
target_mapping
,
segment_ids
,
lm_labels
,
sequence_labels
,
is_impossible_labels
,
token_labels
)
def
set_seed
(
self
):
random
.
seed
(
self
.
seed
)
torch
.
manual_seed
(
self
.
seed
)
def
create_and_check_xlnet_base_model
(
self
,
config
,
input_ids_1
,
input_ids_2
,
input_ids_q
,
perm_mask
,
input_mask
,
target_mapping
,
segment_ids
,
lm_labels
,
sequence_labels
,
is_impossible_labels
):
target_mapping
,
segment_ids
,
lm_labels
,
sequence_labels
,
is_impossible_labels
,
token_labels
):
model
=
XLNetModel
(
config
)
model
.
to
(
torch_device
)
model
.
eval
()
_
,
_
=
model
(
input_ids_1
,
input_mask
=
input_mask
)
...
...
@@ -152,6 +156,7 @@ class XLNetModelTest(CommonTestCases.CommonModelTester):
config
.
mem_len
=
0
model
=
XLNetModel
(
config
)
model
.
to
(
torch_device
)
model
.
eval
()
no_mems_outputs
=
model
(
input_ids_1
)
self
.
parent
.
assertEqual
(
len
(
no_mems_outputs
),
1
)
...
...
@@ -163,9 +168,23 @@ class XLNetModelTest(CommonTestCases.CommonModelTester):
list
(
list
(
mem
.
size
())
for
mem
in
result
[
"mems_1"
]),
[[
self
.
seq_length
,
self
.
batch_size
,
self
.
hidden_size
]]
*
self
.
num_hidden_layers
)
def
create_and_check_xlnet_base_model_with_att_output
(
self
,
config
,
input_ids_1
,
input_ids_2
,
input_ids_q
,
perm_mask
,
input_mask
,
target_mapping
,
segment_ids
,
lm_labels
,
sequence_labels
,
is_impossible_labels
,
token_labels
):
model
=
XLNetModel
(
config
)
model
.
to
(
torch_device
)
model
.
eval
()
_
,
_
,
attentions
=
model
(
input_ids_1
,
target_mapping
=
target_mapping
)
self
.
parent
.
assertEqual
(
len
(
attentions
),
config
.
n_layer
)
self
.
parent
.
assertIsInstance
(
attentions
[
0
],
tuple
)
self
.
parent
.
assertEqual
(
len
(
attentions
[
0
]),
2
)
self
.
parent
.
assertTrue
(
attentions
[
0
][
0
].
shape
,
attentions
[
0
][
0
].
shape
)
def
create_and_check_xlnet_lm_head
(
self
,
config
,
input_ids_1
,
input_ids_2
,
input_ids_q
,
perm_mask
,
input_mask
,
target_mapping
,
segment_ids
,
lm_labels
,
sequence_labels
,
is_impossible_labels
):
target_mapping
,
segment_ids
,
lm_labels
,
sequence_labels
,
is_impossible_labels
,
token_labels
):
model
=
XLNetLMHeadModel
(
config
)
model
.
to
(
torch_device
)
model
.
eval
()
loss_1
,
all_logits_1
,
mems_1
=
model
(
input_ids_1
,
token_type_ids
=
segment_ids
,
labels
=
lm_labels
)
...
...
@@ -204,8 +223,9 @@ class XLNetModelTest(CommonTestCases.CommonModelTester):
[[
self
.
mem_len
,
self
.
batch_size
,
self
.
hidden_size
]]
*
self
.
num_hidden_layers
)
def
create_and_check_xlnet_qa
(
self
,
config
,
input_ids_1
,
input_ids_2
,
input_ids_q
,
perm_mask
,
input_mask
,
target_mapping
,
segment_ids
,
lm_labels
,
sequence_labels
,
is_impossible_labels
):
target_mapping
,
segment_ids
,
lm_labels
,
sequence_labels
,
is_impossible_labels
,
token_labels
):
model
=
XLNetForQuestionAnswering
(
config
)
model
.
to
(
torch_device
)
model
.
eval
()
outputs
=
model
(
input_ids_1
)
...
...
@@ -261,9 +281,43 @@ class XLNetModelTest(CommonTestCases.CommonModelTester):
list
(
list
(
mem
.
size
())
for
mem
in
result
[
"mems"
]),
[[
self
.
seq_length
,
self
.
batch_size
,
self
.
hidden_size
]]
*
self
.
num_hidden_layers
)
def
create_and_check_xlnet_token_classif
(
self
,
config
,
input_ids_1
,
input_ids_2
,
input_ids_q
,
perm_mask
,
input_mask
,
target_mapping
,
segment_ids
,
lm_labels
,
sequence_labels
,
is_impossible_labels
,
token_labels
):
model
=
XLNetForTokenClassification
(
config
)
model
.
to
(
torch_device
)
model
.
eval
()
logits
,
mems_1
=
model
(
input_ids_1
)
loss
,
logits
,
mems_1
=
model
(
input_ids_1
,
labels
=
token_labels
)
result
=
{
"loss"
:
loss
,
"mems_1"
:
mems_1
,
"logits"
:
logits
,
}
self
.
parent
.
assertListEqual
(
list
(
result
[
"loss"
].
size
()),
[])
self
.
parent
.
assertListEqual
(
list
(
result
[
"logits"
].
size
()),
[
self
.
batch_size
,
self
.
seq_length
,
self
.
type_sequence_label_size
])
self
.
parent
.
assertListEqual
(
list
(
list
(
mem
.
size
())
for
mem
in
result
[
"mems_1"
]),
[[
self
.
seq_length
,
self
.
batch_size
,
self
.
hidden_size
]]
*
self
.
num_hidden_layers
)
def
prepare_config_and_inputs_for_common
(
self
):
config_and_inputs
=
self
.
prepare_config_and_inputs
()
(
config
,
input_ids_1
,
input_ids_2
,
input_ids_q
,
perm_mask
,
input_mask
,
target_mapping
,
segment_ids
,
lm_labels
,
sequence_labels
,
is_impossible_labels
)
=
config_and_inputs
inputs_dict
=
{
'input_ids'
:
input_ids_1
}
return
config
,
inputs_dict
def
create_and_check_xlnet_sequence_classif
(
self
,
config
,
input_ids_1
,
input_ids_2
,
input_ids_q
,
perm_mask
,
input_mask
,
target_mapping
,
segment_ids
,
lm_labels
,
sequence_labels
,
is_impossible_labels
):
target_mapping
,
segment_ids
,
lm_labels
,
sequence_labels
,
is_impossible_labels
,
token_labels
):
model
=
XLNetForSequenceClassification
(
config
)
model
.
to
(
torch_device
)
model
.
eval
()
logits
,
mems_1
=
model
(
input_ids_1
)
...
...
@@ -289,7 +343,7 @@ class XLNetModelTest(CommonTestCases.CommonModelTester):
config_and_inputs
=
self
.
prepare_config_and_inputs
()
(
config
,
input_ids_1
,
input_ids_2
,
input_ids_q
,
perm_mask
,
input_mask
,
target_mapping
,
segment_ids
,
lm_labels
,
sequence_labels
,
is_impossible_labels
)
=
config_and_inputs
sequence_labels
,
is_impossible_labels
,
token_labels
)
=
config_and_inputs
inputs_dict
=
{
'input_ids'
:
input_ids_1
}
return
config
,
inputs_dict
...
...
@@ -306,6 +360,12 @@ class XLNetModelTest(CommonTestCases.CommonModelTester):
config_and_inputs
=
self
.
model_tester
.
prepare_config_and_inputs
()
self
.
model_tester
.
create_and_check_xlnet_base_model
(
*
config_and_inputs
)
def
test_xlnet_base_model_with_att_output
(
self
):
self
.
model_tester
.
set_seed
()
config_and_inputs
=
self
.
model_tester
.
prepare_config_and_inputs
()
config_and_inputs
[
0
].
output_attentions
=
True
self
.
model_tester
.
create_and_check_xlnet_base_model_with_att_output
(
*
config_and_inputs
)
def
test_xlnet_lm_head
(
self
):
self
.
model_tester
.
set_seed
()
config_and_inputs
=
self
.
model_tester
.
prepare_config_and_inputs
()
...
...
@@ -316,12 +376,17 @@ class XLNetModelTest(CommonTestCases.CommonModelTester):
config_and_inputs
=
self
.
model_tester
.
prepare_config_and_inputs
()
self
.
model_tester
.
create_and_check_xlnet_sequence_classif
(
*
config_and_inputs
)
def
test_xlnet_token_classif
(
self
):
self
.
model_tester
.
set_seed
()
config_and_inputs
=
self
.
model_tester
.
prepare_config_and_inputs
()
self
.
model_tester
.
create_and_check_xlnet_token_classif
(
*
config_and_inputs
)
def
test_xlnet_qa
(
self
):
self
.
model_tester
.
set_seed
()
config_and_inputs
=
self
.
model_tester
.
prepare_config_and_inputs
()
self
.
model_tester
.
create_and_check_xlnet_qa
(
*
config_and_inputs
)
@
pytest
.
mark
.
slow
@
slow
def
test_model_from_pretrained
(
self
):
cache_dir
=
"/tmp/transformers_test/"
for
model_name
in
list
(
XLNET_PRETRAINED_MODEL_ARCHIVE_MAP
.
keys
())[:
1
]:
...
...
transformers/tests/optimization_test.py
View file @
40ed7172
...
...
@@ -18,19 +18,21 @@ from __future__ import print_function
import
unittest
import
os
import
pytest
from
transformers
import
is_torch_available
if
is_torch_available
():
import
torch
from
transformers
import
(
AdamW
,
ConstantLRSchedule
,
WarmupConstantSchedule
,
WarmupCosineSchedule
,
WarmupCosineWithHardRestartsSchedule
,
WarmupLinearSchedule
)
else
:
pytestmark
=
pytest
.
mark
.
skip
(
"Require Torch"
)
from
transformers
import
(
AdamW
,
get_constant_schedule
,
get_constant_schedule_with_warmup
,
get_cosine_schedule_with_warmup
,
get_cosine_with_hard_restarts_schedule_with_warmup
,
get_linear_schedule_with_warmup
)
from
.tokenization_tests_commons
import
TemporaryDirectory
from
.utils
import
require_torch
def
unwrap_schedule
(
scheduler
,
num_steps
=
10
):
...
...
@@ -54,6 +56,7 @@ def unwrap_and_save_reload_schedule(scheduler, num_steps=10):
scheduler
.
load_state_dict
(
state_dict
)
return
lrs
@
require_torch
class
OptimizationTest
(
unittest
.
TestCase
):
def
assertListAlmostEqual
(
self
,
list1
,
list2
,
tol
):
...
...
@@ -76,6 +79,7 @@ class OptimizationTest(unittest.TestCase):
self
.
assertListAlmostEqual
(
w
.
tolist
(),
[
0.4
,
0.2
,
-
0.5
],
tol
=
1e-2
)
@
require_torch
class
ScheduleInitTest
(
unittest
.
TestCase
):
m
=
torch
.
nn
.
Linear
(
50
,
50
)
if
is_torch_available
()
else
None
optimizer
=
AdamW
(
m
.
parameters
(),
lr
=
10.
)
if
is_torch_available
()
else
None
...
...
@@ -87,59 +91,60 @@ class ScheduleInitTest(unittest.TestCase):
self
.
assertAlmostEqual
(
a
,
b
,
delta
=
tol
)
def
test_constant_scheduler
(
self
):
scheduler
=
C
onstant
LRS
chedule
(
self
.
optimizer
)
scheduler
=
get_c
onstant
_s
chedule
(
self
.
optimizer
)
lrs
=
unwrap_schedule
(
scheduler
,
self
.
num_steps
)
expected_learning_rates
=
[
10.
]
*
self
.
num_steps
self
.
assertEqual
(
len
(
lrs
[
0
]),
1
)
self
.
assertListEqual
([
l
[
0
]
for
l
in
lrs
],
expected_learning_rates
)
scheduler
=
C
onstant
LRS
chedule
(
self
.
optimizer
)
scheduler
=
get_c
onstant
_s
chedule
(
self
.
optimizer
)
lrs_2
=
unwrap_and_save_reload_schedule
(
scheduler
,
self
.
num_steps
)
self
.
assertListEqual
([
l
[
0
]
for
l
in
lrs
],
[
l
[
0
]
for
l
in
lrs_2
])
def
test_warmup_constant_scheduler
(
self
):
scheduler
=
WarmupC
onstant
S
chedule
(
self
.
optimizer
,
warmup_steps
=
4
)
scheduler
=
get_c
onstant
_s
chedule
_with_warmup
(
self
.
optimizer
,
num_
warmup_steps
=
4
)
lrs
=
unwrap_schedule
(
scheduler
,
self
.
num_steps
)
expected_learning_rates
=
[
2.5
,
5.0
,
7.5
,
10.0
,
10.0
,
10.0
,
10.0
,
10.0
,
10.0
,
10.0
]
self
.
assertEqual
(
len
(
lrs
[
0
]),
1
)
self
.
assertListEqual
([
l
[
0
]
for
l
in
lrs
],
expected_learning_rates
)
scheduler
=
WarmupC
onstant
S
chedule
(
self
.
optimizer
,
warmup_steps
=
4
)
scheduler
=
get_c
onstant
_s
chedule
_with_warmup
(
self
.
optimizer
,
num_
warmup_steps
=
4
)
lrs_2
=
unwrap_and_save_reload_schedule
(
scheduler
,
self
.
num_steps
)
self
.
assertListEqual
([
l
[
0
]
for
l
in
lrs
],
[
l
[
0
]
for
l
in
lrs_2
])
def
test_warmup_linear_scheduler
(
self
):
scheduler
=
WarmupL
inear
S
chedule
(
self
.
optimizer
,
warmup_steps
=
2
,
t_total
=
10
)
scheduler
=
get_l
inear
_s
chedule
_with_warmup
(
self
.
optimizer
,
num_
warmup_steps
=
2
,
num_training_steps
=
10
)
lrs
=
unwrap_schedule
(
scheduler
,
self
.
num_steps
)
expected_learning_rates
=
[
5.0
,
10.0
,
8.75
,
7.5
,
6.25
,
5.0
,
3.75
,
2.5
,
1.25
,
0.0
]
self
.
assertEqual
(
len
(
lrs
[
0
]),
1
)
self
.
assertListEqual
([
l
[
0
]
for
l
in
lrs
],
expected_learning_rates
)
scheduler
=
WarmupL
inear
S
chedule
(
self
.
optimizer
,
warmup_steps
=
2
,
t_total
=
10
)
scheduler
=
get_l
inear
_s
chedule
_with_warmup
(
self
.
optimizer
,
num_
warmup_steps
=
2
,
num_training_steps
=
10
)
lrs_2
=
unwrap_and_save_reload_schedule
(
scheduler
,
self
.
num_steps
)
self
.
assertListEqual
([
l
[
0
]
for
l
in
lrs
],
[
l
[
0
]
for
l
in
lrs_2
])
def
test_warmup_cosine_scheduler
(
self
):
scheduler
=
WarmupC
osine
S
chedule
(
self
.
optimizer
,
warmup_steps
=
2
,
t_total
=
10
)
scheduler
=
get_c
osine
_s
chedule
_with_warmup
(
self
.
optimizer
,
num_
warmup_steps
=
2
,
num_training_steps
=
10
)
lrs
=
unwrap_schedule
(
scheduler
,
self
.
num_steps
)
expected_learning_rates
=
[
5.0
,
10.0
,
9.61
,
8.53
,
6.91
,
5.0
,
3.08
,
1.46
,
0.38
,
0.0
]
self
.
assertEqual
(
len
(
lrs
[
0
]),
1
)
self
.
assertListAlmostEqual
([
l
[
0
]
for
l
in
lrs
],
expected_learning_rates
,
tol
=
1e-2
)
scheduler
=
WarmupC
osine
S
chedule
(
self
.
optimizer
,
warmup_steps
=
2
,
t_total
=
10
)
scheduler
=
get_c
osine
_s
chedule
_with_warmup
(
self
.
optimizer
,
num_
warmup_steps
=
2
,
num_training_steps
=
10
)
lrs_2
=
unwrap_and_save_reload_schedule
(
scheduler
,
self
.
num_steps
)
self
.
assertListEqual
([
l
[
0
]
for
l
in
lrs
],
[
l
[
0
]
for
l
in
lrs_2
])
def
test_warmup_cosine_hard_restart_scheduler
(
self
):
scheduler
=
WarmupC
osine
W
ith
H
ard
R
estarts
S
chedule
(
self
.
optimizer
,
warmup_steps
=
2
,
cycles
=
2
,
t_total
=
10
)
scheduler
=
get_c
osine
_w
ith
_h
ard
_r
estarts
_s
chedule
_with_warmup
(
self
.
optimizer
,
num_
warmup_steps
=
2
,
num_
cycles
=
2
,
num_training_steps
=
10
)
lrs
=
unwrap_schedule
(
scheduler
,
self
.
num_steps
)
expected_learning_rates
=
[
5.0
,
10.0
,
8.53
,
5.0
,
1.46
,
10.0
,
8.53
,
5.0
,
1.46
,
0.0
]
self
.
assertEqual
(
len
(
lrs
[
0
]),
1
)
self
.
assertListAlmostEqual
([
l
[
0
]
for
l
in
lrs
],
expected_learning_rates
,
tol
=
1e-2
)
scheduler
=
WarmupC
osine
W
ith
H
ard
R
estarts
S
chedule
(
self
.
optimizer
,
warmup_steps
=
2
,
cycles
=
2
,
t_total
=
10
)
scheduler
=
get_c
osine
_w
ith
_h
ard
_r
estarts
_s
chedule
_with_warmup
(
self
.
optimizer
,
num_
warmup_steps
=
2
,
num_
cycles
=
2
,
num_training_steps
=
10
)
lrs_2
=
unwrap_and_save_reload_schedule
(
scheduler
,
self
.
num_steps
)
self
.
assertListEqual
([
l
[
0
]
for
l
in
lrs
],
[
l
[
0
]
for
l
in
lrs_2
])
if
__name__
==
"__main__"
:
unittest
.
main
()
transformers/tests/optimization_tf_test.py
0 → 100644
View file @
40ed7172
from
__future__
import
absolute_import
from
__future__
import
division
from
__future__
import
print_function
import
unittest
from
transformers
import
is_tf_available
from
.utils
import
require_tf
if
is_tf_available
():
import
tensorflow
as
tf
from
tensorflow.python.eager
import
context
from
tensorflow.python.framework
import
ops
from
transformers
import
(
create_optimizer
,
GradientAccumulator
)
@
require_tf
class
OptimizationFTest
(
unittest
.
TestCase
):
def
assertListAlmostEqual
(
self
,
list1
,
list2
,
tol
):
self
.
assertEqual
(
len
(
list1
),
len
(
list2
))
for
a
,
b
in
zip
(
list1
,
list2
):
self
.
assertAlmostEqual
(
a
,
b
,
delta
=
tol
)
def
testGradientAccumulator
(
self
):
accumulator
=
GradientAccumulator
()
accumulator
([
tf
.
constant
([
1.0
,
2.0
])])
accumulator
([
tf
.
constant
([
-
2.0
,
1.0
])])
accumulator
([
tf
.
constant
([
-
1.0
,
2.0
])])
with
self
.
assertRaises
(
ValueError
):
accumulator
([
tf
.
constant
([
1.0
,
1.0
]),
tf
.
constant
([
2.0
,
2.0
])])
self
.
assertEqual
(
accumulator
.
step
,
3
)
self
.
assertEqual
(
len
(
accumulator
.
gradients
),
1
)
self
.
assertListAlmostEqual
(
accumulator
.
gradients
[
0
].
numpy
().
tolist
(),
[
-
2.0
,
5.0
],
tol
=
1e-2
)
accumulator
.
reset
()
self
.
assertEqual
(
accumulator
.
step
,
0
)
self
.
assertListAlmostEqual
(
accumulator
.
gradients
[
0
].
numpy
().
tolist
(),
[
0.0
,
0.0
],
tol
=
1e-2
)
def
testGradientAccumulatorDistributionStrategy
(
self
):
context
.
_context
=
None
ops
.
enable_eager_execution_internal
()
physical_devices
=
tf
.
config
.
experimental
.
list_physical_devices
(
"CPU"
)
tf
.
config
.
experimental
.
set_virtual_device_configuration
(
physical_devices
[
0
],
[
tf
.
config
.
experimental
.
VirtualDeviceConfiguration
(),
tf
.
config
.
experimental
.
VirtualDeviceConfiguration
()])
devices
=
tf
.
config
.
experimental
.
list_logical_devices
(
device_type
=
"CPU"
)
strategy
=
tf
.
distribute
.
MirroredStrategy
(
devices
=
[
device
.
name
for
device
in
devices
])
with
strategy
.
scope
():
accumulator
=
GradientAccumulator
()
variable
=
tf
.
Variable
([
4.0
,
3.0
])
optimizer
=
create_optimizer
(
5e-5
,
10
,
5
)
gradient_placeholder
=
tf
.
Variable
([
0.0
,
0.0
],
trainable
=
False
)
def
accumulate_on_replica
(
gradient
):
accumulator
([
gradient
])
def
apply_on_replica
():
optimizer
.
apply_gradients
(
list
(
zip
(
accumulator
.
gradients
,
[
variable
])),
1.0
)
@
tf
.
function
def
accumulate
(
grad1
,
grad2
):
with
strategy
.
scope
():
gradient_placeholder
.
values
[
0
].
assign
(
grad1
)
gradient_placeholder
.
values
[
1
].
assign
(
grad2
)
strategy
.
experimental_run_v2
(
accumulate_on_replica
,
args
=
(
gradient_placeholder
,))
@
tf
.
function
def
apply_grad
():
with
strategy
.
scope
():
strategy
.
experimental_run_v2
(
apply_on_replica
)
accumulate
([
1.0
,
2.0
],
[
-
1.0
,
1.0
])
accumulate
([
3.0
,
-
1.0
],
[
-
1.0
,
-
1.0
])
accumulate
([
-
2.0
,
2.0
],
[
3.0
,
-
2.0
])
self
.
assertEqual
(
accumulator
.
step
,
3
)
self
.
assertListAlmostEqual
(
accumulator
.
_gradients
[
0
].
values
[
0
].
value
().
numpy
().
tolist
(),
[
2.0
,
3.0
],
tol
=
1e-2
)
self
.
assertListAlmostEqual
(
accumulator
.
_gradients
[
0
].
values
[
1
].
value
().
numpy
().
tolist
(),
[
1.0
,
-
2.0
],
tol
=
1e-2
)
apply_grad
()
self
.
assertListAlmostEqual
(
variable
.
value
().
numpy
().
tolist
(),
[
4.0
,
3.0
],
tol
=
1e-2
)
accumulator
.
reset
()
self
.
assertEqual
(
accumulator
.
step
,
0
)
self
.
assertListAlmostEqual
(
accumulator
.
_gradients
[
0
].
values
[
0
].
value
().
numpy
().
tolist
(),
[
0.0
,
0.0
],
tol
=
1e-2
)
self
.
assertListAlmostEqual
(
accumulator
.
_gradients
[
0
].
values
[
1
].
value
().
numpy
().
tolist
(),
[
0.0
,
0.0
],
tol
=
1e-2
)
if
__name__
==
"__main__"
:
unittest
.
main
()
\ No newline at end of file
transformers/tests/tokenization_albert_test.py
0 → 100644
View file @
40ed7172
# coding=utf-8
# Copyright 2019 Hugging Face inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
__future__
import
absolute_import
,
division
,
print_function
,
unicode_literals
import
os
import
unittest
from
transformers.tokenization_albert
import
(
AlbertTokenizer
,
SPIECE_UNDERLINE
)
from
.tokenization_tests_commons
import
CommonTestCases
SAMPLE_VOCAB
=
os
.
path
.
join
(
os
.
path
.
dirname
(
os
.
path
.
abspath
(
__file__
)),
'fixtures/spiece.model'
)
class
AlbertTokenizationTest
(
CommonTestCases
.
CommonTokenizerTester
):
tokenizer_class
=
AlbertTokenizer
def
setUp
(
self
):
super
(
AlbertTokenizationTest
,
self
).
setUp
()
# We have a SentencePiece fixture for testing
tokenizer
=
AlbertTokenizer
(
SAMPLE_VOCAB
)
tokenizer
.
save_pretrained
(
self
.
tmpdirname
)
def
get_tokenizer
(
self
,
**
kwargs
):
return
AlbertTokenizer
.
from_pretrained
(
self
.
tmpdirname
,
**
kwargs
)
def
get_input_output_texts
(
self
):
input_text
=
u
"this is a test"
output_text
=
u
"this is a test"
return
input_text
,
output_text
def
test_full_tokenizer
(
self
):
tokenizer
=
AlbertTokenizer
(
SAMPLE_VOCAB
,
keep_accents
=
True
)
tokens
=
tokenizer
.
tokenize
(
u
'This is a test'
)
self
.
assertListEqual
(
tokens
,
[
u
'▁this'
,
u
'▁is'
,
u
'▁a'
,
u
'▁test'
])
self
.
assertListEqual
(
tokenizer
.
convert_tokens_to_ids
(
tokens
),
[
48
,
25
,
21
,
1289
])
tokens
=
tokenizer
.
tokenize
(
u
"I was born in 92000, and this is falsé."
)
self
.
assertListEqual
(
tokens
,
[
u
'▁i'
,
u
'▁was'
,
u
'▁born'
,
u
'▁in'
,
u
'▁9'
,
u
'2000'
,
u
','
,
u
'▁and'
,
u
'▁this'
,
u
'▁is'
,
u
'▁fal'
,
u
's'
,
u
'é'
,
u
'.'
])
ids
=
tokenizer
.
convert_tokens_to_ids
(
tokens
)
self
.
assertListEqual
(
ids
,
[
31
,
23
,
386
,
19
,
561
,
3050
,
15
,
17
,
48
,
25
,
8256
,
18
,
1
,
9
])
back_tokens
=
tokenizer
.
convert_ids_to_tokens
(
ids
)
self
.
assertListEqual
(
back_tokens
,
[
'▁i'
,
'▁was'
,
'▁born'
,
'▁in'
,
'▁9'
,
'2000'
,
','
,
'▁and'
,
'▁this'
,
'▁is'
,
'▁fal'
,
's'
,
'<unk>'
,
'.'
])
def
test_sequence_builders
(
self
):
tokenizer
=
AlbertTokenizer
(
SAMPLE_VOCAB
)
text
=
tokenizer
.
encode
(
"sequence builders"
)
text_2
=
tokenizer
.
encode
(
"multi-sequence build"
)
encoded_sentence
=
tokenizer
.
build_inputs_with_special_tokens
(
text
)
encoded_pair
=
tokenizer
.
build_inputs_with_special_tokens
(
text
,
text_2
)
assert
encoded_sentence
==
[
tokenizer
.
cls_token_id
]
+
text
+
[
tokenizer
.
sep_token_id
]
assert
encoded_pair
==
[
tokenizer
.
cls_token_id
]
+
text
+
[
tokenizer
.
sep_token_id
]
+
text_2
+
[
tokenizer
.
sep_token_id
]
if
__name__
==
'__main__'
:
unittest
.
main
()
transformers/tests/tokenization_auto_test.py
View file @
40ed7172
...
...
@@ -18,14 +18,16 @@ from __future__ import print_function
import
unittest
import
shutil
import
pytest
import
logging
from
transformers
import
AutoTokenizer
,
BertTokenizer
,
AutoTokenizer
,
GPT2Tokenizer
from
transformers
import
BERT_PRETRAINED_CONFIG_ARCHIVE_MAP
,
GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP
from
.utils
import
slow
,
SMALL_MODEL_IDENTIFIER
class
AutoTokenizerTest
(
unittest
.
TestCase
):
@
slow
def
test_tokenizer_from_pretrained
(
self
):
logging
.
basicConfig
(
level
=
logging
.
INFO
)
for
model_name
in
list
(
BERT_PRETRAINED_CONFIG_ARCHIVE_MAP
.
keys
())[:
1
]:
...
...
@@ -40,6 +42,11 @@ class AutoTokenizerTest(unittest.TestCase):
self
.
assertIsInstance
(
tokenizer
,
GPT2Tokenizer
)
self
.
assertGreater
(
len
(
tokenizer
),
0
)
def
test_tokenizer_from_pretrained_identifier
(
self
):
logging
.
basicConfig
(
level
=
logging
.
INFO
)
tokenizer
=
AutoTokenizer
.
from_pretrained
(
SMALL_MODEL_IDENTIFIER
)
self
.
assertIsInstance
(
tokenizer
,
BertTokenizer
)
self
.
assertEqual
(
len
(
tokenizer
),
12
)
if
__name__
==
"__main__"
:
unittest
.
main
()
transformers/tests/tokenization_bert_japanese_test.py
0 → 100644
View file @
40ed7172
# coding=utf-8
# Copyright 2018 The Google AI Language Team Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
__future__
import
absolute_import
,
division
,
print_function
,
unicode_literals
import
os
import
unittest
from
io
import
open
from
transformers.tokenization_bert
import
WordpieceTokenizer
from
transformers.tokenization_bert_japanese
import
(
BertJapaneseTokenizer
,
MecabTokenizer
,
CharacterTokenizer
,
VOCAB_FILES_NAMES
)
from
.tokenization_tests_commons
import
CommonTestCases
from
.utils
import
slow
,
custom_tokenizers
@
custom_tokenizers
class
BertJapaneseTokenizationTest
(
CommonTestCases
.
CommonTokenizerTester
):
tokenizer_class
=
BertJapaneseTokenizer
def
setUp
(
self
):
super
(
BertJapaneseTokenizationTest
,
self
).
setUp
()
vocab_tokens
=
[
u
"[UNK]"
,
u
"[CLS]"
,
u
"[SEP]"
,
u
"こんにちは"
,
u
"こん"
,
u
"にちは"
,
u
"ばんは"
,
u
"##こん"
,
u
"##にちは"
,
u
"##ばんは"
,
u
"世界"
,
u
"##世界"
,
u
"、"
,
u
"##、"
,
u
"。"
,
u
"##。"
]
self
.
vocab_file
=
os
.
path
.
join
(
self
.
tmpdirname
,
VOCAB_FILES_NAMES
[
"vocab_file"
])
with
open
(
self
.
vocab_file
,
"w"
,
encoding
=
"utf-8"
)
as
vocab_writer
:
vocab_writer
.
write
(
""
.
join
([
x
+
"
\n
"
for
x
in
vocab_tokens
]))
def
get_tokenizer
(
self
,
**
kwargs
):
return
BertJapaneseTokenizer
.
from_pretrained
(
self
.
tmpdirname
,
**
kwargs
)
def
get_input_output_texts
(
self
):
input_text
=
u
"こんにちは、世界。
\n
こんばんは、世界。"
output_text
=
u
"こんにちは 、 世界 。 こんばんは 、 世界 。"
return
input_text
,
output_text
def
test_full_tokenizer
(
self
):
tokenizer
=
self
.
tokenizer_class
(
self
.
vocab_file
)
tokens
=
tokenizer
.
tokenize
(
u
"こんにちは、世界。
\n
こんばんは、世界。"
)
self
.
assertListEqual
(
tokens
,
[
u
"こんにちは"
,
u
"、"
,
u
"世界"
,
u
"。"
,
u
"こん"
,
u
"##ばんは"
,
u
"、"
,
u
"世界"
,
"。"
])
self
.
assertListEqual
(
tokenizer
.
convert_tokens_to_ids
(
tokens
),
[
3
,
12
,
10
,
14
,
4
,
9
,
12
,
10
,
14
])
def
test_mecab_tokenizer
(
self
):
tokenizer
=
MecabTokenizer
()
self
.
assertListEqual
(
tokenizer
.
tokenize
(
u
"
\t
アップルストアでiPhone8 が
\n
発売された 。 "
),
[
u
"アップルストア"
,
u
"で"
,
u
"iPhone"
,
u
"8"
,
u
"が"
,
u
"発売"
,
u
"さ"
,
u
"れ"
,
u
"た"
,
u
"。"
])
def
test_mecab_tokenizer_lower
(
self
):
tokenizer
=
MecabTokenizer
(
do_lower_case
=
True
)
self
.
assertListEqual
(
tokenizer
.
tokenize
(
u
"
\t
アップルストアでiPhone8 が
\n
発売された 。 "
),
[
u
"アップルストア"
,
u
"で"
,
u
"iphone"
,
u
"8"
,
u
"が"
,
u
"発売"
,
u
"さ"
,
u
"れ"
,
u
"た"
,
u
"。"
])
def
test_mecab_tokenizer_no_normalize
(
self
):
tokenizer
=
MecabTokenizer
(
normalize_text
=
False
)
self
.
assertListEqual
(
tokenizer
.
tokenize
(
u
"
\t
アップルストアでiPhone8 が
\n
発売された 。 "
),
[
u
"アップルストア"
,
u
"で"
,
u
"iPhone"
,
u
"8"
,
u
"が"
,
u
"発売"
,
u
"さ"
,
u
"れ"
,
u
"た"
,
u
" "
,
u
"。"
])
def
test_wordpiece_tokenizer
(
self
):
vocab_tokens
=
[
u
"[UNK]"
,
u
"[CLS]"
,
u
"[SEP]"
,
u
"こんにちは"
,
u
"こん"
,
u
"にちは"
u
"ばんは"
,
u
"##こん"
,
u
"##にちは"
,
u
"##ばんは"
]
vocab
=
{}
for
(
i
,
token
)
in
enumerate
(
vocab_tokens
):
vocab
[
token
]
=
i
tokenizer
=
WordpieceTokenizer
(
vocab
=
vocab
,
unk_token
=
u
"[UNK]"
)
self
.
assertListEqual
(
tokenizer
.
tokenize
(
u
""
),
[])
self
.
assertListEqual
(
tokenizer
.
tokenize
(
u
"こんにちは"
),
[
u
"こんにちは"
])
self
.
assertListEqual
(
tokenizer
.
tokenize
(
u
"こんばんは"
),
[
u
"こん"
,
u
"##ばんは"
])
self
.
assertListEqual
(
tokenizer
.
tokenize
(
u
"こんばんは こんばんにちは こんにちは"
),
[
u
"こん"
,
u
"##ばんは"
,
u
"[UNK]"
,
u
"こんにちは"
])
@
slow
def
test_sequence_builders
(
self
):
tokenizer
=
self
.
tokenizer_class
.
from_pretrained
(
"bert-base-japanese"
)
text
=
tokenizer
.
encode
(
u
"ありがとう。"
,
add_special_tokens
=
False
)
text_2
=
tokenizer
.
encode
(
u
"どういたしまして。"
,
add_special_tokens
=
False
)
encoded_sentence
=
tokenizer
.
build_inputs_with_special_tokens
(
text
)
encoded_pair
=
tokenizer
.
build_inputs_with_special_tokens
(
text
,
text_2
)
# 2 is for "[CLS]", 3 is for "[SEP]"
assert
encoded_sentence
==
[
2
]
+
text
+
[
3
]
assert
encoded_pair
==
[
2
]
+
text
+
[
3
]
+
text_2
+
[
3
]
class
BertJapaneseCharacterTokenizationTest
(
CommonTestCases
.
CommonTokenizerTester
):
tokenizer_class
=
BertJapaneseTokenizer
def
setUp
(
self
):
super
(
BertJapaneseCharacterTokenizationTest
,
self
).
setUp
()
vocab_tokens
=
[
u
"[UNK]"
,
u
"[CLS]"
,
u
"[SEP]"
,
u
"こ"
,
u
"ん"
,
u
"に"
,
u
"ち"
,
u
"は"
,
u
"ば"
,
u
"世"
,
u
"界"
,
u
"、"
,
u
"。"
]
self
.
vocab_file
=
os
.
path
.
join
(
self
.
tmpdirname
,
VOCAB_FILES_NAMES
[
"vocab_file"
])
with
open
(
self
.
vocab_file
,
"w"
,
encoding
=
"utf-8"
)
as
vocab_writer
:
vocab_writer
.
write
(
""
.
join
([
x
+
"
\n
"
for
x
in
vocab_tokens
]))
def
get_tokenizer
(
self
,
**
kwargs
):
return
BertJapaneseTokenizer
.
from_pretrained
(
self
.
tmpdirname
,
subword_tokenizer_type
=
"character"
,
**
kwargs
)
def
get_input_output_texts
(
self
):
input_text
=
u
"こんにちは、世界。
\n
こんばんは、世界。"
output_text
=
u
"こ ん に ち は 、 世 界 。 こ ん ば ん は 、 世 界 。"
return
input_text
,
output_text
def
test_full_tokenizer
(
self
):
tokenizer
=
self
.
tokenizer_class
(
self
.
vocab_file
,
subword_tokenizer_type
=
"character"
)
tokens
=
tokenizer
.
tokenize
(
u
"こんにちは、世界。
\n
こんばんは、世界。"
)
self
.
assertListEqual
(
tokens
,
[
u
"こ"
,
u
"ん"
,
u
"に"
,
u
"ち"
,
u
"は"
,
u
"、"
,
u
"世"
,
u
"界"
,
u
"。"
,
u
"こ"
,
u
"ん"
,
u
"ば"
,
u
"ん"
,
u
"は"
,
u
"、"
,
u
"世"
,
u
"界"
,
u
"。"
])
self
.
assertListEqual
(
tokenizer
.
convert_tokens_to_ids
(
tokens
),
[
3
,
4
,
5
,
6
,
7
,
11
,
9
,
10
,
12
,
3
,
4
,
8
,
4
,
7
,
11
,
9
,
10
,
12
])
def
test_character_tokenizer
(
self
):
vocab_tokens
=
[
u
"[UNK]"
,
u
"[CLS]"
,
u
"[SEP]"
,
u
"こ"
,
u
"ん"
,
u
"に"
,
u
"ち"
,
u
"は"
,
u
"ば"
,
u
"世"
,
u
"界"
u
"、"
,
u
"。"
]
vocab
=
{}
for
(
i
,
token
)
in
enumerate
(
vocab_tokens
):
vocab
[
token
]
=
i
tokenizer
=
CharacterTokenizer
(
vocab
=
vocab
,
unk_token
=
u
"[UNK]"
)
self
.
assertListEqual
(
tokenizer
.
tokenize
(
u
""
),
[])
self
.
assertListEqual
(
tokenizer
.
tokenize
(
u
"こんにちは"
),
[
u
"こ"
,
u
"ん"
,
u
"に"
,
u
"ち"
,
u
"は"
])
self
.
assertListEqual
(
tokenizer
.
tokenize
(
u
"こんにちほ"
),
[
u
"こ"
,
u
"ん"
,
u
"に"
,
u
"ち"
,
u
"[UNK]"
])
@
slow
def
test_sequence_builders
(
self
):
tokenizer
=
self
.
tokenizer_class
.
from_pretrained
(
"bert-base-japanese-char"
)
text
=
tokenizer
.
encode
(
u
"ありがとう。"
,
add_special_tokens
=
False
)
text_2
=
tokenizer
.
encode
(
u
"どういたしまして。"
,
add_special_tokens
=
False
)
encoded_sentence
=
tokenizer
.
build_inputs_with_special_tokens
(
text
)
encoded_pair
=
tokenizer
.
build_inputs_with_special_tokens
(
text
,
text_2
)
# 2 is for "[CLS]", 3 is for "[SEP]"
assert
encoded_sentence
==
[
2
]
+
text
+
[
3
]
assert
encoded_pair
==
[
2
]
+
text
+
[
3
]
+
text_2
+
[
3
]
transformers/tests/tokenization_bert_test.py
View file @
40ed7172
...
...
@@ -25,6 +25,7 @@ from transformers.tokenization_bert import (BasicTokenizer,
_is_whitespace
,
VOCAB_FILES_NAMES
)
from
.tokenization_tests_commons
import
CommonTestCases
from
.utils
import
slow
class
BertTokenizationTest
(
CommonTestCases
.
CommonTokenizerTester
):
...
...
@@ -125,11 +126,12 @@ class BertTokenizationTest(CommonTestCases.CommonTokenizerTester):
self
.
assertFalse
(
_is_punctuation
(
u
"A"
))
self
.
assertFalse
(
_is_punctuation
(
u
" "
))
@
slow
def
test_sequence_builders
(
self
):
tokenizer
=
self
.
tokenizer_class
.
from_pretrained
(
"bert-base-uncased"
)
text
=
tokenizer
.
encode
(
"sequence builders"
)
text_2
=
tokenizer
.
encode
(
"multi-sequence build"
)
text
=
tokenizer
.
encode
(
"sequence builders"
,
add_special_tokens
=
False
)
text_2
=
tokenizer
.
encode
(
"multi-sequence build"
,
add_special_tokens
=
False
)
encoded_sentence
=
tokenizer
.
build_inputs_with_special_tokens
(
text
)
encoded_pair
=
tokenizer
.
build_inputs_with_special_tokens
(
text
,
text_2
)
...
...
transformers/tests/tokenization_distilbert_test.py
View file @
40ed7172
...
...
@@ -22,6 +22,7 @@ from transformers.tokenization_distilbert import (DistilBertTokenizer)
from
.tokenization_tests_commons
import
CommonTestCases
from
.tokenization_bert_test
import
BertTokenizationTest
from
.utils
import
slow
class
DistilBertTokenizationTest
(
BertTokenizationTest
):
...
...
@@ -30,11 +31,12 @@ class DistilBertTokenizationTest(BertTokenizationTest):
def
get_tokenizer
(
self
,
**
kwargs
):
return
DistilBertTokenizer
.
from_pretrained
(
self
.
tmpdirname
,
**
kwargs
)
@
slow
def
test_sequence_builders
(
self
):
tokenizer
=
DistilBertTokenizer
.
from_pretrained
(
"distilbert-base-uncased"
)
text
=
tokenizer
.
encode
(
"sequence builders"
)
text_2
=
tokenizer
.
encode
(
"multi-sequence build"
)
text
=
tokenizer
.
encode
(
"sequence builders"
,
add_special_tokens
=
False
)
text_2
=
tokenizer
.
encode
(
"multi-sequence build"
,
add_special_tokens
=
False
)
encoded_sentence
=
tokenizer
.
build_inputs_with_special_tokens
(
text
)
encoded_pair
=
tokenizer
.
build_inputs_with_special_tokens
(
text
,
text_2
)
...
...
Prev
1
…
4
5
6
7
8
9
10
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment