Commit c7be096c authored by thomwolf's avatar thomwolf
Browse files

Merge branch 'master' into cli

parents 3492a6ec 33adab2b
# coding=utf-8
# Copyright 2018 Google T5 Authors and HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import unittest
import shutil
from transformers import is_torch_available
from .modeling_common_test import (CommonTestCases, ids_tensor, floats_tensor)
from .configuration_common_test import ConfigTester
from .utils import require_torch, slow, torch_device
if is_torch_available():
from transformers import (T5Config, T5Model, T5WithLMHeadModel)
from transformers.modeling_t5 import T5_PRETRAINED_MODEL_ARCHIVE_MAP
@require_torch
class T5ModelTest(CommonTestCases.CommonModelTester):
all_model_classes = (T5Model, T5WithLMHeadModel) if is_torch_available() else ()
test_pruning = False
test_torchscript = False
test_resize_embeddings = False
is_encoder_decoder = True
class T5ModelTester(object):
def __init__(self,
parent,
batch_size=13,
encoder_seq_length=7,
decoder_seq_length=9,
is_training=True,
use_attention_mask=True,
use_labels=True,
vocab_size=99,
n_positions=14,
hidden_size=32,
num_hidden_layers=5,
num_attention_heads=4,
d_ff=37,
relative_attention_num_buckets=8,
dropout_rate=0.1,
initializer_factor=0.002,
scope=None,
):
self.parent = parent
self.batch_size = batch_size
self.encoder_seq_length = encoder_seq_length
self.decoder_seq_length = decoder_seq_length
self.is_training = is_training
self.use_attention_mask = use_attention_mask
self.use_labels = use_labels
self.vocab_size = vocab_size
self.n_positions = n_positions
self.hidden_size = hidden_size
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
self.d_ff = d_ff
self.relative_attention_num_buckets = relative_attention_num_buckets
self.dropout_rate = dropout_rate
self.initializer_factor = initializer_factor
self.scope = scope
def prepare_config_and_inputs(self):
encoder_input_ids = ids_tensor([self.batch_size, self.encoder_seq_length], self.vocab_size)
decoder_input_ids = ids_tensor([self.batch_size, self.decoder_seq_length], self.vocab_size)
encoder_attention_mask = None
decoder_attention_mask = None
if self.use_attention_mask:
encoder_attention_mask = ids_tensor([self.batch_size, self.encoder_seq_length], vocab_size=2)
decoder_attention_mask = ids_tensor([self.batch_size, self.decoder_seq_length], vocab_size=2)
decoder_lm_labels = None
if self.use_labels:
decoder_lm_labels = ids_tensor([self.batch_size, self.decoder_seq_length], self.vocab_size)
config = T5Config(
vocab_size=self.vocab_size,
n_positions=self.n_positions,
d_model=self.hidden_size,
d_ff=self.d_ff,
d_kv=self.hidden_size // self.num_attention_heads,
num_layers=self.num_hidden_layers,
num_heads=self.num_attention_heads,
relative_attention_num_buckets=self.relative_attention_num_buckets,
dropout_rate=self.dropout_rate,
initializer_factor=self.initializer_factor)
return (config, encoder_input_ids, decoder_input_ids, encoder_attention_mask, decoder_attention_mask, decoder_lm_labels)
def check_loss_output(self, result):
self.parent.assertListEqual(
list(result["loss"].size()),
[])
def create_and_check_t5_model(self, config, encoder_input_ids, decoder_input_ids, encoder_attention_mask, decoder_attention_mask, decoder_lm_labels):
model = T5Model(config=config)
model.eval()
decoder_output, encoder_output = model(encoder_input_ids=encoder_input_ids,
decoder_input_ids=decoder_input_ids,
encoder_attention_mask=encoder_attention_mask,
decoder_attention_mask=decoder_attention_mask)
decoder_output, encoder_output = model(encoder_input_ids=encoder_input_ids,
decoder_input_ids=decoder_input_ids)
result = {
"encoder_output": encoder_output,
"decoder_output": decoder_output,
}
self.parent.assertListEqual(
list(result["encoder_output"].size()),
[self.batch_size, self.encoder_seq_length, self.hidden_size])
self.parent.assertListEqual(
list(result["decoder_output"].size()),
[self.batch_size, self.decoder_seq_length, self.hidden_size])
def create_and_check_t5_with_lm_head(self, config, encoder_input_ids, decoder_input_ids, encoder_attention_mask, decoder_attention_mask, decoder_lm_labels):
model = T5WithLMHeadModel(config=config)
model.eval()
outputs = model(encoder_input_ids=encoder_input_ids, decoder_input_ids=decoder_input_ids,
decoder_attention_mask=decoder_attention_mask, decoder_lm_labels=decoder_lm_labels)
loss, prediction_scores = outputs[0], outputs[1]
result = {
"loss": loss,
"prediction_scores": prediction_scores,
}
self.parent.assertListEqual(
list(result["prediction_scores"].size()),
[self.batch_size, self.decoder_seq_length, self.vocab_size])
self.check_loss_output(result)
def prepare_config_and_inputs_for_common(self):
config_and_inputs = self.prepare_config_and_inputs()
(config, encoder_input_ids, decoder_input_ids, encoder_attention_mask,
decoder_attention_mask, decoder_lm_labels) = config_and_inputs
inputs_dict = {'encoder_input_ids': encoder_input_ids,
'decoder_input_ids': decoder_input_ids,
'decoder_attention_mask': decoder_attention_mask,
'encoder_attention_mask': encoder_attention_mask}
return config, inputs_dict
def setUp(self):
self.model_tester = T5ModelTest.T5ModelTester(self)
self.config_tester = ConfigTester(self, config_class=T5Config, d_model=37)
def test_config(self):
self.config_tester.run_common_tests()
def test_t5_model(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_t5_model(*config_and_inputs)
def test_with_lm_head(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_t5_with_lm_head(*config_and_inputs)
@slow
def test_model_from_pretrained(self):
cache_dir = "/tmp/transformers_test/"
for model_name in list(T5_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
model = T5Model.from_pretrained(model_name, cache_dir=cache_dir)
shutil.rmtree(cache_dir)
self.assertIsNotNone(model)
if __name__ == "__main__":
unittest.main()
......@@ -118,7 +118,7 @@ class TFAlbertModelTest(TFCommonTestCases.TFCommonModelTester):
choice_labels = ids_tensor([self.batch_size], self.num_choices)
config = AlbertConfig(
vocab_size_or_config_json_file=self.vocab_size,
vocab_size=self.vocab_size,
hidden_size=self.hidden_size,
num_hidden_layers=self.num_hidden_layers,
num_attention_heads=self.num_attention_heads,
......
......@@ -22,7 +22,7 @@ import logging
from transformers import is_tf_available
from .utils import require_tf, slow
from .utils import require_tf, slow, SMALL_MODEL_IDENTIFIER
if is_tf_available():
from transformers import (AutoConfig, BertConfig,
......@@ -93,6 +93,11 @@ class TFAutoModelTest(unittest.TestCase):
self.assertIsNotNone(model)
self.assertIsInstance(model, TFBertForQuestionAnswering)
def test_from_pretrained_identifier(self):
logging.basicConfig(level=logging.INFO)
model = TFAutoModelWithLMHead.from_pretrained(SMALL_MODEL_IDENTIFIER, force_download=True)
self.assertIsInstance(model, TFBertForMaskedLM)
if __name__ == "__main__":
unittest.main()
......@@ -114,7 +114,7 @@ class TFBertModelTest(TFCommonTestCases.TFCommonModelTester):
choice_labels = ids_tensor([self.batch_size], self.num_choices)
config = BertConfig(
vocab_size_or_config_json_file=self.vocab_size,
vocab_size=self.vocab_size,
hidden_size=self.hidden_size,
num_hidden_layers=self.num_hidden_layers,
num_attention_heads=self.num_attention_heads,
......
......@@ -69,6 +69,7 @@ class TFCommonTestCases:
test_torchscript = True
test_pruning = True
test_resize_embeddings = True
is_encoder_decoder = False
def test_initialization(self):
pass
......@@ -129,8 +130,12 @@ class TFCommonTestCases:
for name, key in inputs_dict.items())
with torch.no_grad():
pto = pt_model(**pt_inputs_dict)
tfo = tf_model(inputs_dict)
max_diff = np.amax(np.abs(tfo[0].numpy() - pto[0].numpy()))
tfo = tf_model(inputs_dict, training=False)
tf_hidden_states = tfo[0].numpy()
pt_hidden_states = pto[0].numpy()
tf_hidden_states[np.isnan(tf_hidden_states)] = 0
pt_hidden_states[np.isnan(pt_hidden_states)] = 0
max_diff = np.amax(np.abs(tf_hidden_states - pt_hidden_states))
self.assertLessEqual(max_diff, 2e-2)
# Check we can load pt model in tf and vice-versa with checkpoint => model functions
......@@ -150,13 +155,21 @@ class TFCommonTestCases:
with torch.no_grad():
pto = pt_model(**pt_inputs_dict)
tfo = tf_model(inputs_dict)
max_diff = np.amax(np.abs(tfo[0].numpy() - pto[0].numpy()))
tfo = tfo[0].numpy()
pto = pto[0].numpy()
tfo[np.isnan(tfo)] = 0
pto[np.isnan(pto)] = 0
max_diff = np.amax(np.abs(tfo - pto))
self.assertLessEqual(max_diff, 2e-2)
def test_compile_tf_model(self):
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
input_ids = tf.keras.Input(batch_shape=(2, 2000), name='input_ids', dtype='int32')
if self.is_encoder_decoder:
input_ids = {'decoder_input_ids': tf.keras.Input(batch_shape=(2, 2000), name='decoder_input_ids', dtype='int32'),
'encoder_input_ids': tf.keras.Input(batch_shape=(2, 2000), name='encoder_input_ids', dtype='int32')}
else:
input_ids = tf.keras.Input(batch_shape=(2, 2000), name='input_ids', dtype='int32')
optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
......@@ -189,7 +202,7 @@ class TFCommonTestCases:
outputs_dict = model(inputs_dict)
inputs_keywords = copy.deepcopy(inputs_dict)
input_ids = inputs_keywords.pop('input_ids')
input_ids = inputs_keywords.pop('input_ids' if not self.is_encoder_decoder else 'decoder_input_ids', None)
outputs_keywords = model(input_ids, **inputs_keywords)
output_dict = outputs_dict[0].numpy()
......@@ -200,6 +213,11 @@ class TFCommonTestCases:
def test_attention_outputs(self):
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
decoder_seq_length = self.model_tester.decoder_seq_length if hasattr(self.model_tester, 'decoder_seq_length') else self.model_tester.seq_length
encoder_seq_length = self.model_tester.encoder_seq_length if hasattr(self.model_tester, 'encoder_seq_length') else self.model_tester.seq_length
decoder_key_length = self.model_tester.key_length if hasattr(self.model_tester, 'key_length') else decoder_seq_length
encoder_key_length = self.model_tester.key_length if hasattr(self.model_tester, 'key_length') else encoder_seq_length
for model_class in self.all_model_classes:
config.output_attentions = True
config.output_hidden_states = False
......@@ -212,16 +230,28 @@ class TFCommonTestCases:
self.assertListEqual(
list(attentions[0].shape[-3:]),
[self.model_tester.num_attention_heads,
self.model_tester.seq_length,
self.model_tester.key_len if hasattr(self.model_tester, 'key_len') else self.model_tester.seq_length])
encoder_seq_length,
encoder_key_length])
out_len = len(outputs)
if self.is_encoder_decoder:
self.assertEqual(out_len % 2, 0)
decoder_attentions = outputs[(out_len // 2)-1]
self.assertEqual(model.config.output_attentions, True)
self.assertEqual(model.config.output_hidden_states, False)
self.assertEqual(len(decoder_attentions), self.model_tester.num_hidden_layers)
self.assertListEqual(
list(decoder_attentions[0].shape[-3:]),
[self.model_tester.num_attention_heads,
decoder_seq_length,
decoder_key_length])
# Check attention is always last and order is fine
config.output_attentions = True
config.output_hidden_states = True
model = model_class(config)
outputs = model(inputs_dict)
self.assertEqual(out_len+1, len(outputs))
self.assertEqual(out_len + (2 if self.is_encoder_decoder else 1), len(outputs))
self.assertEqual(model.config.output_attentions, True)
self.assertEqual(model.config.output_hidden_states, True)
......@@ -230,8 +260,8 @@ class TFCommonTestCases:
self.assertListEqual(
list(attentions[0].shape[-3:]),
[self.model_tester.num_attention_heads,
self.model_tester.seq_length,
self.model_tester.key_len if hasattr(self.model_tester, 'key_len') else self.model_tester.seq_length])
encoder_seq_length,
encoder_key_length])
def test_hidden_states_output(self):
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
......@@ -264,35 +294,53 @@ class TFCommonTestCases:
for model_class in self.all_model_classes:
model = model_class(config)
first, second = model(inputs_dict, training=False)[0], model(inputs_dict, training=False)[0]
self.assertTrue(tf.math.equal(first, second).numpy().all())
out_1 = first.numpy()
out_2 = second.numpy()
out_1 = out_1[~np.isnan(out_1)]
out_2 = out_2[~np.isnan(out_2)]
max_diff = np.amax(np.abs(out_1 - out_2))
self.assertLessEqual(max_diff, 1e-5)
def _get_embeds(self, wte, input_ids):
# ^^ In our TF models, the input_embeddings can take slightly different forms,
# so we try a few of them.
# We used to fall back to just synthetically creating a dummy tensor of ones:
try:
x = wte(input_ids, mode="embedding")
except:
try:
x = wte([input_ids], mode="embedding")
except:
try:
x = wte([input_ids, None, None, None], mode="embedding")
except:
if hasattr(self.model_tester, "embedding_size"):
x = tf.ones(input_ids.shape + [self.model_tester.embedding_size], dtype=tf.dtypes.float32)
else:
x = tf.ones(input_ids.shape + [self.model_tester.hidden_size], dtype=tf.dtypes.float32)
return x
def test_inputs_embeds(self):
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
input_ids = inputs_dict["input_ids"]
del inputs_dict["input_ids"]
if not self.is_encoder_decoder:
input_ids = inputs_dict["input_ids"]
del inputs_dict["input_ids"]
else:
encoder_input_ids = inputs_dict["encoder_input_ids"]
decoder_input_ids = inputs_dict["decoder_input_ids"]
del inputs_dict["encoder_input_ids"]
del inputs_dict["decoder_input_ids"]
for model_class in self.all_model_classes:
model = model_class(config)
wte = model.get_input_embeddings()
try:
x = wte(input_ids, mode="embedding")
except:
try:
x = wte([input_ids], mode="embedding")
except:
try:
x = wte([input_ids, None, None, None], mode="embedding")
except:
if hasattr(self.model_tester, "embedding_size"):
x = tf.ones(input_ids.shape + [self.model_tester.embedding_size], dtype=tf.dtypes.float32)
else:
x = tf.ones(input_ids.shape + [self.model_tester.hidden_size], dtype=tf.dtypes.float32)
# ^^ In our TF models, the input_embeddings can take slightly different forms,
# so we try a few of them.
# We used to fall back to just synthetically creating a dummy tensor of ones:
#
inputs_dict["inputs_embeds"] = x
if not self.is_encoder_decoder:
inputs_dict["inputs_embeds"] = self._get_embeds(wte, input_ids)
else:
inputs_dict["encoder_inputs_embeds"] = self._get_embeds(wte, encoder_input_ids)
inputs_dict["decoder_inputs_embeds"] = self._get_embeds(wte, decoder_input_ids)
outputs = model(inputs_dict)
......
......@@ -112,7 +112,7 @@ class TFCTRLModelTest(TFCommonTestCases.TFCommonModelTester):
choice_labels = ids_tensor([self.batch_size], self.num_choices)
config = CTRLConfig(
vocab_size_or_config_json_file=self.vocab_size,
vocab_size=self.vocab_size,
n_embd=self.hidden_size,
n_layer=self.num_hidden_layers,
n_head=self.num_attention_heads,
......
......@@ -107,7 +107,7 @@ class TFDistilBertModelTest(TFCommonTestCases.TFCommonModelTester):
choice_labels = ids_tensor([self.batch_size], self.num_choices)
config = DistilBertConfig(
vocab_size_or_config_json_file=self.vocab_size,
vocab_size=self.vocab_size,
dim=self.hidden_size,
n_layers=self.num_hidden_layers,
n_heads=self.num_attention_heads,
......
......@@ -115,7 +115,7 @@ class TFGPT2ModelTest(TFCommonTestCases.TFCommonModelTester):
choice_labels = ids_tensor([self.batch_size], self.num_choices)
config = GPT2Config(
vocab_size_or_config_json_file=self.vocab_size,
vocab_size=self.vocab_size,
n_embd=self.hidden_size,
n_layer=self.num_hidden_layers,
n_head=self.num_attention_heads,
......
......@@ -114,7 +114,7 @@ class TFOpenAIGPTModelTest(TFCommonTestCases.TFCommonModelTester):
choice_labels = ids_tensor([self.batch_size], self.num_choices)
config = OpenAIGPTConfig(
vocab_size_or_config_json_file=self.vocab_size,
vocab_size=self.vocab_size,
n_embd=self.hidden_size,
n_layer=self.num_hidden_layers,
n_head=self.num_attention_heads,
......
......@@ -109,7 +109,7 @@ class TFRobertaModelTest(TFCommonTestCases.TFCommonModelTester):
choice_labels = ids_tensor([self.batch_size], self.num_choices)
config = RobertaConfig(
vocab_size_or_config_json_file=self.vocab_size,
vocab_size=self.vocab_size,
hidden_size=self.hidden_size,
num_hidden_layers=self.num_hidden_layers,
num_attention_heads=self.num_attention_heads,
......
# coding=utf-8
# Copyright 2018 Google T5 Authors and HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import unittest
import shutil
import sys
from .modeling_tf_common_test import (TFCommonTestCases, ids_tensor)
from .configuration_common_test import ConfigTester
from .utils import require_tf, slow
from transformers import T5Config, is_tf_available
if is_tf_available():
import tensorflow as tf
from transformers.modeling_tf_t5 import (TFT5Model, TFT5WithLMHeadModel,
TF_T5_PRETRAINED_MODEL_ARCHIVE_MAP)
@require_tf
class TFT5ModelTest(TFCommonTestCases.TFCommonModelTester):
is_encoder_decoder = True
all_model_classes = (TFT5Model, TFT5WithLMHeadModel) if is_tf_available() else ()
class TFT5ModelTester(object):
def __init__(self,
parent,
batch_size=13,
seq_length=7,
is_training=True,
use_input_mask=True,
use_labels=True,
vocab_size=99,
n_positions=14,
hidden_size=32,
num_hidden_layers=5,
num_attention_heads=4,
d_ff=37,
relative_attention_num_buckets=8,
dropout_rate=0.1,
initializer_factor=0.002,
scope=None,
):
self.parent = parent
self.batch_size = batch_size
self.seq_length = seq_length
self.is_training = is_training
self.use_input_mask = use_input_mask
self.use_labels = use_labels
self.vocab_size = vocab_size
self.n_positions = n_positions
self.hidden_size = hidden_size
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
self.d_ff = d_ff
self.relative_attention_num_buckets = relative_attention_num_buckets
self.dropout_rate = dropout_rate
self.initializer_factor = initializer_factor
self.scope = scope
def prepare_config_and_inputs(self):
input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
input_mask = None
if self.use_input_mask:
input_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
token_labels = None
if self.use_labels:
token_labels = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
config = T5Config(
vocab_size=self.vocab_size,
n_positions=self.n_positions,
d_model=self.hidden_size,
d_ff=self.d_ff,
d_kv=self.hidden_size // self.num_attention_heads,
num_layers=self.num_hidden_layers,
num_heads=self.num_attention_heads,
relative_attention_num_buckets=self.relative_attention_num_buckets,
dropout_rate=self.dropout_rate,
initializer_factor=self.initializer_factor)
return (config, input_ids, input_mask, token_labels)
def create_and_check_t5_model(self, config, input_ids, input_mask, token_labels):
model = TFT5Model(config=config)
inputs = {'encoder_input_ids': input_ids,
'decoder_input_ids': input_ids,
'decoder_attention_mask': input_mask}
encoder_output, decoder_output = model(inputs)
encoder_output, decoder_output = model(input_ids,
decoder_attention_mask=input_mask,
encoder_input_ids=input_ids)
result = {
"encoder_output": encoder_output.numpy(),
"decoder_output": decoder_output.numpy(),
}
self.parent.assertListEqual(
list(result["encoder_output"].shape),
[self.batch_size, self.seq_length, self.hidden_size])
self.parent.assertListEqual(
list(result["decoder_output"].shape),
[self.batch_size, self.seq_length, self.hidden_size])
def create_and_check_t5_with_lm_head(self, config, input_ids, input_mask, token_labels):
model = TFT5WithLMHeadModel(config=config)
inputs = {'encoder_input_ids': input_ids,
'decoder_input_ids': input_ids,
'decoder_attention_mask': input_mask}
prediction_scores, decoder_output = model(inputs)
result = {
"prediction_scores": prediction_scores.numpy(),
}
self.parent.assertListEqual(
list(result["prediction_scores"].shape),
[self.batch_size, self.seq_length, self.vocab_size])
def prepare_config_and_inputs_for_common(self):
config_and_inputs = self.prepare_config_and_inputs()
(config, input_ids, input_mask, token_labels) = config_and_inputs
inputs_dict = {'encoder_input_ids': input_ids,
'decoder_input_ids': input_ids,
'decoder_attention_mask': input_mask}
return config, inputs_dict
def setUp(self):
self.model_tester = TFT5ModelTest.TFT5ModelTester(self)
self.config_tester = ConfigTester(self, config_class=T5Config, d_model=37)
def test_config(self):
self.config_tester.run_common_tests()
def test_t5_model(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_t5_model(*config_and_inputs)
def test_with_lm_head(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_t5_with_lm_head(*config_and_inputs)
@slow
def test_model_from_pretrained(self):
cache_dir = "/tmp/transformers_test/"
for model_name in ['t5-small']:
model = TFT5Model.from_pretrained(model_name, cache_dir=cache_dir)
shutil.rmtree(cache_dir)
self.assertIsNotNone(model)
if __name__ == "__main__":
unittest.main()
......@@ -67,7 +67,7 @@ class TFTransfoXLModelTest(TFCommonTestCases.TFCommonModelTester):
self.batch_size = batch_size
self.seq_length = seq_length
self.mem_len = mem_len
self.key_len = seq_length + mem_len
self.key_length = seq_length + mem_len
self.clamp_len = clamp_len
self.is_training = is_training
self.use_labels = use_labels
......@@ -92,7 +92,7 @@ class TFTransfoXLModelTest(TFCommonTestCases.TFCommonModelTester):
lm_labels = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
config = TransfoXLConfig(
vocab_size_or_config_json_file=self.vocab_size,
vocab_size=self.vocab_size,
mem_len=self.mem_len,
clamp_len=self.clamp_len,
cutoffs=self.cutoffs,
......
......@@ -125,7 +125,7 @@ class TFXLMModelTest(TFCommonTestCases.TFCommonModelTester):
is_impossible_labels = ids_tensor([self.batch_size], 2, dtype=tf.float32)
config = XLMConfig(
vocab_size_or_config_json_file=self.vocab_size,
vocab_size=self.vocab_size,
n_special=self.n_special,
emb_dim=self.hidden_size,
n_layers=self.num_hidden_layers,
......
......@@ -64,7 +64,6 @@ class TFXLNetModelTest(TFCommonTestCases.TFCommonModelTester):
num_attention_heads=4,
d_inner=128,
num_hidden_layers=5,
max_position_embeddings=10,
type_sequence_label_size=2,
untie_r=True,
bi_data=False,
......@@ -88,7 +87,6 @@ class TFXLNetModelTest(TFCommonTestCases.TFCommonModelTester):
self.num_attention_heads = num_attention_heads
self.d_inner = d_inner
self.num_hidden_layers = num_hidden_layers
self.max_position_embeddings = max_position_embeddings
self.bi_data = bi_data
self.untie_r = untie_r
self.same_length = same_length
......@@ -122,13 +120,12 @@ class TFXLNetModelTest(TFCommonTestCases.TFCommonModelTester):
is_impossible_labels = ids_tensor([self.batch_size], 2, dtype=tf.float32)
config = XLNetConfig(
vocab_size_or_config_json_file=self.vocab_size,
vocab_size=self.vocab_size,
d_model=self.hidden_size,
n_head=self.num_attention_heads,
d_inner=self.d_inner,
n_layer=self.num_hidden_layers,
untie_r=self.untie_r,
max_position_embeddings=self.max_position_embeddings,
mem_len=self.mem_len,
clamp_len=self.clamp_len,
same_length=self.same_length,
......
......@@ -66,7 +66,7 @@ class TransfoXLModelTest(CommonTestCases.CommonModelTester):
self.batch_size = batch_size
self.seq_length = seq_length
self.mem_len = mem_len
self.key_len = seq_length + mem_len
self.key_length = seq_length + mem_len
self.clamp_len = clamp_len
self.is_training = is_training
self.use_labels = use_labels
......@@ -91,7 +91,7 @@ class TransfoXLModelTest(CommonTestCases.CommonModelTester):
lm_labels = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
config = TransfoXLConfig(
vocab_size_or_config_json_file=self.vocab_size,
vocab_size=self.vocab_size,
mem_len=self.mem_len,
clamp_len=self.clamp_len,
cutoffs=self.cutoffs,
......
......@@ -121,7 +121,7 @@ class XLMModelTest(CommonTestCases.CommonModelTester):
is_impossible_labels = ids_tensor([self.batch_size], 2).float()
config = XLMConfig(
vocab_size_or_config_json_file=self.vocab_size,
vocab_size=self.vocab_size,
n_special=self.n_special,
emb_dim=self.hidden_size,
n_layers=self.num_hidden_layers,
......
......@@ -60,7 +60,6 @@ class XLNetModelTest(CommonTestCases.CommonModelTester):
num_attention_heads=4,
d_inner=128,
num_hidden_layers=5,
max_position_embeddings=10,
type_sequence_label_size=2,
untie_r=True,
bi_data=False,
......@@ -84,7 +83,6 @@ class XLNetModelTest(CommonTestCases.CommonModelTester):
self.num_attention_heads = num_attention_heads
self.d_inner = d_inner
self.num_hidden_layers = num_hidden_layers
self.max_position_embeddings = max_position_embeddings
self.bi_data = bi_data
self.untie_r = untie_r
self.same_length = same_length
......@@ -116,13 +114,12 @@ class XLNetModelTest(CommonTestCases.CommonModelTester):
token_labels = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
config = XLNetConfig(
vocab_size_or_config_json_file=self.vocab_size,
vocab_size=self.vocab_size,
d_model=self.hidden_size,
n_head=self.num_attention_heads,
d_inner=self.d_inner,
n_layer=self.num_hidden_layers,
untie_r=self.untie_r,
max_position_embeddings=self.max_position_embeddings,
mem_len=self.mem_len,
clamp_len=self.clamp_len,
same_length=self.same_length,
......
......@@ -23,7 +23,7 @@ import logging
from transformers import AutoTokenizer, BertTokenizer, AutoTokenizer, GPT2Tokenizer
from transformers import BERT_PRETRAINED_CONFIG_ARCHIVE_MAP, GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP
from .utils import slow
from .utils import slow, SMALL_MODEL_IDENTIFIER
class AutoTokenizerTest(unittest.TestCase):
......@@ -42,6 +42,11 @@ class AutoTokenizerTest(unittest.TestCase):
self.assertIsInstance(tokenizer, GPT2Tokenizer)
self.assertGreater(len(tokenizer), 0)
def test_tokenizer_from_pretrained_identifier(self):
logging.basicConfig(level=logging.INFO)
tokenizer = AutoTokenizer.from_pretrained(SMALL_MODEL_IDENTIFIER)
self.assertIsInstance(tokenizer, BertTokenizer)
self.assertEqual(len(tokenizer), 12)
if __name__ == "__main__":
unittest.main()
# coding=utf-8
# Copyright 2018 The Google AI Language Team Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import, division, print_function, unicode_literals
import os
import unittest
from io import open
from transformers.tokenization_bert import WordpieceTokenizer
from transformers.tokenization_bert_japanese import (BertJapaneseTokenizer,
MecabTokenizer, CharacterTokenizer,
VOCAB_FILES_NAMES)
from .tokenization_tests_commons import CommonTestCases
from .utils import slow, custom_tokenizers
@custom_tokenizers
class BertJapaneseTokenizationTest(CommonTestCases.CommonTokenizerTester):
tokenizer_class = BertJapaneseTokenizer
def setUp(self):
super(BertJapaneseTokenizationTest, self).setUp()
vocab_tokens = [u"[UNK]", u"[CLS]", u"[SEP]",
u"こんにちは", u"こん", u"にちは", u"ばんは", u"##こん", u"##にちは", u"##ばんは",
u"世界", u"##世界", u"、", u"##、", u"。", u"##。"]
self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer:
vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
def get_tokenizer(self, **kwargs):
return BertJapaneseTokenizer.from_pretrained(self.tmpdirname, **kwargs)
def get_input_output_texts(self):
input_text = u"こんにちは、世界。 \nこんばんは、世界。"
output_text = u"こんにちは 、 世界 。 こんばんは 、 世界 。"
return input_text, output_text
def test_full_tokenizer(self):
tokenizer = self.tokenizer_class(self.vocab_file)
tokens = tokenizer.tokenize(u"こんにちは、世界。\nこんばんは、世界。")
self.assertListEqual(tokens,
[u"こんにちは", u"、", u"世界", u"。",
u"こん", u"##ばんは", u"、", u"世界", "。"])
self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens),
[3, 12, 10, 14, 4, 9, 12, 10, 14])
def test_mecab_tokenizer(self):
tokenizer = MecabTokenizer()
self.assertListEqual(
tokenizer.tokenize(u" \tアップルストアでiPhone8 が \n 発売された 。 "),
[u"アップルストア", u"で", u"iPhone", u"8", u"が",
u"発売", u"さ", u"れ", u"た", u"。"])
def test_mecab_tokenizer_lower(self):
tokenizer = MecabTokenizer(do_lower_case=True)
self.assertListEqual(
tokenizer.tokenize(u" \tアップルストアでiPhone8 が \n 発売された 。 "),
[u"アップルストア", u"で", u"iphone", u"8", u"が",
u"発売", u"さ", u"れ", u"た", u"。"])
def test_mecab_tokenizer_no_normalize(self):
tokenizer = MecabTokenizer(normalize_text=False)
self.assertListEqual(
tokenizer.tokenize(u" \tアップルストアでiPhone8 が \n 発売された 。 "),
[u"アップルストア", u"で", u"iPhone", u"8", u"が",
u"発売", u"さ", u"れ", u"た", u" ", u"。"])
def test_wordpiece_tokenizer(self):
vocab_tokens = [u"[UNK]", u"[CLS]", u"[SEP]",
u"こんにちは", u"こん", u"にちは" u"ばんは", u"##こん", u"##にちは", u"##ばんは"]
vocab = {}
for (i, token) in enumerate(vocab_tokens):
vocab[token] = i
tokenizer = WordpieceTokenizer(vocab=vocab, unk_token=u"[UNK]")
self.assertListEqual(tokenizer.tokenize(u""), [])
self.assertListEqual(tokenizer.tokenize(u"こんにちは"),
[u"こんにちは"])
self.assertListEqual(tokenizer.tokenize(u"こんばんは"),
[u"こん", u"##ばんは"])
self.assertListEqual(tokenizer.tokenize(u"こんばんは こんばんにちは こんにちは"),
[u"こん", u"##ばんは", u"[UNK]", u"こんにちは"])
@slow
def test_sequence_builders(self):
tokenizer = self.tokenizer_class.from_pretrained("bert-base-japanese")
text = tokenizer.encode(u"ありがとう。", add_special_tokens=False)
text_2 = tokenizer.encode(u"どういたしまして。", add_special_tokens=False)
encoded_sentence = tokenizer.build_inputs_with_special_tokens(text)
encoded_pair = tokenizer.build_inputs_with_special_tokens(text, text_2)
# 2 is for "[CLS]", 3 is for "[SEP]"
assert encoded_sentence == [2] + text + [3]
assert encoded_pair == [2] + text + [3] + text_2 + [3]
class BertJapaneseCharacterTokenizationTest(CommonTestCases.CommonTokenizerTester):
tokenizer_class = BertJapaneseTokenizer
def setUp(self):
super(BertJapaneseCharacterTokenizationTest, self).setUp()
vocab_tokens = [u"[UNK]", u"[CLS]", u"[SEP]",
u"こ", u"ん", u"に", u"ち", u"は", u"ば", u"世", u"界", u"、", u"。"]
self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer:
vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
def get_tokenizer(self, **kwargs):
return BertJapaneseTokenizer.from_pretrained(self.tmpdirname,
subword_tokenizer_type="character",
**kwargs)
def get_input_output_texts(self):
input_text = u"こんにちは、世界。 \nこんばんは、世界。"
output_text = u"こ ん に ち は 、 世 界 。 こ ん ば ん は 、 世 界 。"
return input_text, output_text
def test_full_tokenizer(self):
tokenizer = self.tokenizer_class(self.vocab_file,
subword_tokenizer_type="character")
tokens = tokenizer.tokenize(u"こんにちは、世界。 \nこんばんは、世界。")
self.assertListEqual(tokens,
[u"こ", u"ん", u"に", u"ち", u"は", u"、", u"世", u"界", u"。",
u"こ", u"ん", u"ば", u"ん", u"は", u"、", u"世", u"界", u"。"])
self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens),
[3, 4, 5, 6, 7, 11, 9, 10, 12,
3, 4, 8, 4, 7, 11, 9, 10, 12])
def test_character_tokenizer(self):
vocab_tokens = [u"[UNK]", u"[CLS]", u"[SEP]",
u"こ", u"ん", u"に", u"ち", u"は", u"ば", u"世", u"界"u"、", u"。"]
vocab = {}
for (i, token) in enumerate(vocab_tokens):
vocab[token] = i
tokenizer = CharacterTokenizer(vocab=vocab, unk_token=u"[UNK]")
self.assertListEqual(tokenizer.tokenize(u""), [])
self.assertListEqual(tokenizer.tokenize(u"こんにちは"),
[u"こ", u"ん", u"に", u"ち", u"は"])
self.assertListEqual(tokenizer.tokenize(u"こんにちほ"),
[u"こ", u"ん", u"に", u"ち", u"[UNK]"])
@slow
def test_sequence_builders(self):
tokenizer = self.tokenizer_class.from_pretrained("bert-base-japanese-char")
text = tokenizer.encode(u"ありがとう。", add_special_tokens=False)
text_2 = tokenizer.encode(u"どういたしまして。", add_special_tokens=False)
encoded_sentence = tokenizer.build_inputs_with_special_tokens(text)
encoded_pair = tokenizer.build_inputs_with_special_tokens(text, text_2)
# 2 is for "[CLS]", 3 is for "[SEP]"
assert encoded_sentence == [2] + text + [3]
assert encoded_pair == [2] + text + [3] + text_2 + [3]
......@@ -139,5 +139,6 @@ class BertTokenizationTest(CommonTestCases.CommonTokenizerTester):
assert encoded_sentence == [101] + text + [102]
assert encoded_pair == [101] + text + [102] + text_2 + [102]
if __name__ == '__main__':
unittest.main()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment