Unverified Commit 4dc65591 authored by Patrick von Platen's avatar Patrick von Platen Committed by GitHub
Browse files

[Almost all TF models] TF clean up: add missing CLM / MLM loss; fix T5 naming...


[Almost all TF models] TF clean up: add missing CLM / MLM loss; fix T5 naming and keras compile (#5395)

* add first version of clm tf

* make style

* add more tests for bert

* update tf clm loss

* fix tests

* correct tf ner script

* add mlm loss

* delete bogus file

* clean tf auto model + add tests

* finish adding clm loss everywhere

* fix training in distilbert

* fix flake8

* save intermediate

* fix tf t5 naming

* remove prints

* finish up

* up

* fix tf gpt2

* fix new test utils import

* fix flake8

* keep backward compatibility

* Update src/transformers/modeling_tf_albert.py
Co-authored-by: default avatarSylvain Gugger <35901082+sgugger@users.noreply.github.com>

* Update src/transformers/modeling_tf_auto.py
Co-authored-by: default avatarSylvain Gugger <35901082+sgugger@users.noreply.github.com>

* Update src/transformers/modeling_tf_electra.py
Co-authored-by: default avatarSylvain Gugger <35901082+sgugger@users.noreply.github.com>

* Update src/transformers/modeling_tf_roberta.py
Co-authored-by: default avatarSylvain Gugger <35901082+sgugger@users.noreply.github.com>

* Update src/transformers/modeling_tf_mobilebert.py
Co-authored-by: default avatarSylvain Gugger <35901082+sgugger@users.noreply.github.com>

* Update src/transformers/modeling_tf_auto.py
Co-authored-by: default avatarSylvain Gugger <35901082+sgugger@users.noreply.github.com>

* Update src/transformers/modeling_tf_bert.py
Co-authored-by: default avatarSylvain Gugger <35901082+sgugger@users.noreply.github.com>

* Update src/transformers/modeling_tf_distilbert.py
Co-authored-by: default avatarSylvain Gugger <35901082+sgugger@users.noreply.github.com>

* apply sylvains suggestions
Co-authored-by: default avatarSylvain Gugger <35901082+sgugger@users.noreply.github.com>
parent 33e43edd
...@@ -38,6 +38,9 @@ if is_tf_available(): ...@@ -38,6 +38,9 @@ if is_tf_available():
TF_MODEL_FOR_QUESTION_ANSWERING_MAPPING, TF_MODEL_FOR_QUESTION_ANSWERING_MAPPING,
TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING, TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING,
TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING, TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING,
TF_MODEL_FOR_CAUSAL_LM_MAPPING,
TF_MODEL_FOR_MASKED_LM_MAPPING,
TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING,
) )
if _tf_gpu_memory_limit is not None: if _tf_gpu_memory_limit is not None:
...@@ -93,6 +96,12 @@ class TFModelTesterMixin: ...@@ -93,6 +96,12 @@ class TFModelTesterMixin:
inputs_dict["labels"] = tf.zeros(self.model_tester.batch_size) inputs_dict["labels"] = tf.zeros(self.model_tester.batch_size)
elif model_class in TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING.values(): elif model_class in TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING.values():
inputs_dict["labels"] = tf.zeros((self.model_tester.batch_size, self.model_tester.seq_length)) inputs_dict["labels"] = tf.zeros((self.model_tester.batch_size, self.model_tester.seq_length))
elif model_class in TF_MODEL_FOR_CAUSAL_LM_MAPPING.values():
inputs_dict["labels"] = tf.zeros((self.model_tester.batch_size, self.model_tester.seq_length))
elif model_class in TF_MODEL_FOR_MASKED_LM_MAPPING.values():
inputs_dict["labels"] = tf.zeros((self.model_tester.batch_size, self.model_tester.seq_length))
elif model_class in TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING.values():
inputs_dict["labels"] = tf.zeros((self.model_tester.batch_size, self.model_tester.seq_length))
return inputs_dict return inputs_dict
def test_initialization(self): def test_initialization(self):
...@@ -291,7 +300,7 @@ class TFModelTesterMixin: ...@@ -291,7 +300,7 @@ class TFModelTesterMixin:
"decoder_input_ids": tf.keras.Input( "decoder_input_ids": tf.keras.Input(
batch_shape=(2, 2000), name="decoder_input_ids", dtype="int32" batch_shape=(2, 2000), name="decoder_input_ids", dtype="int32"
), ),
"inputs": tf.keras.Input(batch_shape=(2, 2000), name="inputs", dtype="int32"), "input_ids": tf.keras.Input(batch_shape=(2, 2000), name="input_ids", dtype="int32"),
} }
elif model_class in TF_MODEL_FOR_MULTIPLE_CHOICE_MAPPING.values(): elif model_class in TF_MODEL_FOR_MULTIPLE_CHOICE_MAPPING.values():
input_ids = tf.keras.Input(batch_shape=(4, 2, 2000), name="input_ids", dtype="int32") input_ids = tf.keras.Input(batch_shape=(4, 2, 2000), name="input_ids", dtype="int32")
...@@ -325,7 +334,7 @@ class TFModelTesterMixin: ...@@ -325,7 +334,7 @@ class TFModelTesterMixin:
outputs_dict = model(self._prepare_for_class(inputs_dict, model_class)) outputs_dict = model(self._prepare_for_class(inputs_dict, model_class))
inputs_keywords = copy.deepcopy(self._prepare_for_class(inputs_dict, model_class)) inputs_keywords = copy.deepcopy(self._prepare_for_class(inputs_dict, model_class))
input_ids = inputs_keywords.pop("input_ids" if not self.is_encoder_decoder else "inputs", None,) input_ids = inputs_keywords.pop("input_ids", None)
outputs_keywords = model(input_ids, **inputs_keywords) outputs_keywords = model(input_ids, **inputs_keywords)
output_dict = outputs_dict[0].numpy() output_dict = outputs_dict[0].numpy()
output_keywords = outputs_keywords[0].numpy() output_keywords = outputs_keywords[0].numpy()
...@@ -479,9 +488,9 @@ class TFModelTesterMixin: ...@@ -479,9 +488,9 @@ class TFModelTesterMixin:
input_ids = inputs["input_ids"] input_ids = inputs["input_ids"]
del inputs["input_ids"] del inputs["input_ids"]
else: else:
encoder_input_ids = inputs["inputs"] encoder_input_ids = inputs["input_ids"]
decoder_input_ids = inputs.get("decoder_input_ids", encoder_input_ids) decoder_input_ids = inputs.get("decoder_input_ids", encoder_input_ids)
del inputs["inputs"] del inputs["input_ids"]
inputs.pop("decoder_input_ids", None) inputs.pop("decoder_input_ids", None)
wte = model.get_input_embeddings() wte = model.get_input_embeddings()
...@@ -596,9 +605,15 @@ class TFModelTesterMixin: ...@@ -596,9 +605,15 @@ class TFModelTesterMixin:
added_label = prepared_for_class[list(prepared_for_class.keys() - inputs_dict.keys())[0]] added_label = prepared_for_class[list(prepared_for_class.keys() - inputs_dict.keys())[0]]
loss_size = tf.size(added_label) loss_size = tf.size(added_label)
if model.__class__ in TF_MODEL_FOR_CAUSAL_LM_MAPPING.values():
# if loss is causal lm loss, labels are shift, so that one label per batch
# is cut
loss_size = loss_size - self.model_tester.batch_size
# Test that model correctly compute the loss with kwargs # Test that model correctly compute the loss with kwargs
prepared_for_class = self._prepare_for_class(inputs_dict.copy(), model_class, return_labels=True) prepared_for_class = self._prepare_for_class(inputs_dict.copy(), model_class, return_labels=True)
input_ids = prepared_for_class.pop("input_ids") input_ids = prepared_for_class.pop("input_ids")
loss = model(input_ids, **prepared_for_class)[0] loss = model(input_ids, **prepared_for_class)[0]
self.assertEqual(loss.shape, [loss_size]) self.assertEqual(loss.shape, [loss_size])
......
...@@ -17,7 +17,7 @@ ...@@ -17,7 +17,7 @@
import unittest import unittest
from transformers import DistilBertConfig, is_tf_available from transformers import DistilBertConfig, is_tf_available
from transformers.testing_utils import require_tf from transformers.testing_utils import require_tf, slow
from .test_configuration_common import ConfigTester from .test_configuration_common import ConfigTester
from .test_modeling_tf_common import TFModelTesterMixin, ids_tensor from .test_modeling_tf_common import TFModelTesterMixin, ids_tensor
...@@ -32,6 +32,7 @@ if is_tf_available(): ...@@ -32,6 +32,7 @@ if is_tf_available():
TFDistilBertForSequenceClassification, TFDistilBertForSequenceClassification,
TFDistilBertForTokenClassification, TFDistilBertForTokenClassification,
TFDistilBertForMultipleChoice, TFDistilBertForMultipleChoice,
TF_DISTILBERT_PRETRAINED_MODEL_ARCHIVE_LIST,
) )
...@@ -118,9 +119,7 @@ class TFDistilBertModelTester: ...@@ -118,9 +119,7 @@ class TFDistilBertModelTester:
model = TFDistilBertForMaskedLM(config=config) model = TFDistilBertForMaskedLM(config=config)
inputs = {"input_ids": input_ids, "attention_mask": input_mask} inputs = {"input_ids": input_ids, "attention_mask": input_mask}
(prediction_scores,) = model(inputs) (prediction_scores,) = model(inputs)
result = { result = {"prediction_scores": prediction_scores.numpy()}
"prediction_scores": prediction_scores.numpy(),
}
self.parent.assertListEqual( self.parent.assertListEqual(
list(result["prediction_scores"].shape), [self.batch_size, self.seq_length, self.vocab_size] list(result["prediction_scores"].shape), [self.batch_size, self.seq_length, self.vocab_size]
) )
...@@ -129,12 +128,12 @@ class TFDistilBertModelTester: ...@@ -129,12 +128,12 @@ class TFDistilBertModelTester:
self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
): ):
model = TFDistilBertForQuestionAnswering(config=config) model = TFDistilBertForQuestionAnswering(config=config)
inputs = {"input_ids": input_ids, "attention_mask": input_mask} inputs = {
start_logits, end_logits = model(inputs) "input_ids": input_ids,
result = { "attention_mask": input_mask,
"start_logits": start_logits.numpy(),
"end_logits": end_logits.numpy(),
} }
start_logits, end_logits = model(inputs)
result = {"start_logits": start_logits.numpy(), "end_logits": end_logits.numpy()}
self.parent.assertListEqual(list(result["start_logits"].shape), [self.batch_size, self.seq_length]) self.parent.assertListEqual(list(result["start_logits"].shape), [self.batch_size, self.seq_length])
self.parent.assertListEqual(list(result["end_logits"].shape), [self.batch_size, self.seq_length]) self.parent.assertListEqual(list(result["end_logits"].shape), [self.batch_size, self.seq_length])
...@@ -145,9 +144,7 @@ class TFDistilBertModelTester: ...@@ -145,9 +144,7 @@ class TFDistilBertModelTester:
model = TFDistilBertForSequenceClassification(config) model = TFDistilBertForSequenceClassification(config)
inputs = {"input_ids": input_ids, "attention_mask": input_mask} inputs = {"input_ids": input_ids, "attention_mask": input_mask}
(logits,) = model(inputs) (logits,) = model(inputs)
result = { result = {"logits": logits.numpy()}
"logits": logits.numpy(),
}
self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.num_labels]) self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.num_labels])
def create_and_check_distilbert_for_multiple_choice( def create_and_check_distilbert_for_multiple_choice(
...@@ -162,9 +159,7 @@ class TFDistilBertModelTester: ...@@ -162,9 +159,7 @@ class TFDistilBertModelTester:
"attention_mask": multiple_choice_input_mask, "attention_mask": multiple_choice_input_mask,
} }
(logits,) = model(inputs) (logits,) = model(inputs)
result = { result = {"logits": logits.numpy()}
"logits": logits.numpy(),
}
self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.num_choices]) self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.num_choices])
def create_and_check_distilbert_for_token_classification( def create_and_check_distilbert_for_token_classification(
...@@ -236,8 +231,8 @@ class TFDistilBertModelTest(TFModelTesterMixin, unittest.TestCase): ...@@ -236,8 +231,8 @@ class TFDistilBertModelTest(TFModelTesterMixin, unittest.TestCase):
config_and_inputs = self.model_tester.prepare_config_and_inputs() config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_distilbert_for_token_classification(*config_and_inputs) self.model_tester.create_and_check_distilbert_for_token_classification(*config_and_inputs)
# @slow @slow
# def test_model_from_pretrained(self): def test_model_from_pretrained(self):
# for model_name in list(DISTILBERT_PRETRAINED_MODEL_ARCHIVE_LIST[:1]: for model_name in list(TF_DISTILBERT_PRETRAINED_MODEL_ARCHIVE_LIST[:1]):
# model = DistilBertModesss.from_pretrained(model_name) model = TFDistilBertModel.from_pretrained(model_name)
# self.assertIsNotNone(model) self.assertIsNotNone(model)
...@@ -77,6 +77,7 @@ class TFT5ModelTester: ...@@ -77,6 +77,7 @@ class TFT5ModelTester:
eos_token_id=self.eos_token_id, eos_token_id=self.eos_token_id,
bos_token_id=self.pad_token_id, bos_token_id=self.pad_token_id,
pad_token_id=self.pad_token_id, pad_token_id=self.pad_token_id,
decoder_start_token_id=self.pad_token_id,
) )
return (config, input_ids, input_mask, token_labels) return (config, input_ids, input_mask, token_labels)
...@@ -84,7 +85,7 @@ class TFT5ModelTester: ...@@ -84,7 +85,7 @@ class TFT5ModelTester:
def create_and_check_t5_model(self, config, input_ids, input_mask, token_labels): def create_and_check_t5_model(self, config, input_ids, input_mask, token_labels):
model = TFT5Model(config=config) model = TFT5Model(config=config)
inputs = { inputs = {
"inputs": input_ids, "input_ids": input_ids,
"decoder_input_ids": input_ids, "decoder_input_ids": input_ids,
"decoder_attention_mask": input_mask, "decoder_attention_mask": input_mask,
} }
...@@ -115,7 +116,7 @@ class TFT5ModelTester: ...@@ -115,7 +116,7 @@ class TFT5ModelTester:
def create_and_check_t5_with_lm_head(self, config, input_ids, input_mask, token_labels): def create_and_check_t5_with_lm_head(self, config, input_ids, input_mask, token_labels):
model = TFT5ForConditionalGeneration(config=config) model = TFT5ForConditionalGeneration(config=config)
inputs_dict = { inputs_dict = {
"inputs": input_ids, "input_ids": input_ids,
"decoder_input_ids": input_ids, "decoder_input_ids": input_ids,
"decoder_attention_mask": input_mask, "decoder_attention_mask": input_mask,
} }
...@@ -209,7 +210,7 @@ class TFT5ModelTester: ...@@ -209,7 +210,7 @@ class TFT5ModelTester:
config_and_inputs = self.prepare_config_and_inputs() config_and_inputs = self.prepare_config_and_inputs()
(config, input_ids, input_mask, token_labels) = config_and_inputs (config, input_ids, input_mask, token_labels) = config_and_inputs
inputs_dict = { inputs_dict = {
"inputs": input_ids, "input_ids": input_ids,
"decoder_input_ids": input_ids, "decoder_input_ids": input_ids,
"decoder_attention_mask": input_mask, "decoder_attention_mask": input_mask,
"use_cache": tf.convert_to_tensor([False]), "use_cache": tf.convert_to_tensor([False]),
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment