Unverified Commit 8406fa6d authored by Joao Gante's avatar Joao Gante Committed by GitHub
Browse files

Add TFSpeech2Text (#15113)

* Add wrapper classes

* convert inner layers to tf

* Add TF Encoder and Decoder layers

* TFSpeech2Text models

* Loadable model

* TF model with same outputs as PT model

* test skeleton

* correct tests and run the fixup

* correct attention expansion

* TFSpeech2Text pask_key_values with TF format
parent 6a5472a8
......@@ -1478,6 +1478,8 @@ class ModelTesterMixin:
tf_inputs_dict[key] = tf.convert_to_tensor(tensor.numpy(), dtype=tf.float32)
elif key == "pixel_values":
tf_inputs_dict[key] = tf.convert_to_tensor(tensor.numpy(), dtype=tf.float32)
elif key == "input_features":
tf_inputs_dict[key] = tf.convert_to_tensor(tensor.numpy(), dtype=tf.float32)
else:
tf_inputs_dict[key] = tf.convert_to_tensor(tensor.numpy(), dtype=tf.int32)
......@@ -1529,6 +1531,8 @@ class ModelTesterMixin:
tf_inputs_dict[key] = tf.convert_to_tensor(tensor.numpy(), dtype=tf.float32)
elif key == "pixel_values":
tf_inputs_dict[key] = tf.convert_to_tensor(tensor.numpy(), dtype=tf.float32)
elif key == "input_features":
tf_inputs_dict[key] = tf.convert_to_tensor(tensor.numpy(), dtype=tf.float32)
else:
tf_inputs_dict[key] = tf.convert_to_tensor(tensor.numpy(), dtype=tf.int32)
......
......@@ -57,6 +57,7 @@ if is_tf_available():
TF_MODEL_FOR_QUESTION_ANSWERING_MAPPING,
TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING,
TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING,
TF_MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING,
TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING,
BertConfig,
TFAutoModel,
......@@ -140,6 +141,7 @@ class TFModelTesterMixin:
*get_values(TF_MODEL_FOR_MASKED_LM_MAPPING),
*get_values(TF_MODEL_FOR_PRETRAINING_MAPPING),
*get_values(TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING),
*get_values(TF_MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING),
]:
inputs_dict["labels"] = tf.zeros(
(self.model_tester.batch_size, self.model_tester.seq_length), dtype=tf.int32
......@@ -358,7 +360,6 @@ class TFModelTesterMixin:
pt_model = pt_model_class(config)
# Check we can load pt model in tf and vice-versa with model => model functions
tf_model = transformers.load_pytorch_model_in_tf2_model(
tf_model, pt_model, tf_inputs=self._prepare_for_class(inputs_dict, model_class)
)
......@@ -374,6 +375,8 @@ class TFModelTesterMixin:
pt_inputs_dict[name] = torch.from_numpy(key.numpy()).to(torch.float32)
elif name == "pixel_values":
pt_inputs_dict[name] = torch.from_numpy(key.numpy()).to(torch.float32)
elif name == "input_features":
pt_inputs_dict[name] = torch.from_numpy(key.numpy()).to(torch.float32)
else:
pt_inputs_dict[name] = torch.from_numpy(key.numpy()).to(torch.long)
......@@ -416,6 +419,8 @@ class TFModelTesterMixin:
pt_inputs_dict[name] = torch.from_numpy(key.numpy()).to(torch.float32)
elif name == "pixel_values":
pt_inputs_dict[name] = torch.from_numpy(key.numpy()).to(torch.float32)
elif name == "input_features":
pt_inputs_dict[name] = torch.from_numpy(key.numpy()).to(torch.float32)
else:
pt_inputs_dict[name] = torch.from_numpy(key.numpy()).to(torch.long)
......@@ -443,7 +448,24 @@ class TFModelTesterMixin:
metric = tf.keras.metrics.SparseCategoricalAccuracy("accuracy")
for model_class in self.all_model_classes:
if self.is_encoder_decoder:
if model_class.__name__ in ["TFSpeech2TextModel", "TFSpeech2TextForConditionalGeneration"]:
inputs = {
"decoder_input_ids": tf.keras.Input(
batch_shape=(2, max_input),
name="decoder_input_ids",
dtype="int32",
),
"input_features": tf.keras.Input(
batch_shape=(
2,
max_input,
self.model_tester.input_feat_per_channel * self.model_tester.input_channels,
),
name="input_features",
dtype="float32",
),
}
elif self.is_encoder_decoder:
inputs = {
"decoder_input_ids": tf.keras.Input(
batch_shape=(2, max_input),
......@@ -511,10 +533,7 @@ class TFModelTesterMixin:
outputs_dict = model(inputs)
inputs_keywords = copy.deepcopy(self._prepare_for_class(inputs_dict, model_class))
input_ids = inputs_keywords.pop("input_ids", None)
if input_ids is None:
input_ids = inputs_keywords.pop("pixel_values", None)
outputs_keywords = model(input_ids, **inputs_keywords)
outputs_keywords = model(**inputs_keywords)
output_dict = outputs_dict[0].numpy()
output_keywords = outputs_keywords[0].numpy()
......@@ -699,23 +718,28 @@ class TFModelTesterMixin:
def test_model_common_attributes(self):
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
list_lm_models = (
text_in_text_out_models = (
get_values(TF_MODEL_FOR_CAUSAL_LM_MAPPING)
+ get_values(TF_MODEL_FOR_MASKED_LM_MAPPING)
+ get_values(TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING)
)
speech_in_text_out_models = get_values(TF_MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING)
for model_class in self.all_model_classes:
model = model_class(config)
assert isinstance(model.get_input_embeddings(), tf.keras.layers.Layer)
if model_class in list_lm_models:
if model_class in text_in_text_out_models:
x = model.get_output_embeddings()
assert isinstance(x, tf.keras.layers.Layer)
name = model.get_bias()
assert isinstance(name, dict)
for k, v in name.items():
assert isinstance(v, tf.Variable)
elif model_class in speech_in_text_out_models:
x = model.get_output_embeddings()
assert isinstance(x, tf.keras.layers.Layer)
name = model.get_bias()
assert name is None
else:
x = model.get_output_embeddings()
assert x is None
......@@ -922,13 +946,13 @@ class TFModelTesterMixin:
model = model_class(config)
if config.bos_token_id is None:
# if bos token id is not defined mobel needs input_ids
# if bos token id is not defined model needs input_ids
with self.assertRaises(AssertionError):
model.generate(do_sample=True, max_length=5)
# num_return_sequences = 1
self._check_generated_ids(model.generate(input_ids, do_sample=True))
else:
# num_return_sequences = 1
elif model_class.__name__ not in ["TFSpeech2TextForConditionalGeneration"]:
# Models with non-text inputs won't work here; num_return_sequences = 1
self._check_generated_ids(model.generate(do_sample=True, max_length=5))
with self.assertRaises(AssertionError):
......@@ -952,6 +976,8 @@ class TFModelTesterMixin:
def test_lm_head_model_no_beam_search_generate_dict_outputs(self):
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
input_ids = inputs_dict.get("input_ids", None)
if input_ids is None:
input_ids = inputs_dict.get("input_features", None)
# iterate over all generative models
for model_class in self.all_generative_model_classes:
......@@ -988,7 +1014,7 @@ class TFModelTesterMixin:
model = model_class(config)
if config.bos_token_id is None:
# if bos token id is not defined mobel needs input_ids, num_return_sequences = 1
# if bos token id is not defined model needs input_ids, num_return_sequences = 1
self._check_generated_ids(model.generate(input_ids, do_sample=True, num_beams=2))
else:
# num_return_sequences = 1
......@@ -1023,6 +1049,8 @@ class TFModelTesterMixin:
def test_lm_head_model_beam_search_generate_dict_outputs(self):
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
input_ids = inputs_dict.get("input_ids", None)
if input_ids is None:
input_ids = inputs_dict.get("input_features", None)
# iterate over all generative models
for model_class in self.all_generative_model_classes:
......@@ -1072,10 +1100,11 @@ class TFModelTesterMixin:
# Test that model correctly compute the loss with kwargs
prepared_for_class = self._prepare_for_class(inputs_dict.copy(), model_class, return_labels=True)
input_name = "input_ids" if "input_ids" in prepared_for_class else "pixel_values"
input_ids = prepared_for_class.pop(input_name)
possible_input_names = {"input_ids", "pixel_values", "input_features"}
input_name = possible_input_names.intersection(set(prepared_for_class)).pop()
model_input = prepared_for_class.pop(input_name)
loss = model(input_ids, **prepared_for_class)[0]
loss = model(model_input, **prepared_for_class)[0]
self.assertEqual(loss.shape, [loss_size])
# Test that model correctly compute the loss with a dict
......
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment