"...git@developer.sourcefind.cn:chenpangpang/transformers.git" did not exist on "f83942684ddb5c691146df21c78615b09a241267"
Unverified Commit 8406fa6d authored by Joao Gante's avatar Joao Gante Committed by GitHub
Browse files

Add TFSpeech2Text (#15113)

* Add wrapper classes

* convert inner layers to tf

* Add TF Encoder and Decoder layers

* TFSpeech2Text models

* Loadable model

* TF model with same outputs as PT model

* test skeleton

* correct tests and run the fixup

* correct attention expansion

* TFSpeech2Text pask_key_values with TF format
parent 6a5472a8
...@@ -1478,6 +1478,8 @@ class ModelTesterMixin: ...@@ -1478,6 +1478,8 @@ class ModelTesterMixin:
tf_inputs_dict[key] = tf.convert_to_tensor(tensor.numpy(), dtype=tf.float32) tf_inputs_dict[key] = tf.convert_to_tensor(tensor.numpy(), dtype=tf.float32)
elif key == "pixel_values": elif key == "pixel_values":
tf_inputs_dict[key] = tf.convert_to_tensor(tensor.numpy(), dtype=tf.float32) tf_inputs_dict[key] = tf.convert_to_tensor(tensor.numpy(), dtype=tf.float32)
elif key == "input_features":
tf_inputs_dict[key] = tf.convert_to_tensor(tensor.numpy(), dtype=tf.float32)
else: else:
tf_inputs_dict[key] = tf.convert_to_tensor(tensor.numpy(), dtype=tf.int32) tf_inputs_dict[key] = tf.convert_to_tensor(tensor.numpy(), dtype=tf.int32)
...@@ -1529,6 +1531,8 @@ class ModelTesterMixin: ...@@ -1529,6 +1531,8 @@ class ModelTesterMixin:
tf_inputs_dict[key] = tf.convert_to_tensor(tensor.numpy(), dtype=tf.float32) tf_inputs_dict[key] = tf.convert_to_tensor(tensor.numpy(), dtype=tf.float32)
elif key == "pixel_values": elif key == "pixel_values":
tf_inputs_dict[key] = tf.convert_to_tensor(tensor.numpy(), dtype=tf.float32) tf_inputs_dict[key] = tf.convert_to_tensor(tensor.numpy(), dtype=tf.float32)
elif key == "input_features":
tf_inputs_dict[key] = tf.convert_to_tensor(tensor.numpy(), dtype=tf.float32)
else: else:
tf_inputs_dict[key] = tf.convert_to_tensor(tensor.numpy(), dtype=tf.int32) tf_inputs_dict[key] = tf.convert_to_tensor(tensor.numpy(), dtype=tf.int32)
......
...@@ -57,6 +57,7 @@ if is_tf_available(): ...@@ -57,6 +57,7 @@ if is_tf_available():
TF_MODEL_FOR_QUESTION_ANSWERING_MAPPING, TF_MODEL_FOR_QUESTION_ANSWERING_MAPPING,
TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING, TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING,
TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING, TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING,
TF_MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING,
TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING, TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING,
BertConfig, BertConfig,
TFAutoModel, TFAutoModel,
...@@ -140,6 +141,7 @@ class TFModelTesterMixin: ...@@ -140,6 +141,7 @@ class TFModelTesterMixin:
*get_values(TF_MODEL_FOR_MASKED_LM_MAPPING), *get_values(TF_MODEL_FOR_MASKED_LM_MAPPING),
*get_values(TF_MODEL_FOR_PRETRAINING_MAPPING), *get_values(TF_MODEL_FOR_PRETRAINING_MAPPING),
*get_values(TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING), *get_values(TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING),
*get_values(TF_MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING),
]: ]:
inputs_dict["labels"] = tf.zeros( inputs_dict["labels"] = tf.zeros(
(self.model_tester.batch_size, self.model_tester.seq_length), dtype=tf.int32 (self.model_tester.batch_size, self.model_tester.seq_length), dtype=tf.int32
...@@ -358,7 +360,6 @@ class TFModelTesterMixin: ...@@ -358,7 +360,6 @@ class TFModelTesterMixin:
pt_model = pt_model_class(config) pt_model = pt_model_class(config)
# Check we can load pt model in tf and vice-versa with model => model functions # Check we can load pt model in tf and vice-versa with model => model functions
tf_model = transformers.load_pytorch_model_in_tf2_model( tf_model = transformers.load_pytorch_model_in_tf2_model(
tf_model, pt_model, tf_inputs=self._prepare_for_class(inputs_dict, model_class) tf_model, pt_model, tf_inputs=self._prepare_for_class(inputs_dict, model_class)
) )
...@@ -374,6 +375,8 @@ class TFModelTesterMixin: ...@@ -374,6 +375,8 @@ class TFModelTesterMixin:
pt_inputs_dict[name] = torch.from_numpy(key.numpy()).to(torch.float32) pt_inputs_dict[name] = torch.from_numpy(key.numpy()).to(torch.float32)
elif name == "pixel_values": elif name == "pixel_values":
pt_inputs_dict[name] = torch.from_numpy(key.numpy()).to(torch.float32) pt_inputs_dict[name] = torch.from_numpy(key.numpy()).to(torch.float32)
elif name == "input_features":
pt_inputs_dict[name] = torch.from_numpy(key.numpy()).to(torch.float32)
else: else:
pt_inputs_dict[name] = torch.from_numpy(key.numpy()).to(torch.long) pt_inputs_dict[name] = torch.from_numpy(key.numpy()).to(torch.long)
...@@ -416,6 +419,8 @@ class TFModelTesterMixin: ...@@ -416,6 +419,8 @@ class TFModelTesterMixin:
pt_inputs_dict[name] = torch.from_numpy(key.numpy()).to(torch.float32) pt_inputs_dict[name] = torch.from_numpy(key.numpy()).to(torch.float32)
elif name == "pixel_values": elif name == "pixel_values":
pt_inputs_dict[name] = torch.from_numpy(key.numpy()).to(torch.float32) pt_inputs_dict[name] = torch.from_numpy(key.numpy()).to(torch.float32)
elif name == "input_features":
pt_inputs_dict[name] = torch.from_numpy(key.numpy()).to(torch.float32)
else: else:
pt_inputs_dict[name] = torch.from_numpy(key.numpy()).to(torch.long) pt_inputs_dict[name] = torch.from_numpy(key.numpy()).to(torch.long)
...@@ -443,7 +448,24 @@ class TFModelTesterMixin: ...@@ -443,7 +448,24 @@ class TFModelTesterMixin:
metric = tf.keras.metrics.SparseCategoricalAccuracy("accuracy") metric = tf.keras.metrics.SparseCategoricalAccuracy("accuracy")
for model_class in self.all_model_classes: for model_class in self.all_model_classes:
if self.is_encoder_decoder: if model_class.__name__ in ["TFSpeech2TextModel", "TFSpeech2TextForConditionalGeneration"]:
inputs = {
"decoder_input_ids": tf.keras.Input(
batch_shape=(2, max_input),
name="decoder_input_ids",
dtype="int32",
),
"input_features": tf.keras.Input(
batch_shape=(
2,
max_input,
self.model_tester.input_feat_per_channel * self.model_tester.input_channels,
),
name="input_features",
dtype="float32",
),
}
elif self.is_encoder_decoder:
inputs = { inputs = {
"decoder_input_ids": tf.keras.Input( "decoder_input_ids": tf.keras.Input(
batch_shape=(2, max_input), batch_shape=(2, max_input),
...@@ -511,10 +533,7 @@ class TFModelTesterMixin: ...@@ -511,10 +533,7 @@ class TFModelTesterMixin:
outputs_dict = model(inputs) outputs_dict = model(inputs)
inputs_keywords = copy.deepcopy(self._prepare_for_class(inputs_dict, model_class)) inputs_keywords = copy.deepcopy(self._prepare_for_class(inputs_dict, model_class))
input_ids = inputs_keywords.pop("input_ids", None) outputs_keywords = model(**inputs_keywords)
if input_ids is None:
input_ids = inputs_keywords.pop("pixel_values", None)
outputs_keywords = model(input_ids, **inputs_keywords)
output_dict = outputs_dict[0].numpy() output_dict = outputs_dict[0].numpy()
output_keywords = outputs_keywords[0].numpy() output_keywords = outputs_keywords[0].numpy()
...@@ -699,23 +718,28 @@ class TFModelTesterMixin: ...@@ -699,23 +718,28 @@ class TFModelTesterMixin:
def test_model_common_attributes(self): def test_model_common_attributes(self):
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
list_lm_models = ( text_in_text_out_models = (
get_values(TF_MODEL_FOR_CAUSAL_LM_MAPPING) get_values(TF_MODEL_FOR_CAUSAL_LM_MAPPING)
+ get_values(TF_MODEL_FOR_MASKED_LM_MAPPING) + get_values(TF_MODEL_FOR_MASKED_LM_MAPPING)
+ get_values(TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING) + get_values(TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING)
) )
speech_in_text_out_models = get_values(TF_MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING)
for model_class in self.all_model_classes: for model_class in self.all_model_classes:
model = model_class(config) model = model_class(config)
assert isinstance(model.get_input_embeddings(), tf.keras.layers.Layer) assert isinstance(model.get_input_embeddings(), tf.keras.layers.Layer)
if model_class in text_in_text_out_models:
if model_class in list_lm_models:
x = model.get_output_embeddings() x = model.get_output_embeddings()
assert isinstance(x, tf.keras.layers.Layer) assert isinstance(x, tf.keras.layers.Layer)
name = model.get_bias() name = model.get_bias()
assert isinstance(name, dict) assert isinstance(name, dict)
for k, v in name.items(): for k, v in name.items():
assert isinstance(v, tf.Variable) assert isinstance(v, tf.Variable)
elif model_class in speech_in_text_out_models:
x = model.get_output_embeddings()
assert isinstance(x, tf.keras.layers.Layer)
name = model.get_bias()
assert name is None
else: else:
x = model.get_output_embeddings() x = model.get_output_embeddings()
assert x is None assert x is None
...@@ -922,13 +946,13 @@ class TFModelTesterMixin: ...@@ -922,13 +946,13 @@ class TFModelTesterMixin:
model = model_class(config) model = model_class(config)
if config.bos_token_id is None: if config.bos_token_id is None:
# if bos token id is not defined mobel needs input_ids # if bos token id is not defined model needs input_ids
with self.assertRaises(AssertionError): with self.assertRaises(AssertionError):
model.generate(do_sample=True, max_length=5) model.generate(do_sample=True, max_length=5)
# num_return_sequences = 1 # num_return_sequences = 1
self._check_generated_ids(model.generate(input_ids, do_sample=True)) self._check_generated_ids(model.generate(input_ids, do_sample=True))
else: elif model_class.__name__ not in ["TFSpeech2TextForConditionalGeneration"]:
# num_return_sequences = 1 # Models with non-text inputs won't work here; num_return_sequences = 1
self._check_generated_ids(model.generate(do_sample=True, max_length=5)) self._check_generated_ids(model.generate(do_sample=True, max_length=5))
with self.assertRaises(AssertionError): with self.assertRaises(AssertionError):
...@@ -952,6 +976,8 @@ class TFModelTesterMixin: ...@@ -952,6 +976,8 @@ class TFModelTesterMixin:
def test_lm_head_model_no_beam_search_generate_dict_outputs(self): def test_lm_head_model_no_beam_search_generate_dict_outputs(self):
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
input_ids = inputs_dict.get("input_ids", None) input_ids = inputs_dict.get("input_ids", None)
if input_ids is None:
input_ids = inputs_dict.get("input_features", None)
# iterate over all generative models # iterate over all generative models
for model_class in self.all_generative_model_classes: for model_class in self.all_generative_model_classes:
...@@ -988,7 +1014,7 @@ class TFModelTesterMixin: ...@@ -988,7 +1014,7 @@ class TFModelTesterMixin:
model = model_class(config) model = model_class(config)
if config.bos_token_id is None: if config.bos_token_id is None:
# if bos token id is not defined mobel needs input_ids, num_return_sequences = 1 # if bos token id is not defined model needs input_ids, num_return_sequences = 1
self._check_generated_ids(model.generate(input_ids, do_sample=True, num_beams=2)) self._check_generated_ids(model.generate(input_ids, do_sample=True, num_beams=2))
else: else:
# num_return_sequences = 1 # num_return_sequences = 1
...@@ -1023,6 +1049,8 @@ class TFModelTesterMixin: ...@@ -1023,6 +1049,8 @@ class TFModelTesterMixin:
def test_lm_head_model_beam_search_generate_dict_outputs(self): def test_lm_head_model_beam_search_generate_dict_outputs(self):
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
input_ids = inputs_dict.get("input_ids", None) input_ids = inputs_dict.get("input_ids", None)
if input_ids is None:
input_ids = inputs_dict.get("input_features", None)
# iterate over all generative models # iterate over all generative models
for model_class in self.all_generative_model_classes: for model_class in self.all_generative_model_classes:
...@@ -1072,10 +1100,11 @@ class TFModelTesterMixin: ...@@ -1072,10 +1100,11 @@ class TFModelTesterMixin:
# Test that model correctly compute the loss with kwargs # Test that model correctly compute the loss with kwargs
prepared_for_class = self._prepare_for_class(inputs_dict.copy(), model_class, return_labels=True) prepared_for_class = self._prepare_for_class(inputs_dict.copy(), model_class, return_labels=True)
input_name = "input_ids" if "input_ids" in prepared_for_class else "pixel_values" possible_input_names = {"input_ids", "pixel_values", "input_features"}
input_ids = prepared_for_class.pop(input_name) input_name = possible_input_names.intersection(set(prepared_for_class)).pop()
model_input = prepared_for_class.pop(input_name)
loss = model(input_ids, **prepared_for_class)[0] loss = model(model_input, **prepared_for_class)[0]
self.assertEqual(loss.shape, [loss_size]) self.assertEqual(loss.shape, [loss_size])
# Test that model correctly compute the loss with a dict # Test that model correctly compute the loss with a dict
......
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment