Unverified Commit 8406fa6d authored by Joao Gante's avatar Joao Gante Committed by GitHub
Browse files

Add TFSpeech2Text (#15113)

* Add wrapper classes

* convert inner layers to tf

* Add TF Encoder and Decoder layers

* TFSpeech2Text models

* Loadable model

* TF model with same outputs as PT model

* test skeleton

* correct tests and run the fixup

* correct attention expansion

* TFSpeech2Text pask_key_values with TF format
parent 6a5472a8
...@@ -227,7 +227,7 @@ Flax), PyTorch, and/or TensorFlow. ...@@ -227,7 +227,7 @@ Flax), PyTorch, and/or TensorFlow.
| SEW | ❌ | ❌ | ✅ | ❌ | ❌ | | SEW | ❌ | ❌ | ✅ | ❌ | ❌ |
| SEW-D | ❌ | ❌ | ✅ | ❌ | ❌ | | SEW-D | ❌ | ❌ | ✅ | ❌ | ❌ |
| Speech Encoder decoder | ❌ | ❌ | ✅ | ❌ | ❌ | | Speech Encoder decoder | ❌ | ❌ | ✅ | ❌ | ❌ |
| Speech2Text | ✅ | ❌ | ✅ | | ❌ | | Speech2Text | ✅ | ❌ | ✅ | | ❌ |
| Speech2Text2 | ✅ | ❌ | ❌ | ❌ | ❌ | | Speech2Text2 | ✅ | ❌ | ❌ | ❌ | ❌ |
| Splinter | ✅ | ✅ | ✅ | ❌ | ❌ | | Splinter | ✅ | ✅ | ✅ | ❌ | ❌ |
| SqueezeBERT | ✅ | ✅ | ✅ | ❌ | ❌ | | SqueezeBERT | ✅ | ✅ | ✅ | ❌ | ❌ |
......
...@@ -202,6 +202,10 @@ Likewise, if your `NewModel` is a subclass of [`PreTrainedModel`], make sure its ...@@ -202,6 +202,10 @@ Likewise, if your `NewModel` is a subclass of [`PreTrainedModel`], make sure its
[[autodoc]] TFAutoModelForVision2Seq [[autodoc]] TFAutoModelForVision2Seq
## TFAutoModelForSpeechSeq2Seq
[[autodoc]] TFAutoModelForSpeechSeq2Seq
## FlaxAutoModel ## FlaxAutoModel
[[autodoc]] FlaxAutoModel [[autodoc]] FlaxAutoModel
......
...@@ -144,3 +144,13 @@ See the [model hub](https://huggingface.co/models?filter=speech_to_text) to look ...@@ -144,3 +144,13 @@ See the [model hub](https://huggingface.co/models?filter=speech_to_text) to look
[[autodoc]] Speech2TextForConditionalGeneration [[autodoc]] Speech2TextForConditionalGeneration
- forward - forward
## TFSpeech2TextModel
[[autodoc]] TFSpeech2TextModel
- call
## TFSpeech2TextForConditionalGeneration
[[autodoc]] TFSpeech2TextForConditionalGeneration
- call
...@@ -1621,6 +1621,7 @@ if is_tf_available(): ...@@ -1621,6 +1621,7 @@ if is_tf_available():
"TF_MODEL_FOR_QUESTION_ANSWERING_MAPPING", "TF_MODEL_FOR_QUESTION_ANSWERING_MAPPING",
"TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING", "TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING",
"TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING", "TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING",
"TF_MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING",
"TF_MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING", "TF_MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING",
"TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING", "TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING",
"TF_MODEL_FOR_VISION_2_SEQ_MAPPING", "TF_MODEL_FOR_VISION_2_SEQ_MAPPING",
...@@ -1635,6 +1636,7 @@ if is_tf_available(): ...@@ -1635,6 +1636,7 @@ if is_tf_available():
"TFAutoModelForQuestionAnswering", "TFAutoModelForQuestionAnswering",
"TFAutoModelForSeq2SeqLM", "TFAutoModelForSeq2SeqLM",
"TFAutoModelForSequenceClassification", "TFAutoModelForSequenceClassification",
"TFAutoModelForSpeechSeq2Seq",
"TFAutoModelForTableQuestionAnswering", "TFAutoModelForTableQuestionAnswering",
"TFAutoModelForTokenClassification", "TFAutoModelForTokenClassification",
"TFAutoModelForVision2Seq", "TFAutoModelForVision2Seq",
...@@ -1946,6 +1948,14 @@ if is_tf_available(): ...@@ -1946,6 +1948,14 @@ if is_tf_available():
"TFRoFormerPreTrainedModel", "TFRoFormerPreTrainedModel",
] ]
) )
_import_structure["models.speech_to_text"].extend(
[
"TF_SPEECH_TO_TEXT_PRETRAINED_MODEL_ARCHIVE_LIST",
"TFSpeech2TextForConditionalGeneration",
"TFSpeech2TextModel",
"TFSpeech2TextPreTrainedModel",
]
)
_import_structure["models.t5"].extend( _import_structure["models.t5"].extend(
[ [
"TF_T5_PRETRAINED_MODEL_ARCHIVE_LIST", "TF_T5_PRETRAINED_MODEL_ARCHIVE_LIST",
...@@ -3588,6 +3598,7 @@ if TYPE_CHECKING: ...@@ -3588,6 +3598,7 @@ if TYPE_CHECKING:
TF_MODEL_FOR_QUESTION_ANSWERING_MAPPING, TF_MODEL_FOR_QUESTION_ANSWERING_MAPPING,
TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING, TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING,
TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING, TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING,
TF_MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING,
TF_MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING, TF_MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING,
TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING, TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING,
TF_MODEL_FOR_VISION_2_SEQ_MAPPING, TF_MODEL_FOR_VISION_2_SEQ_MAPPING,
...@@ -3602,6 +3613,7 @@ if TYPE_CHECKING: ...@@ -3602,6 +3613,7 @@ if TYPE_CHECKING:
TFAutoModelForQuestionAnswering, TFAutoModelForQuestionAnswering,
TFAutoModelForSeq2SeqLM, TFAutoModelForSeq2SeqLM,
TFAutoModelForSequenceClassification, TFAutoModelForSequenceClassification,
TFAutoModelForSpeechSeq2Seq,
TFAutoModelForTableQuestionAnswering, TFAutoModelForTableQuestionAnswering,
TFAutoModelForTokenClassification, TFAutoModelForTokenClassification,
TFAutoModelForVision2Seq, TFAutoModelForVision2Seq,
...@@ -3850,6 +3862,12 @@ if TYPE_CHECKING: ...@@ -3850,6 +3862,12 @@ if TYPE_CHECKING:
TFRoFormerModel, TFRoFormerModel,
TFRoFormerPreTrainedModel, TFRoFormerPreTrainedModel,
) )
from .models.speech_to_text import (
TF_SPEECH_TO_TEXT_PRETRAINED_MODEL_ARCHIVE_LIST,
TFSpeech2TextForConditionalGeneration,
TFSpeech2TextModel,
TFSpeech2TextPreTrainedModel,
)
from .models.t5 import ( from .models.t5 import (
TF_T5_PRETRAINED_MODEL_ARCHIVE_LIST, TF_T5_PRETRAINED_MODEL_ARCHIVE_LIST,
TFT5EncoderModel, TFT5EncoderModel,
......
...@@ -394,9 +394,12 @@ class TFGenerationMixin: ...@@ -394,9 +394,12 @@ class TFGenerationMixin:
Parameters: Parameters:
input_ids (`tf.Tensor` of `dtype=tf.int32` and shape `(batch_size, sequence_length)`, *optional*): input_ids (`tf.Tensor` of shape `(batch_size, sequence_length)`, `(batch_size, sequence_length,
The sequence used as a prompt for the generation. If `None` the method initializes it with feature_dim)` or `(batch_size, num_channels, height, width)`, *optional*):
`bos_token_id` and a batch size of 1. The sequence used as a prompt for the generation or as model inputs to the encoder. If `None` the
method initializes it with `bos_token_id` and a batch size of 1. For decoder-only models `inputs`
should of in the format of `input_ids`. For encoder-decoder models *inputs* can represent any of
`input_ids`, `input_values`, `input_features`, or `pixel_values`.
max_length (`int`, *optional*, defaults to 20): max_length (`int`, *optional*, defaults to 20):
The maximum length of the sequence to be generated. The maximum length of the sequence to be generated.
min_length (`int`, *optional*, defaults to 10): min_length (`int`, *optional*, defaults to 10):
...@@ -657,11 +660,12 @@ class TFGenerationMixin: ...@@ -657,11 +660,12 @@ class TFGenerationMixin:
), "Greedy beam search decoding cannot return more sequences than it has beams. Please set num_beams >= num_return_sequences" ), "Greedy beam search decoding cannot return more sequences than it has beams. Please set num_beams >= num_return_sequences"
# create attention mask if necessary # create attention mask if necessary
# TODO (PVP): this should later be handled by the forward fn() in each model in the future see PR 3140 accepts_attention_mask = "attention_mask" in set(inspect.signature(self.call).parameters.keys())
if (attention_mask is None) and (pad_token_id is not None) and (pad_token_id in input_ids.numpy()): if accepts_attention_mask:
attention_mask = tf.cast(tf.math.not_equal(input_ids, pad_token_id), dtype=tf.int32) if (attention_mask is None) and (pad_token_id is not None) and (pad_token_id in input_ids.numpy()):
elif attention_mask is None: attention_mask = tf.cast(tf.math.not_equal(input_ids, pad_token_id), dtype=tf.int32)
attention_mask = tf.ones_like(input_ids) elif attention_mask is None:
attention_mask = tf.ones(shape_list(input_ids)[:2], dtype=tf.int32)
if pad_token_id is None and eos_token_id is not None: if pad_token_id is None and eos_token_id is not None:
logger.warning(f"Setting `pad_token_id` to {eos_token_id} (first `eos_token_id`) to generate sequence") logger.warning(f"Setting `pad_token_id` to {eos_token_id} (first `eos_token_id`) to generate sequence")
...@@ -697,16 +701,12 @@ class TFGenerationMixin: ...@@ -697,16 +701,12 @@ class TFGenerationMixin:
encoder = self.get_encoder() encoder = self.get_encoder()
encoder_kwargs = { encoder_kwargs = {
"attention_mask": attention_mask,
"output_attentions": output_attentions, "output_attentions": output_attentions,
"output_hidden_states": output_hidden_states, "output_hidden_states": output_hidden_states,
"return_dict": return_dict_in_generate, "return_dict": return_dict_in_generate,
} }
if accepts_attention_mask:
# vision models don't use `attention_mask`. encoder_kwargs["attention_mask"] = attention_mask
signature = dict(inspect.signature(encoder.call).parameters)
if "attention_mask" not in signature:
encoder_kwargs.pop("attention_mask")
encoder_outputs = encoder(input_ids, **encoder_kwargs) encoder_outputs = encoder(input_ids, **encoder_kwargs)
if return_dict_in_generate: if return_dict_in_generate:
...@@ -715,23 +715,15 @@ class TFGenerationMixin: ...@@ -715,23 +715,15 @@ class TFGenerationMixin:
if output_hidden_states: if output_hidden_states:
model_kwargs["encoder_hidden_states"] = encoder_outputs.hidden_states model_kwargs["encoder_hidden_states"] = encoder_outputs.hidden_states
# The condition `len(shape_list(input_ids)) == 2` is to make this block treats only text inputs. expanded_batch_idxs = tf.reshape(
# (vision inputs might occur when the model is an encoder-decoder model) tf.repeat(tf.expand_dims(tf.range(batch_size), -1), repeats=num_beams * effective_batch_mult, axis=1),
# Expand input ids if num_beams > 1 or num_return_sequences > 1 shape=(-1,),
if len(shape_list(input_ids)) == 2 and (num_return_sequences > 1 or num_beams > 1): )
input_ids_len = shape_list(input_ids)[-1] # prepares text-based inputs
input_ids = tf.broadcast_to( if len(shape_list(input_ids)) == 2:
tf.expand_dims(input_ids, 1), (batch_size, effective_batch_mult * num_beams, input_ids_len) input_ids = tf.gather(input_ids, expanded_batch_idxs, axis=0)
) if accepts_attention_mask:
attention_mask = tf.broadcast_to( attention_mask = tf.gather(attention_mask, expanded_batch_idxs, axis=0)
tf.expand_dims(attention_mask, 1), (batch_size, effective_batch_mult * num_beams, input_ids_len)
)
input_ids = tf.reshape(
input_ids, (effective_batch_size * num_beams, input_ids_len)
) # shape: (batch_size * num_return_sequences * num_beams, cur_len)
attention_mask = tf.reshape(
attention_mask, (effective_batch_size * num_beams, input_ids_len)
) # shape: (batch_size * num_return_sequences * num_beams, cur_len)
if self.config.is_encoder_decoder: if self.config.is_encoder_decoder:
...@@ -749,11 +741,6 @@ class TFGenerationMixin: ...@@ -749,11 +741,6 @@ class TFGenerationMixin:
batch_size == encoder_outputs[0].shape[0] batch_size == encoder_outputs[0].shape[0]
), f"expected encoder_outputs[0] to have 1st dimension bs={batch_size}, got {encoder_outputs[0].shape[0]} " ), f"expected encoder_outputs[0] to have 1st dimension bs={batch_size}, got {encoder_outputs[0].shape[0]} "
# expand batch_idx to assign correct encoder output for expanded input_ids (due to num_beams > 1 and num_return_sequences > 1)
expanded_batch_idxs = tf.reshape(
tf.repeat(tf.expand_dims(tf.range(batch_size), -1), repeats=num_beams * effective_batch_mult, axis=1),
shape=(-1,),
)
# expand encoder_outputs # expand encoder_outputs
encoder_outputs = (tf.gather(encoder_outputs[0], expanded_batch_idxs, axis=0),) encoder_outputs = (tf.gather(encoder_outputs[0], expanded_batch_idxs, axis=0),)
else: else:
...@@ -851,7 +838,8 @@ class TFGenerationMixin: ...@@ -851,7 +838,8 @@ class TFGenerationMixin:
unfinished_sents = tf.ones_like(input_ids[:, 0]) unfinished_sents = tf.ones_like(input_ids[:, 0])
sent_lengths = tf.ones_like(input_ids[:, 0]) * max_length sent_lengths = tf.ones_like(input_ids[:, 0]) * max_length
past = encoder_outputs # defined for encoder-decoder models, None for decoder-only models # defined for encoder-decoder models, None for decoder-only models
past = encoder_outputs
# init attention / hidden states / scores tuples # init attention / hidden states / scores tuples
scores = () if (return_dict_in_generate and kwargs["output_scores"]) else None scores = () if (return_dict_in_generate and kwargs["output_scores"]) else None
...@@ -871,7 +859,11 @@ class TFGenerationMixin: ...@@ -871,7 +859,11 @@ class TFGenerationMixin:
while cur_len < max_length: while cur_len < max_length:
model_inputs = self.prepare_inputs_for_generation( model_inputs = self.prepare_inputs_for_generation(
input_ids, past=past, attention_mask=attention_mask, use_cache=use_cache, **kwargs input_ids,
past=past,
attention_mask=attention_mask,
use_cache=use_cache,
**kwargs,
) )
outputs = self( outputs = self(
**model_inputs, **model_inputs,
...@@ -1132,7 +1124,11 @@ class TFGenerationMixin: ...@@ -1132,7 +1124,11 @@ class TFGenerationMixin:
while cur_len < max_length: while cur_len < max_length:
model_inputs = self.prepare_inputs_for_generation( model_inputs = self.prepare_inputs_for_generation(
input_ids, past=past, attention_mask=attention_mask, use_cache=use_cache, **kwargs input_ids,
past=past,
attention_mask=attention_mask,
use_cache=use_cache,
**kwargs,
) )
outputs = self( outputs = self(
**model_inputs, **model_inputs,
......
...@@ -35,6 +35,7 @@ class TransposeType(ExplicitEnum): ...@@ -35,6 +35,7 @@ class TransposeType(ExplicitEnum):
NO = "no" NO = "no"
SIMPLE = "simple" SIMPLE = "simple"
CONV1D = "conv1d"
CONV2D = "conv2d" CONV2D = "conv2d"
...@@ -68,8 +69,9 @@ def convert_tf_weight_name_to_pt_weight_name(tf_name, start_prefix_to_remove="", ...@@ -68,8 +69,9 @@ def convert_tf_weight_name_to_pt_weight_name(tf_name, start_prefix_to_remove="",
# When should we transpose the weights # When should we transpose the weights
if tf_name[-1] == "kernel" and tf_weight_shape is not None and tf_weight_shape.rank == 4: if tf_name[-1] == "kernel" and tf_weight_shape is not None and tf_weight_shape.rank == 4:
# A simple heuristic to detect conv layer using weight array shape
transpose = TransposeType.CONV2D transpose = TransposeType.CONV2D
elif tf_name[-1] == "kernel" and tf_weight_shape is not None and tf_weight_shape.rank == 3:
transpose = TransposeType.CONV1D
elif bool( elif bool(
tf_name[-1] in ["kernel", "pointwise_kernel", "depthwise_kernel"] tf_name[-1] in ["kernel", "pointwise_kernel", "depthwise_kernel"]
or "emb_projs" in tf_name or "emb_projs" in tf_name
...@@ -194,7 +196,6 @@ def load_pytorch_weights_in_tf2_model(tf_model, pt_state_dict, tf_inputs=None, a ...@@ -194,7 +196,6 @@ def load_pytorch_weights_in_tf2_model(tf_model, pt_state_dict, tf_inputs=None, a
# authorized missing keys don't have to be loaded # authorized missing keys don't have to be loaded
if any(re.search(pat, name) is not None for pat in tf_model._keys_to_ignore_on_load_missing): if any(re.search(pat, name) is not None for pat in tf_model._keys_to_ignore_on_load_missing):
continue continue
raise AttributeError(f"{name} not found in PyTorch model") raise AttributeError(f"{name} not found in PyTorch model")
array = pt_state_dict[name].numpy() array = pt_state_dict[name].numpy()
...@@ -204,6 +205,11 @@ def load_pytorch_weights_in_tf2_model(tf_model, pt_state_dict, tf_inputs=None, a ...@@ -204,6 +205,11 @@ def load_pytorch_weights_in_tf2_model(tf_model, pt_state_dict, tf_inputs=None, a
# PT: (num_out_channel, num_in_channel, kernel[0], kernel[1]) # PT: (num_out_channel, num_in_channel, kernel[0], kernel[1])
# -> TF: (kernel[0], kernel[1], num_in_channel, num_out_channel) # -> TF: (kernel[0], kernel[1], num_in_channel, num_out_channel)
array = numpy.transpose(array, axes=(2, 3, 1, 0)) array = numpy.transpose(array, axes=(2, 3, 1, 0))
elif transpose is TransposeType.CONV1D:
# Conv1D weight:
# PT: (num_out_channel, num_in_channel, kernel)
# -> TF: (kernel, num_in_channel, num_out_channel)
array = numpy.transpose(array, axes=(2, 1, 0))
elif transpose is TransposeType.SIMPLE: elif transpose is TransposeType.SIMPLE:
array = numpy.transpose(array) array = numpy.transpose(array)
...@@ -355,7 +361,6 @@ def load_tf2_weights_in_pytorch_model(pt_model, tf_weights, allow_missing_keys=F ...@@ -355,7 +361,6 @@ def load_tf2_weights_in_pytorch_model(pt_model, tf_weights, allow_missing_keys=F
all_tf_weights = set(list(tf_weights_map.keys())) all_tf_weights = set(list(tf_weights_map.keys()))
loaded_pt_weights_data_ptr = {} loaded_pt_weights_data_ptr = {}
missing_keys_pt = [] missing_keys_pt = []
for pt_weight_name, pt_weight in current_pt_params_dict.items(): for pt_weight_name, pt_weight in current_pt_params_dict.items():
# Handle PyTorch shared weight ()not duplicated in TF 2.0 # Handle PyTorch shared weight ()not duplicated in TF 2.0
if pt_weight.data_ptr() in loaded_pt_weights_data_ptr: if pt_weight.data_ptr() in loaded_pt_weights_data_ptr:
...@@ -377,6 +382,11 @@ def load_tf2_weights_in_pytorch_model(pt_model, tf_weights, allow_missing_keys=F ...@@ -377,6 +382,11 @@ def load_tf2_weights_in_pytorch_model(pt_model, tf_weights, allow_missing_keys=F
# TF: (kernel[0], kernel[1], num_in_channel, num_out_channel) # TF: (kernel[0], kernel[1], num_in_channel, num_out_channel)
# -> PT: (num_out_channel, num_in_channel, kernel[0], kernel[1]) # -> PT: (num_out_channel, num_in_channel, kernel[0], kernel[1])
array = numpy.transpose(array, axes=(3, 2, 0, 1)) array = numpy.transpose(array, axes=(3, 2, 0, 1))
elif transpose is TransposeType.CONV1D:
# Conv1D weight:
# TF: (kernel, num_in_channel, num_out_channel)
# -> PT: (num_out_channel, num_in_channel, kernel)
array = numpy.transpose(array, axes=(2, 1, 0))
elif transpose is TransposeType.SIMPLE: elif transpose is TransposeType.SIMPLE:
array = numpy.transpose(array) array = numpy.transpose(array)
......
...@@ -87,6 +87,7 @@ if is_tf_available(): ...@@ -87,6 +87,7 @@ if is_tf_available():
"TF_MODEL_FOR_QUESTION_ANSWERING_MAPPING", "TF_MODEL_FOR_QUESTION_ANSWERING_MAPPING",
"TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING", "TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING",
"TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING", "TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING",
"TF_MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING",
"TF_MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING", "TF_MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING",
"TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING", "TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING",
"TF_MODEL_FOR_VISION_2_SEQ_MAPPING", "TF_MODEL_FOR_VISION_2_SEQ_MAPPING",
...@@ -101,6 +102,7 @@ if is_tf_available(): ...@@ -101,6 +102,7 @@ if is_tf_available():
"TFAutoModelForQuestionAnswering", "TFAutoModelForQuestionAnswering",
"TFAutoModelForSeq2SeqLM", "TFAutoModelForSeq2SeqLM",
"TFAutoModelForSequenceClassification", "TFAutoModelForSequenceClassification",
"TFAutoModelForSpeechSeq2Seq",
"TFAutoModelForTableQuestionAnswering", "TFAutoModelForTableQuestionAnswering",
"TFAutoModelForTokenClassification", "TFAutoModelForTokenClassification",
"TFAutoModelForVision2Seq", "TFAutoModelForVision2Seq",
...@@ -201,6 +203,7 @@ if TYPE_CHECKING: ...@@ -201,6 +203,7 @@ if TYPE_CHECKING:
TF_MODEL_FOR_QUESTION_ANSWERING_MAPPING, TF_MODEL_FOR_QUESTION_ANSWERING_MAPPING,
TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING, TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING,
TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING, TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING,
TF_MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING,
TF_MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING, TF_MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING,
TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING, TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING,
TF_MODEL_FOR_VISION_2_SEQ_MAPPING, TF_MODEL_FOR_VISION_2_SEQ_MAPPING,
...@@ -215,6 +218,7 @@ if TYPE_CHECKING: ...@@ -215,6 +218,7 @@ if TYPE_CHECKING:
TFAutoModelForQuestionAnswering, TFAutoModelForQuestionAnswering,
TFAutoModelForSeq2SeqLM, TFAutoModelForSeq2SeqLM,
TFAutoModelForSequenceClassification, TFAutoModelForSequenceClassification,
TFAutoModelForSpeechSeq2Seq,
TFAutoModelForTableQuestionAnswering, TFAutoModelForTableQuestionAnswering,
TFAutoModelForTokenClassification, TFAutoModelForTokenClassification,
TFAutoModelForVision2Seq, TFAutoModelForVision2Seq,
......
...@@ -801,7 +801,7 @@ class AutoModelForSpeechSeq2Seq(_BaseAutoModelClass): ...@@ -801,7 +801,7 @@ class AutoModelForSpeechSeq2Seq(_BaseAutoModelClass):
AutoModelForSpeechSeq2Seq = auto_class_update( AutoModelForSpeechSeq2Seq = auto_class_update(
AutoModelForSpeechSeq2Seq, head_doc="sequence-to-sequence speech-to-text modeing" AutoModelForSpeechSeq2Seq, head_doc="sequence-to-sequence speech-to-text modeling"
) )
......
...@@ -29,6 +29,7 @@ logger = logging.get_logger(__name__) ...@@ -29,6 +29,7 @@ logger = logging.get_logger(__name__)
TF_MODEL_MAPPING_NAMES = OrderedDict( TF_MODEL_MAPPING_NAMES = OrderedDict(
[ [
# Base model mapping # Base model mapping
("speech_to_text", "TFSpeech2TextModel"),
("clip", "TFCLIPModel"), ("clip", "TFCLIPModel"),
("deberta-v2", "TFDebertaV2Model"), ("deberta-v2", "TFDebertaV2Model"),
("deberta", "TFDebertaModel"), ("deberta", "TFDebertaModel"),
...@@ -103,6 +104,7 @@ TF_MODEL_FOR_PRETRAINING_MAPPING_NAMES = OrderedDict( ...@@ -103,6 +104,7 @@ TF_MODEL_FOR_PRETRAINING_MAPPING_NAMES = OrderedDict(
TF_MODEL_WITH_LM_HEAD_MAPPING_NAMES = OrderedDict( TF_MODEL_WITH_LM_HEAD_MAPPING_NAMES = OrderedDict(
[ [
# Model with LM heads mapping # Model with LM heads mapping
("speech_to_text", "TFSpeech2TextForConditionalGeneration"),
("rembert", "TFRemBertForMaskedLM"), ("rembert", "TFRemBertForMaskedLM"),
("roformer", "TFRoFormerForMaskedLM"), ("roformer", "TFRoFormerForMaskedLM"),
("convbert", "TFConvBertForMaskedLM"), ("convbert", "TFConvBertForMaskedLM"),
...@@ -204,6 +206,12 @@ TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES = OrderedDict( ...@@ -204,6 +206,12 @@ TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES = OrderedDict(
] ]
) )
TF_MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING_NAMES = OrderedDict(
[
("speech_to_text", "TFSpeech2TextForConditionalGeneration"),
]
)
TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES = OrderedDict( TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES = OrderedDict(
[ [
# Model for Sequence Classification mapping # Model for Sequence Classification mapping
...@@ -340,6 +348,9 @@ TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING = _LazyAutoMapping( ...@@ -340,6 +348,9 @@ TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING = _LazyAutoMapping(
TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING = _LazyAutoMapping( TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING = _LazyAutoMapping(
CONFIG_MAPPING_NAMES, TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES CONFIG_MAPPING_NAMES, TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES
) )
TF_MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING = _LazyAutoMapping(
CONFIG_MAPPING_NAMES, TF_MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING_NAMES
)
TF_MODEL_FOR_QUESTION_ANSWERING_MAPPING = _LazyAutoMapping( TF_MODEL_FOR_QUESTION_ANSWERING_MAPPING = _LazyAutoMapping(
CONFIG_MAPPING_NAMES, TF_MODEL_FOR_QUESTION_ANSWERING_MAPPING_NAMES CONFIG_MAPPING_NAMES, TF_MODEL_FOR_QUESTION_ANSWERING_MAPPING_NAMES
) )
...@@ -468,6 +479,15 @@ TFAutoModelForNextSentencePrediction = auto_class_update( ...@@ -468,6 +479,15 @@ TFAutoModelForNextSentencePrediction = auto_class_update(
) )
class TFAutoModelForSpeechSeq2Seq(_BaseAutoModelClass):
_model_mapping = TF_MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING
TFAutoModelForSpeechSeq2Seq = auto_class_update(
TFAutoModelForSpeechSeq2Seq, head_doc="sequence-to-sequence speech-to-text modeling"
)
class TFAutoModelWithLMHead(_TFAutoModelWithLMHead): class TFAutoModelWithLMHead(_TFAutoModelWithLMHead):
@classmethod @classmethod
def from_config(cls, config): def from_config(cls, config):
......
...@@ -147,7 +147,11 @@ class TFBartAttention(tf.keras.layers.Layer): ...@@ -147,7 +147,11 @@ class TFBartAttention(tf.keras.layers.Layer):
self.num_heads = num_heads self.num_heads = num_heads
self.dropout = tf.keras.layers.Dropout(dropout) self.dropout = tf.keras.layers.Dropout(dropout)
self.head_dim = embed_dim // num_heads self.head_dim = embed_dim // num_heads
assert self.head_dim * num_heads == self.embed_dim, "embed_dim must be divisible by num_heads" if (self.head_dim * num_heads) != self.embed_dim:
raise ValueError(
f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}"
f" and `num_heads`: {num_heads})."
)
self.scaling = self.head_dim ** -0.5 self.scaling = self.head_dim ** -0.5
self.is_decoder = is_decoder self.is_decoder = is_decoder
...@@ -296,11 +300,11 @@ class TFBartEncoderLayer(tf.keras.layers.Layer): ...@@ -296,11 +300,11 @@ class TFBartEncoderLayer(tf.keras.layers.Layer):
def call(self, hidden_states: tf.Tensor, attention_mask: tf.Tensor, layer_head_mask: tf.Tensor, training=False): def call(self, hidden_states: tf.Tensor, attention_mask: tf.Tensor, layer_head_mask: tf.Tensor, training=False):
""" """
Args: Args:
hidden_states (`tf.Tensor`): input to the layer of shape *(seq_len, batch, embed_dim)* hidden_states (`tf.Tensor`): input to the layer of shape `(seq_len, batch, embed_dim)`
attention_mask (`tf.Tensor`): attention mask of size attention_mask (`tf.Tensor`): attention mask of size
*(batch, 1, tgt_len, src_len)* where padding elements are indicated by very large negative values. `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
layer_head_mask (`tf.Tensor`): mask for attention heads in a given layer of size layer_head_mask (`tf.Tensor`): mask for attention heads in a given layer of size
*(encoder_attention_heads,)* `(encoder_attention_heads,)`
""" """
residual = hidden_states residual = hidden_states
hidden_states, self_attn_weights, _ = self.self_attn( hidden_states, self_attn_weights, _ = self.self_attn(
...@@ -372,17 +376,17 @@ class TFBartDecoderLayer(tf.keras.layers.Layer): ...@@ -372,17 +376,17 @@ class TFBartDecoderLayer(tf.keras.layers.Layer):
) -> Tuple[tf.Tensor, tf.Tensor, Tuple[Tuple[tf.Tensor]]]: ) -> Tuple[tf.Tensor, tf.Tensor, Tuple[Tuple[tf.Tensor]]]:
""" """
Args: Args:
hidden_states (`tf.Tensor`): input to the layer of shape *(seq_len, batch, embed_dim)* hidden_states (`tf.Tensor`): input to the layer of shape `(seq_len, batch, embed_dim)`
attention_mask (`tf.Tensor`): attention mask of size attention_mask (`tf.Tensor`): attention mask of size
*(batch, 1, tgt_len, src_len)* where padding elements are indicated by very large negative values. `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
encoder_hidden_states (`tf.Tensor`): encoder_hidden_states (`tf.Tensor`):
cross attention input to the layer of shape *(seq_len, batch, embed_dim)* cross attention input to the layer of shape `(seq_len, batch, embed_dim)`
encoder_attention_mask (`tf.Tensor`): encoder attention mask of size encoder_attention_mask (`tf.Tensor`): encoder attention mask of size
*(batch, 1, tgt_len, src_len)* where padding elements are indicated by very large negative values. `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
layer_head_mask (`tf.Tensor`): mask for attention heads in a given layer of size layer_head_mask (`tf.Tensor`): mask for attention heads in a given layer of size
*(decoder_attention_heads,)* `(decoder_attention_heads,)`
cross_attn_layer_head_mask (`tf.Tensor`): mask for heads of the cross-attention module. cross_attn_layer_head_mask (`tf.Tensor`): mask for heads of the cross-attention module.
*(decoder_attention_heads,)* `(decoder_attention_heads,)`
past_key_value (`Tuple(tf.Tensor)`): cached past key and value projection states past_key_value (`Tuple(tf.Tensor)`): cached past key and value projection states
""" """
residual = hidden_states residual = hidden_states
......
...@@ -150,7 +150,11 @@ class TFBlenderbotAttention(tf.keras.layers.Layer): ...@@ -150,7 +150,11 @@ class TFBlenderbotAttention(tf.keras.layers.Layer):
self.num_heads = num_heads self.num_heads = num_heads
self.dropout = tf.keras.layers.Dropout(dropout) self.dropout = tf.keras.layers.Dropout(dropout)
self.head_dim = embed_dim // num_heads self.head_dim = embed_dim // num_heads
assert self.head_dim * num_heads == self.embed_dim, "embed_dim must be divisible by num_heads" if (self.head_dim * num_heads) != self.embed_dim:
raise ValueError(
f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}"
f" and `num_heads`: {num_heads})."
)
self.scaling = self.head_dim ** -0.5 self.scaling = self.head_dim ** -0.5
self.is_decoder = is_decoder self.is_decoder = is_decoder
......
...@@ -149,7 +149,11 @@ class TFBlenderbotSmallAttention(tf.keras.layers.Layer): ...@@ -149,7 +149,11 @@ class TFBlenderbotSmallAttention(tf.keras.layers.Layer):
self.num_heads = num_heads self.num_heads = num_heads
self.dropout = tf.keras.layers.Dropout(dropout) self.dropout = tf.keras.layers.Dropout(dropout)
self.head_dim = embed_dim // num_heads self.head_dim = embed_dim // num_heads
assert self.head_dim * num_heads == self.embed_dim, "embed_dim must be divisible by num_heads" if (self.head_dim * num_heads) != self.embed_dim:
raise ValueError(
f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}"
f" and `num_heads`: {num_heads})."
)
self.scaling = self.head_dim ** -0.5 self.scaling = self.head_dim ** -0.5
self.is_decoder = is_decoder self.is_decoder = is_decoder
...@@ -299,11 +303,11 @@ class TFBlenderbotSmallEncoderLayer(tf.keras.layers.Layer): ...@@ -299,11 +303,11 @@ class TFBlenderbotSmallEncoderLayer(tf.keras.layers.Layer):
def call(self, hidden_states: tf.Tensor, attention_mask: tf.Tensor, layer_head_mask: tf.Tensor, training=False): def call(self, hidden_states: tf.Tensor, attention_mask: tf.Tensor, layer_head_mask: tf.Tensor, training=False):
""" """
Args: Args:
hidden_states (`tf.Tensor`): input to the layer of shape *(seq_len, batch, embed_dim)* hidden_states (`tf.Tensor`): input to the layer of shape `(seq_len, batch, embed_dim)`
attention_mask (`tf.Tensor`): attention mask of size attention_mask (`tf.Tensor`): attention mask of size
*(batch, 1, tgt_len, src_len)* where padding elements are indicated by very large negative values. `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
layer_head_mask (`tf.Tensor`): mask for attention heads in a given layer of size layer_head_mask (`tf.Tensor`): mask for attention heads in a given layer of size
*(encoder_attention_heads,)* `(encoder_attention_heads,)`
""" """
residual = hidden_states residual = hidden_states
hidden_states, self_attn_weights, _ = self.self_attn( hidden_states, self_attn_weights, _ = self.self_attn(
...@@ -376,17 +380,17 @@ class TFBlenderbotSmallDecoderLayer(tf.keras.layers.Layer): ...@@ -376,17 +380,17 @@ class TFBlenderbotSmallDecoderLayer(tf.keras.layers.Layer):
) -> Tuple[tf.Tensor, tf.Tensor, Tuple[Tuple[tf.Tensor]]]: ) -> Tuple[tf.Tensor, tf.Tensor, Tuple[Tuple[tf.Tensor]]]:
""" """
Args: Args:
hidden_states (`tf.Tensor`): input to the layer of shape *(seq_len, batch, embed_dim)* hidden_states (`tf.Tensor`): input to the layer of shape `(seq_len, batch, embed_dim)`
attention_mask (`tf.Tensor`): attention mask of size attention_mask (`tf.Tensor`): attention mask of size
*(batch, 1, tgt_len, src_len)* where padding elements are indicated by very large negative values. `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
encoder_hidden_states (`tf.Tensor`): encoder_hidden_states (`tf.Tensor`):
cross attention input to the layer of shape *(seq_len, batch, embed_dim)* cross attention input to the layer of shape `(seq_len, batch, embed_dim)`
encoder_attention_mask (`tf.Tensor`): encoder attention mask of size encoder_attention_mask (`tf.Tensor`): encoder attention mask of size
*(batch, 1, tgt_len, src_len)* where padding elements are indicated by very large negative values. `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
layer_head_mask (`tf.Tensor`): mask for attention heads in a given layer of size layer_head_mask (`tf.Tensor`): mask for attention heads in a given layer of size
*(decoder_attention_heads,)* `(decoder_attention_heads,)`
cross_attn_layer_head_mask (`tf.Tensor`): mask for heads of the cross-attention module. cross_attn_layer_head_mask (`tf.Tensor`): mask for heads of the cross-attention module.
*(decoder_attention_heads,)* `(decoder_attention_heads,)`
past_key_value (`Tuple(tf.Tensor)`): cached past key and value projection states past_key_value (`Tuple(tf.Tensor)`): cached past key and value projection states
""" """
residual = hidden_states residual = hidden_states
......
...@@ -736,7 +736,11 @@ class TFHubertAttention(tf.keras.layers.Layer): ...@@ -736,7 +736,11 @@ class TFHubertAttention(tf.keras.layers.Layer):
self.num_heads = num_heads self.num_heads = num_heads
self.dropout = tf.keras.layers.Dropout(dropout) self.dropout = tf.keras.layers.Dropout(dropout)
self.head_dim = embed_dim // num_heads self.head_dim = embed_dim // num_heads
assert self.head_dim * num_heads == self.embed_dim, "embed_dim must be divisible by num_heads" if (self.head_dim * num_heads) != self.embed_dim:
raise ValueError(
f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}"
f" and `num_heads`: {num_heads})."
)
self.scaling = self.head_dim ** -0.5 self.scaling = self.head_dim ** -0.5
self.is_decoder = is_decoder self.is_decoder = is_decoder
......
...@@ -189,7 +189,11 @@ class TFMarianAttention(tf.keras.layers.Layer): ...@@ -189,7 +189,11 @@ class TFMarianAttention(tf.keras.layers.Layer):
self.num_heads = num_heads self.num_heads = num_heads
self.dropout = tf.keras.layers.Dropout(dropout) self.dropout = tf.keras.layers.Dropout(dropout)
self.head_dim = embed_dim // num_heads self.head_dim = embed_dim // num_heads
assert self.head_dim * num_heads == self.embed_dim, "embed_dim must be divisible by num_heads" if (self.head_dim * num_heads) != self.embed_dim:
raise ValueError(
f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}"
f" and `num_heads`: {num_heads})."
)
self.scaling = self.head_dim ** -0.5 self.scaling = self.head_dim ** -0.5
self.is_decoder = is_decoder self.is_decoder = is_decoder
...@@ -339,11 +343,11 @@ class TFMarianEncoderLayer(tf.keras.layers.Layer): ...@@ -339,11 +343,11 @@ class TFMarianEncoderLayer(tf.keras.layers.Layer):
def call(self, hidden_states: tf.Tensor, attention_mask: tf.Tensor, layer_head_mask: tf.Tensor, training=False): def call(self, hidden_states: tf.Tensor, attention_mask: tf.Tensor, layer_head_mask: tf.Tensor, training=False):
""" """
Args: Args:
hidden_states (`tf.Tensor`): input to the layer of shape *(seq_len, batch, embed_dim)* hidden_states (`tf.Tensor`): input to the layer of shape `(seq_len, batch, embed_dim)`
attention_mask (`tf.Tensor`): attention mask of size attention_mask (`tf.Tensor`): attention mask of size
*(batch, 1, tgt_len, src_len)* where padding elements are indicated by very large negative values. `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
layer_head_mask (`tf.Tensor`): mask for attention heads in a given layer of size layer_head_mask (`tf.Tensor`): mask for attention heads in a given layer of size
*(encoder_attention_heads,)* `(encoder_attention_heads,)`
""" """
residual = hidden_states residual = hidden_states
hidden_states, self_attn_weights, _ = self.self_attn( hidden_states, self_attn_weights, _ = self.self_attn(
...@@ -416,17 +420,17 @@ class TFMarianDecoderLayer(tf.keras.layers.Layer): ...@@ -416,17 +420,17 @@ class TFMarianDecoderLayer(tf.keras.layers.Layer):
) -> Tuple[tf.Tensor, tf.Tensor, Tuple[Tuple[tf.Tensor]]]: ) -> Tuple[tf.Tensor, tf.Tensor, Tuple[Tuple[tf.Tensor]]]:
""" """
Args: Args:
hidden_states (`tf.Tensor`): input to the layer of shape *(seq_len, batch, embed_dim)* hidden_states (`tf.Tensor`): input to the layer of shape `(seq_len, batch, embed_dim)`
attention_mask (`tf.Tensor`): attention mask of size attention_mask (`tf.Tensor`): attention mask of size
*(batch, 1, tgt_len, src_len)* where padding elements are indicated by very large negative values. `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
encoder_hidden_states (`tf.Tensor`): encoder_hidden_states (`tf.Tensor`):
cross attention input to the layer of shape *(seq_len, batch, embed_dim)* cross attention input to the layer of shape `(seq_len, batch, embed_dim)`
encoder_attention_mask (`tf.Tensor`): encoder attention mask of size encoder_attention_mask (`tf.Tensor`): encoder attention mask of size
*(batch, 1, tgt_len, src_len)* where padding elements are indicated by very large negative values. `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
layer_head_mask (`tf.Tensor`): mask for attention heads in a given layer of size layer_head_mask (`tf.Tensor`): mask for attention heads in a given layer of size
*(decoder_attention_heads,)* `(decoder_attention_heads,)`
cross_attn_layer_head_mask (`tf.Tensor`): mask for heads of the cross-attention module. cross_attn_layer_head_mask (`tf.Tensor`): mask for heads of the cross-attention module.
*(decoder_attention_heads,)* `(decoder_attention_heads,)`
past_key_value (`Tuple(tf.Tensor)`): cached past key and value projection states past_key_value (`Tuple(tf.Tensor)`): cached past key and value projection states
""" """
residual = hidden_states residual = hidden_states
......
...@@ -149,7 +149,11 @@ class TFMBartAttention(tf.keras.layers.Layer): ...@@ -149,7 +149,11 @@ class TFMBartAttention(tf.keras.layers.Layer):
self.num_heads = num_heads self.num_heads = num_heads
self.dropout = tf.keras.layers.Dropout(dropout) self.dropout = tf.keras.layers.Dropout(dropout)
self.head_dim = embed_dim // num_heads self.head_dim = embed_dim // num_heads
assert self.head_dim * num_heads == self.embed_dim, "embed_dim must be divisible by num_heads" if (self.head_dim * num_heads) != self.embed_dim:
raise ValueError(
f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}"
f" and `num_heads`: {num_heads})."
)
self.scaling = self.head_dim ** -0.5 self.scaling = self.head_dim ** -0.5
self.is_decoder = is_decoder self.is_decoder = is_decoder
......
...@@ -190,7 +190,11 @@ class TFPegasusAttention(tf.keras.layers.Layer): ...@@ -190,7 +190,11 @@ class TFPegasusAttention(tf.keras.layers.Layer):
self.num_heads = num_heads self.num_heads = num_heads
self.dropout = tf.keras.layers.Dropout(dropout) self.dropout = tf.keras.layers.Dropout(dropout)
self.head_dim = embed_dim // num_heads self.head_dim = embed_dim // num_heads
assert self.head_dim * num_heads == self.embed_dim, "embed_dim must be divisible by num_heads" if (self.head_dim * num_heads) != self.embed_dim:
raise ValueError(
f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}"
f" and `num_heads`: {num_heads})."
)
self.scaling = self.head_dim ** -0.5 self.scaling = self.head_dim ** -0.5
self.is_decoder = is_decoder self.is_decoder = is_decoder
......
...@@ -17,7 +17,13 @@ ...@@ -17,7 +17,13 @@
# limitations under the License. # limitations under the License.
from typing import TYPE_CHECKING from typing import TYPE_CHECKING
from ...file_utils import _LazyModule, is_sentencepiece_available, is_speech_available, is_torch_available from ...file_utils import (
_LazyModule,
is_sentencepiece_available,
is_speech_available,
is_tf_available,
is_torch_available,
)
_import_structure = { _import_structure = {
...@@ -36,6 +42,14 @@ if is_speech_available(): ...@@ -36,6 +42,14 @@ if is_speech_available():
if is_sentencepiece_available(): if is_sentencepiece_available():
_import_structure["processing_speech_to_text"] = ["Speech2TextProcessor"] _import_structure["processing_speech_to_text"] = ["Speech2TextProcessor"]
if is_tf_available():
_import_structure["modeling_tf_speech_to_text"] = [
"TF_SPEECH_TO_TEXT_PRETRAINED_MODEL_ARCHIVE_LIST",
"TFSpeech2TextForConditionalGeneration",
"TFSpeech2TextModel",
"TFSpeech2TextPreTrainedModel",
]
if is_torch_available(): if is_torch_available():
_import_structure["modeling_speech_to_text"] = [ _import_structure["modeling_speech_to_text"] = [
"SPEECH_TO_TEXT_PRETRAINED_MODEL_ARCHIVE_LIST", "SPEECH_TO_TEXT_PRETRAINED_MODEL_ARCHIVE_LIST",
...@@ -57,6 +71,14 @@ if TYPE_CHECKING: ...@@ -57,6 +71,14 @@ if TYPE_CHECKING:
if is_sentencepiece_available(): if is_sentencepiece_available():
from .processing_speech_to_text import Speech2TextProcessor from .processing_speech_to_text import Speech2TextProcessor
if is_tf_available():
from .modeling_tf_speech_to_text import (
TF_SPEECH_TO_TEXT_PRETRAINED_MODEL_ARCHIVE_LIST,
TFSpeech2TextForConditionalGeneration,
TFSpeech2TextModel,
TFSpeech2TextPreTrainedModel,
)
if is_torch_available(): if is_torch_available():
from .modeling_speech_to_text import ( from .modeling_speech_to_text import (
SPEECH_TO_TEXT_PRETRAINED_MODEL_ARCHIVE_LIST, SPEECH_TO_TEXT_PRETRAINED_MODEL_ARCHIVE_LIST,
......
...@@ -765,7 +765,11 @@ class TFWav2Vec2Attention(tf.keras.layers.Layer): ...@@ -765,7 +765,11 @@ class TFWav2Vec2Attention(tf.keras.layers.Layer):
self.num_heads = num_heads self.num_heads = num_heads
self.dropout = tf.keras.layers.Dropout(dropout) self.dropout = tf.keras.layers.Dropout(dropout)
self.head_dim = embed_dim // num_heads self.head_dim = embed_dim // num_heads
assert self.head_dim * num_heads == self.embed_dim, "embed_dim must be divisible by num_heads" if (self.head_dim * num_heads) != self.embed_dim:
raise ValueError(
f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}"
f" and `num_heads`: {num_heads})."
)
self.scaling = self.head_dim ** -0.5 self.scaling = self.head_dim ** -0.5
self.is_decoder = is_decoder self.is_decoder = is_decoder
......
...@@ -198,6 +198,9 @@ TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING = None ...@@ -198,6 +198,9 @@ TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING = None
TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING = None TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING = None
TF_MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING = None
TF_MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING = None TF_MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING = None
...@@ -276,6 +279,13 @@ class TFAutoModelForSequenceClassification(metaclass=DummyObject): ...@@ -276,6 +279,13 @@ class TFAutoModelForSequenceClassification(metaclass=DummyObject):
requires_backends(self, ["tf"]) requires_backends(self, ["tf"])
class TFAutoModelForSpeechSeq2Seq(metaclass=DummyObject):
_backends = ["tf"]
def __init__(self, *args, **kwargs):
requires_backends(self, ["tf"])
class TFAutoModelForTableQuestionAnswering(metaclass=DummyObject): class TFAutoModelForTableQuestionAnswering(metaclass=DummyObject):
_backends = ["tf"] _backends = ["tf"]
...@@ -1678,6 +1688,30 @@ class TFRoFormerPreTrainedModel(metaclass=DummyObject): ...@@ -1678,6 +1688,30 @@ class TFRoFormerPreTrainedModel(metaclass=DummyObject):
requires_backends(self, ["tf"]) requires_backends(self, ["tf"])
TF_SPEECH_TO_TEXT_PRETRAINED_MODEL_ARCHIVE_LIST = None
class TFSpeech2TextForConditionalGeneration(metaclass=DummyObject):
_backends = ["tf"]
def __init__(self, *args, **kwargs):
requires_backends(self, ["tf"])
class TFSpeech2TextModel(metaclass=DummyObject):
_backends = ["tf"]
def __init__(self, *args, **kwargs):
requires_backends(self, ["tf"])
class TFSpeech2TextPreTrainedModel(metaclass=DummyObject):
_backends = ["tf"]
def __init__(self, *args, **kwargs):
requires_backends(self, ["tf"])
TF_T5_PRETRAINED_MODEL_ARCHIVE_LIST = None TF_T5_PRETRAINED_MODEL_ARCHIVE_LIST = None
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment