Add SeamlessM4T v2 (#27779)

* add working convertion script * first non-working version of modeling code * update modeling code (working) * make style * make fix-copies * add config docstrings * add config to ignore docstrings formatage due to unconventional markdown * fix copies * fix generation num_return_sequences * enrich docs * add and fix tests beside integration tests * update integration tests * update repo id * add tie weights and make style * correct naming in .md * fix imports and so on * correct docstrings * fix fp16 speech forward * fix speechencoder attention * make style * fix copied from * rename SeamlessM4Tv2-v2 to SeamlessM4Tv2 * Apply suggestions on configuration Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com> * remove useless public models * fix private models + better naming for T2U models * clean speech encoder relative position embeddings * refactor chunk attention * add docstrings to chunk attention method * improve naming and docstrings * rename some attention variables + add temperature sampling in T2U model * rename DOCSTRINGS variable names * make style + remove 2 useless config parameters * enrich model card * remove any attention_head reference + fix temperature in T2U * new fmt and make style * Apply suggestions from code review Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com> * rename spkr_id->speaker_id and change docstrings of get_char_input_ids * simplify v2attention * make style * Update seamless_m4t_v2.md * update code and tests with last update * update repo ids * fill article name, abstract andauthors * update not_doctested and slow_doc tests --------- Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com>

Add SeamlessM4T v2 (#27779)
* add working convertion script * first non-working version of modeling code * update modeling code (working) * make style * make fix-copies * add config docstrings * add config to ignore docstrings formatage due to unconventional markdown * fix copies * fix generation num_return_sequences * enrich docs * add and fix tests beside integration tests * update integration tests * update repo id * add tie weights and make style * correct naming in .md * fix imports and so on * correct docstrings * fix fp16 speech forward * fix speechencoder attention * make style * fix copied from * rename SeamlessM4Tv2-v2 to SeamlessM4Tv2 * Apply suggestions on configuration Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com> * remove useless public models * fix private models + better naming for T2U models * clean speech encoder relative position embeddings * refactor chunk attention * add docstrings to chunk attention method * improve naming and docstrings * rename some attention variables + add temperature sampling in T2U model * rename DOCSTRINGS variable names * make style + remove 2 useless config parameters * enrich model card * remove any attention_head reference + fix temperature in T2U * new fmt and make style * Apply suggestions from code review Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com> * rename spkr_id->speaker_id and change docstrings of get_char_input_ids * simplify v2attention * make style * Update seamless_m4t_v2.md * update code and tests with last update * update repo ids * fill article name, abstract andauthors * update not_doctested and slow_doc tests --------- Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com>
29f1aee3 · Yoach Lacombe · GitHub · 510270af · 29f1aee3 · 29f1aee3
Unverified Commit 29f1aee3 authored Nov 30, 2023 by Yoach Lacombe Committed by GitHub Nov 30, 2023
11 changed files
--- a/src/transformers/models/seamless_m4t_v2/convert_fairseq2_to_hf.py
+++ b/src/transformers/models/seamless_m4t_v2/convert_fairseq2_to_hf.py
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Converting Meta SeamlessM4Tv2 checkpoints from seamless_communication to HF."""
+import argparse
+import os
+from pathlib import Path
+import torch
+from accelerate.utils.modeling import find_tied_parameters
+from seamless_communication.inference import Translator
+from transformers import (
+    SeamlessM4TFeatureExtractor,
+    SeamlessM4TProcessor,
+    SeamlessM4TTokenizer,
+    SeamlessM4Tv2Config,
+    SeamlessM4Tv2Model,
+)
+from transformers.utils import logging
+# fmt: off
+UNIT_SUPPORTED_LANGUAGES = ["__arb__", "__ben__", "__cat__", "__ces__", "__cmn__", "__cym__", "__dan__", "__deu__", "__eng__", "__est__", "__fin__", "__fra__", "__hin__", "__ind__", "__ita__", "__jpn__", "__kan__", "__kor__", "__mlt__", "__nld__", "__pes__", "__pol__", "__por__", "__ron__", "__rus__", "__slk__", "__spa__", "__swe__", "__swh__", "__tam__", "__tel__", "__tgl__", "__tha__", "__tur__", "__ukr__", "__urd__", "__uzn__", "__vie__", ]
+# fmt: on
+# fmt: off
+VOCODER_SUPPORTED_LANGUAGES = ["__arb__", "__ben__", "__cat__", "__ces__", "__cmn__", "__cym__", "__dan__", "__deu__", "__eng__", "__est__", "__fin__", "__fra__", "__hin__", "__ind__", "__ita__", "__jpn__", "__kor__", "__mlt__", "__nld__", "__pes__", "__pol__", "__por__", "__ron__", "__rus__", "__slk__", "__spa__", "__swe__", "__swh__", "__tel__", "__tgl__", "__tha__", "__tur__", "__ukr__", "__urd__", "__uzn__", "__vie__",]
+# fmt: on
+# fmt: off
+LARGE_SUPPORTED_LANGUAGES = ["afr","amh","arb","ary","arz","asm","azj","bel","ben","bos","bul","cat","ceb","ces","ckb","cmn","cmn_Hant","cym","dan","deu","ell","eng","est","eus","fin","fra","fuv","gaz","gle","glg","guj","heb","hin","hrv","hun","hye","ibo","ind","isl","ita","jav","jpn","kan","kat","kaz","khk","khm","kir","kor","lao","lit","lug","luo","lvs","mai","mal","mar","mkd","mlt","mni","mya","nld","nno","nob","npi","nya","ory","pan","pbt","pes","pol","por","ron","rus","sat","slk","slv","sna","snd","som","spa","srp","swe","swh","tam","tel","tgk","tgl","tha","tur","ukr","urd","uzn","vie","yor","yue","zlm","zul",]
+# fmt: on
+def assert_param_count(model_1, model_2):
+    count_1 = sum(p[1].numel() for p in model_1.named_parameters() if "final_proj" not in p[0])
+    count_2 = sum(p[1].numel() for p in model_2.named_parameters() if "final_proj" not in p[0])
+    assert count_1 == count_2, f"{model_1.__class__}: {count_1} != {model_2.__class__}: {count_2}"
+def param_count(model):
+    return sum(p[1].numel() for p in model.named_parameters() if "final_proj" not in p[0])
+def _grab_best_device(use_gpu=True):
+    if torch.cuda.device_count() > 0 and use_gpu:
+        device = "cuda"
+    else:
+        device = "cpu"
+    return torch.device(device)
+logging.set_verbosity_info()
+logger = logging.get_logger(__name__)
+vocoder_convert_list = [
+    ("ups", "hifi_gan.upsampler"),
+    ("conv_pre", "hifi_gan.conv_pre"),
+    ("resblocks", "hifi_gan.resblocks"),
+    ("conv_post", "hifi_gan.conv_post"),
+    ("lang", "language_embedding"),
+    ("spkr", "speaker_embedding"),
+    ("dict.", "unit_embedding."),
+    ("dur_predictor.conv1.0", "dur_predictor.conv1"),
+    ("dur_predictor.conv2.0", "dur_predictor.conv2"),
+]
+# order is important
+wav2vec_convert_list = [
+    ("speech_encoder_frontend.model_dim_proj", "feature_projection.projection"),
+    ("speech_encoder_frontend.post_extract_layer_norm", "feature_projection.layer_norm"),
+    ("speech_encoder_frontend.pos_encoder.conv", "encoder.pos_conv_embed.conv"),
+    ("speech_encoder.inner.layers", "encoder.layers"),
+    ("speech_encoder.inner_layer_norm", "encoder.layer_norm"),
+    ("speech_encoder.adaptor_layers", "adapter.layers"),
+    ("inner_proj", "intermediate_dense"),
+    ("self_attn.output_proj", "self_attn.linear_out"),
+    ("output_proj", "output_dense"),
+    ("self_attn.k_proj", "self_attn.linear_k"),
+    ("self_attn.v_proj", "self_attn.linear_v"),
+    ("self_attn.q_proj", "self_attn.linear_q"),
+    ("self_attn.sdpa.u_bias", "self_attn.pos_bias_u"),
+    ("self_attn.sdpa.v_bias", "self_attn.pos_bias_v"),
+    ("self_attn.sdpa.rel_k_embed", "self_attn.distance_embedding"),
+    ("self_attn.sdpa.r_proj", "self_attn.linear_pos"),
+    ("conv.pointwise_conv1", "conv_module.pointwise_conv1"),
+    ("conv.pointwise_conv2", "conv_module.pointwise_conv2"),
+    ("conv.depthwise_conv", "conv_module.depthwise_conv"),
+    ("conv.batch_norm", "conv_module.batch_norm"),
+    ("conv.layer_norm", "conv_module.depthwise_layer_norm"),
+    ("conv_layer_norm", "conv_module.layer_norm"),
+    ("speech_encoder.proj1", "intermediate_ffn.intermediate_dense"),
+    ("speech_encoder.proj2", "intermediate_ffn.output_dense"),
+    ("speech_encoder.layer_norm", "inner_layer_norm"),
+]
+t2u_convert_list = [
+    ("t2u_model.final_proj", "lm_head"),
+    ("t2u_model.", "model."),
+    ("encoder_decoder_attn_layer_norm", "cross_attention_layer_norm"),
+    ("encoder_decoder_attn", "cross_attention"),
+    ("linear_k", "k_proj"),
+    ("linear_v", "v_proj"),
+    ("linear_q", "q_proj"),
+    ("ffn.inner_proj", "ffn.fc1"),
+    ("ffn.output_proj", "ffn.fc2"),
+    ("output_proj", "out_proj"),
+    ("decoder_frontend.embed_char", "decoder.embed_char"),
+    ("decoder_frontend.pos_emb_alpha_char", "decoder.pos_emb_alpha_char"),
+    ("decoder_frontend.embed", "decoder.embed_tokens"),
+    ("decoder_frontend.pos_emb_alpha", "decoder.pos_emb_alpha"),
+    ("conv1d.conv", "conv"),
+    ("conv1d_layer_norm", "conv_layer_norm"),
+    ("decoder_frontend.variance_adaptor", "decoder"),
+    ("duration_predictor.conv1.0", "duration_predictor.conv1"),
+    ("duration_predictor.conv2.0", "duration_predictor.conv2"),
+]
+text_convert_list = [
+    ("text_encoder.", ""),
+    ("text_decoder.", ""),
+    ("text_encoder_frontend.embed", "embed_tokens"),
+    ("text_decoder_frontend.embed", "embed_tokens"),
+    ("encoder_decoder_attn_layer_norm", "cross_attention_layer_norm"),
+    ("encoder_decoder_attn", "cross_attention"),
+    ("linear_k", "k_proj"),
+    ("linear_v", "v_proj"),
+    ("linear_q", "q_proj"),
+    ("ffn.inner_proj", "ffn.fc1"),
+    ("ffn.output_proj", "ffn.fc2"),
+    ("output_proj", "out_proj"),
+    ("final_proj", "lm_head"),
+]
+CUR_PATH = os.path.dirname(os.path.abspath(__file__))
+default_cache_dir = os.path.join(os.path.expanduser("~"), ".cache")
+CACHE_DIR = os.path.join(os.getenv("XDG_CACHE_HOME", default_cache_dir), "huggingface", "hub")
+def _load_hf_config():
+    return SeamlessM4Tv2Config()
+def _convert_model(
+    original_model,
+    hf_model,
+    convert_list,
+    device,
+    unwanted_prefix="model.",
+    filter_state_dict="speech",
+    exclude_state_dict=None,
+):
+    state_dict = original_model.state_dict()
+    # filter func
+    if isinstance(filter_state_dict, str):
+        def filter_func(x):
+            return filter_state_dict in x[0]
+    else:
+        def filter_func(item):
+            if exclude_state_dict is not None and exclude_state_dict in item[0]:
+                return False
+            for filter_el in filter_state_dict:
+                if filter_el in item[0]:
+                    return True
+            return False
+    state_dict = dict(filter(filter_func, state_dict.items()))
+    for k, v in list(state_dict.items()):
+        new_k = k[len(unwanted_prefix) :]
+        for old_layer_name, new_layer_name in convert_list:
+            if old_layer_name in new_k:
+                new_k = new_k.replace(old_layer_name, new_layer_name)
+        # must do it by hand
+        if ".layer_norm" in new_k and new_k.split(".layer_norm")[0][-1].isnumeric():
+            new_k = new_k.replace("layer_norm", "final_layer_norm")
+        state_dict[new_k] = state_dict.pop(k)
+    extra_keys = set(state_dict.keys()) - set(hf_model.state_dict().keys())
+    extra_keys = set(extra_keys)
+    missing_keys = set(hf_model.state_dict().keys()) - set(state_dict.keys())
+    missing_keys = set({k for k in missing_keys if "final_logits_bias" not in k})
+    if len(extra_keys) != 0:
+        raise ValueError(f"extra keys found: {extra_keys}")
+    if len(missing_keys) != 0:
+        raise ValueError(f"missing keys: {missing_keys}")
+    hf_model.load_state_dict(state_dict, strict=False)
+    n_params = param_count(hf_model)
+    logger.info(f"model loaded: {round(n_params/1e6,1)}M params")
+    hf_model.eval()
+    hf_model.to(device)
+    del state_dict
+    return hf_model
+def load_model(save_dir, model_type, repo_id):
+    """
+    Meta SeamlessM4Tv2 is made of 8 main components:
+    - speech_encoder (#1) and speech_encoder_frontend (#2)
+    - t2u_model (#3)
+    - text_encoder (#4) and text_encoder_frontend (#5)
+    - text_decoder (#6) [and text_decoder_frontend (#5) = equals to text_encoder_frontend]
+    - final_proj (#7)
+    - vocoder (#8)
+    """
+    device = _grab_best_device()
+    name = "seamlessM4T_v2_large"
+    original_model = Translator(name, "vocoder_v2", device, dtype=torch.float32)
+    ######### TOKENIZER
+    langs = LARGE_SUPPORTED_LANGUAGES
+    langs = [f"__{lang}__" for lang in langs]
+    vocab_file = os.path.join(os.path.expanduser("~"), "tokenizer", model_type, "tokenizer.model")
+    save_dir = os.path.join(save_dir, name)
+    Path(save_dir).mkdir(exist_ok=True)
+    tokenizer = SeamlessM4TTokenizer(vocab_file, additional_special_tokens=langs)
+    sanity_check_lang_id = tokenizer.convert_tokens_to_ids("__fra__")
+    tokenizer.save_pretrained(save_dir)
+    tokenizer = SeamlessM4TTokenizer.from_pretrained(save_dir)
+    if sanity_check_lang_id != tokenizer.convert_tokens_to_ids("__fra__"):
+        raise ValueError(
+            f"Error in tokenizer saving/loading - __fra__ lang id is not coherent: {sanity_check_lang_id} vs {tokenizer.convert_tokens_to_ids('__fra__')}"
+        )
+    ####### get language to ids dict
+    text_decoder_lang_code_to_id = {lang.replace("__", ""): tokenizer.convert_tokens_to_ids(lang) for lang in langs}
+    # offset: vocoder unit vocab size + 5 (for EOS/PAD/BOS/UNK/MSK) + len(supported_languages)
+    t2u_lang_code_to_id = {
+        code.replace("__", ""): i + 10005 + len(UNIT_SUPPORTED_LANGUAGES)
+        for i, code in enumerate(UNIT_SUPPORTED_LANGUAGES)
+    }
+    vocoder_lang_code_to_id = {code.replace("__", ""): i for i, code in enumerate(VOCODER_SUPPORTED_LANGUAGES)}
+    ######### FE
+    fe = SeamlessM4TFeatureExtractor(language_code=langs)
+    fe.save_pretrained(save_dir)
+    fe = SeamlessM4TFeatureExtractor.from_pretrained(save_dir)
+    processor = SeamlessM4TProcessor(feature_extractor=fe, tokenizer=tokenizer)
+    processor.save_pretrained(save_dir)
+    processor.push_to_hub(repo_id=repo_id, create_pr=True)
+    processor = SeamlessM4TProcessor.from_pretrained(save_dir)
+    ######## Model
+    # init config
+    hf_config = _load_hf_config()
+    ######## get id_to_text and char_to_id from original model tokenizers
+    id_to_text = {i: original_model.text_tokenizer.model.index_to_token(i) for i in range(hf_config.vocab_size)}
+    char_to_id = {
+        original_model.model.t2u_model.decoder_frontend.char_tokenizer.model.index_to_token(i): i for i in range(10904)
+    }
+    # init model
+    hf_model = SeamlessM4Tv2Model(hf_config)
+    hf_model.generation_config.__setattr__("text_decoder_lang_to_code_id", text_decoder_lang_code_to_id)
+    hf_model.generation_config.__setattr__("t2u_lang_code_to_id", t2u_lang_code_to_id)
+    hf_model.generation_config.__setattr__("vocoder_lang_code_to_id", vocoder_lang_code_to_id)
+    hf_model.generation_config.__setattr__("id_to_text", id_to_text)
+    hf_model.generation_config.__setattr__("char_to_id", char_to_id)
+    # -1. take care of vocoder
+    # similarly to speech T5 must apply and remove weight norm
+    hf_model.vocoder.apply_weight_norm()
+    hf_model.vocoder = _convert_model(
+        original_model,
+        hf_model.vocoder,
+        vocoder_convert_list,
+        device,
+        unwanted_prefix="vocoder.code_generator.",
+        filter_state_dict="vocoder",
+    )
+    hf_model.vocoder.remove_weight_norm()
+    # 1. take care of speech encoder
+    wav2vec = hf_model.speech_encoder
+    hf_model.speech_encoder = _convert_model(
+        original_model, wav2vec, wav2vec_convert_list, device, unwanted_prefix="model.", filter_state_dict="speech"
+    )
+    # 2. take care of t2u
+    hf_model.t2u_model = _convert_model(
+        original_model,
+        hf_model.t2u_model,
+        t2u_convert_list,
+        device,
+        unwanted_prefix="model.",
+        filter_state_dict="t2u_model",
+    )
+    # 3. take care of text encoder
+    hf_model.text_encoder = _convert_model(
+        original_model,
+        hf_model.text_encoder,
+        text_convert_list,
+        device,
+        unwanted_prefix="model.",
+        filter_state_dict=["model.text_encoder"],
+        exclude_state_dict="t2u_model",
+    )
+    # 4. take care of text decoder
+    hf_model.text_decoder = _convert_model(
+        original_model,
+        hf_model.text_decoder,
+        text_convert_list,
+        device,
+        unwanted_prefix="model.",
+        filter_state_dict=["model.text_decoder"],
+        exclude_state_dict="t2u_model",
+    )
+    # 5. take care of final proj
+    hf_model.lm_head = _convert_model(
+        original_model,
+        hf_model.lm_head,
+        [("final_proj.", "")],
+        device,
+        unwanted_prefix="model.",
+        filter_state_dict=["model.final_proj"],
+        exclude_state_dict="t2u_model",
+    )
+    # sanity check
+    print(find_tied_parameters(hf_model))
+    count_1 = param_count(hf_model)
+    count_2 = param_count(original_model)
+    print(f"HF MODEL:{count_1}, ORIGINAL_MODEL: {count_2}, diff:{count_1 - count_2}")
+    print(f"HF MODEL excluding embeddings:{hf_model.num_parameters(exclude_embeddings=True)}")
+    del original_model
+    hf_model.generation_config._from_model_config = False
+    hf_model.save_pretrained(save_dir)
+    hf_model.push_to_hub(repo_id=repo_id, create_pr=True)
+    hf_model = SeamlessM4Tv2Model.from_pretrained(save_dir)
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    # Required parameters
+    parser.add_argument(
+        "--model_type",
+        default="large",
+        type=str,
+        help="Model type.",
+    )
+    parser.add_argument(
+        "--save_dir",
+        default="/home/ubuntu/weights_v2",
+        type=str,
+        help="Path to the output PyTorch model.",
+    )
+    parser.add_argument(
+        "--repo_id",
+        default="facebook/seamless-m4t-v2-large",
+        type=str,
+        help="Repo ID.",
+    )
+    args = parser.parse_args()
+    load_model(args.save_dir, args.model_type, args.repo_id)
--- a/src/transformers/models/seamless_m4t_v2/modeling_seamless_m4t_v2.py
+++ b/src/transformers/models/seamless_m4t_v2/modeling_seamless_m4t_v2.py
--- a/src/transformers/utils/dummy_pt_objects.py
+++ b/src/transformers/utils/dummy_pt_objects.py
@@ -7213,6 +7213,51 @@ class SeamlessM4TTextToUnitModel(metaclass=DummyObject):
        requires_backends(self, ["torch"])
+SEAMLESS_M4T_V2_PRETRAINED_MODEL_ARCHIVE_LIST = None
+class SeamlessM4Tv2ForSpeechToSpeech(metaclass=DummyObject):
+    _backends = ["torch"]
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+class SeamlessM4Tv2ForSpeechToText(metaclass=DummyObject):
+    _backends = ["torch"]
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+class SeamlessM4Tv2ForTextToSpeech(metaclass=DummyObject):
+    _backends = ["torch"]
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+class SeamlessM4Tv2ForTextToText(metaclass=DummyObject):
+    _backends = ["torch"]
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+class SeamlessM4Tv2Model(metaclass=DummyObject):
+    _backends = ["torch"]
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+class SeamlessM4Tv2PreTrainedModel(metaclass=DummyObject):
+    _backends = ["torch"]
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
 SEGFORMER_PRETRAINED_MODEL_ARCHIVE_LIST = None

--- a/tests/models/seamless_m4t/test_modeling_seamless_m4t.py
+++ b/tests/models/seamless_m4t/test_modeling_seamless_m4t.py
@@ -16,7 +16,6 @@
 import copy
-import inspect
 import tempfile
 import unittest
@@ -479,10 +478,6 @@ class SeamlessM4TModelWithSpeechInputTest(ModelTesterMixin, unittest.TestCase):
    def test_save_load_fast_init_to_base(self):
        pass
-    @unittest.skip(reason="The speech encoder doesn't support head masking")
-    def test_generate_with_head_masking(self):
-        pass
    @unittest.skip(reason="SeamlessM4TModel can takes input_ids or input_features")
    def test_forward_signature(self):
        pass
@@ -714,43 +709,6 @@ class SeamlessM4TModelWithTextInputTest(
    def test_model_weights_reload_no_missing_tied_weights(self):
        pass
-    def test_generate_with_head_masking(self):
-        """Test designed for encoder-decoder models to ensure the attention head masking is used."""
-        attention_names = ["encoder_attentions", "decoder_attentions", "cross_attentions"]
-        for model_class in self.all_generative_model_classes:
-            config, input_ids, attention_mask, max_length = self._get_input_ids_and_config()
-            model = model_class(config).to(torch_device).eval()
-            head_masking = {
-                "head_mask": torch.zeros(config.encoder_layers, config.encoder_attention_heads, device=torch_device),
-                "decoder_head_mask": torch.zeros(
-                    config.decoder_layers, config.decoder_attention_heads, device=torch_device
-                ),
-                "cross_attn_head_mask": torch.zeros(
-                    config.decoder_layers, config.decoder_attention_heads, device=torch_device
-                ),
-            }
-            signature = inspect.signature(model.forward)
-            # We want to test only models where encoder/decoder head masking is implemented
-            if not set(head_masking.keys()) < {*signature.parameters.keys()}:
-                continue
-            for attn_name, (name, mask) in zip(attention_names, head_masking.items()):
-                out = model.generate(
-                    input_ids,
-                    attention_mask=attention_mask,
-                    num_beams=1,
-                    output_attentions=True,
-                    return_dict_in_generate=True,
-                    remove_invalid_values=True,
-                    **{name: mask},
-                )
-                # We check the state of decoder_attentions and cross_attentions just from the last step
-                attn_weights = out[attn_name] if attn_name == attention_names[0] else out[attn_name][-1]
-                self.assertEqual(sum([w.sum().item() for w in attn_weights]), 0.0)
    @unittest.skip(reason="SeamlessM4TModel can take input_ids or input_features")
    def test_forward_signature(self):
        pass

--- a/tests/models/seamless_m4t_v2/__init__.py
+++ b/tests/models/seamless_m4t_v2/__init__.py
--- a/tests/models/seamless_m4t_v2/test_modeling_seamless_m4t_v2.py
+++ b/tests/models/seamless_m4t_v2/test_modeling_seamless_m4t_v2.py
--- a/utils/check_config_attributes.py
+++ b/utils/check_config_attributes.py
@@ -96,6 +96,21 @@ SPECIAL_CASES_TO_ALLOW = {
        "t2u_encoder_layers",
        "t2u_max_position_embeddings",
    ],
+    # Actually used in the config or generation config, in that case necessary for the sub-components generation
+    "SeamlessM4Tv2Config": [
+        "max_new_tokens",
+        "t2u_decoder_attention_heads",
+        "t2u_decoder_ffn_dim",
+        "t2u_decoder_layers",
+        "t2u_encoder_attention_heads",
+        "t2u_encoder_ffn_dim",
+        "t2u_encoder_layers",
+        "t2u_max_position_embeddings",
+        "t2u_variance_pred_dropout",
+        "t2u_variance_predictor_embed_dim",
+        "t2u_variance_predictor_hidden_dim",
+        "t2u_variance_predictor_kernel_size",
+    ],
 }

--- a/utils/check_docstrings.py
+++ b/utils/check_docstrings.py
@@ -463,6 +463,7 @@ OBJECTS_TO_IGNORE = [
    "SamConfig",
    "SamPromptEncoderConfig",
    "SeamlessM4TConfig",  # use of unconventional markdown
+    "SeamlessM4Tv2Config",  # use of unconventional markdown
    "Seq2SeqTrainingArguments",
    "SpecialTokensMixin",
    "Speech2Text2Config",

--- a/utils/check_repo.py
+++ b/utils/check_repo.py
@@ -76,6 +76,9 @@ PRIVATE_MODELS = [
    "Kosmos2TextModel",
    "Kosmos2TextForCausalLM",
    "Kosmos2VisionModel",
+    "SeamlessM4Tv2TextToUnitModel",
+    "SeamlessM4Tv2CodeHifiGan",
+    "SeamlessM4Tv2TextToUnitForConditionalGeneration",
 ]
 # Update this list for models that are not tested with a comment explaining the reason it should not be.
@@ -296,6 +299,10 @@ IGNORE_NON_AUTO_CONFIGURED = PRIVATE_MODELS.copy() + [
    "SeamlessM4TCodeHifiGan",
    "SeamlessM4TForSpeechToSpeech",  # no auto class for speech-to-speech
    "TvpForVideoGrounding",
+    "SeamlessM4Tv2NARTextToUnitModel",
+    "SeamlessM4Tv2NARTextToUnitForConditionalGeneration",
+    "SeamlessM4Tv2CodeHifiGan",
+    "SeamlessM4Tv2ForSpeechToSpeech",  # no auto class for speech-to-speech
 ]
 # DO NOT edit this list!

--- a/utils/not_doctested.txt
+++ b/utils/not_doctested.txt
@@ -776,6 +776,7 @@ src/transformers/models/sam/modeling_sam.py
 src/transformers/models/sam/modeling_tf_sam.py
 src/transformers/models/sam/processing_sam.py
 src/transformers/models/seamless_m4t/convert_fairseq2_to_hf.py
+src/transformers/models/seamless_m4t_v2/convert_fairseq2_to_hf.py
 src/transformers/models/segformer/configuration_segformer.py
 src/transformers/models/segformer/convert_segformer_original_to_pytorch.py
 src/transformers/models/sew/convert_sew_original_pytorch_checkpoint_to_pytorch.py

--- a/utils/slow_documentation_tests.txt
+++ b/utils/slow_documentation_tests.txt
@@ -2,6 +2,7 @@ docs/source/en/generation_strategies.md
 docs/source/en/model_doc/ctrl.md
 docs/source/en/model_doc/kosmos-2.md
 docs/source/en/model_doc/seamless_m4t.md
+docs/source/en/model_doc/seamless_m4t_v2.md
 docs/source/en/task_summary.md
 docs/source/en/tasks/prompting.md
 src/transformers/models/blip_2/modeling_blip_2.py