Unverified Commit 29f1aee3 authored by Yoach Lacombe's avatar Yoach Lacombe Committed by GitHub
Browse files

Add SeamlessM4T v2 (#27779)



* add working convertion script

* first non-working version of modeling code

* update modeling code (working)

* make style

* make fix-copies

* add config docstrings

* add config to ignore docstrings formatage due to unconventional markdown

* fix copies

* fix generation num_return_sequences

* enrich docs

* add and fix tests beside integration tests

* update integration tests

* update repo id

* add tie weights and make style

* correct naming in .md

* fix imports and so on

* correct docstrings

* fix fp16 speech forward

* fix speechencoder attention

* make style

* fix copied from

* rename SeamlessM4Tv2-v2 to SeamlessM4Tv2

* Apply suggestions on configuration
Co-authored-by: default avatarArthur <48595927+ArthurZucker@users.noreply.github.com>

* remove useless public models

* fix private models + better naming for T2U models

* clean speech encoder relative position embeddings

* refactor chunk attention

* add docstrings to chunk attention method

* improve naming and docstrings

* rename some attention variables + add temperature sampling in T2U model

* rename DOCSTRINGS variable names

* make style + remove 2 useless config parameters

* enrich model card

* remove any attention_head reference + fix temperature in T2U

* new fmt and make style

* Apply suggestions from code review
Co-authored-by: default avatarArthur <48595927+ArthurZucker@users.noreply.github.com>

* rename spkr_id->speaker_id and change docstrings of get_char_input_ids

* simplify v2attention

* make style

* Update seamless_m4t_v2.md

* update code and tests with last update

* update repo ids

* fill article name, abstract andauthors

* update not_doctested and slow_doc tests

---------
Co-authored-by: default avatarArthur <48595927+ArthurZucker@users.noreply.github.com>
parent 510270af
# coding=utf-8
# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" Converting Meta SeamlessM4Tv2 checkpoints from seamless_communication to HF."""
import argparse
import os
from pathlib import Path
import torch
from accelerate.utils.modeling import find_tied_parameters
from seamless_communication.inference import Translator
from transformers import (
SeamlessM4TFeatureExtractor,
SeamlessM4TProcessor,
SeamlessM4TTokenizer,
SeamlessM4Tv2Config,
SeamlessM4Tv2Model,
)
from transformers.utils import logging
# fmt: off
UNIT_SUPPORTED_LANGUAGES = ["__arb__", "__ben__", "__cat__", "__ces__", "__cmn__", "__cym__", "__dan__", "__deu__", "__eng__", "__est__", "__fin__", "__fra__", "__hin__", "__ind__", "__ita__", "__jpn__", "__kan__", "__kor__", "__mlt__", "__nld__", "__pes__", "__pol__", "__por__", "__ron__", "__rus__", "__slk__", "__spa__", "__swe__", "__swh__", "__tam__", "__tel__", "__tgl__", "__tha__", "__tur__", "__ukr__", "__urd__", "__uzn__", "__vie__", ]
# fmt: on
# fmt: off
VOCODER_SUPPORTED_LANGUAGES = ["__arb__", "__ben__", "__cat__", "__ces__", "__cmn__", "__cym__", "__dan__", "__deu__", "__eng__", "__est__", "__fin__", "__fra__", "__hin__", "__ind__", "__ita__", "__jpn__", "__kor__", "__mlt__", "__nld__", "__pes__", "__pol__", "__por__", "__ron__", "__rus__", "__slk__", "__spa__", "__swe__", "__swh__", "__tel__", "__tgl__", "__tha__", "__tur__", "__ukr__", "__urd__", "__uzn__", "__vie__",]
# fmt: on
# fmt: off
LARGE_SUPPORTED_LANGUAGES = ["afr","amh","arb","ary","arz","asm","azj","bel","ben","bos","bul","cat","ceb","ces","ckb","cmn","cmn_Hant","cym","dan","deu","ell","eng","est","eus","fin","fra","fuv","gaz","gle","glg","guj","heb","hin","hrv","hun","hye","ibo","ind","isl","ita","jav","jpn","kan","kat","kaz","khk","khm","kir","kor","lao","lit","lug","luo","lvs","mai","mal","mar","mkd","mlt","mni","mya","nld","nno","nob","npi","nya","ory","pan","pbt","pes","pol","por","ron","rus","sat","slk","slv","sna","snd","som","spa","srp","swe","swh","tam","tel","tgk","tgl","tha","tur","ukr","urd","uzn","vie","yor","yue","zlm","zul",]
# fmt: on
def assert_param_count(model_1, model_2):
count_1 = sum(p[1].numel() for p in model_1.named_parameters() if "final_proj" not in p[0])
count_2 = sum(p[1].numel() for p in model_2.named_parameters() if "final_proj" not in p[0])
assert count_1 == count_2, f"{model_1.__class__}: {count_1} != {model_2.__class__}: {count_2}"
def param_count(model):
return sum(p[1].numel() for p in model.named_parameters() if "final_proj" not in p[0])
def _grab_best_device(use_gpu=True):
if torch.cuda.device_count() > 0 and use_gpu:
device = "cuda"
else:
device = "cpu"
return torch.device(device)
logging.set_verbosity_info()
logger = logging.get_logger(__name__)
vocoder_convert_list = [
("ups", "hifi_gan.upsampler"),
("conv_pre", "hifi_gan.conv_pre"),
("resblocks", "hifi_gan.resblocks"),
("conv_post", "hifi_gan.conv_post"),
("lang", "language_embedding"),
("spkr", "speaker_embedding"),
("dict.", "unit_embedding."),
("dur_predictor.conv1.0", "dur_predictor.conv1"),
("dur_predictor.conv2.0", "dur_predictor.conv2"),
]
# order is important
wav2vec_convert_list = [
("speech_encoder_frontend.model_dim_proj", "feature_projection.projection"),
("speech_encoder_frontend.post_extract_layer_norm", "feature_projection.layer_norm"),
("speech_encoder_frontend.pos_encoder.conv", "encoder.pos_conv_embed.conv"),
("speech_encoder.inner.layers", "encoder.layers"),
("speech_encoder.inner_layer_norm", "encoder.layer_norm"),
("speech_encoder.adaptor_layers", "adapter.layers"),
("inner_proj", "intermediate_dense"),
("self_attn.output_proj", "self_attn.linear_out"),
("output_proj", "output_dense"),
("self_attn.k_proj", "self_attn.linear_k"),
("self_attn.v_proj", "self_attn.linear_v"),
("self_attn.q_proj", "self_attn.linear_q"),
("self_attn.sdpa.u_bias", "self_attn.pos_bias_u"),
("self_attn.sdpa.v_bias", "self_attn.pos_bias_v"),
("self_attn.sdpa.rel_k_embed", "self_attn.distance_embedding"),
("self_attn.sdpa.r_proj", "self_attn.linear_pos"),
("conv.pointwise_conv1", "conv_module.pointwise_conv1"),
("conv.pointwise_conv2", "conv_module.pointwise_conv2"),
("conv.depthwise_conv", "conv_module.depthwise_conv"),
("conv.batch_norm", "conv_module.batch_norm"),
("conv.layer_norm", "conv_module.depthwise_layer_norm"),
("conv_layer_norm", "conv_module.layer_norm"),
("speech_encoder.proj1", "intermediate_ffn.intermediate_dense"),
("speech_encoder.proj2", "intermediate_ffn.output_dense"),
("speech_encoder.layer_norm", "inner_layer_norm"),
]
t2u_convert_list = [
("t2u_model.final_proj", "lm_head"),
("t2u_model.", "model."),
("encoder_decoder_attn_layer_norm", "cross_attention_layer_norm"),
("encoder_decoder_attn", "cross_attention"),
("linear_k", "k_proj"),
("linear_v", "v_proj"),
("linear_q", "q_proj"),
("ffn.inner_proj", "ffn.fc1"),
("ffn.output_proj", "ffn.fc2"),
("output_proj", "out_proj"),
("decoder_frontend.embed_char", "decoder.embed_char"),
("decoder_frontend.pos_emb_alpha_char", "decoder.pos_emb_alpha_char"),
("decoder_frontend.embed", "decoder.embed_tokens"),
("decoder_frontend.pos_emb_alpha", "decoder.pos_emb_alpha"),
("conv1d.conv", "conv"),
("conv1d_layer_norm", "conv_layer_norm"),
("decoder_frontend.variance_adaptor", "decoder"),
("duration_predictor.conv1.0", "duration_predictor.conv1"),
("duration_predictor.conv2.0", "duration_predictor.conv2"),
]
text_convert_list = [
("text_encoder.", ""),
("text_decoder.", ""),
("text_encoder_frontend.embed", "embed_tokens"),
("text_decoder_frontend.embed", "embed_tokens"),
("encoder_decoder_attn_layer_norm", "cross_attention_layer_norm"),
("encoder_decoder_attn", "cross_attention"),
("linear_k", "k_proj"),
("linear_v", "v_proj"),
("linear_q", "q_proj"),
("ffn.inner_proj", "ffn.fc1"),
("ffn.output_proj", "ffn.fc2"),
("output_proj", "out_proj"),
("final_proj", "lm_head"),
]
CUR_PATH = os.path.dirname(os.path.abspath(__file__))
default_cache_dir = os.path.join(os.path.expanduser("~"), ".cache")
CACHE_DIR = os.path.join(os.getenv("XDG_CACHE_HOME", default_cache_dir), "huggingface", "hub")
def _load_hf_config():
return SeamlessM4Tv2Config()
def _convert_model(
original_model,
hf_model,
convert_list,
device,
unwanted_prefix="model.",
filter_state_dict="speech",
exclude_state_dict=None,
):
state_dict = original_model.state_dict()
# filter func
if isinstance(filter_state_dict, str):
def filter_func(x):
return filter_state_dict in x[0]
else:
def filter_func(item):
if exclude_state_dict is not None and exclude_state_dict in item[0]:
return False
for filter_el in filter_state_dict:
if filter_el in item[0]:
return True
return False
state_dict = dict(filter(filter_func, state_dict.items()))
for k, v in list(state_dict.items()):
new_k = k[len(unwanted_prefix) :]
for old_layer_name, new_layer_name in convert_list:
if old_layer_name in new_k:
new_k = new_k.replace(old_layer_name, new_layer_name)
# must do it by hand
if ".layer_norm" in new_k and new_k.split(".layer_norm")[0][-1].isnumeric():
new_k = new_k.replace("layer_norm", "final_layer_norm")
state_dict[new_k] = state_dict.pop(k)
extra_keys = set(state_dict.keys()) - set(hf_model.state_dict().keys())
extra_keys = set(extra_keys)
missing_keys = set(hf_model.state_dict().keys()) - set(state_dict.keys())
missing_keys = set({k for k in missing_keys if "final_logits_bias" not in k})
if len(extra_keys) != 0:
raise ValueError(f"extra keys found: {extra_keys}")
if len(missing_keys) != 0:
raise ValueError(f"missing keys: {missing_keys}")
hf_model.load_state_dict(state_dict, strict=False)
n_params = param_count(hf_model)
logger.info(f"model loaded: {round(n_params/1e6,1)}M params")
hf_model.eval()
hf_model.to(device)
del state_dict
return hf_model
def load_model(save_dir, model_type, repo_id):
"""
Meta SeamlessM4Tv2 is made of 8 main components:
- speech_encoder (#1) and speech_encoder_frontend (#2)
- t2u_model (#3)
- text_encoder (#4) and text_encoder_frontend (#5)
- text_decoder (#6) [and text_decoder_frontend (#5) = equals to text_encoder_frontend]
- final_proj (#7)
- vocoder (#8)
"""
device = _grab_best_device()
name = "seamlessM4T_v2_large"
original_model = Translator(name, "vocoder_v2", device, dtype=torch.float32)
######### TOKENIZER
langs = LARGE_SUPPORTED_LANGUAGES
langs = [f"__{lang}__" for lang in langs]
vocab_file = os.path.join(os.path.expanduser("~"), "tokenizer", model_type, "tokenizer.model")
save_dir = os.path.join(save_dir, name)
Path(save_dir).mkdir(exist_ok=True)
tokenizer = SeamlessM4TTokenizer(vocab_file, additional_special_tokens=langs)
sanity_check_lang_id = tokenizer.convert_tokens_to_ids("__fra__")
tokenizer.save_pretrained(save_dir)
tokenizer = SeamlessM4TTokenizer.from_pretrained(save_dir)
if sanity_check_lang_id != tokenizer.convert_tokens_to_ids("__fra__"):
raise ValueError(
f"Error in tokenizer saving/loading - __fra__ lang id is not coherent: {sanity_check_lang_id} vs {tokenizer.convert_tokens_to_ids('__fra__')}"
)
####### get language to ids dict
text_decoder_lang_code_to_id = {lang.replace("__", ""): tokenizer.convert_tokens_to_ids(lang) for lang in langs}
# offset: vocoder unit vocab size + 5 (for EOS/PAD/BOS/UNK/MSK) + len(supported_languages)
t2u_lang_code_to_id = {
code.replace("__", ""): i + 10005 + len(UNIT_SUPPORTED_LANGUAGES)
for i, code in enumerate(UNIT_SUPPORTED_LANGUAGES)
}
vocoder_lang_code_to_id = {code.replace("__", ""): i for i, code in enumerate(VOCODER_SUPPORTED_LANGUAGES)}
######### FE
fe = SeamlessM4TFeatureExtractor(language_code=langs)
fe.save_pretrained(save_dir)
fe = SeamlessM4TFeatureExtractor.from_pretrained(save_dir)
processor = SeamlessM4TProcessor(feature_extractor=fe, tokenizer=tokenizer)
processor.save_pretrained(save_dir)
processor.push_to_hub(repo_id=repo_id, create_pr=True)
processor = SeamlessM4TProcessor.from_pretrained(save_dir)
######## Model
# init config
hf_config = _load_hf_config()
######## get id_to_text and char_to_id from original model tokenizers
id_to_text = {i: original_model.text_tokenizer.model.index_to_token(i) for i in range(hf_config.vocab_size)}
char_to_id = {
original_model.model.t2u_model.decoder_frontend.char_tokenizer.model.index_to_token(i): i for i in range(10904)
}
# init model
hf_model = SeamlessM4Tv2Model(hf_config)
hf_model.generation_config.__setattr__("text_decoder_lang_to_code_id", text_decoder_lang_code_to_id)
hf_model.generation_config.__setattr__("t2u_lang_code_to_id", t2u_lang_code_to_id)
hf_model.generation_config.__setattr__("vocoder_lang_code_to_id", vocoder_lang_code_to_id)
hf_model.generation_config.__setattr__("id_to_text", id_to_text)
hf_model.generation_config.__setattr__("char_to_id", char_to_id)
# -1. take care of vocoder
# similarly to speech T5 must apply and remove weight norm
hf_model.vocoder.apply_weight_norm()
hf_model.vocoder = _convert_model(
original_model,
hf_model.vocoder,
vocoder_convert_list,
device,
unwanted_prefix="vocoder.code_generator.",
filter_state_dict="vocoder",
)
hf_model.vocoder.remove_weight_norm()
# 1. take care of speech encoder
wav2vec = hf_model.speech_encoder
hf_model.speech_encoder = _convert_model(
original_model, wav2vec, wav2vec_convert_list, device, unwanted_prefix="model.", filter_state_dict="speech"
)
# 2. take care of t2u
hf_model.t2u_model = _convert_model(
original_model,
hf_model.t2u_model,
t2u_convert_list,
device,
unwanted_prefix="model.",
filter_state_dict="t2u_model",
)
# 3. take care of text encoder
hf_model.text_encoder = _convert_model(
original_model,
hf_model.text_encoder,
text_convert_list,
device,
unwanted_prefix="model.",
filter_state_dict=["model.text_encoder"],
exclude_state_dict="t2u_model",
)
# 4. take care of text decoder
hf_model.text_decoder = _convert_model(
original_model,
hf_model.text_decoder,
text_convert_list,
device,
unwanted_prefix="model.",
filter_state_dict=["model.text_decoder"],
exclude_state_dict="t2u_model",
)
# 5. take care of final proj
hf_model.lm_head = _convert_model(
original_model,
hf_model.lm_head,
[("final_proj.", "")],
device,
unwanted_prefix="model.",
filter_state_dict=["model.final_proj"],
exclude_state_dict="t2u_model",
)
# sanity check
print(find_tied_parameters(hf_model))
count_1 = param_count(hf_model)
count_2 = param_count(original_model)
print(f"HF MODEL:{count_1}, ORIGINAL_MODEL: {count_2}, diff:{count_1 - count_2}")
print(f"HF MODEL excluding embeddings:{hf_model.num_parameters(exclude_embeddings=True)}")
del original_model
hf_model.generation_config._from_model_config = False
hf_model.save_pretrained(save_dir)
hf_model.push_to_hub(repo_id=repo_id, create_pr=True)
hf_model = SeamlessM4Tv2Model.from_pretrained(save_dir)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
# Required parameters
parser.add_argument(
"--model_type",
default="large",
type=str,
help="Model type.",
)
parser.add_argument(
"--save_dir",
default="/home/ubuntu/weights_v2",
type=str,
help="Path to the output PyTorch model.",
)
parser.add_argument(
"--repo_id",
default="facebook/seamless-m4t-v2-large",
type=str,
help="Repo ID.",
)
args = parser.parse_args()
load_model(args.save_dir, args.model_type, args.repo_id)
...@@ -7213,6 +7213,51 @@ class SeamlessM4TTextToUnitModel(metaclass=DummyObject): ...@@ -7213,6 +7213,51 @@ class SeamlessM4TTextToUnitModel(metaclass=DummyObject):
requires_backends(self, ["torch"]) requires_backends(self, ["torch"])
SEAMLESS_M4T_V2_PRETRAINED_MODEL_ARCHIVE_LIST = None
class SeamlessM4Tv2ForSpeechToSpeech(metaclass=DummyObject):
_backends = ["torch"]
def __init__(self, *args, **kwargs):
requires_backends(self, ["torch"])
class SeamlessM4Tv2ForSpeechToText(metaclass=DummyObject):
_backends = ["torch"]
def __init__(self, *args, **kwargs):
requires_backends(self, ["torch"])
class SeamlessM4Tv2ForTextToSpeech(metaclass=DummyObject):
_backends = ["torch"]
def __init__(self, *args, **kwargs):
requires_backends(self, ["torch"])
class SeamlessM4Tv2ForTextToText(metaclass=DummyObject):
_backends = ["torch"]
def __init__(self, *args, **kwargs):
requires_backends(self, ["torch"])
class SeamlessM4Tv2Model(metaclass=DummyObject):
_backends = ["torch"]
def __init__(self, *args, **kwargs):
requires_backends(self, ["torch"])
class SeamlessM4Tv2PreTrainedModel(metaclass=DummyObject):
_backends = ["torch"]
def __init__(self, *args, **kwargs):
requires_backends(self, ["torch"])
SEGFORMER_PRETRAINED_MODEL_ARCHIVE_LIST = None SEGFORMER_PRETRAINED_MODEL_ARCHIVE_LIST = None
......
...@@ -16,7 +16,6 @@ ...@@ -16,7 +16,6 @@
import copy import copy
import inspect
import tempfile import tempfile
import unittest import unittest
...@@ -479,10 +478,6 @@ class SeamlessM4TModelWithSpeechInputTest(ModelTesterMixin, unittest.TestCase): ...@@ -479,10 +478,6 @@ class SeamlessM4TModelWithSpeechInputTest(ModelTesterMixin, unittest.TestCase):
def test_save_load_fast_init_to_base(self): def test_save_load_fast_init_to_base(self):
pass pass
@unittest.skip(reason="The speech encoder doesn't support head masking")
def test_generate_with_head_masking(self):
pass
@unittest.skip(reason="SeamlessM4TModel can takes input_ids or input_features") @unittest.skip(reason="SeamlessM4TModel can takes input_ids or input_features")
def test_forward_signature(self): def test_forward_signature(self):
pass pass
...@@ -714,43 +709,6 @@ class SeamlessM4TModelWithTextInputTest( ...@@ -714,43 +709,6 @@ class SeamlessM4TModelWithTextInputTest(
def test_model_weights_reload_no_missing_tied_weights(self): def test_model_weights_reload_no_missing_tied_weights(self):
pass pass
def test_generate_with_head_masking(self):
"""Test designed for encoder-decoder models to ensure the attention head masking is used."""
attention_names = ["encoder_attentions", "decoder_attentions", "cross_attentions"]
for model_class in self.all_generative_model_classes:
config, input_ids, attention_mask, max_length = self._get_input_ids_and_config()
model = model_class(config).to(torch_device).eval()
head_masking = {
"head_mask": torch.zeros(config.encoder_layers, config.encoder_attention_heads, device=torch_device),
"decoder_head_mask": torch.zeros(
config.decoder_layers, config.decoder_attention_heads, device=torch_device
),
"cross_attn_head_mask": torch.zeros(
config.decoder_layers, config.decoder_attention_heads, device=torch_device
),
}
signature = inspect.signature(model.forward)
# We want to test only models where encoder/decoder head masking is implemented
if not set(head_masking.keys()) < {*signature.parameters.keys()}:
continue
for attn_name, (name, mask) in zip(attention_names, head_masking.items()):
out = model.generate(
input_ids,
attention_mask=attention_mask,
num_beams=1,
output_attentions=True,
return_dict_in_generate=True,
remove_invalid_values=True,
**{name: mask},
)
# We check the state of decoder_attentions and cross_attentions just from the last step
attn_weights = out[attn_name] if attn_name == attention_names[0] else out[attn_name][-1]
self.assertEqual(sum([w.sum().item() for w in attn_weights]), 0.0)
@unittest.skip(reason="SeamlessM4TModel can take input_ids or input_features") @unittest.skip(reason="SeamlessM4TModel can take input_ids or input_features")
def test_forward_signature(self): def test_forward_signature(self):
pass pass
......
This diff is collapsed.
...@@ -96,6 +96,21 @@ SPECIAL_CASES_TO_ALLOW = { ...@@ -96,6 +96,21 @@ SPECIAL_CASES_TO_ALLOW = {
"t2u_encoder_layers", "t2u_encoder_layers",
"t2u_max_position_embeddings", "t2u_max_position_embeddings",
], ],
# Actually used in the config or generation config, in that case necessary for the sub-components generation
"SeamlessM4Tv2Config": [
"max_new_tokens",
"t2u_decoder_attention_heads",
"t2u_decoder_ffn_dim",
"t2u_decoder_layers",
"t2u_encoder_attention_heads",
"t2u_encoder_ffn_dim",
"t2u_encoder_layers",
"t2u_max_position_embeddings",
"t2u_variance_pred_dropout",
"t2u_variance_predictor_embed_dim",
"t2u_variance_predictor_hidden_dim",
"t2u_variance_predictor_kernel_size",
],
} }
......
...@@ -463,6 +463,7 @@ OBJECTS_TO_IGNORE = [ ...@@ -463,6 +463,7 @@ OBJECTS_TO_IGNORE = [
"SamConfig", "SamConfig",
"SamPromptEncoderConfig", "SamPromptEncoderConfig",
"SeamlessM4TConfig", # use of unconventional markdown "SeamlessM4TConfig", # use of unconventional markdown
"SeamlessM4Tv2Config", # use of unconventional markdown
"Seq2SeqTrainingArguments", "Seq2SeqTrainingArguments",
"SpecialTokensMixin", "SpecialTokensMixin",
"Speech2Text2Config", "Speech2Text2Config",
......
...@@ -76,6 +76,9 @@ PRIVATE_MODELS = [ ...@@ -76,6 +76,9 @@ PRIVATE_MODELS = [
"Kosmos2TextModel", "Kosmos2TextModel",
"Kosmos2TextForCausalLM", "Kosmos2TextForCausalLM",
"Kosmos2VisionModel", "Kosmos2VisionModel",
"SeamlessM4Tv2TextToUnitModel",
"SeamlessM4Tv2CodeHifiGan",
"SeamlessM4Tv2TextToUnitForConditionalGeneration",
] ]
# Update this list for models that are not tested with a comment explaining the reason it should not be. # Update this list for models that are not tested with a comment explaining the reason it should not be.
...@@ -296,6 +299,10 @@ IGNORE_NON_AUTO_CONFIGURED = PRIVATE_MODELS.copy() + [ ...@@ -296,6 +299,10 @@ IGNORE_NON_AUTO_CONFIGURED = PRIVATE_MODELS.copy() + [
"SeamlessM4TCodeHifiGan", "SeamlessM4TCodeHifiGan",
"SeamlessM4TForSpeechToSpeech", # no auto class for speech-to-speech "SeamlessM4TForSpeechToSpeech", # no auto class for speech-to-speech
"TvpForVideoGrounding", "TvpForVideoGrounding",
"SeamlessM4Tv2NARTextToUnitModel",
"SeamlessM4Tv2NARTextToUnitForConditionalGeneration",
"SeamlessM4Tv2CodeHifiGan",
"SeamlessM4Tv2ForSpeechToSpeech", # no auto class for speech-to-speech
] ]
# DO NOT edit this list! # DO NOT edit this list!
......
...@@ -776,6 +776,7 @@ src/transformers/models/sam/modeling_sam.py ...@@ -776,6 +776,7 @@ src/transformers/models/sam/modeling_sam.py
src/transformers/models/sam/modeling_tf_sam.py src/transformers/models/sam/modeling_tf_sam.py
src/transformers/models/sam/processing_sam.py src/transformers/models/sam/processing_sam.py
src/transformers/models/seamless_m4t/convert_fairseq2_to_hf.py src/transformers/models/seamless_m4t/convert_fairseq2_to_hf.py
src/transformers/models/seamless_m4t_v2/convert_fairseq2_to_hf.py
src/transformers/models/segformer/configuration_segformer.py src/transformers/models/segformer/configuration_segformer.py
src/transformers/models/segformer/convert_segformer_original_to_pytorch.py src/transformers/models/segformer/convert_segformer_original_to_pytorch.py
src/transformers/models/sew/convert_sew_original_pytorch_checkpoint_to_pytorch.py src/transformers/models/sew/convert_sew_original_pytorch_checkpoint_to_pytorch.py
......
...@@ -2,6 +2,7 @@ docs/source/en/generation_strategies.md ...@@ -2,6 +2,7 @@ docs/source/en/generation_strategies.md
docs/source/en/model_doc/ctrl.md docs/source/en/model_doc/ctrl.md
docs/source/en/model_doc/kosmos-2.md docs/source/en/model_doc/kosmos-2.md
docs/source/en/model_doc/seamless_m4t.md docs/source/en/model_doc/seamless_m4t.md
docs/source/en/model_doc/seamless_m4t_v2.md
docs/source/en/task_summary.md docs/source/en/task_summary.md
docs/source/en/tasks/prompting.md docs/source/en/tasks/prompting.md
src/transformers/models/blip_2/modeling_blip_2.py src/transformers/models/blip_2/modeling_blip_2.py
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment