Unverified Commit c43b380e authored by Yoach Lacombe's avatar Yoach Lacombe Committed by GitHub
Browse files

Add MusicGen Melody (#28819)



* first modeling code

* make repository

* still WIP

* update model

* add tests

* add latest change

* clean docstrings and copied from

* update docstrings md and readme

* correct chroma function

* correct copied from and remove unreleated test

* add doc to toctree

* correct imports

* add convert script to notdoctested

* Add suggestion from Sanchit
Co-authored-by: default avatarSanchit Gandhi <93869735+sanchit-gandhi@users.noreply.github.com>

* correct get_uncoditional_inputs docstrings

* modify README according to SANCHIT feedback

* add chroma to audio utils

* clean librosa and torchaudio hard dependencies

* fix FE

* refactor audio decoder -> audio encoder for consistency with previous musicgen

* refactor conditional -> encoder

* modify sampling rate logics

* modify license at the beginning

* refactor all_self_attns->all_attentions

* remove ignore copy from causallm generate

* add copied from for from_sub_models

* fix make copies

* add warning if audio is truncated

* add copied from where relevant

* remove artefact

* fix convert script

* fix torchaudio and FE

* modify chroma method according to feedback-> better naming

* refactor input_values->input_features

* refactor input_values->input_features and fix import fe

* add input_features to docstrigs

* correct inputs_embeds logics

* remove dtype conversion

* refactor _prepare_conditional_hidden_states_kwargs_for_generation ->_prepare_encoder_hidden_states_kwargs_for_generation

* change warning for chroma length

* Update src/transformers/models/musicgen_melody/convert_musicgen_melody_transformers.py
Co-authored-by: default avatarSanchit Gandhi <93869735+sanchit-gandhi@users.noreply.github.com>

* change way to save wav, using soundfile

* correct docs and change to soundfile

* fix import

* fix init proj layers

* remove line breaks from md

* fix issue with docstrings

* add FE suggestions

* improve is in logics and remove useless imports

* remove custom from_pretrained

* simplify docstring code

* add suggestions for modeling tests

* make style

* update converting script with sanity check

* remove encoder attention mask from conditional generation

* replace musicgen melody checkpoints with official orga

* rename ylacombe->facebook in checkpoints

* fix copies

* remove unecessary warning

* add shape in code docstrings

* add files to slow doc tests

* fix md bug and add md to not_tested

* make fix-copies

* fix hidden states test and batching

---------
Co-authored-by: default avatarSanchit Gandhi <93869735+sanchit-gandhi@users.noreply.github.com>
parent bf3dfd11
......@@ -161,6 +161,7 @@ CONFIG_MAPPING_NAMES = OrderedDict(
("mra", "MraConfig"),
("mt5", "MT5Config"),
("musicgen", "MusicgenConfig"),
("musicgen_melody", "MusicgenMelodyConfig"),
("mvp", "MvpConfig"),
("nat", "NatConfig"),
("nezha", "NezhaConfig"),
......@@ -396,6 +397,7 @@ CONFIG_ARCHIVE_MAP_MAPPING_NAMES = OrderedDict(
("mpt", "MPT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
("mra", "MRA_PRETRAINED_CONFIG_ARCHIVE_MAP"),
("musicgen", "MUSICGEN_PRETRAINED_CONFIG_ARCHIVE_MAP"),
("musicgen_melody", "MUSICGEN_MELODY_PRETRAINED_CONFIG_ARCHIVE_MAP"),
("mvp", "MVP_PRETRAINED_CONFIG_ARCHIVE_MAP"),
("nat", "NAT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
("nezha", "NEZHA_PRETRAINED_CONFIG_ARCHIVE_MAP"),
......@@ -649,6 +651,7 @@ MODEL_NAMES_MAPPING = OrderedDict(
("mra", "MRA"),
("mt5", "MT5"),
("musicgen", "MusicGen"),
("musicgen_melody", "MusicGen Melody"),
("mvp", "MVP"),
("nat", "NAT"),
("nezha", "Nezha"),
......
......@@ -454,6 +454,7 @@ MODEL_FOR_CAUSAL_LM_MAPPING_NAMES = OrderedDict(
("mixtral", "MixtralForCausalLM"),
("mpt", "MptForCausalLM"),
("musicgen", "MusicgenForCausalLM"),
("musicgen_melody", "MusicgenMelodyForCausalLM"),
("mvp", "MvpForCausalLM"),
("open-llama", "OpenLlamaForCausalLM"),
("openai-gpt", "OpenAIGPTLMHeadModel"),
......@@ -1176,6 +1177,7 @@ MODEL_FOR_TEXT_TO_WAVEFORM_MAPPING_NAMES = OrderedDict(
("bark", "BarkModel"),
("fastspeech2_conformer", "FastSpeech2ConformerWithHifiGan"),
("musicgen", "MusicgenForConditionalGeneration"),
("musicgen_melody", "MusicgenMelodyForConditionalGeneration"),
("seamless_m4t", "SeamlessM4TForTextToSpeech"),
("seamless_m4t_v2", "SeamlessM4Tv2ForTextToSpeech"),
("vits", "VitsModel"),
......
......@@ -280,6 +280,7 @@ else:
),
),
("musicgen", ("T5Tokenizer", "T5TokenizerFast" if is_tokenizers_available() else None)),
("musicgen_melody", ("T5Tokenizer", "T5TokenizerFast" if is_tokenizers_available() else None)),
("mvp", ("MvpTokenizer", "MvpTokenizerFast" if is_tokenizers_available() else None)),
("nezha", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)),
(
......
# Copyright 2024 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import TYPE_CHECKING
from ...utils import (
OptionalDependencyNotAvailable,
_LazyModule,
is_torch_available,
is_torchaudio_available,
)
_import_structure = {
"configuration_musicgen_melody": [
"MUSICGEN_MELODY_PRETRAINED_CONFIG_ARCHIVE_MAP",
"MusicgenMelodyConfig",
"MusicgenMelodyDecoderConfig",
],
}
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["modeling_musicgen_melody"] = [
"MUSICGEN_MELODY_PRETRAINED_MODEL_ARCHIVE_LIST",
"MusicgenMelodyForConditionalGeneration",
"MusicgenMelodyForCausalLM",
"MusicgenMelodyModel",
"MusicgenMelodyPreTrainedModel",
]
try:
if not is_torchaudio_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["feature_extraction_musicgen_melody"] = ["MusicgenMelodyFeatureExtractor"]
_import_structure["processing_musicgen_melody"] = ["MusicgenMelodyProcessor"]
if TYPE_CHECKING:
from .configuration_musicgen_melody import (
MUSICGEN_MELODY_PRETRAINED_CONFIG_ARCHIVE_MAP,
MusicgenMelodyConfig,
MusicgenMelodyDecoderConfig,
)
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .modeling_musicgen_melody import (
MUSICGEN_MELODY_PRETRAINED_MODEL_ARCHIVE_LIST,
MusicgenMelodyForCausalLM,
MusicgenMelodyForConditionalGeneration,
MusicgenMelodyModel,
MusicgenMelodyPreTrainedModel,
)
try:
if not is_torchaudio_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .feature_extraction_musicgen_melody import MusicgenMelodyFeatureExtractor
from .processing_musicgen_melody import MusicgenMelodyProcessor
else:
import sys
sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
# coding=utf-8
# Copyright 2024 Meta AI and The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" Musicgen Melody model configuration"""
from ...configuration_utils import PretrainedConfig
from ...utils import logging
from ..auto.configuration_auto import AutoConfig
logger = logging.get_logger(__name__)
MUSICGEN_MELODY_PRETRAINED_CONFIG_ARCHIVE_MAP = {
"facebook/musicgen-melody": "https://huggingface.co/facebook/musicgen-melody/resolve/main/config.json",
}
class MusicgenMelodyDecoderConfig(PretrainedConfig):
r"""
This is the configuration class to store the configuration of an [`MusicgenMelodyDecoder`]. It is used to instantiate a
Musicgen Melody decoder according to the specified arguments, defining the model architecture. Instantiating a
configuration with the defaults will yield a similar configuration to that of the Musicgen Melody
[facebook/musicgen-melody](https://huggingface.co/facebook/musicgen-melody) architecture.
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
Args:
vocab_size (`int`, *optional*, defaults to 2048):
Vocabulary size of the MusicgenMelodyDecoder model. Defines the number of different tokens that can be
represented by the `inputs_ids` passed when calling [`MusicgenMelodyDecoder`].
max_position_embeddings (`int`, *optional*, defaults to 2048):
The maximum sequence length that this model might ever be used with. Typically, set this to something large
just in case (e.g., 512 or 1024 or 2048).
num_hidden_layers (`int`, *optional*, defaults to 24):
Number of decoder layers.
ffn_dim (`int`, *optional*, defaults to 4096):
Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer block.
num_attention_heads (`int`, *optional*, defaults to 16):
Number of attention heads for each attention layer in the Transformer block.
layerdrop (`float`, *optional*, defaults to 0.0):
The LayerDrop probability for the decoder. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556)
for more details.
use_cache (`bool`, *optional*, defaults to `True`):
Whether the model should return the last key/values attentions (not used by all models)
activation_function (`str` or `function`, *optional*, defaults to `"gelu"`):
The non-linear activation function (function or string) in the decoder and pooler. If string, `"gelu"`,
`"relu"`, `"silu"` and `"gelu_new"` are supported.
hidden_size (`int`, *optional*, defaults to 1024):
Dimensionality of the layers and the pooler layer.
dropout (`float`, *optional*, defaults to 0.1):
The dropout probability for all fully connected layers in the embeddings, text_encoder, and pooler.
attention_dropout (`float`, *optional*, defaults to 0.0):
The dropout ratio for the attention probabilities.
activation_dropout (`float`, *optional*, defaults to 0.0):
The dropout ratio for activations inside the fully connected layer.
initializer_factor (`float`, *optional*, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
scale_embedding (`bool`, *optional*, defaults to `False`):
Scale embeddings by diving by sqrt(hidden_size).
num_codebooks (`int`, *optional*, defaults to 4):
The number of parallel codebooks forwarded to the model.
audio_channels (`int`, *optional*, defaults to 1):
Number of audio channels used by the model (either mono or stereo). Stereo models generate a separate
audio stream for the left/right output channels. Mono models generate a single audio stream output.
pad_token_id (`int`, *optional*, defaults to 2048): The id of the *padding* token.
bos_token_id (`int`, *optional*, defaults to 2048): The id of the *beginning-of-sequence* token.
eos_token_id (`int`, *optional*): The id of the *end-of-sequence* token.
tie_word_embeddings (`bool`, *optional*, defaults to `False`): Whether to tie word embeddings with the text encoder.
"""
model_type = "musicgen_melody_decoder"
keys_to_ignore_at_inference = ["past_key_values"]
def __init__(
self,
vocab_size=2048,
max_position_embeddings=2048,
num_hidden_layers=24,
ffn_dim=4096,
num_attention_heads=16,
layerdrop=0.0,
use_cache=True,
activation_function="gelu",
hidden_size=1024,
dropout=0.1,
attention_dropout=0.0,
activation_dropout=0.0,
initializer_factor=0.02,
scale_embedding=False,
num_codebooks=4,
audio_channels=1,
pad_token_id=2048,
bos_token_id=2048,
eos_token_id=None,
tie_word_embeddings=False,
**kwargs,
):
self.vocab_size = vocab_size
self.max_position_embeddings = max_position_embeddings
self.hidden_size = hidden_size
self.ffn_dim = ffn_dim
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
self.dropout = dropout
self.attention_dropout = attention_dropout
self.activation_dropout = activation_dropout
self.activation_function = activation_function
self.initializer_factor = initializer_factor
self.layerdrop = layerdrop
self.use_cache = use_cache
self.scale_embedding = scale_embedding # scale factor will be sqrt(d_model) if True
self.num_codebooks = num_codebooks
if audio_channels not in [1, 2]:
raise ValueError(f"Expected 1 (mono) or 2 (stereo) audio channels, got {audio_channels} channels.")
self.audio_channels = audio_channels
super().__init__(
pad_token_id=pad_token_id,
bos_token_id=bos_token_id,
eos_token_id=eos_token_id,
tie_word_embeddings=tie_word_embeddings,
**kwargs,
)
class MusicgenMelodyConfig(PretrainedConfig):
r"""
This is the configuration class to store the configuration of a [`MusicgenMelodyModel`]. It is used to instantiate a
Musicgen Melody model according to the specified arguments, defining the text encoder, audio encoder and Musicgen Melody decoder
configs. Instantiating a configuration with the defaults will yield a similar configuration to that of the Musicgen Melody
[facebook/musicgen-melody](https://huggingface.co/facebook/musicgen-melody) architecture.
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
Args:
num_chroma (`int`, *optional*, defaults to 12): Number of chroma bins to use.
chroma_length (`int`, *optional*, defaults to 235):
Maximum chroma duration if audio is used to condition the model. Corresponds to the maximum duration used during training.
kwargs (*optional*):
Dictionary of keyword arguments. Notably:
- **text_encoder** ([`PretrainedConfig`], *optional*) -- An instance of a configuration object that
defines the text encoder config.
- **audio_encoder** ([`PretrainedConfig`], *optional*) -- An instance of a configuration object that
defines the audio encoder config.
- **decoder** ([`PretrainedConfig`], *optional*) -- An instance of a configuration object that defines
the decoder config.
Example:
```python
>>> from transformers import (
... MusicgenMelodyConfig,
... MusicgenMelodyDecoderConfig,
... T5Config,
... EncodecConfig,
... MusicgenMelodyForConditionalGeneration,
... )
>>> # Initializing text encoder, audio encoder, and decoder model configurations
>>> text_encoder_config = T5Config()
>>> audio_encoder_config = EncodecConfig()
>>> decoder_config = MusicgenMelodyDecoderConfig()
>>> configuration = MusicgenMelodyConfig.from_sub_models_config(
... text_encoder_config, audio_encoder_config, decoder_config
... )
>>> # Initializing a MusicgenMelodyForConditionalGeneration (with random weights) from the facebook/musicgen-melody style configuration
>>> model = MusicgenMelodyForConditionalGeneration(configuration)
>>> # Accessing the model configuration
>>> configuration = model.config
>>> config_text_encoder = model.config.text_encoder
>>> config_audio_encoder = model.config.audio_encoder
>>> config_decoder = model.config.decoder
>>> # Saving the model, including its configuration
>>> model.save_pretrained("musicgen_melody-model")
>>> # loading model and config from pretrained folder
>>> musicgen_melody_config = MusicgenMelodyConfig.from_pretrained("musicgen_melody-model")
>>> model = MusicgenMelodyForConditionalGeneration.from_pretrained("musicgen_melody-model", config=musicgen_melody_config)
```"""
model_type = "musicgen_melody"
is_composition = True
def __init__(
self,
num_chroma=12,
chroma_length=235,
**kwargs,
):
super().__init__(**kwargs)
if "text_encoder" not in kwargs or "audio_encoder" not in kwargs or "decoder" not in kwargs:
raise ValueError("Config has to be initialized with text_encoder, audio_encoder and decoder config")
text_encoder_config = kwargs.pop("text_encoder")
text_encoder_model_type = text_encoder_config.pop("model_type")
audio_encoder_config = kwargs.pop("audio_encoder")
audio_encoder_model_type = audio_encoder_config.pop("model_type")
decoder_config = kwargs.pop("decoder")
self.text_encoder = AutoConfig.for_model(text_encoder_model_type, **text_encoder_config)
self.audio_encoder = AutoConfig.for_model(audio_encoder_model_type, **audio_encoder_config)
self.decoder = MusicgenMelodyDecoderConfig(**decoder_config)
self.is_encoder_decoder = False
self.num_chroma = num_chroma
self.chroma_length = chroma_length
@classmethod
def from_sub_models_config(
cls,
text_encoder_config: PretrainedConfig,
audio_encoder_config: PretrainedConfig,
decoder_config: MusicgenMelodyDecoderConfig,
**kwargs,
):
r"""
Instantiate a [`MusicgenMelodyConfig`] (or a derived class) from text encoder, audio encoder and decoder
configurations.
Returns:
[`MusicgenMelodyConfig`]: An instance of a configuration object
"""
return cls(
text_encoder=text_encoder_config.to_dict(),
audio_encoder=audio_encoder_config.to_dict(),
decoder=decoder_config.to_dict(),
**kwargs,
)
@property
# This is a property because you might want to change the codec model on the fly
def sampling_rate(self):
return self.audio_encoder.sampling_rate
# coding=utf-8
# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Convert Musicgen Melody checkpoints from the original repository."""
import argparse
from pathlib import Path
from typing import Dict, OrderedDict, Tuple
import torch
from audiocraft.models import MusicGen
from transformers import (
AutoTokenizer,
EncodecModel,
T5EncoderModel,
)
from transformers.models.musicgen_melody.configuration_musicgen_melody import MusicgenMelodyDecoderConfig
from transformers.models.musicgen_melody.feature_extraction_musicgen_melody import MusicgenMelodyFeatureExtractor
from transformers.models.musicgen_melody.modeling_musicgen_melody import (
MusicgenMelodyForCausalLM,
MusicgenMelodyForConditionalGeneration,
)
from transformers.models.musicgen_melody.processing_musicgen_melody import MusicgenMelodyProcessor
from transformers.utils import logging
logging.set_verbosity_info()
logger = logging.get_logger(__name__)
EXPECTED_MISSING_KEYS = ["model.decoder.embed_positions.weights"]
EXPECTED_ADDITIONAL_KEYS = ["condition_provider.conditioners.self_wav.chroma.spec.window"]
def rename_keys(name):
if "emb" in name:
name = name.replace("emb", "model.decoder.embed_tokens")
if "transformer" in name:
name = name.replace("transformer", "model.decoder")
if "cross_attention" in name:
name = name.replace("cross_attention", "encoder_attn")
if "linear1" in name:
name = name.replace("linear1", "fc1")
if "linear2" in name:
name = name.replace("linear2", "fc2")
if "norm1" in name:
name = name.replace("norm1", "self_attn_layer_norm")
if "norm_cross" in name:
name = name.replace("norm_cross", "encoder_attn_layer_norm")
if "norm2" in name:
name = name.replace("norm2", "final_layer_norm")
if "out_norm" in name:
name = name.replace("out_norm", "model.decoder.layer_norm")
if "linears" in name:
name = name.replace("linears", "lm_heads")
if "condition_provider.conditioners.description.output_proj" in name:
name = name.replace("condition_provider.conditioners.description.output_proj", "enc_to_dec_proj")
if "condition_provider.conditioners.self_wav.output_proj" in name:
name = name.replace("condition_provider.conditioners.self_wav.output_proj", "audio_enc_to_dec_proj")
return name
def rename_state_dict(state_dict: OrderedDict, hidden_size: int) -> Tuple[Dict, Dict]:
"""Function that takes the fairseq MusicgenMelody state dict and renames it according to the HF
module names. It further partitions the state dict into the decoder (LM) state dict, and that for the
text encoder projection and for the audio encoder projection."""
keys = list(state_dict.keys())
enc_dec_proj_state_dict = {}
audio_enc_to_dec_proj_state_dict = {}
for key in keys:
val = state_dict.pop(key)
key = rename_keys(key)
if "in_proj_weight" in key:
# split fused qkv proj
state_dict[key.replace("in_proj_weight", "q_proj.weight")] = val[:hidden_size, :]
state_dict[key.replace("in_proj_weight", "k_proj.weight")] = val[hidden_size : 2 * hidden_size, :]
state_dict[key.replace("in_proj_weight", "v_proj.weight")] = val[-hidden_size:, :]
elif "audio_enc_to_dec_proj" in key:
audio_enc_to_dec_proj_state_dict[key[len("audio_enc_to_dec_proj.") :]] = val
elif "enc_to_dec_proj" in key:
enc_dec_proj_state_dict[key[len("enc_to_dec_proj.") :]] = val
else:
state_dict[key] = val
return state_dict, enc_dec_proj_state_dict, audio_enc_to_dec_proj_state_dict
def decoder_config_from_checkpoint(checkpoint: str) -> MusicgenMelodyDecoderConfig:
if checkpoint == "facebook/musicgen-melody" or checkpoint == "facebook/musicgen-stereo-melody":
hidden_size = 1536
num_hidden_layers = 48
num_attention_heads = 24
elif checkpoint == "facebook/musicgen-melody-large" or checkpoint == "facebook/musicgen-stereo-melody-large":
hidden_size = 2048
num_hidden_layers = 48
num_attention_heads = 32
else:
raise ValueError(
"Checkpoint should be one of `['facebook/musicgen-melody', 'facebook/musicgen-melody-large']` for the mono checkpoints, "
"or `['facebook/musicgen-stereo-melody', 'facebook/musicgen-stereo-melody-large']` "
f"for the stereo checkpoints, got {checkpoint}."
)
if "stereo" in checkpoint:
audio_channels = 2
num_codebooks = 8
else:
audio_channels = 1
num_codebooks = 4
config = MusicgenMelodyDecoderConfig(
hidden_size=hidden_size,
ffn_dim=hidden_size * 4,
num_hidden_layers=num_hidden_layers,
num_attention_heads=num_attention_heads,
num_codebooks=num_codebooks,
audio_channels=audio_channels,
)
return config
@torch.no_grad()
def convert_musicgen_melody_checkpoint(
checkpoint, pytorch_dump_folder=None, repo_id=None, device="cpu", test_same_output=False
):
fairseq_model = MusicGen.get_pretrained(checkpoint, device=args.device)
decoder_config = decoder_config_from_checkpoint(checkpoint)
decoder_state_dict = fairseq_model.lm.state_dict()
decoder_state_dict, enc_dec_proj_state_dict, audio_enc_to_dec_proj_state_dict = rename_state_dict(
decoder_state_dict, hidden_size=decoder_config.hidden_size
)
text_encoder = T5EncoderModel.from_pretrained("t5-base")
audio_encoder = EncodecModel.from_pretrained("facebook/encodec_32khz")
decoder = MusicgenMelodyForCausalLM(decoder_config).eval()
# load all decoder weights - expect that we'll be missing embeddings and enc-dec projection
missing_keys, unexpected_keys = decoder.load_state_dict(decoder_state_dict, strict=False)
for key in missing_keys.copy():
if key.startswith(("text_encoder", "audio_encoder")) or key in EXPECTED_MISSING_KEYS:
missing_keys.remove(key)
for key in unexpected_keys.copy():
if key in EXPECTED_ADDITIONAL_KEYS:
unexpected_keys.remove(key)
if len(missing_keys) > 0:
raise ValueError(f"Missing key(s) in state_dict: {missing_keys}")
if len(unexpected_keys) > 0:
raise ValueError(f"Unexpected key(s) in state_dict: {unexpected_keys}")
# init the composite model
model = MusicgenMelodyForConditionalGeneration(
text_encoder=text_encoder, audio_encoder=audio_encoder, decoder=decoder
).to(args.device)
# load the pre-trained enc-dec projection (from the decoder state dict)
model.enc_to_dec_proj.load_state_dict(enc_dec_proj_state_dict)
# load the pre-trained audio encoder projection (from the decoder state dict)
model.audio_enc_to_dec_proj.load_state_dict(audio_enc_to_dec_proj_state_dict)
# check we can do a forward pass
input_ids = torch.arange(0, 2 * decoder_config.num_codebooks, dtype=torch.long).reshape(2, -1).to(device)
decoder_input_ids = input_ids.reshape(2 * decoder_config.num_codebooks, -1).to(device)
with torch.no_grad():
logits = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids).logits
output_length = 1 + input_ids.shape[1] + model.config.chroma_length
if logits.shape != (2 * decoder_config.num_codebooks, output_length, 2048):
raise ValueError("Incorrect shape for logits")
# now construct the processor
tokenizer = AutoTokenizer.from_pretrained("t5-base")
feature_extractor = MusicgenMelodyFeatureExtractor()
processor = MusicgenMelodyProcessor(feature_extractor=feature_extractor, tokenizer=tokenizer)
# set the appropriate bos/pad token ids
model.generation_config.decoder_start_token_id = 2048
model.generation_config.pad_token_id = 2048
# set other default generation config params
model.generation_config.max_length = int(30 * audio_encoder.config.frame_rate)
model.generation_config.do_sample = True
model.generation_config.guidance_scale = 3.0
if test_same_output:
# check same output than original model
decoder_input_ids = torch.ones_like(decoder_input_ids).to(device) * model.generation_config.pad_token_id
with torch.no_grad():
decoder_input_ids = decoder_input_ids[: decoder_config.num_codebooks]
inputs = processor(text=["gen"], return_tensors="pt", padding=True).to(device)
logits = model(**inputs, decoder_input_ids=decoder_input_ids).logits
attributes, prompt_tokens = fairseq_model._prepare_tokens_and_attributes(["gen"], None)
original_logits = fairseq_model.lm.forward(
decoder_input_ids.reshape(1, decoder_config.num_codebooks, -1), attributes
)
torch.testing.assert_close(
original_logits.squeeze(2).reshape(decoder_config.num_codebooks, -1),
logits[:, -1],
rtol=1e-5,
atol=5e-5,
)
if pytorch_dump_folder is not None:
Path(pytorch_dump_folder).mkdir(exist_ok=True)
logger.info(f"Saving model {checkpoint} to {pytorch_dump_folder}")
model.save_pretrained(pytorch_dump_folder)
processor.save_pretrained(pytorch_dump_folder)
if repo_id:
logger.info(f"Pushing model {checkpoint} to {repo_id}")
model.push_to_hub(repo_id, create_pr=True)
processor.push_to_hub(repo_id, create_pr=True)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
# Required parameters
parser.add_argument(
"--checkpoint",
default="facebook/musicgen-melody",
type=str,
help="Checkpoint size of the Musicgen Melody model you'd like to convert. Can be one of: "
"`['facebook/musicgen-melody', 'facebook/musicgen-melody-large']` for the mono checkpoints, or "
"`['facebook/musicgen-stereo-melody', 'facebook/musicgen-stereo-melody-large']` "
"for the stereo checkpoints.",
)
parser.add_argument(
"--pytorch_dump_folder",
default=None,
type=str,
help="Path to the output PyTorch model directory.",
)
parser.add_argument(
"--push_to_hub",
default="musicgen-melody",
type=str,
help="Where to upload the converted model on the 🤗 hub.",
)
parser.add_argument(
"--device", default="cpu", type=str, help="Torch device to run the conversion, either cpu or cuda."
)
parser.add_argument("--test_same_output", default=False, type=bool, help="If `True`, test if same output logits.")
args = parser.parse_args()
convert_musicgen_melody_checkpoint(
args.checkpoint, args.pytorch_dump_folder, args.push_to_hub, args.device, args.test_same_output
)
# coding=utf-8
# Copyright 2024 Meta AI and The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Feature extractor class for Musicgen Melody
"""
import copy
from typing import Any, Dict, List, Optional, Union
import numpy as np
from ...audio_utils import chroma_filter_bank
from ...feature_extraction_sequence_utils import SequenceFeatureExtractor
from ...feature_extraction_utils import BatchFeature
from ...utils import TensorType, is_torch_available, is_torchaudio_available, logging
if is_torch_available():
import torch
if is_torchaudio_available():
import torchaudio
logger = logging.get_logger(__name__)
class MusicgenMelodyFeatureExtractor(SequenceFeatureExtractor):
r"""
Constructs a MusicgenMelody feature extractor.
This feature extractor inherits from [`~feature_extraction_sequence_utils.SequenceFeatureExtractor`] which contains
most of the main methods. Users should refer to this superclass for more information regarding those methods.
This class extracts chroma features from audio processed by [Demucs](https://github.com/adefossez/demucs/tree/main) or
directly from raw audio waveform.
Args:
feature_size (`int`, *optional*, defaults to 12):
The feature dimension of the extracted features.
sampling_rate (`int`, *optional*, defaults to 32000):
The sampling rate at which the audio files should be digitalized expressed in hertz (Hz).
hop_length (`int`, *optional*, defaults to 4096):
Length of the overlaping windows for the STFT used to obtain the Mel Frequency coefficients.
chunk_length (`int`, *optional*, defaults to 30):
The maximum number of chunks of `sampling_rate` samples used to trim and pad longer or shorter audio
sequences.
n_fft (`int`, *optional*, defaults to 16384):
Size of the Fourier transform.
num_chroma (`int`, *optional*, defaults to 12):
Number of chroma bins to use.
padding_value (`float`, *optional*, defaults to 0.0):
Padding value used to pad the audio.
return_attention_mask (`bool`, *optional*, defaults to `False`):
Whether to return the attention mask. Can be overwritten when calling the feature extractor.
[What are attention masks?](../glossary#attention-mask)
<Tip>
For Whisper models, `attention_mask` should always be passed for batched inference, to avoid subtle
bugs.
</Tip>
stem_indices (`List[int]`, *optional*, defaults to `[3, 2]`):
Stem channels to extract if demucs outputs are passed.
"""
model_input_names = ["input_features"]
def __init__(
self,
feature_size=12,
sampling_rate=32000,
hop_length=4096,
chunk_length=30,
n_fft=16384,
num_chroma=12,
padding_value=0.0,
return_attention_mask=False, # pad inputs to max length with silence token (zero) and no attention mask
stem_indices=[3, 2],
**kwargs,
):
super().__init__(
feature_size=feature_size,
sampling_rate=sampling_rate,
padding_value=padding_value,
return_attention_mask=return_attention_mask,
**kwargs,
)
self.n_fft = n_fft
self.hop_length = hop_length
self.chunk_length = chunk_length
self.n_samples = chunk_length * sampling_rate
self.sampling_rate = sampling_rate
self.chroma_filters = torch.from_numpy(
chroma_filter_bank(sampling_rate=sampling_rate, num_frequency_bins=n_fft, tuning=0, num_chroma=num_chroma)
).float()
self.spectrogram = torchaudio.transforms.Spectrogram(
n_fft=n_fft, win_length=n_fft, hop_length=hop_length, power=2, center=True, pad=0, normalized=True
)
self.stem_indices = stem_indices
def _torch_extract_fbank_features(self, waveform: torch.Tensor) -> torch.Tensor:
"""
Compute the chroma spectrogram of the provided audio using the torchaudio spectrogram implementation and the librosa chroma features.
"""
# if wav length is not long enough, pad it
wav_length = waveform.shape[-1]
if wav_length < self.n_fft:
pad = self.n_fft - wav_length
rest = 0 if pad % 2 == 0 else 1
waveform = torch.nn.functional.pad(waveform, (pad // 2, pad // 2 + rest), "constant", 0)
# squeeze alongside channel dimension
spec = self.spectrogram(waveform).squeeze(1)
# sum along the frequency dimension
raw_chroma = torch.einsum("cf, ...ft->...ct", self.chroma_filters, spec)
# normalise with max value
norm_chroma = torch.nn.functional.normalize(raw_chroma, p=float("inf"), dim=-2, eps=1e-6)
# transpose time and chroma dimension -> (batch, time, chroma)
norm_chroma = norm_chroma.transpose(1, 2)
# replace max value alongside chroma dimension with 1 and replace the rest with 0
idx = norm_chroma.argmax(-1, keepdim=True)
norm_chroma[:] = 0
norm_chroma.scatter_(dim=-1, index=idx, value=1)
return norm_chroma
def _extract_stem_indices(self, audio, sampling_rate=None):
"""
Extracts stems from the output of the [Demucs](https://github.com/adefossez/demucs/tree/main) audio separation model,
then converts to mono-channel and resample to the feature extractor sampling rate.
Args:
audio (`torch.Tensor` of shape `(batch_size, num_stems, channel_size, audio_length)`):
The output of the Demucs model to be processed.
sampling_rate (`int`, *optional*):
Demucs sampling rate. If not specified, defaults to `44000`.
"""
sampling_rate = 44000 if sampling_rate is None else sampling_rate
# extract "vocals" and "others" sources from audio encoder (demucs) output
# [batch_size, num_stems, channel_size, audio_length]
wav = audio[:, torch.tensor(self.stem_indices)]
# merge extracted stems to single waveform
wav = wav.sum(1)
# convert to mono-channel waveform
wav = wav.mean(dim=1, keepdim=True)
# resample to model sampling rate
# not equivalent to julius.resample
if sampling_rate != self.sampling_rate:
wav = torchaudio.functional.resample(
wav, sampling_rate, self.sampling_rate, rolloff=0.945, lowpass_filter_width=24
)
# [batch_size, 1, audio_length] -> [batch_size, audio_length]
wav = wav.squeeze(1)
return wav
def __call__(
self,
audio: Union[np.ndarray, List[float], List[np.ndarray], List[List[float]]],
truncation: bool = True,
pad_to_multiple_of: Optional[int] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_attention_mask: Optional[bool] = None,
padding: Optional[str] = True,
max_length: Optional[int] = None,
sampling_rate: Optional[int] = None,
**kwargs,
) -> BatchFeature:
"""
Main method to featurize and prepare for the model one or several sequence(s).
Args:
audio (`torch.Tensor`, `np.ndarray`, `List[float]`, `List[np.ndarray]`, `List[torch.Tensor]`, `List[List[float]]`):
The sequence or batch of sequences to be padded. Each sequence can be a torch tensor, a numpy array, a list of float
values, a list of numpy arrays, a list of torch tensors, or a list of list of float values.
If `audio` is the output of Demucs, it has to be a torch tensor of shape `(batch_size, num_stems, channel_size, audio_length)`.
Otherwise, it must be mono or stereo channel audio.
truncation (`bool`, *optional*, default to `True`):
Activates truncation to cut input sequences longer than *max_length* to *max_length*.
pad_to_multiple_of (`int`, *optional*, defaults to None):
If set will pad the sequence to a multiple of the provided value.
This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability
`>= 7.5` (Volta), or on TPUs which benefit from having sequence lengths be a multiple of 128.
return_tensors (`str` or [`~utils.TensorType`], *optional*):
If set, will return tensors instead of list of python integers. Acceptable values are:
- `'tf'`: Return TensorFlow `tf.constant` objects.
- `'pt'`: Return PyTorch `torch.Tensor` objects.
- `'np'`: Return Numpy `np.ndarray` objects.
return_attention_mask (`bool`, *optional*):
Whether to return the attention mask. If left to the default, will return the attention mask according
to the specific feature_extractor's default.
[What are attention masks?](../glossary#attention-mask)
<Tip>
For Musicgen Melody models, audio `attention_mask` is not necessary.
</Tip>
padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `True`):
Select a strategy to pad the returned sequences (according to the model's padding side and padding
index) among:
- `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
sequence if provided).
- `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
acceptable input length for the model if that argument is not provided.
- `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different
lengths).
max_length (`int`, *optional*):
Maximum length of the returned list and optionally padding length (see above).
sampling_rate (`int`, *optional*):
The sampling rate at which the `audio` input was sampled. It is strongly recommended to pass
`sampling_rate` at the forward call to prevent silent errors.
Note that if `audio` is the output of Demucs, `sampling_rate` must be the sampling rate at which Demucs operates.
"""
if sampling_rate is None:
logger.warning_once(
"It is strongly recommended to pass the `sampling_rate` argument to this function. "
"Failing to do so can result in silent errors that might be hard to debug."
)
if isinstance(audio, torch.Tensor) and len(audio.shape) == 4:
logger.warning_once(
"`audio` is a 4-dimensional torch tensor and has thus been recognized as the output of `Demucs`. "
"If this is not the case, make sure to read Musicgen Melody docstrings and "
"to correct `audio` to get the right behaviour."
"Link to the docstrings: https://huggingface.co/docs/transformers/main/en/model_doc/musicgen_melody"
)
audio = self._extract_stem_indices(audio, sampling_rate=sampling_rate)
elif sampling_rate is not None and sampling_rate != self.sampling_rate:
audio = torchaudio.functional.resample(
audio, sampling_rate, self.sampling_rate, rolloff=0.945, lowpass_filter_width=24
)
is_batched = isinstance(audio, (np.ndarray, torch.Tensor)) and len(audio.shape) > 1
is_batched = is_batched or (
isinstance(audio, (list, tuple)) and (isinstance(audio[0], (torch.Tensor, np.ndarray, tuple, list)))
)
if is_batched and not isinstance(audio[0], torch.Tensor):
audio = [torch.tensor(speech, dtype=torch.float32).unsqueeze(-1) for speech in audio]
elif is_batched:
audio = [speech.unsqueeze(-1) for speech in audio]
elif not is_batched and not isinstance(audio, torch.Tensor):
audio = torch.tensor(audio, dtype=torch.float32).unsqueeze(-1)
if isinstance(audio[0], torch.Tensor) and audio[0].dtype is torch.float64:
audio = [speech.to(torch.float32) for speech in audio]
# always return batch
if not is_batched:
audio = [audio]
if len(audio[0].shape) == 3:
logger.warning_once(
"`audio` has been detected as a batch of stereo signals. Will be convert to mono signals. "
"If this is an undesired behaviour, make sure to read Musicgen Melody docstrings and "
"to correct `audio` to get the right behaviour."
"Link to the docstrings: https://huggingface.co/docs/transformers/main/en/model_doc/musicgen_melody"
)
# convert to mono-channel waveform
audio = [stereo.mean(dim=0) for stereo in audio]
batched_speech = BatchFeature({"input_features": audio})
padded_inputs = self.pad(
batched_speech,
padding=padding,
max_length=max_length if max_length else self.n_samples,
truncation=truncation,
pad_to_multiple_of=pad_to_multiple_of,
return_attention_mask=return_attention_mask,
return_tensors="pt",
)
input_features = self._torch_extract_fbank_features(padded_inputs["input_features"].squeeze(-1))
padded_inputs["input_features"] = input_features
if return_attention_mask:
# rescale from raw audio length to spectrogram length
padded_inputs["attention_mask"] = padded_inputs["attention_mask"][:, :: self.hop_length]
if return_tensors is not None:
padded_inputs = padded_inputs.convert_to_tensors(return_tensors)
return padded_inputs
def to_dict(self) -> Dict[str, Any]:
"""
Serializes this instance to a Python dictionary. Returns:
`Dict[str, Any]`: Dictionary of all the attributes that make up this configuration instance.
"""
output = copy.deepcopy(self.__dict__)
output["feature_extractor_type"] = self.__class__.__name__
if "mel_filters" in output:
del output["mel_filters"]
if "window" in output:
del output["window"]
if "chroma_filters" in output:
del output["chroma_filters"]
if "spectrogram" in output:
del output["spectrogram"]
return output
# coding=utf-8
# Copyright 2024 Meta AI and The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Text/audio processor class for MusicGen Melody
"""
from typing import List, Optional
import numpy as np
from ...processing_utils import ProcessorMixin
from ...utils import to_numpy
class MusicgenMelodyProcessor(ProcessorMixin):
r"""
Constructs a MusicGen Melody processor which wraps a Wav2Vec2 feature extractor - for raw audio waveform processing - and a T5 tokenizer into a single processor
class.
[`MusicgenProcessor`] offers all the functionalities of [`MusicgenMelodyFeatureExtractor`] and [`T5Tokenizer`]. See
[`~MusicgenProcessor.__call__`] and [`~MusicgenProcessor.decode`] for more information.
Args:
feature_extractor (`MusicgenMelodyFeatureExtractor`):
An instance of [`MusicgenMelodyFeatureExtractor`]. The feature extractor is a required input.
tokenizer (`T5Tokenizer`):
An instance of [`T5Tokenizer`]. The tokenizer is a required input.
"""
feature_extractor_class = "MusicgenMelodyFeatureExtractor"
tokenizer_class = ("T5Tokenizer", "T5TokenizerFast")
def __init__(self, feature_extractor, tokenizer):
super().__init__(feature_extractor, tokenizer)
# Copied from transformers.models.musicgen.processing_musicgen.MusicgenProcessor.get_decoder_prompt_ids
def get_decoder_prompt_ids(self, task=None, language=None, no_timestamps=True):
return self.tokenizer.get_decoder_prompt_ids(task=task, language=language, no_timestamps=no_timestamps)
def __call__(self, audio=None, text=None, **kwargs):
"""
Main method to prepare for the model one or several sequences(s) and audio(s). This method forwards the `audio`
and `kwargs` arguments to MusicgenMelodyFeatureExtractor's [`~MusicgenMelodyFeatureExtractor.__call__`] if `audio` is not
`None` to pre-process the audio. It also forwards the `text` and `kwargs` arguments to
PreTrainedTokenizer's [`~PreTrainedTokenizer.__call__`] if `text` is not `None`. Please refer to the doctsring of the above two methods for more information.
Args:
audio (`np.ndarray`, `torch.Tensor`, `List[np.ndarray]`, `List[torch.Tensor]`):
The audio or batch of audios to be prepared. Each audio can be NumPy array or PyTorch tensor. In case
of a NumPy array/PyTorch tensor, each audio should be a mono-stereo signal of shape (T), where T is the sample length of the audio.
text (`str`, `List[str]`, `List[List[str]]`):
The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
(pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
`is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
kwargs (*optional*):
Remaining dictionary of keyword arguments that will be passed to the feature extractor and/or the
tokenizer.
Returns:
[`BatchEncoding`]: A [`BatchEncoding`] with the following fields:
- **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
- **input_features** -- Audio input features to be fed to a model. Returned when `audio` is not `None`.
- **attention_mask** -- List of token indices specifying which tokens should be attended to by the model when `text` is not `None`.
When only `audio` is specified, returns the timestamps attention mask.
"""
sampling_rate = kwargs.pop("sampling_rate", None)
if audio is None and text is None:
raise ValueError("You need to specify either an `audio` or `text` input to process.")
if text is not None:
inputs = self.tokenizer(text, **kwargs)
if audio is not None:
audio_inputs = self.feature_extractor(audio, sampling_rate=sampling_rate, **kwargs)
if text is None:
return audio_inputs
elif audio is None:
return inputs
else:
inputs["input_features"] = audio_inputs["input_features"]
return inputs
# Copied from transformers.models.musicgen.processing_musicgen.MusicgenProcessor.batch_decode with padding_mask->attention_mask
def batch_decode(self, *args, **kwargs):
"""
This method is used to decode either batches of audio outputs from the MusicGen model, or batches of token ids
from the tokenizer. In the case of decoding token ids, this method forwards all its arguments to T5Tokenizer's
[`~PreTrainedTokenizer.batch_decode`]. Please refer to the docstring of this method for more information.
"""
audio_values = kwargs.pop("audio", None)
attention_mask = kwargs.pop("attention_mask", None)
if len(args) > 0:
audio_values = args[0]
args = args[1:]
if audio_values is not None:
return self._decode_audio(audio_values, attention_mask=attention_mask)
else:
return self.tokenizer.batch_decode(*args, **kwargs)
# Copied from transformers.models.musicgen.processing_musicgen.MusicgenProcessor.decode
def decode(self, *args, **kwargs):
"""
This method forwards all its arguments to T5Tokenizer's [`~PreTrainedTokenizer.decode`]. Please refer to the
docstring of this method for more information.
"""
return self.tokenizer.decode(*args, **kwargs)
# Copied from transformers.models.musicgen.processing_musicgen.MusicgenProcessor._decode_audio with padding_mask->attention_mask
def _decode_audio(self, audio_values, attention_mask: Optional = None) -> List[np.ndarray]:
"""
This method strips any padding from the audio values to return a list of numpy audio arrays.
"""
audio_values = to_numpy(audio_values)
bsz, channels, seq_len = audio_values.shape
if attention_mask is None:
return list(audio_values)
attention_mask = to_numpy(attention_mask)
# match the sequence length of the padding mask to the generated audio arrays by padding with the **non-padding**
# token (so that the generated audio values are **not** treated as padded tokens)
difference = seq_len - attention_mask.shape[-1]
padding_value = 1 - self.feature_extractor.padding_value
attention_mask = np.pad(attention_mask, ((0, 0), (0, difference)), "constant", constant_values=padding_value)
audio_values = audio_values.tolist()
for i in range(bsz):
sliced_audio = np.asarray(audio_values[i])[
attention_mask[i][None, :] != self.feature_extractor.padding_value
]
audio_values[i] = sliced_audio.reshape(channels, -1)
return audio_values
def get_unconditional_inputs(self, num_samples=1, return_tensors="pt"):
"""
Helper function to get null inputs for unconditional generation, enabling the model to be used without the
feature extractor or tokenizer.
Args:
num_samples (int, *optional*):
Number of audio samples to unconditionally generate.
Example:
```python
>>> from transformers import MusicgenMelodyForConditionalGeneration, MusicgenMelodyProcessor
>>> model = MusicgenMelodyForConditionalGeneration.from_pretrained("facebook/musicgen-melody")
>>> # get the unconditional (or 'null') inputs for the model
>>> processor = MusicgenMelodyProcessor.from_pretrained("facebook/musicgen-melody")
>>> unconditional_inputs = processor.get_unconditional_inputs(num_samples=1)
>>> audio_samples = model.generate(**unconditional_inputs, max_new_tokens=256)
```"""
inputs = self.tokenizer([""] * num_samples, return_tensors=return_tensors, return_attention_mask=True)
inputs["attention_mask"][:] = 0
return inputs
......@@ -5876,6 +5876,37 @@ class MusicgenProcessor(metaclass=DummyObject):
requires_backends(self, ["torch"])
MUSICGEN_MELODY_PRETRAINED_MODEL_ARCHIVE_LIST = None
class MusicgenMelodyForCausalLM(metaclass=DummyObject):
_backends = ["torch"]
def __init__(self, *args, **kwargs):
requires_backends(self, ["torch"])
class MusicgenMelodyForConditionalGeneration(metaclass=DummyObject):
_backends = ["torch"]
def __init__(self, *args, **kwargs):
requires_backends(self, ["torch"])
class MusicgenMelodyModel(metaclass=DummyObject):
_backends = ["torch"]
def __init__(self, *args, **kwargs):
requires_backends(self, ["torch"])
class MusicgenMelodyPreTrainedModel(metaclass=DummyObject):
_backends = ["torch"]
def __init__(self, *args, **kwargs):
requires_backends(self, ["torch"])
MVP_PRETRAINED_MODEL_ARCHIVE_LIST = None
......
# This file is autogenerated by the command `make fix-copies`, do not edit.
from ..utils import DummyObject, requires_backends
class MusicgenMelodyFeatureExtractor(metaclass=DummyObject):
_backends = ["torchaudio"]
def __init__(self, *args, **kwargs):
requires_backends(self, ["torchaudio"])
class MusicgenMelodyProcessor(metaclass=DummyObject):
_backends = ["torchaudio"]
def __init__(self, *args, **kwargs):
requires_backends(self, ["torchaudio"])
# coding=utf-8
# Copyright 2024 HuggingFace Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import itertools
import math
import os
import random
import tempfile
import unittest
import numpy as np
from transformers.testing_utils import (
check_json_file_has_correct_format,
require_torch,
require_torchaudio,
)
from transformers.utils.import_utils import is_torchaudio_available
from ...test_sequence_feature_extraction_common import SequenceFeatureExtractionTestMixin
if is_torchaudio_available():
import torch
from transformers import MusicgenMelodyFeatureExtractor
global_rng = random.Random()
# Copied from tests.models.whisper.test_feature_extraction_whisper.floats_list
def floats_list(shape, scale=1.0, rng=None, name=None):
"""Creates a random float32 tensor"""
if rng is None:
rng = global_rng
values = []
for batch_idx in range(shape[0]):
values.append([])
for _ in range(shape[1]):
values[-1].append(rng.random() * scale)
return values
# Copied from tests.models.musicgen.test_modeling_musicgen.get_bip_bip
def get_bip_bip(bip_duration=0.125, duration=0.5, sample_rate=32000):
"""Produces a series of 'bip bip' sounds at a given frequency."""
timesteps = np.arange(int(duration * sample_rate)) / sample_rate
wav = np.cos(2 * math.pi * 440 * timesteps)
time_period = (timesteps % (2 * bip_duration)) / (2 * bip_duration)
envelope = time_period >= 0.5
return wav * envelope
@require_torch
@require_torchaudio
class MusicgenMelodyFeatureExtractionTester(unittest.TestCase):
def __init__(
self,
parent,
batch_size=7,
min_seq_length=400,
max_seq_length=2000,
feature_size=12,
padding_value=0.0,
sampling_rate=4_000,
return_attention_mask=True,
):
self.parent = parent
self.batch_size = batch_size
self.min_seq_length = min_seq_length
self.max_seq_length = max_seq_length
self.seq_length_diff = (self.max_seq_length - self.min_seq_length) // (self.batch_size - 1)
self.padding_value = padding_value
self.sampling_rate = sampling_rate
self.return_attention_mask = return_attention_mask
self.feature_size = feature_size
self.num_chroma = feature_size
def prepare_feat_extract_dict(self):
return {
"feature_size": self.feature_size,
"padding_value": self.padding_value,
"sampling_rate": self.sampling_rate,
"return_attention_mask": self.return_attention_mask,
}
# Copied from tests.models.whisper.test_feature_extraction_whisper.WhisperFeatureExtractionTester.prepare_inputs_for_common
def prepare_inputs_for_common(self, equal_length=False, numpify=False):
def _flatten(list_of_lists):
return list(itertools.chain(*list_of_lists))
if equal_length:
speech_inputs = [floats_list((self.max_seq_length, self.feature_size)) for _ in range(self.batch_size)]
else:
# make sure that inputs increase in size
speech_inputs = [
floats_list((x, self.feature_size))
for x in range(self.min_seq_length, self.max_seq_length, self.seq_length_diff)
]
if numpify:
speech_inputs = [np.asarray(x) for x in speech_inputs]
return speech_inputs
@require_torchaudio
@require_torch
class MusicgenMelodyFeatureExtractionTest(SequenceFeatureExtractionTestMixin, unittest.TestCase):
feature_extraction_class = MusicgenMelodyFeatureExtractor if is_torchaudio_available() else None
def setUp(self):
self.feat_extract_tester = MusicgenMelodyFeatureExtractionTester(self)
# Copied from tests.models.seamless_m4t.test_feature_extraction_seamless_m4t.SeamlessM4TFeatureExtractionTest.test_feat_extract_from_and_save_pretrained
def test_feat_extract_from_and_save_pretrained(self):
feat_extract_first = self.feature_extraction_class(**self.feat_extract_dict)
with tempfile.TemporaryDirectory() as tmpdirname:
saved_file = feat_extract_first.save_pretrained(tmpdirname)[0]
check_json_file_has_correct_format(saved_file)
feat_extract_second = self.feature_extraction_class.from_pretrained(tmpdirname)
dict_first = feat_extract_first.to_dict()
dict_second = feat_extract_second.to_dict()
self.assertDictEqual(dict_first, dict_second)
# Copied from tests.models.seamless_m4t.test_feature_extraction_seamless_m4t.SeamlessM4TFeatureExtractionTest.test_feat_extract_to_json_file
def test_feat_extract_to_json_file(self):
feat_extract_first = self.feature_extraction_class(**self.feat_extract_dict)
with tempfile.TemporaryDirectory() as tmpdirname:
json_file_path = os.path.join(tmpdirname, "feat_extract.json")
feat_extract_first.to_json_file(json_file_path)
feat_extract_second = self.feature_extraction_class.from_json_file(json_file_path)
dict_first = feat_extract_first.to_dict()
dict_second = feat_extract_second.to_dict()
self.assertEqual(dict_first, dict_second)
def test_call(self):
# Tests that all call wrap to encode_plus and batch_encode_plus
feature_extractor = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict())
# create three inputs of length 800, 1000, and 1200
speech_inputs = [floats_list((1, x))[0] for x in range(800, 1400, 200)]
np_speech_inputs = [np.asarray(speech_input) for speech_input in speech_inputs]
# Test feature size
input_features = feature_extractor(np_speech_inputs, padding=True, return_tensors="np").input_features
self.assertTrue(input_features.ndim == 3)
self.assertTrue(input_features.shape[0] == 3)
# Ignore copy
self.assertTrue(input_features.shape[-1] == feature_extractor.feature_size)
# Test not batched input
encoded_sequences_1 = feature_extractor(speech_inputs[0], return_tensors="np").input_features
encoded_sequences_2 = feature_extractor(np_speech_inputs[0], return_tensors="np").input_features
self.assertTrue(np.allclose(encoded_sequences_1, encoded_sequences_2, atol=1e-3))
# Test batched
encoded_sequences_1 = feature_extractor(speech_inputs, return_tensors="np").input_features
encoded_sequences_2 = feature_extractor(np_speech_inputs, return_tensors="np").input_features
for enc_seq_1, enc_seq_2 in zip(encoded_sequences_1, encoded_sequences_2):
self.assertTrue(np.allclose(enc_seq_1, enc_seq_2, atol=1e-3))
# Test 2-D numpy arrays are batched.
speech_inputs = [floats_list((1, x))[0] for x in (800, 800, 800)]
np_speech_inputs = np.asarray(speech_inputs)
encoded_sequences_1 = feature_extractor(speech_inputs, return_tensors="np").input_features
encoded_sequences_2 = feature_extractor(np_speech_inputs, return_tensors="np").input_features
for enc_seq_1, enc_seq_2 in zip(encoded_sequences_1, encoded_sequences_2):
self.assertTrue(np.allclose(enc_seq_1, enc_seq_2, atol=1e-3))
@require_torchaudio
def test_call_from_demucs(self):
# Tests that all call wrap to encode_plus and batch_encode_plus
feature_extractor = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict())
# (batch_size, num_stems, channel_size, audio_length)
inputs = torch.rand([4, 5, 2, 44000])
# Test feature size
input_features = feature_extractor(inputs, padding=True, return_tensors="np").input_features
self.assertTrue(input_features.ndim == 3)
self.assertTrue(input_features.shape[0] == 4)
self.assertTrue(input_features.shape[-1] == feature_extractor.feature_size)
# Test single input
encoded_sequences_1 = feature_extractor(inputs[[0]], return_tensors="np").input_features
self.assertTrue(np.allclose(encoded_sequences_1[0], input_features[0], atol=1e-3))
# Copied from tests.models.whisper.test_feature_extraction_whisper.WhisperFeatureExtractionTest.test_double_precision_pad with input_features->input_features
def test_double_precision_pad(self):
import torch
feature_extractor = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict())
np_speech_inputs = np.random.rand(100, 32).astype(np.float64)
py_speech_inputs = np_speech_inputs.tolist()
for inputs in [py_speech_inputs, np_speech_inputs]:
np_processed = feature_extractor.pad([{"input_features": inputs}], return_tensors="np")
self.assertTrue(np_processed.input_features.dtype == np.float32)
pt_processed = feature_extractor.pad([{"input_features": inputs}], return_tensors="pt")
self.assertTrue(pt_processed.input_features.dtype == torch.float32)
def test_integration(self):
EXPECTED_INPUT_FEATURES = torch.zeros([2, 8, 12])
EXPECTED_INPUT_FEATURES[0, :6, 9] = 1
EXPECTED_INPUT_FEATURES[0, 6:, 0] = 1
EXPECTED_INPUT_FEATURES[1, :, 9] = 1
input_speech = [get_bip_bip(duration=0.5), get_bip_bip(duration=1.0)]
feature_extractor = MusicgenMelodyFeatureExtractor()
input_features = feature_extractor(input_speech, return_tensors="pt").input_features
self.assertEqual(input_features.shape, (2, 8, 12))
self.assertTrue((input_features == EXPECTED_INPUT_FEATURES).all())
This diff is collapsed.
# Copyright 2024 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tests for the MusicGen processor."""
import random
import shutil
import tempfile
import unittest
import numpy as np
from transformers import T5Tokenizer, T5TokenizerFast
from transformers.testing_utils import require_sentencepiece, require_torch, require_torchaudio
from transformers.utils.import_utils import is_torchaudio_available
if is_torchaudio_available():
from transformers import MusicgenMelodyFeatureExtractor, MusicgenMelodyProcessor
global_rng = random.Random()
# Copied from tests.models.whisper.test_feature_extraction_whisper.floats_list
def floats_list(shape, scale=1.0, rng=None, name=None):
"""Creates a random float32 tensor"""
if rng is None:
rng = global_rng
values = []
for batch_idx in range(shape[0]):
values.append([])
for _ in range(shape[1]):
values[-1].append(rng.random() * scale)
return values
@require_torch
@require_sentencepiece
@require_torchaudio
# Copied from tests.models.musicgen.test_processing_musicgen.MusicgenProcessorTest with Musicgen->MusicgenMelody, Encodec->MusicgenMelody, padding_mask->attention_mask, input_values->input_features
class MusicgenMelodyProcessorTest(unittest.TestCase):
def setUp(self):
# Ignore copy
self.checkpoint = "facebook/musicgen-melody"
self.tmpdirname = tempfile.mkdtemp()
def get_tokenizer(self, **kwargs):
return T5Tokenizer.from_pretrained(self.checkpoint, **kwargs)
def get_feature_extractor(self, **kwargs):
return MusicgenMelodyFeatureExtractor.from_pretrained(self.checkpoint, **kwargs)
def tearDown(self):
shutil.rmtree(self.tmpdirname)
def test_save_load_pretrained_default(self):
tokenizer = self.get_tokenizer()
feature_extractor = self.get_feature_extractor()
processor = MusicgenMelodyProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
processor.save_pretrained(self.tmpdirname)
processor = MusicgenMelodyProcessor.from_pretrained(self.tmpdirname)
self.assertEqual(processor.tokenizer.get_vocab(), tokenizer.get_vocab())
self.assertIsInstance(processor.tokenizer, T5TokenizerFast)
self.assertEqual(processor.feature_extractor.to_json_string(), feature_extractor.to_json_string())
self.assertIsInstance(processor.feature_extractor, MusicgenMelodyFeatureExtractor)
def test_save_load_pretrained_additional_features(self):
processor = MusicgenMelodyProcessor(
tokenizer=self.get_tokenizer(), feature_extractor=self.get_feature_extractor()
)
processor.save_pretrained(self.tmpdirname)
tokenizer_add_kwargs = self.get_tokenizer(bos_token="(BOS)", eos_token="(EOS)")
feature_extractor_add_kwargs = self.get_feature_extractor(do_normalize=False, padding_value=1.0)
processor = MusicgenMelodyProcessor.from_pretrained(
self.tmpdirname, bos_token="(BOS)", eos_token="(EOS)", do_normalize=False, padding_value=1.0
)
self.assertEqual(processor.tokenizer.get_vocab(), tokenizer_add_kwargs.get_vocab())
self.assertIsInstance(processor.tokenizer, T5TokenizerFast)
self.assertEqual(processor.feature_extractor.to_json_string(), feature_extractor_add_kwargs.to_json_string())
self.assertIsInstance(processor.feature_extractor, MusicgenMelodyFeatureExtractor)
def test_feature_extractor(self):
feature_extractor = self.get_feature_extractor()
tokenizer = self.get_tokenizer()
processor = MusicgenMelodyProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
raw_speech = floats_list((3, 1000))
input_feat_extract = feature_extractor(raw_speech, return_tensors="np")
input_processor = processor(raw_speech, return_tensors="np")
for key in input_feat_extract.keys():
self.assertAlmostEqual(input_feat_extract[key].sum(), input_processor[key].sum(), delta=1e-2)
def test_tokenizer(self):
feature_extractor = self.get_feature_extractor()
tokenizer = self.get_tokenizer()
processor = MusicgenMelodyProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
input_str = "This is a test string"
encoded_processor = processor(text=input_str)
encoded_tok = tokenizer(input_str)
for key in encoded_tok.keys():
self.assertListEqual(encoded_tok[key], encoded_processor[key])
def test_tokenizer_decode(self):
feature_extractor = self.get_feature_extractor()
tokenizer = self.get_tokenizer()
processor = MusicgenMelodyProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
predicted_ids = [[1, 4, 5, 8, 1, 0, 8], [3, 4, 3, 1, 1, 8, 9]]
decoded_processor = processor.batch_decode(sequences=predicted_ids)
decoded_tok = tokenizer.batch_decode(predicted_ids)
self.assertListEqual(decoded_tok, decoded_processor)
def test_model_input_names(self):
feature_extractor = self.get_feature_extractor()
tokenizer = self.get_tokenizer()
processor = MusicgenMelodyProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
self.assertListEqual(
processor.model_input_names,
feature_extractor.model_input_names,
msg="`processor` and `feature_extractor` model input names do not match",
)
# Ignore copy
def test_decode_audio(self):
feature_extractor = self.get_feature_extractor(padding_side="left")
tokenizer = self.get_tokenizer()
processor = MusicgenMelodyProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
attention_mask = np.zeros((3, 20))
attention_mask[0, -5:] = 1
attention_mask[1, -20:] = 1
attention_mask[2, -10:] = 1
generated_speech = np.asarray(floats_list((3, 20)))[:, None, :]
decoded_audios = processor.batch_decode(generated_speech, attention_mask=attention_mask)
self.assertIsInstance(decoded_audios, list)
for audio in decoded_audios:
self.assertIsInstance(audio, np.ndarray)
self.assertTrue(decoded_audios[0].shape == (1, 5))
self.assertTrue(decoded_audios[1].shape == (1, 20))
self.assertTrue(decoded_audios[2].shape == (1, 10))
......@@ -20,6 +20,7 @@ import pytest
from transformers.audio_utils import (
amplitude_to_db,
chroma_filter_bank,
hertz_to_mel,
mel_filter_bank,
mel_to_hertz,
......@@ -27,6 +28,11 @@ from transformers.audio_utils import (
spectrogram,
window_function,
)
from transformers.testing_utils import is_librosa_available, require_librosa
if is_librosa_available():
from librosa.filters import chroma
class AudioUtilsFunctionTester(unittest.TestCase):
......@@ -755,3 +761,57 @@ class AudioUtilsFunctionTester(unittest.TestCase):
amplitude_to_db(spectrogram, min_value=0.0)
with pytest.raises(ValueError):
amplitude_to_db(spectrogram, db_range=-80)
@require_librosa
def test_chroma_equivalence(self):
num_frequency_bins = 25
num_chroma = 6
sampling_rate = 24000
# test default parameters
original_chroma = chroma(sr=sampling_rate, n_chroma=num_chroma, n_fft=num_frequency_bins)
utils_chroma = chroma_filter_bank(
num_frequency_bins=num_frequency_bins, num_chroma=num_chroma, sampling_rate=sampling_rate
)
self.assertTrue(np.allclose(original_chroma, utils_chroma))
# test no weighting_parameters
original_chroma = chroma(sr=sampling_rate, n_chroma=num_chroma, n_fft=num_frequency_bins, octwidth=None)
utils_chroma = chroma_filter_bank(
num_frequency_bins=num_frequency_bins,
num_chroma=num_chroma,
sampling_rate=sampling_rate,
weighting_parameters=None,
)
self.assertTrue(np.allclose(original_chroma, utils_chroma))
# test with L1 norm
original_chroma = chroma(sr=sampling_rate, n_chroma=num_chroma, n_fft=num_frequency_bins, norm=1.0)
utils_chroma = chroma_filter_bank(
num_frequency_bins=num_frequency_bins, num_chroma=num_chroma, sampling_rate=sampling_rate, power=1.0
)
self.assertTrue(np.allclose(original_chroma, utils_chroma))
# test starting at 'A' chroma, power = None, tuning = 0, different weighting_parameters
original_chroma = chroma(
sr=sampling_rate,
n_chroma=num_chroma,
n_fft=num_frequency_bins,
norm=None,
base_c=None,
octwidth=1.0,
ctroct=4.0,
)
utils_chroma = chroma_filter_bank(
num_frequency_bins=num_frequency_bins,
num_chroma=num_chroma,
sampling_rate=sampling_rate,
power=None,
start_at_c_chroma=False,
weighting_parameters=(4.0, 1.0),
)
self.assertTrue(np.allclose(original_chroma, utils_chroma))
......@@ -395,6 +395,7 @@ OBJECTS_TO_IGNORE = [
"MraConfig",
"MusicgenDecoderConfig",
"MusicgenForConditionalGeneration",
"MusicgenMelodyForConditionalGeneration",
"MvpConfig",
"MvpTokenizerFast",
"MT5Tokenizer",
......
......@@ -294,6 +294,7 @@ IGNORE_NON_AUTO_CONFIGURED = PRIVATE_MODELS.copy() + [
"BarkCoarseModel",
"BarkFineModel",
"BarkSemanticModel",
"MusicgenMelodyModel",
"MusicgenModel",
"MusicgenForConditionalGeneration",
"SpeechT5ForSpeechToSpeech",
......
......@@ -176,6 +176,7 @@ docs/source/en/model_doc/mpt.md
docs/source/en/model_doc/mra.md
docs/source/en/model_doc/mt5.md
docs/source/en/model_doc/musicgen.md
docs/source/en/model_doc/musicgen_melody.md
docs/source/en/model_doc/mvp.md
docs/source/en/model_doc/nat.md
docs/source/en/model_doc/nezha.md
......@@ -706,6 +707,7 @@ src/transformers/models/mt5/modeling_flax_mt5.py
src/transformers/models/mt5/modeling_mt5.py
src/transformers/models/mt5/modeling_tf_mt5.py
src/transformers/models/musicgen/convert_musicgen_transformers.py
src/transformers/models/musicgen_melody/convert_musicgen_melody_transformers.py
src/transformers/models/mvp/modeling_mvp.py
src/transformers/models/nezha/modeling_nezha.py
src/transformers/models/nllb_moe/configuration_nllb_moe.py
......
......@@ -8,4 +8,6 @@ docs/source/en/tasks/prompting.md
src/transformers/models/blip_2/modeling_blip_2.py
src/transformers/models/ctrl/modeling_ctrl.py
src/transformers/models/fuyu/modeling_fuyu.py
src/transformers/models/kosmos2/modeling_kosmos2.py
\ No newline at end of file
src/transformers/models/kosmos2/modeling_kosmos2.py
src/transformers/models/musicgen_melody/modeling_musicgen_melody.py
src/transformers/models/musicgen_melody/processing_musicgen_melody.py
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment