Commit 31a54850 authored by Yoach Lacombe's avatar Yoach Lacombe
Browse files

make style

parent 91542bfa
...@@ -13,7 +13,7 @@ encodec_vocab_size = encodec.codebook_size ...@@ -13,7 +13,7 @@ encodec_vocab_size = encodec.codebook_size
decoder_config = ParlerTTSDecoderConfig( decoder_config = ParlerTTSDecoderConfig(
vocab_size=encodec_vocab_size+1, vocab_size=encodec_vocab_size + 1,
max_position_embeddings=2048, max_position_embeddings=2048,
num_hidden_layers=4, num_hidden_layers=4,
ffn_dim=512, ffn_dim=512,
...@@ -27,34 +27,32 @@ decoder_config = ParlerTTSDecoderConfig( ...@@ -27,34 +27,32 @@ decoder_config = ParlerTTSDecoderConfig(
activation_dropout=0.0, activation_dropout=0.0,
pad_token_id=encodec_vocab_size, pad_token_id=encodec_vocab_size,
eos_token_id=encodec_vocab_size, eos_token_id=encodec_vocab_size,
bos_token_id=encodec_vocab_size+1, bos_token_id=encodec_vocab_size + 1,
num_codebooks=num_codebooks, num_codebooks=num_codebooks,
) )
# TODO: ?? how to make it stop ? # TODO: ?? how to make it stop ?
decoder = ParlerTTSForCausalLM(decoder_config) decoder = ParlerTTSForCausalLM(decoder_config)
decoder.save_pretrained("/raid/yoach/tmp/artefacts/decoder/") decoder.save_pretrained("/raid/yoach/tmp/artefacts/decoder/")
model = ParlerTTSForConditionalGeneration.from_sub_models_pretrained( model = ParlerTTSForConditionalGeneration.from_sub_models_pretrained(
text_encoder_pretrained_model_name_or_path=text_model, text_encoder_pretrained_model_name_or_path=text_model,
audio_encoder_pretrained_model_name_or_path=encodec_version, audio_encoder_pretrained_model_name_or_path=encodec_version,
decoder_pretrained_model_name_or_path="/raid/yoach/tmp/artefacts/decoder/", decoder_pretrained_model_name_or_path="/raid/yoach/tmp/artefacts/decoder/",
vocab_size = t5.vocab_size vocab_size=t5.vocab_size,
) )
# set the appropriate bos/pad token ids # set the appropriate bos/pad token ids
model.generation_config.decoder_start_token_id = encodec_vocab_size+1 model.generation_config.decoder_start_token_id = encodec_vocab_size + 1
model.generation_config.pad_token_id = encodec_vocab_size model.generation_config.pad_token_id = encodec_vocab_size
model.generation_config.eos_token_id = encodec_vocab_size model.generation_config.eos_token_id = encodec_vocab_size
# set other default generation config params # set other default generation config params
model.generation_config.max_length = int(30 * model.audio_encoder.config.frame_rate) model.generation_config.max_length = int(30 * model.audio_encoder.config.frame_rate)
model.generation_config.do_sample = False # True model.generation_config.do_sample = False # True
model.generation_config.guidance_scale = 1 # 3.0 model.generation_config.guidance_scale = 1 # 3.0
model.save_pretrained("/raid/yoach/tmp/artefacts/tiny-model/") model.save_pretrained("/raid/yoach/tmp/artefacts/tiny-model/")
\ No newline at end of file
...@@ -20,7 +20,7 @@ encodec_vocab_size = encodec.codebook_size ...@@ -20,7 +20,7 @@ encodec_vocab_size = encodec.codebook_size
decoder_config = ParlerTTSDecoderConfig( decoder_config = ParlerTTSDecoderConfig(
vocab_size=encodec_vocab_size+1, vocab_size=encodec_vocab_size + 1,
max_position_embeddings=2048, max_position_embeddings=2048,
num_hidden_layers=4, num_hidden_layers=4,
ffn_dim=512, ffn_dim=512,
...@@ -34,34 +34,32 @@ decoder_config = ParlerTTSDecoderConfig( ...@@ -34,34 +34,32 @@ decoder_config = ParlerTTSDecoderConfig(
activation_dropout=0.0, activation_dropout=0.0,
pad_token_id=encodec_vocab_size, pad_token_id=encodec_vocab_size,
eos_token_id=encodec_vocab_size, eos_token_id=encodec_vocab_size,
bos_token_id=encodec_vocab_size+1, bos_token_id=encodec_vocab_size + 1,
num_codebooks=num_codebooks, num_codebooks=num_codebooks,
) )
# TODO: ?? how to make it stop ? # TODO: ?? how to make it stop ?
decoder = ParlerTTSForCausalLM(decoder_config) decoder = ParlerTTSForCausalLM(decoder_config)
decoder.save_pretrained("/raid/yoach/tmp/artefacts/decoder/") decoder.save_pretrained("/raid/yoach/tmp/artefacts/decoder/")
model = ParlerTTSForConditionalGeneration.from_sub_models_pretrained( model = ParlerTTSForConditionalGeneration.from_sub_models_pretrained(
text_encoder_pretrained_model_name_or_path=text_model, text_encoder_pretrained_model_name_or_path=text_model,
audio_encoder_pretrained_model_name_or_path=encodec_version, audio_encoder_pretrained_model_name_or_path=encodec_version,
decoder_pretrained_model_name_or_path="/raid/yoach/tmp/artefacts/decoder/", decoder_pretrained_model_name_or_path="/raid/yoach/tmp/artefacts/decoder/",
vocab_size = t5.vocab_size vocab_size=t5.vocab_size,
) )
# set the appropriate bos/pad token ids # set the appropriate bos/pad token ids
model.generation_config.decoder_start_token_id = encodec_vocab_size+1 model.generation_config.decoder_start_token_id = encodec_vocab_size + 1
model.generation_config.pad_token_id = encodec_vocab_size model.generation_config.pad_token_id = encodec_vocab_size
model.generation_config.eos_token_id = encodec_vocab_size model.generation_config.eos_token_id = encodec_vocab_size
# set other default generation config params # set other default generation config params
model.generation_config.max_length = int(30 * model.audio_encoder.config.frame_rate) model.generation_config.max_length = int(30 * model.audio_encoder.config.frame_rate)
model.generation_config.do_sample = False # True model.generation_config.do_sample = False # True
model.generation_config.guidance_scale = 1 # 3.0 model.generation_config.guidance_scale = 1 # 3.0
model.save_pretrained("/raid/yoach/tmp/artefacts/tiny-dac-model/") model.save_pretrained("/raid/yoach/tmp/artefacts/tiny-dac-model/")
\ No newline at end of file
...@@ -21,8 +21,8 @@ encodec_vocab_size = encodec.codebook_size ...@@ -21,8 +21,8 @@ encodec_vocab_size = encodec.codebook_size
decoder_config = ParlerTTSDecoderConfig( decoder_config = ParlerTTSDecoderConfig(
vocab_size=encodec_vocab_size+1, vocab_size=encodec_vocab_size + 1,
max_position_embeddings=3000, # 30 s = 2580 max_position_embeddings=3000, # 30 s = 2580
num_hidden_layers=12, num_hidden_layers=12,
ffn_dim=4096, ffn_dim=4096,
num_attention_heads=16, num_attention_heads=16,
...@@ -35,33 +35,31 @@ decoder_config = ParlerTTSDecoderConfig( ...@@ -35,33 +35,31 @@ decoder_config = ParlerTTSDecoderConfig(
activation_dropout=0.0, activation_dropout=0.0,
pad_token_id=encodec_vocab_size, pad_token_id=encodec_vocab_size,
eos_token_id=encodec_vocab_size, eos_token_id=encodec_vocab_size,
bos_token_id=encodec_vocab_size+1, bos_token_id=encodec_vocab_size + 1,
num_codebooks=num_codebooks, num_codebooks=num_codebooks,
) )
decoder = ParlerTTSForCausalLM(decoder_config) decoder = ParlerTTSForCausalLM(decoder_config)
decoder.save_pretrained("/raid/yoach/tmp/artefacts/decoder/") decoder.save_pretrained("/raid/yoach/tmp/artefacts/decoder/")
model = ParlerTTSForConditionalGeneration.from_sub_models_pretrained( model = ParlerTTSForConditionalGeneration.from_sub_models_pretrained(
text_encoder_pretrained_model_name_or_path=text_model, text_encoder_pretrained_model_name_or_path=text_model,
audio_encoder_pretrained_model_name_or_path=encodec_version, audio_encoder_pretrained_model_name_or_path=encodec_version,
decoder_pretrained_model_name_or_path="/raid/yoach/tmp/artefacts/decoder/", decoder_pretrained_model_name_or_path="/raid/yoach/tmp/artefacts/decoder/",
vocab_size = t5.vocab_size vocab_size=t5.vocab_size,
) )
# set the appropriate bos/pad token ids # set the appropriate bos/pad token ids
model.generation_config.decoder_start_token_id = encodec_vocab_size+1 model.generation_config.decoder_start_token_id = encodec_vocab_size + 1
model.generation_config.pad_token_id = encodec_vocab_size model.generation_config.pad_token_id = encodec_vocab_size
model.generation_config.eos_token_id = encodec_vocab_size model.generation_config.eos_token_id = encodec_vocab_size
# set other default generation config params # set other default generation config params
model.generation_config.max_length = int(30 * model.audio_encoder.config.frame_rate) model.generation_config.max_length = int(30 * model.audio_encoder.config.frame_rate)
model.generation_config.do_sample = False # True model.generation_config.do_sample = False # True
model.generation_config.guidance_scale = 1 # 3.0 model.generation_config.guidance_scale = 1 # 3.0
model.save_pretrained("/raid/yoach/tmp/artefacts/small-stable-speech-untrained/") model.save_pretrained("/raid/yoach/tmp/artefacts/small-stable-speech-untrained/")
\ No newline at end of file
...@@ -21,8 +21,8 @@ encodec_vocab_size = encodec.codebook_size ...@@ -21,8 +21,8 @@ encodec_vocab_size = encodec.codebook_size
decoder_config = ParlerTTSDecoderConfig( decoder_config = ParlerTTSDecoderConfig(
vocab_size=encodec_vocab_size+1, vocab_size=encodec_vocab_size + 1,
max_position_embeddings=4096, # 30 s = 2580 max_position_embeddings=4096, # 30 s = 2580
num_hidden_layers=8, num_hidden_layers=8,
ffn_dim=3072, ffn_dim=3072,
num_attention_heads=12, num_attention_heads=12,
...@@ -35,33 +35,31 @@ decoder_config = ParlerTTSDecoderConfig( ...@@ -35,33 +35,31 @@ decoder_config = ParlerTTSDecoderConfig(
activation_dropout=0.0, activation_dropout=0.0,
pad_token_id=encodec_vocab_size, pad_token_id=encodec_vocab_size,
eos_token_id=encodec_vocab_size, eos_token_id=encodec_vocab_size,
bos_token_id=encodec_vocab_size+1, bos_token_id=encodec_vocab_size + 1,
num_codebooks=num_codebooks, num_codebooks=num_codebooks,
) )
decoder = ParlerTTSForCausalLM(decoder_config) decoder = ParlerTTSForCausalLM(decoder_config)
decoder.save_pretrained("/raid/yoach/tmp/artefacts/decoder_small/") decoder.save_pretrained("/raid/yoach/tmp/artefacts/decoder_small/")
model = ParlerTTSForConditionalGeneration.from_sub_models_pretrained( model = ParlerTTSForConditionalGeneration.from_sub_models_pretrained(
text_encoder_pretrained_model_name_or_path=text_model, text_encoder_pretrained_model_name_or_path=text_model,
audio_encoder_pretrained_model_name_or_path=encodec_version, audio_encoder_pretrained_model_name_or_path=encodec_version,
decoder_pretrained_model_name_or_path="/raid/yoach/tmp/artefacts/decoder_small/", decoder_pretrained_model_name_or_path="/raid/yoach/tmp/artefacts/decoder_small/",
vocab_size = t5.vocab_size vocab_size=t5.vocab_size,
) )
# set the appropriate bos/pad token ids # set the appropriate bos/pad token ids
model.generation_config.decoder_start_token_id = encodec_vocab_size+1 model.generation_config.decoder_start_token_id = encodec_vocab_size + 1
model.generation_config.pad_token_id = encodec_vocab_size model.generation_config.pad_token_id = encodec_vocab_size
model.generation_config.eos_token_id = encodec_vocab_size model.generation_config.eos_token_id = encodec_vocab_size
# set other default generation config params # set other default generation config params
model.generation_config.max_length = int(30 * model.audio_encoder.config.frame_rate) model.generation_config.max_length = int(30 * model.audio_encoder.config.frame_rate)
model.generation_config.do_sample = False # True model.generation_config.do_sample = False # True
model.generation_config.guidance_scale = 1 # 3.0 model.generation_config.guidance_scale = 1 # 3.0
model.save_pretrained("/raid/yoach/tmp/artefacts/stable-speech-untrained-75M/") model.save_pretrained("/raid/yoach/tmp/artefacts/stable-speech-untrained-75M/")
\ No newline at end of file
from .configuration_parler_tts import ParlerTTSConfig, ParlerTTSDecoderConfig from .configuration_parler_tts import ParlerTTSConfig, ParlerTTSDecoderConfig
from .modeling_parler_tts import ParlerTTSForCausalLM, ParlerTTSForConditionalGeneration, apply_delay_pattern_mask, build_delay_pattern_mask from .modeling_parler_tts import (
ParlerTTSForCausalLM,
ParlerTTSForConditionalGeneration,
apply_delay_pattern_mask,
build_delay_pattern_mask,
)
from .dac_wrapper import DACConfig, DACModel from .dac_wrapper import DACConfig, DACModel
\ No newline at end of file
...@@ -81,7 +81,7 @@ class ParlerTTSDecoderConfig(PretrainedConfig): ...@@ -81,7 +81,7 @@ class ParlerTTSDecoderConfig(PretrainedConfig):
def __init__( def __init__(
self, self,
vocab_size=2049, # vocab size = 2048 (encodec vocab size) + 1 (eos) vocab_size=2049, # vocab size = 2048 (encodec vocab size) + 1 (eos)
max_position_embeddings=2048, max_position_embeddings=2048,
num_hidden_layers=24, num_hidden_layers=24,
ffn_dim=4096, ffn_dim=4096,
......
from .configuration_dac import DACConfig from .configuration_dac import DACConfig
from .modeling_dac import DACModel from .modeling_dac import DACModel
\ No newline at end of file
...@@ -8,17 +8,16 @@ class DACConfig(PretrainedConfig): ...@@ -8,17 +8,16 @@ class DACConfig(PretrainedConfig):
def __init__( def __init__(
self, self,
num_codebooks: int = 9, num_codebooks: int = 9,
model_bitrate: int = 8, # kbps model_bitrate: int = 8, # kbps
codebook_size: int = 1024, codebook_size: int = 1024,
latent_dim: int = 1024, latent_dim: int = 1024,
frame_rate: int = 86, frame_rate: int = 86,
**kwargs, **kwargs,
): ):
self.codebook_size = codebook_size self.codebook_size = codebook_size
self.model_bitrate = model_bitrate self.model_bitrate = model_bitrate
self.latent_dim = latent_dim self.latent_dim = latent_dim
self.num_codebooks = num_codebooks self.num_codebooks = num_codebooks
self.frame_rate = frame_rate self.frame_rate = frame_rate
super().__init__(**kwargs) super().__init__(**kwargs)
\ No newline at end of file
...@@ -7,22 +7,24 @@ from .configuration_dac import DACConfig ...@@ -7,22 +7,24 @@ from .configuration_dac import DACConfig
from dac.model import DAC from dac.model import DAC
# model doesn't support batching yet
# model doesn't support batching yet
class DACModel(PreTrainedModel): class DACModel(PreTrainedModel):
config_class = DACConfig config_class = DACConfig
def __init__(self, config): def __init__(self, config):
super().__init__(config) super().__init__(config)
self.model = DAC( self.model = DAC(
n_codebooks = config.num_codebooks, n_codebooks=config.num_codebooks,
latent_dim = config.latent_dim, latent_dim=config.latent_dim,
codebook_size = config.codebook_size, codebook_size=config.codebook_size,
) )
def encode(self, input_values, padding_mask=None, bandwidth=None, return_dict=None, n_quantizers=None, sample_rate=None): def encode(
self, input_values, padding_mask=None, bandwidth=None, return_dict=None, n_quantizers=None, sample_rate=None
):
""" """
Encodes the input audio waveform into discrete codes. Encodes the input audio waveform into discrete codes.
...@@ -44,7 +46,7 @@ class DACModel(PreTrainedModel): ...@@ -44,7 +46,7 @@ class DACModel(PreTrainedModel):
factors for each chunk when `normalize` is True. Each frames is a tuple `(codebook, scale)`, with factors for each chunk when `normalize` is True. Each frames is a tuple `(codebook, scale)`, with
`codebook` of shape `[batch_size, num_codebooks, frames]`. `codebook` of shape `[batch_size, num_codebooks, frames]`.
Scale is not used here. Scale is not used here.
""" """
_, channels, input_length = input_values.shape _, channels, input_length = input_values.shape
...@@ -52,12 +54,12 @@ class DACModel(PreTrainedModel): ...@@ -52,12 +54,12 @@ class DACModel(PreTrainedModel):
raise ValueError(f"Number of audio channels must be 1 or 2, but got {channels}") raise ValueError(f"Number of audio channels must be 1 or 2, but got {channels}")
audio_data = self.model.preprocess(input_values, sample_rate) audio_data = self.model.preprocess(input_values, sample_rate)
return_dict = return_dict if return_dict is not None else self.config.return_dict return_dict = return_dict if return_dict is not None else self.config.return_dict
# TODO: for now, no chunk length # TODO: for now, no chunk length
chunk_length = None # self.config.chunk_length chunk_length = None # self.config.chunk_length
if chunk_length is None: if chunk_length is None:
chunk_length = input_length chunk_length = input_length
stride = input_length stride = input_length
...@@ -79,9 +81,9 @@ class DACModel(PreTrainedModel): ...@@ -79,9 +81,9 @@ class DACModel(PreTrainedModel):
for offset in range(0, input_length - step, stride): for offset in range(0, input_length - step, stride):
mask = padding_mask[..., offset : offset + chunk_length].bool() mask = padding_mask[..., offset : offset + chunk_length].bool()
frame = audio_data[:, :, offset : offset + chunk_length] frame = audio_data[:, :, offset : offset + chunk_length]
scale = None scale = None
_, encoded_frame, _, _, _ = self.model.encode(frame, n_quantizers=n_quantizers) _, encoded_frame, _, _, _ = self.model.encode(frame, n_quantizers=n_quantizers)
encoded_frames.append(encoded_frame) encoded_frames.append(encoded_frame)
scales.append(scale) scales.append(scale)
...@@ -92,15 +94,14 @@ class DACModel(PreTrainedModel): ...@@ -92,15 +94,14 @@ class DACModel(PreTrainedModel):
return (encoded_frames, scales) return (encoded_frames, scales)
return EncodecEncoderOutput(encoded_frames, scales) return EncodecEncoderOutput(encoded_frames, scales)
def decode( def decode(
self, self,
audio_codes, audio_codes,
audio_scales, audio_scales,
padding_mask = None, padding_mask=None,
return_dict = None, return_dict=None,
): ):
""" """
Decodes the given frames into an output audio waveform. Decodes the given frames into an output audio waveform.
...@@ -125,12 +126,12 @@ class DACModel(PreTrainedModel): ...@@ -125,12 +126,12 @@ class DACModel(PreTrainedModel):
if len(audio_codes) != 1: if len(audio_codes) != 1:
raise ValueError(f"Expected one frame, got {len(audio_codes)}") raise ValueError(f"Expected one frame, got {len(audio_codes)}")
audio_values = self.model.quantizer.from_codes(audio_codes.squeeze(0))[0] audio_values = self.model.quantizer.from_codes(audio_codes.squeeze(0))[0]
audio_values = self.model.decode(audio_values) audio_values = self.model.decode(audio_values)
if not return_dict: if not return_dict:
return (audio_values,) return (audio_values,)
return EncodecDecoderOutput(audio_values) return EncodecDecoderOutput(audio_values)
def forward(self, tensor): def forward(self, tensor):
raise ValueError(f"`DACModel.forward` not implemented yet") raise ValueError(f"`DACModel.forward` not implemented yet")
\ No newline at end of file
This diff is collapsed.
import dac import dac
# Download a model # Download a model
model_path = dac.utils.download(model_type="44khz") model_path = dac.utils.download(model_type="44khz")
model = dac.DAC.load(model_path) model = dac.DAC.load(model_path)
...@@ -10,6 +10,7 @@ hf_dac = DACModel(DACConfig()) ...@@ -10,6 +10,7 @@ hf_dac = DACModel(DACConfig())
hf_dac.model.load_state_dict(model.state_dict()) hf_dac.model.load_state_dict(model.state_dict())
from transformers import AutoConfig, AutoModel from transformers import AutoConfig, AutoModel
AutoConfig.register("dac", DACConfig) AutoConfig.register("dac", DACConfig)
AutoModel.register(DACConfig, DACModel) AutoModel.register(DACConfig, DACModel)
...@@ -20,4 +21,4 @@ hf_dac.push_to_hub("ylacombe/dac_44khZ_8kbps") ...@@ -20,4 +21,4 @@ hf_dac.push_to_hub("ylacombe/dac_44khZ_8kbps")
from transformers import EncodecFeatureExtractor from transformers import EncodecFeatureExtractor
EncodecFeatureExtractor(sampling_rate=44100).push_to_hub("ylacombe/dac_44khZ_8kbps") EncodecFeatureExtractor(sampling_rate=44100).push_to_hub("ylacombe/dac_44khZ_8kbps")
\ No newline at end of file
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment