Commit 31a54850 authored by Yoach Lacombe's avatar Yoach Lacombe
Browse files

make style

parent 91542bfa
......@@ -13,7 +13,7 @@ encodec_vocab_size = encodec.codebook_size
decoder_config = ParlerTTSDecoderConfig(
vocab_size=encodec_vocab_size+1,
vocab_size=encodec_vocab_size + 1,
max_position_embeddings=2048,
num_hidden_layers=4,
ffn_dim=512,
......@@ -27,34 +27,32 @@ decoder_config = ParlerTTSDecoderConfig(
activation_dropout=0.0,
pad_token_id=encodec_vocab_size,
eos_token_id=encodec_vocab_size,
bos_token_id=encodec_vocab_size+1,
bos_token_id=encodec_vocab_size + 1,
num_codebooks=num_codebooks,
)
# TODO: ?? how to make it stop ?
decoder = ParlerTTSForCausalLM(decoder_config)
decoder.save_pretrained("/raid/yoach/tmp/artefacts/decoder/")
model = ParlerTTSForConditionalGeneration.from_sub_models_pretrained(
text_encoder_pretrained_model_name_or_path=text_model,
audio_encoder_pretrained_model_name_or_path=encodec_version,
decoder_pretrained_model_name_or_path="/raid/yoach/tmp/artefacts/decoder/",
vocab_size = t5.vocab_size
vocab_size=t5.vocab_size,
)
# set the appropriate bos/pad token ids
model.generation_config.decoder_start_token_id = encodec_vocab_size+1
model.generation_config.decoder_start_token_id = encodec_vocab_size + 1
model.generation_config.pad_token_id = encodec_vocab_size
model.generation_config.eos_token_id = encodec_vocab_size
# set other default generation config params
model.generation_config.max_length = int(30 * model.audio_encoder.config.frame_rate)
model.generation_config.do_sample = False # True
model.generation_config.guidance_scale = 1 # 3.0
model.generation_config.do_sample = False # True
model.generation_config.guidance_scale = 1 # 3.0
model.save_pretrained("/raid/yoach/tmp/artefacts/tiny-model/")
\ No newline at end of file
model.save_pretrained("/raid/yoach/tmp/artefacts/tiny-model/")
......@@ -20,7 +20,7 @@ encodec_vocab_size = encodec.codebook_size
decoder_config = ParlerTTSDecoderConfig(
vocab_size=encodec_vocab_size+1,
vocab_size=encodec_vocab_size + 1,
max_position_embeddings=2048,
num_hidden_layers=4,
ffn_dim=512,
......@@ -34,34 +34,32 @@ decoder_config = ParlerTTSDecoderConfig(
activation_dropout=0.0,
pad_token_id=encodec_vocab_size,
eos_token_id=encodec_vocab_size,
bos_token_id=encodec_vocab_size+1,
bos_token_id=encodec_vocab_size + 1,
num_codebooks=num_codebooks,
)
# TODO: ?? how to make it stop ?
decoder = ParlerTTSForCausalLM(decoder_config)
decoder.save_pretrained("/raid/yoach/tmp/artefacts/decoder/")
model = ParlerTTSForConditionalGeneration.from_sub_models_pretrained(
text_encoder_pretrained_model_name_or_path=text_model,
audio_encoder_pretrained_model_name_or_path=encodec_version,
decoder_pretrained_model_name_or_path="/raid/yoach/tmp/artefacts/decoder/",
vocab_size = t5.vocab_size
vocab_size=t5.vocab_size,
)
# set the appropriate bos/pad token ids
model.generation_config.decoder_start_token_id = encodec_vocab_size+1
model.generation_config.decoder_start_token_id = encodec_vocab_size + 1
model.generation_config.pad_token_id = encodec_vocab_size
model.generation_config.eos_token_id = encodec_vocab_size
# set other default generation config params
model.generation_config.max_length = int(30 * model.audio_encoder.config.frame_rate)
model.generation_config.do_sample = False # True
model.generation_config.guidance_scale = 1 # 3.0
model.generation_config.do_sample = False # True
model.generation_config.guidance_scale = 1 # 3.0
model.save_pretrained("/raid/yoach/tmp/artefacts/tiny-dac-model/")
\ No newline at end of file
model.save_pretrained("/raid/yoach/tmp/artefacts/tiny-dac-model/")
......@@ -21,8 +21,8 @@ encodec_vocab_size = encodec.codebook_size
decoder_config = ParlerTTSDecoderConfig(
vocab_size=encodec_vocab_size+1,
max_position_embeddings=3000, # 30 s = 2580
vocab_size=encodec_vocab_size + 1,
max_position_embeddings=3000, # 30 s = 2580
num_hidden_layers=12,
ffn_dim=4096,
num_attention_heads=16,
......@@ -35,33 +35,31 @@ decoder_config = ParlerTTSDecoderConfig(
activation_dropout=0.0,
pad_token_id=encodec_vocab_size,
eos_token_id=encodec_vocab_size,
bos_token_id=encodec_vocab_size+1,
bos_token_id=encodec_vocab_size + 1,
num_codebooks=num_codebooks,
)
decoder = ParlerTTSForCausalLM(decoder_config)
decoder.save_pretrained("/raid/yoach/tmp/artefacts/decoder/")
model = ParlerTTSForConditionalGeneration.from_sub_models_pretrained(
text_encoder_pretrained_model_name_or_path=text_model,
audio_encoder_pretrained_model_name_or_path=encodec_version,
decoder_pretrained_model_name_or_path="/raid/yoach/tmp/artefacts/decoder/",
vocab_size = t5.vocab_size
vocab_size=t5.vocab_size,
)
# set the appropriate bos/pad token ids
model.generation_config.decoder_start_token_id = encodec_vocab_size+1
model.generation_config.decoder_start_token_id = encodec_vocab_size + 1
model.generation_config.pad_token_id = encodec_vocab_size
model.generation_config.eos_token_id = encodec_vocab_size
# set other default generation config params
model.generation_config.max_length = int(30 * model.audio_encoder.config.frame_rate)
model.generation_config.do_sample = False # True
model.generation_config.guidance_scale = 1 # 3.0
model.generation_config.do_sample = False # True
model.generation_config.guidance_scale = 1 # 3.0
model.save_pretrained("/raid/yoach/tmp/artefacts/small-stable-speech-untrained/")
\ No newline at end of file
model.save_pretrained("/raid/yoach/tmp/artefacts/small-stable-speech-untrained/")
......@@ -21,8 +21,8 @@ encodec_vocab_size = encodec.codebook_size
decoder_config = ParlerTTSDecoderConfig(
vocab_size=encodec_vocab_size+1,
max_position_embeddings=4096, # 30 s = 2580
vocab_size=encodec_vocab_size + 1,
max_position_embeddings=4096, # 30 s = 2580
num_hidden_layers=8,
ffn_dim=3072,
num_attention_heads=12,
......@@ -35,33 +35,31 @@ decoder_config = ParlerTTSDecoderConfig(
activation_dropout=0.0,
pad_token_id=encodec_vocab_size,
eos_token_id=encodec_vocab_size,
bos_token_id=encodec_vocab_size+1,
bos_token_id=encodec_vocab_size + 1,
num_codebooks=num_codebooks,
)
decoder = ParlerTTSForCausalLM(decoder_config)
decoder.save_pretrained("/raid/yoach/tmp/artefacts/decoder_small/")
model = ParlerTTSForConditionalGeneration.from_sub_models_pretrained(
text_encoder_pretrained_model_name_or_path=text_model,
audio_encoder_pretrained_model_name_or_path=encodec_version,
decoder_pretrained_model_name_or_path="/raid/yoach/tmp/artefacts/decoder_small/",
vocab_size = t5.vocab_size
vocab_size=t5.vocab_size,
)
# set the appropriate bos/pad token ids
model.generation_config.decoder_start_token_id = encodec_vocab_size+1
model.generation_config.decoder_start_token_id = encodec_vocab_size + 1
model.generation_config.pad_token_id = encodec_vocab_size
model.generation_config.eos_token_id = encodec_vocab_size
# set other default generation config params
model.generation_config.max_length = int(30 * model.audio_encoder.config.frame_rate)
model.generation_config.do_sample = False # True
model.generation_config.guidance_scale = 1 # 3.0
model.generation_config.do_sample = False # True
model.generation_config.guidance_scale = 1 # 3.0
model.save_pretrained("/raid/yoach/tmp/artefacts/stable-speech-untrained-75M/")
\ No newline at end of file
model.save_pretrained("/raid/yoach/tmp/artefacts/stable-speech-untrained-75M/")
from .configuration_parler_tts import ParlerTTSConfig, ParlerTTSDecoderConfig
from .modeling_parler_tts import ParlerTTSForCausalLM, ParlerTTSForConditionalGeneration, apply_delay_pattern_mask, build_delay_pattern_mask
from .modeling_parler_tts import (
ParlerTTSForCausalLM,
ParlerTTSForConditionalGeneration,
apply_delay_pattern_mask,
build_delay_pattern_mask,
)
from .dac_wrapper import DACConfig, DACModel
\ No newline at end of file
from .dac_wrapper import DACConfig, DACModel
......@@ -81,7 +81,7 @@ class ParlerTTSDecoderConfig(PretrainedConfig):
def __init__(
self,
vocab_size=2049, # vocab size = 2048 (encodec vocab size) + 1 (eos)
vocab_size=2049, # vocab size = 2048 (encodec vocab size) + 1 (eos)
max_position_embeddings=2048,
num_hidden_layers=24,
ffn_dim=4096,
......
from .configuration_dac import DACConfig
from .modeling_dac import DACModel
\ No newline at end of file
from .modeling_dac import DACModel
......@@ -8,17 +8,16 @@ class DACConfig(PretrainedConfig):
def __init__(
self,
num_codebooks: int = 9,
model_bitrate: int = 8, # kbps
model_bitrate: int = 8, # kbps
codebook_size: int = 1024,
latent_dim: int = 1024,
frame_rate: int = 86,
**kwargs,
):
self.codebook_size = codebook_size
self.model_bitrate = model_bitrate
self.latent_dim = latent_dim
self.num_codebooks = num_codebooks
self.frame_rate = frame_rate
super().__init__(**kwargs)
\ No newline at end of file
super().__init__(**kwargs)
......@@ -7,22 +7,24 @@ from .configuration_dac import DACConfig
from dac.model import DAC
# model doesn't support batching yet
# model doesn't support batching yet
class DACModel(PreTrainedModel):
config_class = DACConfig
def __init__(self, config):
super().__init__(config)
self.model = DAC(
n_codebooks = config.num_codebooks,
latent_dim = config.latent_dim,
codebook_size = config.codebook_size,
n_codebooks=config.num_codebooks,
latent_dim=config.latent_dim,
codebook_size=config.codebook_size,
)
def encode(self, input_values, padding_mask=None, bandwidth=None, return_dict=None, n_quantizers=None, sample_rate=None):
def encode(
self, input_values, padding_mask=None, bandwidth=None, return_dict=None, n_quantizers=None, sample_rate=None
):
"""
Encodes the input audio waveform into discrete codes.
......@@ -44,7 +46,7 @@ class DACModel(PreTrainedModel):
factors for each chunk when `normalize` is True. Each frames is a tuple `(codebook, scale)`, with
`codebook` of shape `[batch_size, num_codebooks, frames]`.
Scale is not used here.
"""
_, channels, input_length = input_values.shape
......@@ -52,12 +54,12 @@ class DACModel(PreTrainedModel):
raise ValueError(f"Number of audio channels must be 1 or 2, but got {channels}")
audio_data = self.model.preprocess(input_values, sample_rate)
return_dict = return_dict if return_dict is not None else self.config.return_dict
# TODO: for now, no chunk length
chunk_length = None # self.config.chunk_length
chunk_length = None # self.config.chunk_length
if chunk_length is None:
chunk_length = input_length
stride = input_length
......@@ -79,9 +81,9 @@ class DACModel(PreTrainedModel):
for offset in range(0, input_length - step, stride):
mask = padding_mask[..., offset : offset + chunk_length].bool()
frame = audio_data[:, :, offset : offset + chunk_length]
scale = None
_, encoded_frame, _, _, _ = self.model.encode(frame, n_quantizers=n_quantizers)
encoded_frames.append(encoded_frame)
scales.append(scale)
......@@ -92,15 +94,14 @@ class DACModel(PreTrainedModel):
return (encoded_frames, scales)
return EncodecEncoderOutput(encoded_frames, scales)
def decode(
self,
audio_codes,
audio_scales,
padding_mask = None,
return_dict = None,
):
self,
audio_codes,
audio_scales,
padding_mask=None,
return_dict=None,
):
"""
Decodes the given frames into an output audio waveform.
......@@ -125,12 +126,12 @@ class DACModel(PreTrainedModel):
if len(audio_codes) != 1:
raise ValueError(f"Expected one frame, got {len(audio_codes)}")
audio_values = self.model.quantizer.from_codes(audio_codes.squeeze(0))[0]
audio_values = self.model.decode(audio_values)
if not return_dict:
return (audio_values,)
return EncodecDecoderOutput(audio_values)
def forward(self, tensor):
raise ValueError(f"`DACModel.forward` not implemented yet")
\ No newline at end of file
raise ValueError(f"`DACModel.forward` not implemented yet")
This diff is collapsed.
import dac
# Download a model
model_path = dac.utils.download(model_type="44khz")
model = dac.DAC.load(model_path)
......@@ -10,6 +10,7 @@ hf_dac = DACModel(DACConfig())
hf_dac.model.load_state_dict(model.state_dict())
from transformers import AutoConfig, AutoModel
AutoConfig.register("dac", DACConfig)
AutoModel.register(DACConfig, DACModel)
......@@ -20,4 +21,4 @@ hf_dac.push_to_hub("ylacombe/dac_44khZ_8kbps")
from transformers import EncodecFeatureExtractor
EncodecFeatureExtractor(sampling_rate=44100).push_to_hub("ylacombe/dac_44khZ_8kbps")
\ No newline at end of file
EncodecFeatureExtractor(sampling_rate=44100).push_to_hub("ylacombe/dac_44khZ_8kbps")
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment