Commit 5564a484 authored by Yoach Lacombe's avatar Yoach Lacombe
Browse files

finalize init model scripts

parent 7ea2b865
from parler_tts import ParlerTTSForCausalLM, ParlerTTSForConditionalGeneration, ParlerTTSDecoderConfig
from transformers import AutoConfig
import os
import argparse
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("save_directory", type=str, help="Directory where to save the model and the decoder.")
parser.add_argument("text_model", type=str, help="Repository id or path to the text encoder.")
parser.add_argument("audio_model", type=str, help="Repository id or path to the audio encoder.")
args = parser.parse_args()
text_model = args.text_model
encodec_version = args.audio_model
t5 = AutoConfig.from_pretrained(text_model)
encodec = AutoConfig.from_pretrained(encodec_version)
encodec_vocab_size = encodec.codebook_size
num_codebooks = encodec.num_codebooks
print("num_codebooks", num_codebooks)
decoder_config = ParlerTTSDecoderConfig(
vocab_size=encodec_vocab_size + 1,
max_position_embeddings=2048,
num_hidden_layers=4,
ffn_dim=512,
num_attention_heads=8,
layerdrop=0.0,
use_cache=True,
activation_function="gelu",
hidden_size=512,
dropout=0.0,
attention_dropout=0.0,
activation_dropout=0.0,
pad_token_id=encodec_vocab_size,
eos_token_id=encodec_vocab_size,
bos_token_id=encodec_vocab_size + 1,
num_codebooks=num_codebooks,
)
decoder = ParlerTTSForCausalLM(decoder_config)
decoder.save_pretrained(os.path.join(args.save_directory, "decoder"))
model = ParlerTTSForConditionalGeneration.from_sub_models_pretrained(
text_encoder_pretrained_model_name_or_path=text_model,
audio_encoder_pretrained_model_name_or_path=encodec_version,
decoder_pretrained_model_name_or_path=os.path.join(args.save_directory, "decoder"),
vocab_size=t5.vocab_size,
)
# set the appropriate bos/pad token ids
model.generation_config.decoder_start_token_id = encodec_vocab_size + 1
model.generation_config.pad_token_id = encodec_vocab_size
model.generation_config.eos_token_id = encodec_vocab_size
# set other default generation config params
model.generation_config.max_length = int(30 * model.audio_encoder.config.frame_rate)
model.generation_config.do_sample = True # True
model.generation_config.guidance_scale = 1 # 3.0
model.save_pretrained(os.path.join(args.save_directory, "tiny-model"))
from parler_tts import ParlerTTSForCausalLM, ParlerTTSForConditionalGeneration, ParlerTTSDecoderConfig
from transformers import AutoConfig
import os
TMP_DIR = "./tmp/artefacts/"
text_model = "google-t5/t5-small"
encodec_version = "ylacombe/dac_44khZ_8kbps"
num_codebooks = 9
t5 = AutoConfig.from_pretrained(text_model)
encodec = AutoConfig.from_pretrained(encodec_version)
encodec_vocab_size = encodec.codebook_size
decoder_config = ParlerTTSDecoderConfig(
vocab_size=encodec_vocab_size + 1,
max_position_embeddings=2048,
num_hidden_layers=4,
ffn_dim=512,
num_attention_heads=8,
layerdrop=0.0,
use_cache=True,
activation_function="gelu",
hidden_size=512,
dropout=0.0,
attention_dropout=0.0,
activation_dropout=0.0,
pad_token_id=encodec_vocab_size,
eos_token_id=encodec_vocab_size,
bos_token_id=encodec_vocab_size + 1,
num_codebooks=num_codebooks,
)
decoder = ParlerTTSForCausalLM(decoder_config)
decoder.save_pretrained(os.path.join(TMP_DIR, "decoder"))
model = ParlerTTSForConditionalGeneration.from_sub_models_pretrained(
text_encoder_pretrained_model_name_or_path=text_model,
audio_encoder_pretrained_model_name_or_path=encodec_version,
decoder_pretrained_model_name_or_path=os.path.join(TMP_DIR, "decoder"),
vocab_size=t5.vocab_size,
)
# set the appropriate bos/pad token ids
model.generation_config.decoder_start_token_id = encodec_vocab_size + 1
model.generation_config.pad_token_id = encodec_vocab_size
model.generation_config.eos_token_id = encodec_vocab_size
# set other default generation config params
model.generation_config.max_length = int(30 * model.audio_encoder.config.frame_rate)
model.generation_config.do_sample = True # True
model.generation_config.guidance_scale = 1 # 3.0
model.save_pretrained(os.path.join(TMP_DIR, "tiny-model"))
from parler_tts import ParlerTTSForCausalLM, ParlerTTSForConditionalGeneration, ParlerTTSDecoderConfig from parler_tts import ParlerTTSForCausalLM, ParlerTTSForConditionalGeneration, ParlerTTSDecoderConfig
from transformers import AutoConfig from transformers import AutoConfig
import os import os
import argparse
TMP_DIR = "./artefacts/"
if __name__ == "__main__":
text_model = "google-t5/t5-small" parser = argparse.ArgumentParser()
encodec_version = "facebook/encodec_24khz" parser.add_argument("save_directory", type=str, help="Directory where to save the model and the decoder.")
num_codebooks = 8 args = parser.parse_args()
t5 = AutoConfig.from_pretrained(text_model) text_model = "google-t5/t5-small"
encodec = AutoConfig.from_pretrained(encodec_version) encodec_version = "facebook/encodec_24khz"
encodec_vocab_size = encodec.codebook_size t5 = AutoConfig.from_pretrained(text_model)
encodec = AutoConfig.from_pretrained(encodec_version)
decoder_config = ParlerTTSDecoderConfig( encodec_vocab_size = encodec.codebook_size
vocab_size=encodec_vocab_size + 1, num_codebooks = 8
max_position_embeddings=2048, print("num_codebooks", num_codebooks)
num_hidden_layers=4,
ffn_dim=512, decoder_config = ParlerTTSDecoderConfig(
num_attention_heads=8, vocab_size=encodec_vocab_size + 1,
layerdrop=0.0, max_position_embeddings=2048,
use_cache=True, num_hidden_layers=4,
activation_function="gelu", ffn_dim=512,
hidden_size=512, num_attention_heads=8,
dropout=0.0, layerdrop=0.0,
attention_dropout=0.0, use_cache=True,
activation_dropout=0.0, activation_function="gelu",
pad_token_id=encodec_vocab_size, hidden_size=512,
eos_token_id=encodec_vocab_size, dropout=0.0,
bos_token_id=encodec_vocab_size + 1, attention_dropout=0.0,
num_codebooks=num_codebooks, activation_dropout=0.0,
) pad_token_id=encodec_vocab_size,
eos_token_id=encodec_vocab_size,
decoder = ParlerTTSForCausalLM(decoder_config) bos_token_id=encodec_vocab_size + 1,
num_codebooks=num_codebooks,
decoder.save_pretrained(os.path.join(TMP_DIR, "decoder")) )
decoder = ParlerTTSForCausalLM(decoder_config)
model = ParlerTTSForConditionalGeneration.from_sub_models_pretrained(
text_encoder_pretrained_model_name_or_path=text_model, decoder.save_pretrained(os.path.join(args.save_directory, "decoder"))
audio_encoder_pretrained_model_name_or_path=encodec_version,
decoder_pretrained_model_name_or_path=os.path.join(TMP_DIR, "decoder"),
vocab_size=t5.vocab_size, model = ParlerTTSForConditionalGeneration.from_sub_models_pretrained(
) text_encoder_pretrained_model_name_or_path=text_model,
audio_encoder_pretrained_model_name_or_path=encodec_version,
# set the appropriate bos/pad token ids decoder_pretrained_model_name_or_path=os.path.join(args.save_directory, "decoder"),
model.generation_config.decoder_start_token_id = encodec_vocab_size + 1 vocab_size=t5.vocab_size,
model.generation_config.pad_token_id = encodec_vocab_size )
model.generation_config.eos_token_id = encodec_vocab_size
# set the appropriate bos/pad token ids
# set other default generation config params model.generation_config.decoder_start_token_id = encodec_vocab_size + 1
model.generation_config.max_length = int(30 * model.audio_encoder.config.frame_rate) model.generation_config.pad_token_id = encodec_vocab_size
model.generation_config.do_sample = True # True model.generation_config.eos_token_id = encodec_vocab_size
model.generation_config.guidance_scale = 1 # 3.0
# set other default generation config params
model.save_pretrained(os.path.join(TMP_DIR, "tiny-model")) model.generation_config.max_length = int(30 * model.audio_encoder.config.frame_rate)
model.generation_config.do_sample = True # True
model.generation_config.guidance_scale = 1 # 3.0
model.save_pretrained(os.path.join(args.save_directory, "tiny-model"))
from parler_tts import ParlerTTSForCausalLM, ParlerTTSForConditionalGeneration, ParlerTTSDecoderConfig from parler_tts import ParlerTTSForCausalLM, ParlerTTSForConditionalGeneration, ParlerTTSDecoderConfig
from transformers import AutoConfig from transformers import AutoConfig
import os import os
TMP_DIR = "./tmp/artefacts/" import argparse
text_model = "google/flan-t5-base"
encodec_version = "ylacombe/dac_44khZ_8kbps"
num_codebooks = 9
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("save_directory", type=str, help="Directory where to save the model and the decoder.")
parser.add_argument("text_model", type=str, help="Repository id or path to the text encoder.")
parser.add_argument("audio_model", type=str, help="Repository id or path to the audio encoder.")
args = parser.parse_args()
t5 = AutoConfig.from_pretrained(text_model) text_model = args.text_model
encodec = AutoConfig.from_pretrained(encodec_version) encodec_version = args.audio_model
encodec_vocab_size = encodec.codebook_size t5 = AutoConfig.from_pretrained(text_model)
encodec = AutoConfig.from_pretrained(encodec_version)
encodec_vocab_size = encodec.codebook_size
num_codebooks = encodec.num_codebooks
print("num_codebooks", num_codebooks)
decoder_config = ParlerTTSDecoderConfig(
vocab_size=encodec_vocab_size + 64, # + 64 instead of +1 to have a multiple of 64
max_position_embeddings=4096, # 30 s = 2580
num_hidden_layers=24,
ffn_dim=4096,
num_attention_heads=16,
layerdrop=0.0,
use_cache=True,
activation_function="gelu",
hidden_size=1024,
dropout=0.1,
attention_dropout=0.0,
activation_dropout=0.0,
pad_token_id=encodec_vocab_size,
eos_token_id=encodec_vocab_size,
bos_token_id=encodec_vocab_size + 1,
num_codebooks=num_codebooks,
)
decoder_config = ParlerTTSDecoderConfig(
vocab_size=encodec_vocab_size + 64, # + 64 instead of +1 to have a multiple of 64
max_position_embeddings=4096, # 30 s = 2580
num_hidden_layers=24,
ffn_dim=4096,
num_attention_heads=16,
layerdrop=0.0,
use_cache=True,
activation_function="gelu",
hidden_size=1024,
dropout=0.1,
attention_dropout=0.0,
activation_dropout=0.0,
pad_token_id=encodec_vocab_size,
eos_token_id=encodec_vocab_size,
bos_token_id=encodec_vocab_size + 1,
num_codebooks=num_codebooks,
)
decoder = ParlerTTSForCausalLM(decoder_config)
decoder.save_pretrained(os.path.join(TMP_DIR, "decoder"))
decoder = ParlerTTSForCausalLM(decoder_config)
decoder.save_pretrained(os.path.join(args.save_directory, "decoder"))
model = ParlerTTSForConditionalGeneration.from_sub_models_pretrained(
text_encoder_pretrained_model_name_or_path=text_model,
audio_encoder_pretrained_model_name_or_path=encodec_version,
decoder_pretrained_model_name_or_path=os.path.join(TMP_DIR, "decoder"),
vocab_size=t5.vocab_size,
)
# set the appropriate bos/pad token ids model = ParlerTTSForConditionalGeneration.from_sub_models_pretrained(
model.generation_config.decoder_start_token_id = encodec_vocab_size + 1 text_encoder_pretrained_model_name_or_path=text_model,
model.generation_config.pad_token_id = encodec_vocab_size audio_encoder_pretrained_model_name_or_path=encodec_version,
model.generation_config.eos_token_id = encodec_vocab_size decoder_pretrained_model_name_or_path=os.path.join(args.save_directory, "decoder"),
vocab_size=t5.vocab_size,
)
# set other default generation config params # set the appropriate bos/pad token ids
model.generation_config.max_length = int(30 * model.audio_encoder.config.frame_rate) model.generation_config.decoder_start_token_id = encodec_vocab_size + 1
model.generation_config.do_sample = True # True model.generation_config.pad_token_id = encodec_vocab_size
model.generation_config.guidance_scale = 1 # 3.0 model.generation_config.eos_token_id = encodec_vocab_size
# set other default generation config params
model.generation_config.max_length = int(30 * model.audio_encoder.config.frame_rate)
model.generation_config.do_sample = True # True
model.generation_config.guidance_scale = 1 # 3.0
model.save_pretrained(os.path.join(TMP_DIR,"stable-speech-untrained-300M/"))
model.save_pretrained(os.path.join(args.save_directory,"stable-speech-untrained-300M/"))
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment