Commit 5b593f58 authored by Yoach Lacombe's avatar Yoach Lacombe
Browse files

replace 300M reference to 600M and Mini

parent 613564c8
...@@ -33,8 +33,8 @@ import torch ...@@ -33,8 +33,8 @@ import torch
device = "cuda:0" if torch.cuda.is_available() else "cpu" device = "cuda:0" if torch.cuda.is_available() else "cpu"
model = ParlerTTSForConditionalGeneration.from_pretrained("parler-tts/parler_tts_300M_v0.1").to(device) model = ParlerTTSForConditionalGeneration.from_pretrained("parler-tts/parler_tts_mini_v0.1").to(device)
tokenizer = AutoTokenizer.from_pretrained("parler-tts/parler_tts_300M_v0.1") tokenizer = AutoTokenizer.from_pretrained("parler-tts/parler_tts_mini_v0.1")
prompt = "Hey, how are you doing today?" prompt = "Hey, how are you doing today?"
description = "A female speaker with a slightly low-pitched voice delivers her words quite expressively, in a very confined sounding environment with clear audio quality. She speaks very fast." description = "A female speaker with a slightly low-pitched voice delivers her words quite expressively, in a very confined sounding environment with clear audio quality. She speaks very fast."
......
...@@ -6,7 +6,7 @@ from transformers import AutoTokenizer, AutoFeatureExtractor, set_seed ...@@ -6,7 +6,7 @@ from transformers import AutoTokenizer, AutoFeatureExtractor, set_seed
device = "cuda:0" if torch.cuda.is_available() else "cpu" device = "cuda:0" if torch.cuda.is_available() else "cpu"
repo_id = "parler-tts/parler_tts_300M_v0.1" repo_id = "parler-tts/parler_tts_mini_v0.1"
model = ParlerTTSForConditionalGeneration.from_pretrained(repo_id).to(device) model = ParlerTTSForConditionalGeneration.from_pretrained(repo_id).to(device)
tokenizer = AutoTokenizer.from_pretrained(repo_id) tokenizer = AutoTokenizer.from_pretrained(repo_id)
......
...@@ -64,4 +64,4 @@ if __name__ == "__main__": ...@@ -64,4 +64,4 @@ if __name__ == "__main__":
model.config.pad_token_id = encodec_vocab_size model.config.pad_token_id = encodec_vocab_size
model.config.decoder_start_token_id = encodec_vocab_size+1 model.config.decoder_start_token_id = encodec_vocab_size+1
model.save_pretrained(os.path.join(args.save_directory, "parler-tts-untrained-300M/")) model.save_pretrained(os.path.join(args.save_directory, "parler-tts-untrained-600M/"))
...@@ -2,7 +2,7 @@ from parler_tts import ParlerTTSForConditionalGeneration ...@@ -2,7 +2,7 @@ from parler_tts import ParlerTTSForConditionalGeneration
from transformers import AutoTokenizer, AutoFeatureExtractor from transformers import AutoTokenizer, AutoFeatureExtractor
path = "TODO" path = "TODO"
repo_id = "parler_tts_300M" repo_id = "parler_tts_600M"
AutoFeatureExtractor.from_pretrained("ylacombe/dac_44khZ_8kbps").push_to_hub(repo_id) AutoFeatureExtractor.from_pretrained("ylacombe/dac_44khZ_8kbps").push_to_hub(repo_id)
......
{ {
"model_name_or_path": "./parler-tts-untrained-300M/parler-tts-untrained-300M/", "model_name_or_path": "./parler-tts-untrained-600M/parler-tts-untrained-600M/",
"save_to_disk": "./tmp_dataset_audio/", "save_to_disk": "./tmp_dataset_audio/",
"temporary_save_to_disk": "./audio_code_tmp/", "temporary_save_to_disk": "./audio_code_tmp/",
......
{ {
"model_name_or_path": "./parler-tts-untrained-300M/parler-tts-untrained-300M/", "model_name_or_path": "./parler-tts-untrained-600M/parler-tts-untrained-600M/",
"save_to_disk": "./tmp_dataset_audio/", "save_to_disk": "./tmp_dataset_audio/",
"temporary_save_to_disk": "./audio_code_tmp/", "temporary_save_to_disk": "./audio_code_tmp/",
......
...@@ -71,7 +71,7 @@ And then enter an authentication token from https://huggingface.co/settings/toke ...@@ -71,7 +71,7 @@ And then enter an authentication token from https://huggingface.co/settings/toke
Depending on your compute resources and your dataset, you need to choose between fine-tuning a pre-trained model and training a new model from scratch. Depending on your compute resources and your dataset, you need to choose between fine-tuning a pre-trained model and training a new model from scratch.
In that sense, we released a 300M checkpoint trained on 10.5K hours of annotated data under the repository id: [`parler-tts/parler_tts_300M_v0.1`](https://huggingface.co/parler-tts/parler_tts_300M_v0.1), that you can fine-tune for your own use-case. In that sense, we released a 600M checkpoint trained on 10.5K hours of annotated data under the repository id: [`parler-tts/parler_tts_mini_v0.1`](https://huggingface.co/parler-tts/parler_tts_mini_v0.1), that you can fine-tune for your own use-case.
You can also train you own model from scratch. You can find [here](/helpers/model_init_scripts/) examples on how to initialize a model from scratch. For example, you can initialize a dummy model with: You can also train you own model from scratch. You can find [here](/helpers/model_init_scripts/) examples on how to initialize a model from scratch. For example, you can initialize a dummy model with:
...@@ -79,10 +79,10 @@ You can also train you own model from scratch. You can find [here](/helpers/mode ...@@ -79,10 +79,10 @@ You can also train you own model from scratch. You can find [here](/helpers/mode
python helpers/model_init_scripts/init_dummy_model.py ./parler-tts-untrained-dummy --text_model "google-t5/t5-small" --audio_model "parler-tts/dac_44khZ_8kbps" python helpers/model_init_scripts/init_dummy_model.py ./parler-tts-untrained-dummy --text_model "google-t5/t5-small" --audio_model "parler-tts/dac_44khZ_8kbps"
``` ```
In the rest of this guide, and to reproduce the Parler-TTS v0.1 training recipe, we'll use a 300-M parameters that we'll initialize with: In the rest of this guide, and to reproduce the Parler-TTS v0.1 training recipe, we'll use a 600-M parameters model that we'll initialize with:
```sh ```sh
python helpers/model_init_scripts/init_model_300M.py ./parler-tts-untrained-300M --text_model "google/flan-t5-base" --audio_model "parler-tts/dac_44khZ_8kbps" python helpers/model_init_scripts/init_model_600M.py ./parler-tts-untrained-600M --text_model "google/flan-t5-base" --audio_model "parler-tts/dac_44khZ_8kbps"
``` ```
...@@ -113,7 +113,7 @@ To train Parler-TTS v0.1, we roughly used: ...@@ -113,7 +113,7 @@ To train Parler-TTS v0.1, we roughly used:
```sh ```sh
accelerate launch ./training/run_parler_tts_training.py \ accelerate launch ./training/run_parler_tts_training.py \
--model_name_or_path "./parler-tts-untrained-300M/parler-tts-untrained-300M/" \ --model_name_or_path "./parler-tts-untrained-600M/parler-tts-untrained-600M/" \
--feature_extractor_name "parler-tts/dac_44khZ_8kbps" \ --feature_extractor_name "parler-tts/dac_44khZ_8kbps" \
--description_tokenizer_name "google/flan-t5-base" \ --description_tokenizer_name "google/flan-t5-base" \
--prompt_tokenizer_name "google/flan-t5-base" \ --prompt_tokenizer_name "google/flan-t5-base" \
...@@ -202,4 +202,4 @@ And finally, two additional comments: ...@@ -202,4 +202,4 @@ And finally, two additional comments:
> [!TIP] > [!TIP]
> Fine-tuning is as easy as modifying `model_name_or_path` to a pre-trained model. > Fine-tuning is as easy as modifying `model_name_or_path` to a pre-trained model.
> For example: `--model_name_or_path parler-tts/parler_tts_300M_v0.1`. > For example: `--model_name_or_path parler-tts/parler_tts_mini_v0.1`.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment