replace 300M reference to 600M and Mini

5b593f58 · Yoach Lacombe · 613564c8 · 5b593f58 · 5b593f58 · 5b593f58
Commit 5b593f58 authored Apr 10, 2024 by Yoach Lacombe
7 changed files
--- a/README.md
+++ b/README.md
@@ -33,8 +33,8 @@ import torch

 device = "cuda:0" if torch.cuda.is_available() else "cpu"

-model = ParlerTTSForConditionalGeneration.from_pretrained("parler-tts/parler_tts_300M_v0.1").to(device)
-tokenizer = AutoTokenizer.from_pretrained("parler-tts/parler_tts_300M_v0.1")
+model = ParlerTTSForConditionalGeneration.from_pretrained("parler-tts/parler_tts_mini_v0.1").to(device)
+tokenizer = AutoTokenizer.from_pretrained("parler-tts/parler_tts_mini_v0.1")

 prompt = "Hey, how are you doing today?"
 description = "A female speaker with a slightly low-pitched voice delivers her words quite expressively, in a very confined sounding environment with clear audio quality. She speaks very fast."

--- a/helpers/gradio_demo/app.py
+++ b/helpers/gradio_demo/app.py
@@ -6,7 +6,7 @@ from transformers import AutoTokenizer, AutoFeatureExtractor, set_seed

 device = "cuda:0" if torch.cuda.is_available() else "cpu"

-repo_id = "parler-tts/parler_tts_300M_v0.1"
+repo_id = "parler-tts/parler_tts_mini_v0.1"

 model = ParlerTTSForConditionalGeneration.from_pretrained(repo_id).to(device)
 tokenizer = AutoTokenizer.from_pretrained(repo_id)

--- a/helpers/model_init_scripts/init_model_300M.py
+++ b/helpers/model_init_scripts/init_model_300M.py
@@ -64,4 +64,4 @@ if __name__ == "__main__":
    model.config.pad_token_id = encodec_vocab_size
    model.config.decoder_start_token_id = encodec_vocab_size+1

-    model.save_pretrained(os.path.join(args.save_directory, "parler-tts-untrained-300M/"))
+    model.save_pretrained(os.path.join(args.save_directory, "parler-tts-untrained-600M/"))
--- a/helpers/push_to_hub_scripts/push_trained_parler_tts_to_hub.py
+++ b/helpers/push_to_hub_scripts/push_trained_parler_tts_to_hub.py
@@ -2,7 +2,7 @@ from parler_tts import ParlerTTSForConditionalGeneration
 from transformers import AutoTokenizer, AutoFeatureExtractor

 path = "TODO"
-repo_id = "parler_tts_300M"
+repo_id = "parler_tts_600M"


 AutoFeatureExtractor.from_pretrained("ylacombe/dac_44khZ_8kbps").push_to_hub(repo_id)

--- a/helpers/training_configs/librispeech_tts_r_300M_dummy.json
+++ b/helpers/training_configs/librispeech_tts_r_300M_dummy.json
 {
-    "model_name_or_path": "./parler-tts-untrained-300M/parler-tts-untrained-300M/",
+    "model_name_or_path": "./parler-tts-untrained-600M/parler-tts-untrained-600M/",
    "save_to_disk":  "./tmp_dataset_audio/",
    "temporary_save_to_disk": "./audio_code_tmp/",


--- a/helpers/training_configs/starting_point_0.01.json
+++ b/helpers/training_configs/starting_point_0.01.json
 {
-    "model_name_or_path": "./parler-tts-untrained-300M/parler-tts-untrained-300M/",
+    "model_name_or_path": "./parler-tts-untrained-600M/parler-tts-untrained-600M/",
    "save_to_disk":  "./tmp_dataset_audio/",
    "temporary_save_to_disk": "./audio_code_tmp/",


--- a/training/README.md
+++ b/training/README.md
@@ -71,7 +71,7 @@ And then enter an authentication token from https://huggingface.co/settings/toke

 Depending on your compute resources and your dataset, you need to choose between fine-tuning a pre-trained model and training a new model from scratch.

-In that sense, we released a 300M checkpoint trained on 10.5K hours of annotated data under the repository id: [`parler-tts/parler_tts_300M_v0.1`](https://huggingface.co/parler-tts/parler_tts_300M_v0.1), that you can fine-tune for your own use-case.
+In that sense, we released a 600M checkpoint trained on 10.5K hours of annotated data under the repository id: [`parler-tts/parler_tts_mini_v0.1`](https://huggingface.co/parler-tts/parler_tts_mini_v0.1), that you can fine-tune for your own use-case.

 You can also train you own model from scratch. You can find [here](/helpers/model_init_scripts/) examples on how to initialize a model from scratch. For example, you can initialize a dummy model with:

@@ -79,10 +79,10 @@ You can also train you own model from scratch. You can find [here](/helpers/mode
 python helpers/model_init_scripts/init_dummy_model.py ./parler-tts-untrained-dummy --text_model "google-t5/t5-small" --audio_model "parler-tts/dac_44khZ_8kbps"
 ```

-In the rest of this guide, and to reproduce the Parler-TTS v0.1 training recipe, we'll use a 300-M parameters that we'll initialize with:
+In the rest of this guide, and to reproduce the Parler-TTS v0.1 training recipe, we'll use a 600-M parameters model that we'll initialize with:

 ```sh
-python helpers/model_init_scripts/init_model_300M.py ./parler-tts-untrained-300M --text_model "google/flan-t5-base" --audio_model "parler-tts/dac_44khZ_8kbps"
+python helpers/model_init_scripts/init_model_600M.py ./parler-tts-untrained-600M --text_model "google/flan-t5-base" --audio_model "parler-tts/dac_44khZ_8kbps"
 ```


@@ -113,7 +113,7 @@ To train Parler-TTS v0.1, we roughly used:

 ```sh
 accelerate launch ./training/run_parler_tts_training.py \
-    --model_name_or_path "./parler-tts-untrained-300M/parler-tts-untrained-300M/" \
+    --model_name_or_path "./parler-tts-untrained-600M/parler-tts-untrained-600M/" \
    --feature_extractor_name "parler-tts/dac_44khZ_8kbps" \
    --description_tokenizer_name "google/flan-t5-base" \
    --prompt_tokenizer_name "google/flan-t5-base" \
@@ -202,4 +202,4 @@ And finally, two additional comments:

 > [!TIP]
 > Fine-tuning is as easy as modifying `model_name_or_path` to a pre-trained model.
-> For example: `--model_name_or_path parler-tts/parler_tts_300M_v0.1`.
+> For example: `--model_name_or_path parler-tts/parler_tts_mini_v0.1`.