Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenpangpang
parler-tts
Commits
e7cc576a
Commit
e7cc576a
authored
Mar 04, 2024
by
Yoach Lacombe
Browse files
add dac config, init, and temporary datasets saving
parent
9bde9933
Changes
8
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
510 additions
and
198 deletions
+510
-198
example_configs/librispeech_tts_r.json
example_configs/librispeech_tts_r.json
+1
-1
example_configs/librispeech_tts_r_100.json
example_configs/librispeech_tts_r_100.json
+7
-8
example_configs/librispeech_tts_r_75M.json
example_configs/librispeech_tts_r_75M.json
+77
-0
example_configs/librispeech_tts_r_dummy_dac.json
example_configs/librispeech_tts_r_dummy_dac.json
+76
-0
init_dummy_model_dac.py
init_dummy_model_dac.py
+67
-0
init_model.py
init_model.py
+9
-3
init_model_75M.py
init_model_75M.py
+67
-0
run_stable_speech_training.py
run_stable_speech_training.py
+206
-186
No files found.
example_configs/librispeech_tts_r.json
View file @
e7cc576a
...
...
@@ -63,7 +63,7 @@
"evaluation_strategy"
:
"steps"
,
"eval_steps"
:
600
,
"per_device_eval_batch_size"
:
8
,
"generation_max_length"
:
40
0
,
"generation_max_length"
:
225
0
,
"fp16"
:
false
,
"seed"
:
456
,
...
...
example_configs/librispeech_tts_r_100.json
View file @
e7cc576a
{
"model_name_or_path"
:
"/raid/yoach/tmp/artefacts/small-stable-speech-untrained/"
,
"feature_extractor_name"
:
"
facebook/encode
c_
2
4kh
z
"
,
"feature_extractor_name"
:
"
ylacombe/da
c_
4
4kh
Z_8kbps
"
,
"description_tokenizer_name"
:
"google-t5/t5-small"
,
"prompt_tokenizer_name"
:
"google-t5/t5-small"
,
"push_to_hub"
:
tru
e
,
"push_to_hub"
:
fals
e
,
"hub_model_id"
:
"ylacombe/stable-speech-mini"
,
"report_to"
:
[
"wandb"
],
"overwrite_output_dir"
:
tru
e
,
"overwrite_output_dir"
:
fals
e
,
"output_dir"
:
"/raid/yoach/tmp/artefacts/training-mini/"
,
...
...
@@ -34,7 +34,7 @@
"add_audio_samples_to_wandb"
:
true
,
"id_column_name"
:
"id"
,
"preprocessing_num_workers"
:
1
,
"preprocessing_num_workers"
:
8
,
"pad_token_id"
:
1024
,
...
...
@@ -45,7 +45,7 @@
"num_train_epochs"
:
15
,
"gradient_accumulation_steps"
:
1
,
"gradient_checkpointing"
:
true
,
"per_device_train_batch_size"
:
40
,
"per_device_train_batch_size"
:
28
,
"learning_rate"
:
1e-4
,
"adam_beta1"
:
0.9
,
"adam_beta2"
:
0.999
,
...
...
@@ -63,11 +63,10 @@
"predict_with_generate"
:
true
,
"include_inputs_for_metrics"
:
true
,
"evaluation_strategy"
:
"steps"
,
"eval_steps"
:
30
00
,
"save_steps"
:
3000
,
"eval_steps"
:
25
00
,
"save_steps"
:
2499
,
"per_device_eval_batch_size"
:
8
,
"generation_max_length"
:
400
,
"audio_encode_per_device_eval_batch_size"
:
32
,
"dtype"
:
"float16"
,
...
...
example_configs/librispeech_tts_r_75M.json
0 → 100644
View file @
e7cc576a
{
"model_name_or_path"
:
"/raid/yoach/tmp/artefacts/stable-speech-untrained-75M/"
,
"save_to_disk"
:
"/raid/yoach/tmp/artefacts/libritts_r_1k_hours_processed/"
,
"preprocessing_only"
:
false
,
"feature_extractor_name"
:
"ylacombe/dac_44khZ_8kbps"
,
"description_tokenizer_name"
:
"google/t5-v1_1-small"
,
"prompt_tokenizer_name"
:
"google/t5-v1_1-small"
,
"push_to_hub"
:
false
,
"hub_model_id"
:
"ylacombe/stable-speech-75M"
,
"report_to"
:
[
"wandb"
],
"overwrite_output_dir"
:
false
,
"output_dir"
:
"/raid/yoach/tmp/artefacts/training-75M-0.1/"
,
"train_dataset_name"
:
"blabble-io/libritts_r+blabble-io/libritts_r+blabble-io/libritts_r"
,
"train_metadata_dataset_name"
:
"stable-speech/libritts-r-tags-and-text-generated+stable-speech/libritts-r-tags-and-text-generated+stable-speech/libritts-r-tags-and-text-generated"
,
"train_dataset_config_name"
:
"clean+clean+other"
,
"train_split_name"
:
"train.clean.360+train.clean.100+train.other.500"
,
"eval_dataset_name"
:
"blabble-io/libritts_r+blabble-io/libritts_r"
,
"eval_metadata_dataset_name"
:
"stable-speech/libritts-r-tags-and-text-generated+stable-speech/libritts-r-tags-and-text-generated"
,
"eval_dataset_config_name"
:
"clean+other"
,
"eval_split_name"
:
"test.clean+test.other"
,
"target_audio_column_name"
:
"audio"
,
"description_column_name"
:
"text_description"
,
"prompt_column_name"
:
"text"
,
"max_eval_samples"
:
24
,
"max_duration_in_seconds"
:
35
,
"min_duration_in_seconds"
:
2.0
,
"add_audio_samples_to_wandb"
:
true
,
"id_column_name"
:
"id"
,
"preprocessing_num_workers"
:
16
,
"pad_token_id"
:
1024
,
"decoder_start_token_id"
:
1025
,
"do_train"
:
true
,
"num_train_epochs"
:
1
,
"gradient_accumulation_steps"
:
1
,
"gradient_checkpointing"
:
true
,
"per_device_train_batch_size"
:
28
,
"learning_rate"
:
1e-4
,
"adam_beta1"
:
0.9
,
"adam_beta2"
:
0.999
,
"weight_decay"
:
0.03
,
"lr_scheduler_type"
:
"constant_with_warmup"
,
"warmup_steps"
:
5000
,
"logging_steps"
:
102
,
"freeze_text_encoder"
:
true
,
"do_eval"
:
true
,
"predict_with_generate"
:
true
,
"include_inputs_for_metrics"
:
true
,
"evaluation_strategy"
:
"steps"
,
"eval_steps"
:
2500
,
"save_steps"
:
2499
,
"per_device_eval_batch_size"
:
1
,
"audio_encode_per_device_eval_batch_size"
:
24
,
"dtype"
:
"bfloat16"
,
"seed"
:
456
,
"dataloader_num_workers"
:
16
}
example_configs/librispeech_tts_r_dummy_dac.json
0 → 100644
View file @
e7cc576a
{
"model_name_or_path"
:
"/raid/yoach/tmp/artefacts/tiny-dac-model/"
,
"save_to_disk"
:
"/raid/yoach/tmp/artefacts/small_experiment_dataset/"
,
"feature_extractor_name"
:
"ylacombe/dac_44khZ_8kbps"
,
"description_tokenizer_name"
:
"google-t5/t5-small"
,
"prompt_tokenizer_name"
:
"google-t5/t5-small"
,
"push_to_hub"
:
false
,
"hub_model_id"
:
"stable-speech-mini"
,
"report_to"
:
[
"wandb"
],
"overwrite_output_dir"
:
true
,
"output_dir"
:
"/raid/yoach/tmp/artefacts/training/"
,
"train_dataset_name"
:
"blabble-io/libritts_r"
,
"train_metadata_dataset_name"
:
"stable-speech/libritts-r-tags-and-text-generated"
,
"train_dataset_config_name"
:
"clean"
,
"train_split_name"
:
"train.clean.360"
,
"eval_dataset_name"
:
"blabble-io/libritts_r"
,
"eval_metadata_dataset_name"
:
"stable-speech/libritts-r-tags-and-text-generated"
,
"eval_dataset_config_name"
:
"clean"
,
"eval_split_name"
:
"train.clean.360"
,
"target_audio_column_name"
:
"audio"
,
"description_column_name"
:
"text_description"
,
"prompt_column_name"
:
"text"
,
"max_train_samples"
:
4
,
"max_eval_samples"
:
4
,
"max_duration_in_seconds"
:
30
,
"min_duration_in_seconds"
:
1.0
,
"add_audio_samples_to_wandb"
:
true
,
"id_column_name"
:
"id"
,
"preprocessing_num_workers"
:
1
,
"pad_token_id"
:
1024
,
"decoder_start_token_id"
:
1025
,
"do_train"
:
true
,
"num_train_epochs"
:
180
,
"gradient_accumulation_steps"
:
1
,
"gradient_checkpointing"
:
false
,
"per_device_train_batch_size"
:
2
,
"learning_rate"
:
1e-3
,
"adam_beta1"
:
0.9
,
"adam_beta2"
:
0.999
,
"weight_decay"
:
0.1
,
"lr_scheduler_type"
:
"cosine"
,
"warmup_ratio"
:
0.1
,
"freeze_text_encoder"
:
true
,
"do_eval"
:
true
,
"predict_with_generate"
:
true
,
"include_inputs_for_metrics"
:
true
,
"evaluation_strategy"
:
"steps"
,
"eval_steps"
:
30
,
"per_device_eval_batch_size"
:
2
,
"generation_max_length"
:
800
,
"do_sample"
:
false
,
"logging_steps"
:
15
,
"dtype"
:
"float32"
,
"seed"
:
456
,
"dataloader_num_workers"
:
8
}
init_dummy_model_dac.py
0 → 100644
View file @
e7cc576a
from
stable_speech
import
StableSpeechConfig
,
StableSpeechForCausalLM
,
StableSpeechForConditionalGeneration
,
StableSpeechDecoderConfig
from
transformers
import
AutoConfig
from
transformers
import
AutoModel
from
transformers
import
AutoConfig
,
AutoModel
from
stable_speech
import
DACConfig
,
DACModel
AutoConfig
.
register
(
"dac"
,
DACConfig
)
AutoModel
.
register
(
DACConfig
,
DACModel
)
text_model
=
"google-t5/t5-small"
encodec_version
=
"ylacombe/dac_44khZ_8kbps"
num_codebooks
=
9
t5
=
AutoConfig
.
from_pretrained
(
text_model
)
encodec
=
AutoConfig
.
from_pretrained
(
encodec_version
)
encodec_vocab_size
=
encodec
.
codebook_size
decoder_config
=
StableSpeechDecoderConfig
(
vocab_size
=
encodec_vocab_size
+
1
,
max_position_embeddings
=
2048
,
num_hidden_layers
=
4
,
ffn_dim
=
512
,
num_attention_heads
=
8
,
layerdrop
=
0.0
,
use_cache
=
True
,
activation_function
=
"gelu"
,
hidden_size
=
512
,
dropout
=
0.0
,
attention_dropout
=
0.0
,
activation_dropout
=
0.0
,
pad_token_id
=
encodec_vocab_size
,
eos_token_id
=
encodec_vocab_size
,
bos_token_id
=
encodec_vocab_size
+
1
,
num_codebooks
=
num_codebooks
,
)
# TODO: ?? how to make it stop ?
decoder
=
StableSpeechForCausalLM
(
decoder_config
)
decoder
.
save_pretrained
(
"/raid/yoach/tmp/artefacts/decoder/"
)
model
=
StableSpeechForConditionalGeneration
.
from_sub_models_pretrained
(
text_encoder_pretrained_model_name_or_path
=
text_model
,
audio_encoder_pretrained_model_name_or_path
=
encodec_version
,
decoder_pretrained_model_name_or_path
=
"/raid/yoach/tmp/artefacts/decoder/"
,
vocab_size
=
t5
.
vocab_size
)
# set the appropriate bos/pad token ids
model
.
generation_config
.
decoder_start_token_id
=
encodec_vocab_size
+
1
model
.
generation_config
.
pad_token_id
=
encodec_vocab_size
model
.
generation_config
.
eos_token_id
=
encodec_vocab_size
# set other default generation config params
model
.
generation_config
.
max_length
=
int
(
30
*
model
.
audio_encoder
.
config
.
frame_rate
)
model
.
generation_config
.
do_sample
=
False
# True
model
.
generation_config
.
guidance_scale
=
1
# 3.0
model
.
save_pretrained
(
"/raid/yoach/tmp/artefacts/tiny-dac-model/"
)
\ No newline at end of file
init_model.py
View file @
e7cc576a
...
...
@@ -3,9 +3,15 @@ from transformers import T5Config, EncodecConfig
from
transformers
import
AutoConfig
from
transformers
import
AutoConfig
,
AutoModel
from
stable_speech
import
DACConfig
,
DACModel
AutoConfig
.
register
(
"dac"
,
DACConfig
)
AutoModel
.
register
(
DACConfig
,
DACModel
)
text_model
=
"google-t5/t5-small"
encodec_version
=
"
facebook/encode
c_
2
4kh
z
"
num_codebooks
=
8
encodec_version
=
"
ylacombe/da
c_
4
4kh
Z_8kbps
"
num_codebooks
=
9
t5
=
AutoConfig
.
from_pretrained
(
text_model
)
...
...
@@ -16,7 +22,7 @@ encodec_vocab_size = encodec.codebook_size
decoder_config
=
StableSpeechDecoderConfig
(
vocab_size
=
encodec_vocab_size
+
1
,
max_position_embeddings
=
225
0
,
# 30 s
max_position_embeddings
=
300
0
,
# 30 s
= 2580
num_hidden_layers
=
12
,
ffn_dim
=
4096
,
num_attention_heads
=
16
,
...
...
init_model_75M.py
0 → 100644
View file @
e7cc576a
from
stable_speech
import
StableSpeechConfig
,
StableSpeechForCausalLM
,
StableSpeechForConditionalGeneration
,
StableSpeechDecoderConfig
from
transformers
import
T5Config
,
EncodecConfig
from
transformers
import
AutoConfig
from
transformers
import
AutoConfig
,
AutoModel
from
stable_speech
import
DACConfig
,
DACModel
AutoConfig
.
register
(
"dac"
,
DACConfig
)
AutoModel
.
register
(
DACConfig
,
DACModel
)
text_model
=
"google/t5-v1_1-small"
encodec_version
=
"ylacombe/dac_44khZ_8kbps"
num_codebooks
=
9
t5
=
AutoConfig
.
from_pretrained
(
text_model
)
encodec
=
AutoConfig
.
from_pretrained
(
encodec_version
)
encodec_vocab_size
=
encodec
.
codebook_size
decoder_config
=
StableSpeechDecoderConfig
(
vocab_size
=
encodec_vocab_size
+
1
,
max_position_embeddings
=
4096
,
# 30 s = 2580
num_hidden_layers
=
8
,
ffn_dim
=
3072
,
num_attention_heads
=
12
,
layerdrop
=
0.0
,
use_cache
=
True
,
activation_function
=
"gelu"
,
hidden_size
=
768
,
dropout
=
0.0
,
attention_dropout
=
0.0
,
activation_dropout
=
0.0
,
pad_token_id
=
encodec_vocab_size
,
eos_token_id
=
encodec_vocab_size
,
bos_token_id
=
encodec_vocab_size
+
1
,
num_codebooks
=
num_codebooks
,
)
decoder
=
StableSpeechForCausalLM
(
decoder_config
)
decoder
.
save_pretrained
(
"/raid/yoach/tmp/artefacts/decoder_small/"
)
model
=
StableSpeechForConditionalGeneration
.
from_sub_models_pretrained
(
text_encoder_pretrained_model_name_or_path
=
text_model
,
audio_encoder_pretrained_model_name_or_path
=
encodec_version
,
decoder_pretrained_model_name_or_path
=
"/raid/yoach/tmp/artefacts/decoder_small/"
,
vocab_size
=
t5
.
vocab_size
)
# set the appropriate bos/pad token ids
model
.
generation_config
.
decoder_start_token_id
=
encodec_vocab_size
+
1
model
.
generation_config
.
pad_token_id
=
encodec_vocab_size
model
.
generation_config
.
eos_token_id
=
encodec_vocab_size
# set other default generation config params
model
.
generation_config
.
max_length
=
int
(
30
*
model
.
audio_encoder
.
config
.
frame_rate
)
model
.
generation_config
.
do_sample
=
False
# True
model
.
generation_config
.
guidance_scale
=
1
# 3.0
model
.
save_pretrained
(
"/raid/yoach/tmp/artefacts/stable-speech-untrained-75M/"
)
\ No newline at end of file
run_stable_speech_training.py
View file @
e7cc576a
This diff is collapsed.
Click to expand it.
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment