Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenpangpang
parler-tts
Commits
71d87fd3
Commit
71d87fd3
authored
Feb 28, 2024
by
Yoach Lacombe
Browse files
add latest changes
parent
eec6e3d6
Changes
5
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
83 additions
and
47 deletions
+83
-47
example_configs/librispeech_tts_r_100.json
example_configs/librispeech_tts_r_100.json
+20
-16
example_configs/librispeech_tts_r_dummy.json
example_configs/librispeech_tts_r_dummy.json
+9
-9
init_dummy_model.py
init_dummy_model.py
+23
-9
init_model.py
init_model.py
+30
-12
run_stable_speech_training.py
run_stable_speech_training.py
+1
-1
No files found.
example_configs/librispeech_tts_r_100.json
View file @
71d87fd3
{
{
"model_name_or_path"
:
"/raid/yoach/tmp/small-stable-speech-untrained/"
,
"model_name_or_path"
:
"/raid/yoach/tmp/
artefacts/
small-stable-speech-untrained/"
,
"feature_extractor_name"
:
"facebook/encodec_
3
2khz"
,
"feature_extractor_name"
:
"facebook/encodec_2
4
khz"
,
"description_tokenizer_name"
:
"
t5-base
"
,
"description_tokenizer_name"
:
"
google-t5/t5-small
"
,
"prompt_tokenizer_name"
:
"
t5-base
"
,
"prompt_tokenizer_name"
:
"
google-t5/t5-small
"
,
"push_to_hub"
:
fals
e
,
"push_to_hub"
:
tru
e
,
"hub_model_id"
:
"stable-speech-mini"
,
"hub_model_id"
:
"
ylacombe/
stable-speech-mini"
,
"report_to"
:
[
"wandb"
],
"report_to"
:
[
"wandb"
],
"overwrite_output_dir"
:
true
,
"overwrite_output_dir"
:
true
,
"output_dir"
:
"/raid/yoach/tmp/artefacts/training/"
,
"output_dir"
:
"/raid/yoach/tmp/artefacts/training
-mini
/"
,
"train_dataset_name"
:
"blabble-io/libritts_r"
,
"train_dataset_name"
:
"blabble-io/libritts_r"
,
...
@@ -28,22 +28,24 @@
...
@@ -28,22 +28,24 @@
"max_eval_samples"
:
24
,
"max_eval_samples"
:
24
,
"max_duration_in_seconds"
:
3
2
,
"max_duration_in_seconds"
:
2
9
,
"min_duration_in_seconds"
:
2.0
,
"min_duration_in_seconds"
:
2.0
,
"add_audio_samples_to_wandb"
:
true
,
"add_audio_samples_to_wandb"
:
true
,
"id_column_name"
:
"id"
,
"id_column_name"
:
"id"
,
"preprocessing_num_workers"
:
8
,
"preprocessing_num_workers"
:
1
,
"pad_token_id"
:
1024
,
"decoder_start_token_id"
:
1025
,
"pad_token_id"
:
2050
,
"decoder_start_token_id"
:
2048
,
"do_train"
:
true
,
"do_train"
:
true
,
"num_train_epochs"
:
200
,
"num_train_epochs"
:
15
,
"gradient_accumulation_steps"
:
1
,
"gradient_accumulation_steps"
:
1
,
"gradient_checkpointing"
:
fals
e
,
"gradient_checkpointing"
:
tru
e
,
"per_device_train_batch_size"
:
5
0
,
"per_device_train_batch_size"
:
4
0
,
"learning_rate"
:
1e-4
,
"learning_rate"
:
1e-4
,
"adam_beta1"
:
0.9
,
"adam_beta1"
:
0.9
,
"adam_beta2"
:
0.999
,
"adam_beta2"
:
0.999
,
...
@@ -61,11 +63,13 @@
...
@@ -61,11 +63,13 @@
"predict_with_generate"
:
true
,
"predict_with_generate"
:
true
,
"include_inputs_for_metrics"
:
true
,
"include_inputs_for_metrics"
:
true
,
"evaluation_strategy"
:
"steps"
,
"evaluation_strategy"
:
"steps"
,
"eval_steps"
:
600
,
"eval_steps"
:
3000
,
"save_steps"
:
3000
,
"per_device_eval_batch_size"
:
8
,
"per_device_eval_batch_size"
:
8
,
"generation_max_length"
:
400
,
"generation_max_length"
:
400
,
"audio_encode_per_device_eval_batch_size"
:
2
4
,
"audio_encode_per_device_eval_batch_size"
:
3
2
,
"dtype"
:
"float16"
,
"dtype"
:
"float16"
,
"seed"
:
456
,
"seed"
:
456
,
...
...
example_configs/librispeech_tts_r_dummy.json
View file @
71d87fd3
{
{
"model_name_or_path"
:
"/
home
/yoach/
dataspeech
/artefacts/tiny-model/"
,
"model_name_or_path"
:
"/
raid
/yoach/
tmp
/artefacts/tiny-model/"
,
"feature_extractor_name"
:
"facebook/encodec_
3
2khz"
,
"feature_extractor_name"
:
"facebook/encodec_2
4
khz"
,
"description_tokenizer_name"
:
"
t5-base
"
,
"description_tokenizer_name"
:
"
google-t5/t5-small
"
,
"prompt_tokenizer_name"
:
"
t5-base
"
,
"prompt_tokenizer_name"
:
"
google-t5/t5-small
"
,
"push_to_hub"
:
false
,
"push_to_hub"
:
false
,
"hub_model_id"
:
"stable-speech-mini"
,
"hub_model_id"
:
"stable-speech-mini"
,
"report_to"
:
[
"wandb"
],
"report_to"
:
[
"wandb"
],
"overwrite_output_dir"
:
true
,
"overwrite_output_dir"
:
true
,
"output_dir"
:
"/
home
/yoach/
dataspeech
/artefacts/training/"
,
"output_dir"
:
"/
raid
/yoach/
tmp
/artefacts/training/"
,
"train_dataset_name"
:
"blabble-io/libritts_r"
,
"train_dataset_name"
:
"blabble-io/libritts_r"
,
"train_metadata_dataset_name"
:
"stable-speech/libritts-r-tags-and-text-generated"
,
"train_metadata_dataset_name"
:
"stable-speech/libritts-r-tags-and-text-generated"
,
...
@@ -36,11 +36,11 @@
...
@@ -36,11 +36,11 @@
"preprocessing_num_workers"
:
1
,
"preprocessing_num_workers"
:
1
,
"pad_token_id"
:
2048
,
"pad_token_id"
:
1024
,
"decoder_start_token_id"
:
2049
,
"decoder_start_token_id"
:
1025
,
"do_train"
:
true
,
"do_train"
:
true
,
"num_train_epochs"
:
1
2
0
,
"num_train_epochs"
:
1
8
0
,
"gradient_accumulation_steps"
:
1
,
"gradient_accumulation_steps"
:
1
,
"gradient_checkpointing"
:
false
,
"gradient_checkpointing"
:
false
,
"per_device_train_batch_size"
:
2
,
"per_device_train_batch_size"
:
2
,
...
@@ -62,7 +62,7 @@
...
@@ -62,7 +62,7 @@
"evaluation_strategy"
:
"steps"
,
"evaluation_strategy"
:
"steps"
,
"eval_steps"
:
30
,
"eval_steps"
:
30
,
"per_device_eval_batch_size"
:
2
,
"per_device_eval_batch_size"
:
2
,
"generation_max_length"
:
4
00
,
"generation_max_length"
:
8
00
,
"do_sample"
:
false
,
"do_sample"
:
false
,
"logging_steps"
:
15
,
"logging_steps"
:
15
,
...
...
init_dummy_model.py
View file @
71d87fd3
...
@@ -2,7 +2,18 @@ from stable_speech import StableSpeechConfig, StableSpeechForCausalLM, StableSpe
...
@@ -2,7 +2,18 @@ from stable_speech import StableSpeechConfig, StableSpeechForCausalLM, StableSpe
from
transformers
import
T5Config
,
EncodecConfig
from
transformers
import
T5Config
,
EncodecConfig
from
transformers
import
AutoConfig
from
transformers
import
AutoConfig
text_model
=
"google-t5/t5-small"
encodec_version
=
"facebook/encodec_24khz"
num_codebooks
=
8
t5
=
AutoConfig
.
from_pretrained
(
text_model
)
encodec
=
AutoConfig
.
from_pretrained
(
encodec_version
)
encodec_vocab_size
=
encodec
.
codebook_size
decoder_config
=
StableSpeechDecoderConfig
(
decoder_config
=
StableSpeechDecoderConfig
(
vocab_size
=
encodec_vocab_size
+
1
,
max_position_embeddings
=
2048
,
max_position_embeddings
=
2048
,
num_hidden_layers
=
4
,
num_hidden_layers
=
4
,
ffn_dim
=
512
,
ffn_dim
=
512
,
...
@@ -14,6 +25,10 @@ decoder_config = StableSpeechDecoderConfig(
...
@@ -14,6 +25,10 @@ decoder_config = StableSpeechDecoderConfig(
dropout
=
0.0
,
dropout
=
0.0
,
attention_dropout
=
0.0
,
attention_dropout
=
0.0
,
activation_dropout
=
0.0
,
activation_dropout
=
0.0
,
pad_token_id
=
encodec_vocab_size
,
eos_token_id
=
encodec_vocab_size
,
bos_token_id
=
encodec_vocab_size
+
1
,
num_codebooks
=
num_codebooks
,
)
)
# TODO: ?? how to make it stop ?
# TODO: ?? how to make it stop ?
...
@@ -21,26 +36,25 @@ decoder_config = StableSpeechDecoderConfig(
...
@@ -21,26 +36,25 @@ decoder_config = StableSpeechDecoderConfig(
decoder
=
StableSpeechForCausalLM
(
decoder_config
)
decoder
=
StableSpeechForCausalLM
(
decoder_config
)
decoder
.
save_pretrained
(
"/
home
/yoach/
dataspeech
/artefacts/decoder/"
)
decoder
.
save_pretrained
(
"/
raid
/yoach/
tmp
/artefacts/decoder/"
)
t5
=
AutoConfig
.
from_pretrained
(
"t5-base"
)
model
=
StableSpeechForConditionalGeneration
.
from_sub_models_pretrained
(
model
=
StableSpeechForConditionalGeneration
.
from_sub_models_pretrained
(
text_encoder_pretrained_model_name_or_path
=
"t5-base"
,
text_encoder_pretrained_model_name_or_path
=
text_model
,
audio_encoder_pretrained_model_name_or_path
=
"facebook/
encodec_
32khz"
,
audio_encoder_pretrained_model_name_or_path
=
encodec_
version
,
decoder_pretrained_model_name_or_path
=
"/
home
/yoach/
dataspeech
/artefacts/decoder/"
,
decoder_pretrained_model_name_or_path
=
"/
raid
/yoach/
tmp
/artefacts/decoder/"
,
vocab_size
=
t5
.
vocab_size
vocab_size
=
t5
.
vocab_size
)
)
# set the appropriate bos/pad token ids
# set the appropriate bos/pad token ids
model
.
generation_config
.
decoder_start_token_id
=
2049
model
.
generation_config
.
decoder_start_token_id
=
encodec_vocab_size
+
1
model
.
generation_config
.
pad_token_id
=
2048
model
.
generation_config
.
pad_token_id
=
encodec_vocab_size
model
.
generation_config
.
eos_token_id
=
2048
model
.
generation_config
.
eos_token_id
=
encodec_vocab_size
# set other default generation config params
# set other default generation config params
model
.
generation_config
.
max_length
=
int
(
30
*
model
.
audio_encoder
.
config
.
frame_rate
)
model
.
generation_config
.
max_length
=
int
(
30
*
model
.
audio_encoder
.
config
.
frame_rate
)
model
.
generation_config
.
do_sample
=
False
# True
model
.
generation_config
.
do_sample
=
False
# True
model
.
generation_config
.
guidance_scale
=
1
# 3.0
model
.
generation_config
.
guidance_scale
=
1
# 3.0
model
.
save_pretrained
(
"/home/yoach/dataspeech/artefacts/tiny-model/"
)
model
.
save_pretrained
(
"/raid/yoach/tmp/artefacts/tiny-model/"
)
\ No newline at end of file
\ No newline at end of file
init_model.py
View file @
71d87fd3
...
@@ -2,9 +2,22 @@ from stable_speech import StableSpeechConfig, StableSpeechForCausalLM, StableSpe
...
@@ -2,9 +2,22 @@ from stable_speech import StableSpeechConfig, StableSpeechForCausalLM, StableSpe
from
transformers
import
T5Config
,
EncodecConfig
from
transformers
import
T5Config
,
EncodecConfig
from
transformers
import
AutoConfig
from
transformers
import
AutoConfig
text_model
=
"google-t5/t5-small"
encodec_version
=
"facebook/encodec_24khz"
num_codebooks
=
8
t5
=
AutoConfig
.
from_pretrained
(
text_model
)
encodec
=
AutoConfig
.
from_pretrained
(
encodec_version
)
encodec_vocab_size
=
encodec
.
codebook_size
decoder_config
=
StableSpeechDecoderConfig
(
decoder_config
=
StableSpeechDecoderConfig
(
max_position_embeddings
=
2048
,
vocab_size
=
encodec_vocab_size
+
1
,
num_hidden_layers
=
24
,
max_position_embeddings
=
2250
,
# 30 s
num_hidden_layers
=
12
,
ffn_dim
=
4096
,
ffn_dim
=
4096
,
num_attention_heads
=
16
,
num_attention_heads
=
16
,
layerdrop
=
0.0
,
layerdrop
=
0.0
,
...
@@ -14,30 +27,35 @@ decoder_config = StableSpeechDecoderConfig(
...
@@ -14,30 +27,35 @@ decoder_config = StableSpeechDecoderConfig(
dropout
=
0.0
,
dropout
=
0.0
,
attention_dropout
=
0.0
,
attention_dropout
=
0.0
,
activation_dropout
=
0.0
,
activation_dropout
=
0.0
,
)
pad_token_id
=
encodec_vocab_size
,
eos_token_id
=
encodec_vocab_size
,
bos_token_id
=
encodec_vocab_size
+
1
,
num_codebooks
=
num_codebooks
,
)
decoder
=
StableSpeechForCausalLM
(
decoder_config
)
decoder
=
StableSpeechForCausalLM
(
decoder_config
)
decoder
.
save_pretrained
(
"/home/yoach/dataspeech/artefacts/decoder/"
)
decoder
.
save_pretrained
(
"/raid/yoach/tmp/artefacts/decoder/"
)
t5
=
AutoConfig
.
from_pretrained
(
"t5-base"
)
model
=
StableSpeechForConditionalGeneration
.
from_sub_models_pretrained
(
model
=
StableSpeechForConditionalGeneration
.
from_sub_models_pretrained
(
text_encoder_pretrained_model_name_or_path
=
"t5-base"
,
text_encoder_pretrained_model_name_or_path
=
text_model
,
audio_encoder_pretrained_model_name_or_path
=
"facebook/
encodec_
32khz"
,
audio_encoder_pretrained_model_name_or_path
=
encodec_
version
,
decoder_pretrained_model_name_or_path
=
"/
home
/yoach/
dataspeech
/artefacts/decoder/"
,
decoder_pretrained_model_name_or_path
=
"/
raid
/yoach/
tmp
/artefacts/decoder/"
,
vocab_size
=
t5
.
vocab_size
vocab_size
=
t5
.
vocab_size
)
)
# set the appropriate bos/pad token ids
# set the appropriate bos/pad token ids
model
.
generation_config
.
decoder_start_token_id
=
2049
model
.
generation_config
.
decoder_start_token_id
=
encodec_vocab_size
+
1
model
.
generation_config
.
pad_token_id
=
2048
model
.
generation_config
.
pad_token_id
=
encodec_vocab_size
model
.
generation_config
.
eos_token_id
=
2048
model
.
generation_config
.
eos_token_id
=
encodec_vocab_size
# set other default generation config params
# set other default generation config params
model
.
generation_config
.
max_length
=
int
(
30
*
model
.
audio_encoder
.
config
.
frame_rate
)
model
.
generation_config
.
max_length
=
int
(
30
*
model
.
audio_encoder
.
config
.
frame_rate
)
model
.
generation_config
.
do_sample
=
False
# True
model
.
generation_config
.
do_sample
=
False
# True
model
.
generation_config
.
guidance_scale
=
1
# 3.0
model
.
generation_config
.
guidance_scale
=
1
# 3.0
model
.
save_pretrained
(
"/home/yoach/dataspeech/artefacts/small-stable-speech-untrained/"
)
\ No newline at end of file
model
.
save_pretrained
(
"/raid/yoach/tmp/artefacts/small-stable-speech-untrained/"
)
\ No newline at end of file
run_stable_speech_training.py
View file @
71d87fd3
...
@@ -257,7 +257,7 @@ class ModelArguments:
...
@@ -257,7 +257,7 @@ class ModelArguments:
metadata
=
{
"help"
:
"Whether to do sampling or greedy decoding."
},
metadata
=
{
"help"
:
"Whether to do sampling or greedy decoding."
},
)
)
bandwidth
:
float
=
field
(
bandwidth
:
float
=
field
(
default
=
3
,
# TODO
default
=
6
,
# TODO
metadata
=
{
"help"
:
"Audio encoder bandwidth."
},
metadata
=
{
"help"
:
"Audio encoder bandwidth."
},
)
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment