Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenpangpang
parler-tts
Commits
ee12a812
"vscode:/vscode.git/clone" did not exist on "ad1a8638c91afb10d3c9c629ce99a78d2ed6c5c4"
Commit
ee12a812
authored
Feb 27, 2024
by
Yoach Lacombe
Browse files
enrich config and code
parent
7ae1e8e3
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
127 additions
and
2 deletions
+127
-2
example_configs/librispeech_tts_r_100.json
example_configs/librispeech_tts_r_100.json
+73
-0
init_model.py
init_model.py
+43
-0
run_stable_speech_training.py
run_stable_speech_training.py
+11
-2
No files found.
example_configs/librispeech_tts_r_100.json
0 → 100644
View file @
ee12a812
{
"model_name_or_path"
:
"/raid/yoach/tmp/small-stable-speech-untrained/"
,
"feature_extractor_name"
:
"facebook/encodec_32khz"
,
"description_tokenizer_name"
:
"t5-base"
,
"prompt_tokenizer_name"
:
"t5-base"
,
"push_to_hub"
:
false
,
"hub_model_id"
:
"stable-speech-mini"
,
"report_to"
:
[
"wandb"
],
"overwrite_output_dir"
:
true
,
"output_dir"
:
"/raid/yoach/tmp/artefacts/training/"
,
"train_dataset_name"
:
"blabble-io/libritts_r"
,
"train_metadata_dataset_name"
:
"stable-speech/libritts-r-tags-and-text-generated"
,
"train_dataset_config_name"
:
"clean"
,
"train_split_name"
:
"train.clean.100"
,
"eval_dataset_name"
:
"blabble-io/libritts_r"
,
"eval_metadata_dataset_name"
:
"stable-speech/libritts-r-tags-and-text-generated"
,
"eval_dataset_config_name"
:
"clean"
,
"eval_split_name"
:
"test.clean"
,
"target_audio_column_name"
:
"audio"
,
"description_column_name"
:
"text_description"
,
"prompt_column_name"
:
"text"
,
"max_eval_samples"
:
24
,
"max_duration_in_seconds"
:
32
,
"min_duration_in_seconds"
:
2.0
,
"add_audio_samples_to_wandb"
:
true
,
"id_column_name"
:
"id"
,
"preprocessing_num_workers"
:
8
,
"pad_token_id"
:
2050
,
"decoder_start_token_id"
:
2048
,
"do_train"
:
true
,
"num_train_epochs"
:
200
,
"gradient_accumulation_steps"
:
1
,
"gradient_checkpointing"
:
false
,
"per_device_train_batch_size"
:
50
,
"learning_rate"
:
1e-4
,
"adam_beta1"
:
0.9
,
"adam_beta2"
:
0.999
,
"weight_decay"
:
0.1
,
"lr_scheduler_type"
:
"cosine"
,
"warmup_ratio"
:
0.1
,
"logging_steps"
:
102
,
"freeze_text_encoder"
:
false
,
"do_eval"
:
true
,
"predict_with_generate"
:
true
,
"include_inputs_for_metrics"
:
true
,
"evaluation_strategy"
:
"steps"
,
"eval_steps"
:
600
,
"per_device_eval_batch_size"
:
8
,
"generation_max_length"
:
400
,
"audio_encode_per_device_eval_batch_size"
:
24
,
"dtype"
:
"float16"
,
"seed"
:
456
,
"dataloader_num_workers"
:
16
}
init_model.py
0 → 100644
View file @
ee12a812
from
stable_speech
import
StableSpeechConfig
,
StableSpeechForCausalLM
,
StableSpeechForConditionalGeneration
,
StableSpeechDecoderConfig
from
transformers
import
T5Config
,
EncodecConfig
from
transformers
import
AutoConfig
decoder_config
=
StableSpeechDecoderConfig
(
max_position_embeddings
=
2048
,
num_hidden_layers
=
24
,
ffn_dim
=
4096
,
num_attention_heads
=
16
,
layerdrop
=
0.0
,
use_cache
=
True
,
activation_function
=
"gelu"
,
hidden_size
=
1024
,
dropout
=
0.0
,
attention_dropout
=
0.0
,
activation_dropout
=
0.0
,
)
decoder
=
StableSpeechForCausalLM
(
decoder_config
)
decoder
.
save_pretrained
(
"/raid/yoach/tmp/decoder/"
)
t5
=
AutoConfig
.
from_pretrained
(
"t5-base"
)
model
=
StableSpeechForConditionalGeneration
.
from_sub_models_pretrained
(
text_encoder_pretrained_model_name_or_path
=
"t5-base"
,
audio_encoder_pretrained_model_name_or_path
=
"facebook/encodec_32khz"
,
decoder_pretrained_model_name_or_path
=
"/raid/yoach/tmp/decoder/"
,
vocab_size
=
t5
.
vocab_size
)
# set the appropriate bos/pad token ids
model
.
generation_config
.
decoder_start_token_id
=
2048
model
.
generation_config
.
pad_token_id
=
2050
model
.
generation_config
.
eos_token_id
=
2049
# set other default generation config params
model
.
generation_config
.
max_length
=
int
(
30
*
model
.
audio_encoder
.
config
.
frame_rate
)
model
.
generation_config
.
do_sample
=
False
# True
model
.
generation_config
.
guidance_scale
=
1
# 3.0
model
.
save_pretrained
(
"/raid/yoach/tmp/small-stable-speech-untrained/"
)
\ No newline at end of file
run_stable_speech_training.py
View file @
ee12a812
...
...
@@ -442,6 +442,14 @@ class StableSpeechTrainingArguments(Seq2SeqTrainingArguments):
)
},
)
audio_encode_per_device_eval_batch_size
:
int
=
field
(
default
=
8
,
metadata
=
{
"help"
:
(
"TODO"
)
},
)
@
dataclass
class
DataCollatorEncodecWithPadding
:
...
...
@@ -965,7 +973,8 @@ def main():
def
apply_audio_decoder
(
batch
):
len_audio
=
batch
.
pop
(
"len_audio"
)
audio_decoder
.
to
(
batch
[
"input_values"
].
device
).
eval
()
labels
=
audio_decoder
.
encode
(
**
batch
)[
"audio_codes"
]
with
torch
.
no_grad
():
labels
=
audio_decoder
.
encode
(
**
batch
)[
"audio_codes"
]
output
=
{}
output
[
"len_audio"
]
=
len_audio
# (1, bsz, codebooks, seq_len) -> (bsz, seq_len, codebooks)
...
...
@@ -976,7 +985,7 @@ def main():
for
split
in
vectorized_datasets
:
data_loader
=
DataLoader
(
vectorized_datasets
[
split
],
batch_size
=
training_args
.
per_device_eval_batch_size
,
batch_size
=
training_args
.
audio_encode_
per_device_eval_batch_size
,
collate_fn
=
encoder_data_collator
,
num_workers
=
training_args
.
dataloader_num_workers
,
pin_memory
=
True
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment