Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenpangpang
parler-tts
Commits
5564a484
Commit
5564a484
authored
Apr 08, 2024
by
Yoach Lacombe
Browse files
finalize init model scripts
parent
7ea2b865
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
177 additions
and
154 deletions
+177
-154
scripts/model_init/init_dummy_model.py
scripts/model_init/init_dummy_model.py
+67
-0
scripts/model_init/init_dummy_model_with_dac.py
scripts/model_init/init_dummy_model_with_dac.py
+0
-57
scripts/model_init/init_dummy_model_with_encodec.py
scripts/model_init/init_dummy_model_with_encodec.py
+59
-55
scripts/model_init/init_model_300M.py
scripts/model_init/init_model_300M.py
+51
-42
No files found.
scripts/model_init/init_dummy_model.py
0 → 100644
View file @
5564a484
from
parler_tts
import
ParlerTTSForCausalLM
,
ParlerTTSForConditionalGeneration
,
ParlerTTSDecoderConfig
from
transformers
import
AutoConfig
import
os
import
argparse
if
__name__
==
"__main__"
:
parser
=
argparse
.
ArgumentParser
()
parser
.
add_argument
(
"save_directory"
,
type
=
str
,
help
=
"Directory where to save the model and the decoder."
)
parser
.
add_argument
(
"text_model"
,
type
=
str
,
help
=
"Repository id or path to the text encoder."
)
parser
.
add_argument
(
"audio_model"
,
type
=
str
,
help
=
"Repository id or path to the audio encoder."
)
args
=
parser
.
parse_args
()
text_model
=
args
.
text_model
encodec_version
=
args
.
audio_model
t5
=
AutoConfig
.
from_pretrained
(
text_model
)
encodec
=
AutoConfig
.
from_pretrained
(
encodec_version
)
encodec_vocab_size
=
encodec
.
codebook_size
num_codebooks
=
encodec
.
num_codebooks
print
(
"num_codebooks"
,
num_codebooks
)
decoder_config
=
ParlerTTSDecoderConfig
(
vocab_size
=
encodec_vocab_size
+
1
,
max_position_embeddings
=
2048
,
num_hidden_layers
=
4
,
ffn_dim
=
512
,
num_attention_heads
=
8
,
layerdrop
=
0.0
,
use_cache
=
True
,
activation_function
=
"gelu"
,
hidden_size
=
512
,
dropout
=
0.0
,
attention_dropout
=
0.0
,
activation_dropout
=
0.0
,
pad_token_id
=
encodec_vocab_size
,
eos_token_id
=
encodec_vocab_size
,
bos_token_id
=
encodec_vocab_size
+
1
,
num_codebooks
=
num_codebooks
,
)
decoder
=
ParlerTTSForCausalLM
(
decoder_config
)
decoder
.
save_pretrained
(
os
.
path
.
join
(
args
.
save_directory
,
"decoder"
))
model
=
ParlerTTSForConditionalGeneration
.
from_sub_models_pretrained
(
text_encoder_pretrained_model_name_or_path
=
text_model
,
audio_encoder_pretrained_model_name_or_path
=
encodec_version
,
decoder_pretrained_model_name_or_path
=
os
.
path
.
join
(
args
.
save_directory
,
"decoder"
),
vocab_size
=
t5
.
vocab_size
,
)
# set the appropriate bos/pad token ids
model
.
generation_config
.
decoder_start_token_id
=
encodec_vocab_size
+
1
model
.
generation_config
.
pad_token_id
=
encodec_vocab_size
model
.
generation_config
.
eos_token_id
=
encodec_vocab_size
# set other default generation config params
model
.
generation_config
.
max_length
=
int
(
30
*
model
.
audio_encoder
.
config
.
frame_rate
)
model
.
generation_config
.
do_sample
=
True
# True
model
.
generation_config
.
guidance_scale
=
1
# 3.0
model
.
save_pretrained
(
os
.
path
.
join
(
args
.
save_directory
,
"tiny-model"
))
scripts/model_init/init_dummy_model_with_dac.py
deleted
100644 → 0
View file @
7ea2b865
from
parler_tts
import
ParlerTTSForCausalLM
,
ParlerTTSForConditionalGeneration
,
ParlerTTSDecoderConfig
from
transformers
import
AutoConfig
import
os
TMP_DIR
=
"./tmp/artefacts/"
text_model
=
"google-t5/t5-small"
encodec_version
=
"ylacombe/dac_44khZ_8kbps"
num_codebooks
=
9
t5
=
AutoConfig
.
from_pretrained
(
text_model
)
encodec
=
AutoConfig
.
from_pretrained
(
encodec_version
)
encodec_vocab_size
=
encodec
.
codebook_size
decoder_config
=
ParlerTTSDecoderConfig
(
vocab_size
=
encodec_vocab_size
+
1
,
max_position_embeddings
=
2048
,
num_hidden_layers
=
4
,
ffn_dim
=
512
,
num_attention_heads
=
8
,
layerdrop
=
0.0
,
use_cache
=
True
,
activation_function
=
"gelu"
,
hidden_size
=
512
,
dropout
=
0.0
,
attention_dropout
=
0.0
,
activation_dropout
=
0.0
,
pad_token_id
=
encodec_vocab_size
,
eos_token_id
=
encodec_vocab_size
,
bos_token_id
=
encodec_vocab_size
+
1
,
num_codebooks
=
num_codebooks
,
)
decoder
=
ParlerTTSForCausalLM
(
decoder_config
)
decoder
.
save_pretrained
(
os
.
path
.
join
(
TMP_DIR
,
"decoder"
))
model
=
ParlerTTSForConditionalGeneration
.
from_sub_models_pretrained
(
text_encoder_pretrained_model_name_or_path
=
text_model
,
audio_encoder_pretrained_model_name_or_path
=
encodec_version
,
decoder_pretrained_model_name_or_path
=
os
.
path
.
join
(
TMP_DIR
,
"decoder"
),
vocab_size
=
t5
.
vocab_size
,
)
# set the appropriate bos/pad token ids
model
.
generation_config
.
decoder_start_token_id
=
encodec_vocab_size
+
1
model
.
generation_config
.
pad_token_id
=
encodec_vocab_size
model
.
generation_config
.
eos_token_id
=
encodec_vocab_size
# set other default generation config params
model
.
generation_config
.
max_length
=
int
(
30
*
model
.
audio_encoder
.
config
.
frame_rate
)
model
.
generation_config
.
do_sample
=
True
# True
model
.
generation_config
.
guidance_scale
=
1
# 3.0
model
.
save_pretrained
(
os
.
path
.
join
(
TMP_DIR
,
"tiny-model"
))
scripts/model_init/init_dummy_model_with_encodec.py
View file @
5564a484
from
parler_tts
import
ParlerTTSForCausalLM
,
ParlerTTSForConditionalGeneration
,
ParlerTTSDecoderConfig
from
parler_tts
import
ParlerTTSForCausalLM
,
ParlerTTSForConditionalGeneration
,
ParlerTTSDecoderConfig
from
transformers
import
AutoConfig
from
transformers
import
AutoConfig
import
os
import
os
import
argparse
TMP_DIR
=
"./artefacts/"
if
__name__
==
"__main__"
:
text_model
=
"google-t5/t5-small"
parser
=
argparse
.
ArgumentParser
()
encodec_version
=
"facebook/encodec_24khz"
parser
.
add_argument
(
"save_directory"
,
type
=
str
,
help
=
"Directory where to save the model and the decoder."
)
num_codebooks
=
8
args
=
parser
.
parse_args
()
t5
=
AutoConfig
.
from_pretrained
(
text_model
)
text_model
=
"google-t5/t5-small"
encodec
=
AutoConfig
.
from_pretrained
(
encodec_version
)
encodec_version
=
"facebook/encodec_24khz"
encodec_vocab_size
=
encodec
.
codebook_size
t5
=
AutoConfig
.
from_pretrained
(
text_model
)
encodec
=
AutoConfig
.
from_pretrained
(
encodec_version
)
decoder_config
=
ParlerTTSDecoderConfig
(
encodec_vocab_size
=
encodec
.
codebook_size
vocab_size
=
encodec_vocab_size
+
1
,
num_codebooks
=
8
max_position_embeddings
=
2048
,
print
(
"num_codebooks"
,
num_codebooks
)
num_hidden_layers
=
4
,
ffn_dim
=
512
,
decoder_config
=
ParlerTTSDecoderConfig
(
num_attention_heads
=
8
,
vocab_size
=
encodec_vocab_size
+
1
,
layerdrop
=
0.0
,
max_position_embeddings
=
2048
,
use_cache
=
True
,
num_hidden_layers
=
4
,
activation_function
=
"gelu"
,
ffn_dim
=
512
,
hidden_size
=
512
,
num_attention_heads
=
8
,
dropout
=
0.0
,
layerdrop
=
0.0
,
attention_dropout
=
0.0
,
use_cache
=
True
,
activation_dropout
=
0.0
,
activation_function
=
"gelu"
,
pad_token_id
=
encodec_vocab_size
,
hidden_size
=
512
,
eos_token_id
=
encodec_vocab_size
,
dropout
=
0.0
,
bos_token_id
=
encodec_vocab_size
+
1
,
attention_dropout
=
0.0
,
num_codebooks
=
num_codebooks
,
activation_dropout
=
0.0
,
)
pad_token_id
=
encodec_vocab_size
,
eos_token_id
=
encodec_vocab_size
,
decoder
=
ParlerTTSForCausalLM
(
decoder_config
)
bos_token_id
=
encodec_vocab_size
+
1
,
num_codebooks
=
num_codebooks
,
decoder
.
save_pretrained
(
os
.
path
.
join
(
TMP_DIR
,
"decoder"
))
)
decoder
=
ParlerTTSForCausalLM
(
decoder_config
)
model
=
ParlerTTSForConditionalGeneration
.
from_sub_models_pretrained
(
text_encoder_pretrained_model_name_or_path
=
text_model
,
decoder
.
save_pretrained
(
os
.
path
.
join
(
args
.
save_directory
,
"decoder"
))
audio_encoder_pretrained_model_name_or_path
=
encodec_version
,
decoder_pretrained_model_name_or_path
=
os
.
path
.
join
(
TMP_DIR
,
"decoder"
),
vocab_size
=
t5
.
vocab_size
,
model
=
ParlerTTSForConditionalGeneration
.
from_sub_models_pretrained
(
)
text_encoder_pretrained_model_name_or_path
=
text_model
,
audio_encoder_pretrained_model_name_or_path
=
encodec_version
,
# set the appropriate bos/pad token ids
decoder_pretrained_model_name_or_path
=
os
.
path
.
join
(
args
.
save_directory
,
"decoder"
),
model
.
generation_config
.
decoder_start_token_id
=
encodec_vocab_size
+
1
vocab_size
=
t5
.
vocab_size
,
model
.
generation_config
.
pad_token_id
=
encodec_vocab_size
)
model
.
generation_config
.
eos_token_id
=
encodec_vocab_size
# set the appropriate bos/pad token ids
# set other default generation config params
model
.
generation_config
.
decoder_start_token_id
=
encodec_vocab_size
+
1
model
.
generation_config
.
max_length
=
int
(
30
*
model
.
audio_encoder
.
config
.
frame_rate
)
model
.
generation_config
.
pad_token_id
=
encodec_vocab_size
model
.
generation_config
.
do_sample
=
True
# True
model
.
generation_config
.
eos_token_id
=
encodec_vocab_size
model
.
generation_config
.
guidance_scale
=
1
# 3.0
# set other default generation config params
model
.
save_pretrained
(
os
.
path
.
join
(
TMP_DIR
,
"tiny-model"
))
model
.
generation_config
.
max_length
=
int
(
30
*
model
.
audio_encoder
.
config
.
frame_rate
)
model
.
generation_config
.
do_sample
=
True
# True
model
.
generation_config
.
guidance_scale
=
1
# 3.0
model
.
save_pretrained
(
os
.
path
.
join
(
args
.
save_directory
,
"tiny-model"
))
scripts/model_init/init_model_300M.py
View file @
5564a484
from
parler_tts
import
ParlerTTSForCausalLM
,
ParlerTTSForConditionalGeneration
,
ParlerTTSDecoderConfig
from
parler_tts
import
ParlerTTSForCausalLM
,
ParlerTTSForConditionalGeneration
,
ParlerTTSDecoderConfig
from
transformers
import
AutoConfig
from
transformers
import
AutoConfig
import
os
import
os
TMP_DIR
=
"./tmp/artefacts/"
import
argparse
text_model
=
"google/flan-t5-base"
encodec_version
=
"ylacombe/dac_44khZ_8kbps"
num_codebooks
=
9
if
__name__
==
"__main__"
:
parser
=
argparse
.
ArgumentParser
()
parser
.
add_argument
(
"save_directory"
,
type
=
str
,
help
=
"Directory where to save the model and the decoder."
)
parser
.
add_argument
(
"text_model"
,
type
=
str
,
help
=
"Repository id or path to the text encoder."
)
parser
.
add_argument
(
"audio_model"
,
type
=
str
,
help
=
"Repository id or path to the audio encoder."
)
args
=
parser
.
parse_args
()
t5
=
AutoConfig
.
from_pretrained
(
text_model
)
text_model
=
args
.
text_model
encodec
=
AutoConfig
.
from_pretrained
(
encodec_version
)
encodec
_version
=
args
.
audio_model
encodec_vocab_size
=
encodec
.
codebook_size
t5
=
AutoConfig
.
from_pretrained
(
text_model
)
encodec
=
AutoConfig
.
from_pretrained
(
encodec_version
)
encodec_vocab_size
=
encodec
.
codebook_size
num_codebooks
=
encodec
.
num_codebooks
print
(
"num_codebooks"
,
num_codebooks
)
decoder_config
=
ParlerTTSDecoderConfig
(
vocab_size
=
encodec_vocab_size
+
64
,
# + 64 instead of +1 to have a multiple of 64
max_position_embeddings
=
4096
,
# 30 s = 2580
num_hidden_layers
=
24
,
ffn_dim
=
4096
,
num_attention_heads
=
16
,
layerdrop
=
0.0
,
use_cache
=
True
,
activation_function
=
"gelu"
,
hidden_size
=
1024
,
dropout
=
0.1
,
attention_dropout
=
0.0
,
activation_dropout
=
0.0
,
pad_token_id
=
encodec_vocab_size
,
eos_token_id
=
encodec_vocab_size
,
bos_token_id
=
encodec_vocab_size
+
1
,
num_codebooks
=
num_codebooks
,
)
decoder_config
=
ParlerTTSDecoderConfig
(
vocab_size
=
encodec_vocab_size
+
64
,
# + 64 instead of +1 to have a multiple of 64
max_position_embeddings
=
4096
,
# 30 s = 2580
num_hidden_layers
=
24
,
ffn_dim
=
4096
,
num_attention_heads
=
16
,
layerdrop
=
0.0
,
use_cache
=
True
,
activation_function
=
"gelu"
,
hidden_size
=
1024
,
dropout
=
0.1
,
attention_dropout
=
0.0
,
activation_dropout
=
0.0
,
pad_token_id
=
encodec_vocab_size
,
eos_token_id
=
encodec_vocab_size
,
bos_token_id
=
encodec_vocab_size
+
1
,
num_codebooks
=
num_codebooks
,
)
decoder
=
ParlerTTSForCausalLM
(
decoder_config
)
decoder
.
save_pretrained
(
os
.
path
.
join
(
TMP_DIR
,
"decoder"
))
decoder
=
ParlerTTSForCausalLM
(
decoder_config
)
decoder
.
save_pretrained
(
os
.
path
.
join
(
args
.
save_directory
,
"decoder"
))
model
=
ParlerTTSForConditionalGeneration
.
from_sub_models_pretrained
(
text_encoder_pretrained_model_name_or_path
=
text_model
,
audio_encoder_pretrained_model_name_or_path
=
encodec_version
,
decoder_pretrained_model_name_or_path
=
os
.
path
.
join
(
TMP_DIR
,
"decoder"
),
vocab_size
=
t5
.
vocab_size
,
)
# set the appropriate bos/pad token ids
model
=
ParlerTTSForConditionalGeneration
.
from_sub_models_pretrained
(
model
.
generation_config
.
decoder_start_token_id
=
encodec_vocab_size
+
1
text_encoder_pretrained_model_name_or_path
=
text_model
,
model
.
generation_config
.
pad_token_id
=
encodec_vocab_size
audio_encoder_pretrained_model_name_or_path
=
encodec_version
,
model
.
generation_config
.
eos_token_id
=
encodec_vocab_size
decoder_pretrained_model_name_or_path
=
os
.
path
.
join
(
args
.
save_directory
,
"decoder"
),
vocab_size
=
t5
.
vocab_size
,
)
# set
o
the
r default generation config param
s
# set the
appropriate bos/pad token id
s
model
.
generation_config
.
max_length
=
int
(
30
*
model
.
audio_encoder
.
config
.
frame_rate
)
model
.
generation_config
.
decoder_start_token_id
=
encodec_vocab_size
+
1
model
.
generation_config
.
do_sample
=
True
# Tru
e
model
.
generation_config
.
pad_token_id
=
encodec_vocab_siz
e
model
.
generation_config
.
guidance_scale
=
1
# 3.0
model
.
generation_config
.
eos_token_id
=
encodec_vocab_size
# set other default generation config params
model
.
generation_config
.
max_length
=
int
(
30
*
model
.
audio_encoder
.
config
.
frame_rate
)
model
.
generation_config
.
do_sample
=
True
# True
model
.
generation_config
.
guidance_scale
=
1
# 3.0
model
.
save_pretrained
(
os
.
path
.
join
(
TMP_DIR
,
"stable-speech-untrained-300M/"
))
model
.
save_pretrained
(
os
.
path
.
join
(
args
.
save_directory
,
"stable-speech-untrained-300M/"
))
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment