Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenpangpang
parler-tts
Commits
5bfd88ee
Commit
5bfd88ee
authored
Apr 08, 2024
by
Yoach Lacombe
Browse files
add init script
parent
7f3ed1f7
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
74 additions
and
23 deletions
+74
-23
scripts/model_init/init_dummy_model_with_dac.py
scripts/model_init/init_dummy_model_with_dac.py
+7
-15
scripts/model_init/init_dummy_model_with_encodec.py
scripts/model_init/init_dummy_model_with_encodec.py
+8
-8
scripts/model_init/init_model_300M.py
scripts/model_init/init_model_300M.py
+59
-0
No files found.
init_dummy_model_dac.py
→
scripts/model_init/
init_dummy_model_
with_
dac.py
View file @
5bfd88ee
from
parler_tts
import
ParlerTTSConfig
,
ParlerTTSForCausalLM
,
ParlerTTSForConditionalGeneration
,
ParlerTTSDecoderConfig
from
parler_tts
import
ParlerTTSForCausalLM
,
ParlerTTSForConditionalGeneration
,
ParlerTTSDecoderConfig
from
transformers
import
AutoConfig
from
transformers
import
AutoModel
from
transformers
import
AutoConfig
,
AutoModel
from
parler_tts
import
DACConfig
,
DACModel
AutoConfig
.
register
(
"dac"
,
DACConfig
)
AutoModel
.
register
(
DACConfig
,
DACModel
)
import
os
TMP_DIR
=
"./tmp/artefacts/"
text_model
=
"google-t5/t5-small"
encodec_version
=
"ylacombe/dac_44khZ_8kbps"
...
...
@@ -37,18 +31,16 @@ decoder_config = ParlerTTSDecoderConfig(
bos_token_id
=
encodec_vocab_size
+
1
,
num_codebooks
=
num_codebooks
,
)
# TODO: ?? how to make it stop ?
decoder
=
ParlerTTSForCausalLM
(
decoder_config
)
decoder
.
save_pretrained
(
"/raid/yoach/tmp/artefacts/decoder/"
)
decoder
.
save_pretrained
(
os
.
path
.
join
(
TMP_DIR
,
"decoder"
))
model
=
ParlerTTSForConditionalGeneration
.
from_sub_models_pretrained
(
text_encoder_pretrained_model_name_or_path
=
text_model
,
audio_encoder_pretrained_model_name_or_path
=
encodec_version
,
decoder_pretrained_model_name_or_path
=
"/raid/yoach/tmp/artefacts/
decoder
/
"
,
decoder_pretrained_model_name_or_path
=
os
.
path
.
join
(
TMP_DIR
,
"
decoder"
)
,
vocab_size
=
t5
.
vocab_size
,
)
...
...
@@ -59,7 +51,7 @@ model.generation_config.eos_token_id = encodec_vocab_size
# set other default generation config params
model
.
generation_config
.
max_length
=
int
(
30
*
model
.
audio_encoder
.
config
.
frame_rate
)
model
.
generation_config
.
do_sample
=
Fals
e
# True
model
.
generation_config
.
do_sample
=
Tru
e
# True
model
.
generation_config
.
guidance_scale
=
1
# 3.0
model
.
save_pretrained
(
"/raid/yoach/tmp/artefacts/
tiny-
dac-
model
/
"
)
model
.
save_pretrained
(
os
.
path
.
join
(
TMP_DIR
,
"
tiny-model"
)
)
init_dummy_model.py
→
scripts/model_init/
init_dummy_model
_with_encodec
.py
View file @
5bfd88ee
from
parler_tts
import
ParlerTTSConfig
,
ParlerTTSForCausalLM
,
ParlerTTSForConditionalGeneration
,
ParlerTTSDecoderConfig
from
transformers
import
T5Config
,
EncodecConfig
from
parler_tts
import
ParlerTTSForCausalLM
,
ParlerTTSForConditionalGeneration
,
ParlerTTSDecoderConfig
from
transformers
import
AutoConfig
import
os
TMP_DIR
=
"./artefacts/"
text_model
=
"google-t5/t5-small"
encodec_version
=
"facebook/encodec_24khz"
...
...
@@ -30,18 +32,16 @@ decoder_config = ParlerTTSDecoderConfig(
bos_token_id
=
encodec_vocab_size
+
1
,
num_codebooks
=
num_codebooks
,
)
# TODO: ?? how to make it stop ?
decoder
=
ParlerTTSForCausalLM
(
decoder_config
)
decoder
.
save_pretrained
(
"/raid/yoach/tmp/artefacts/
decoder
/
"
)
decoder
.
save_pretrained
(
os
.
path
.
join
(
TMP_DIR
,
"
decoder"
)
)
model
=
ParlerTTSForConditionalGeneration
.
from_sub_models_pretrained
(
text_encoder_pretrained_model_name_or_path
=
text_model
,
audio_encoder_pretrained_model_name_or_path
=
encodec_version
,
decoder_pretrained_model_name_or_path
=
"/raid/yoach/tmp/artefacts/
decoder
/
"
,
decoder_pretrained_model_name_or_path
=
os
.
path
.
join
(
TMP_DIR
,
"
decoder"
)
,
vocab_size
=
t5
.
vocab_size
,
)
...
...
@@ -52,7 +52,7 @@ model.generation_config.eos_token_id = encodec_vocab_size
# set other default generation config params
model
.
generation_config
.
max_length
=
int
(
30
*
model
.
audio_encoder
.
config
.
frame_rate
)
model
.
generation_config
.
do_sample
=
Fals
e
# True
model
.
generation_config
.
do_sample
=
Tru
e
# True
model
.
generation_config
.
guidance_scale
=
1
# 3.0
model
.
save_pretrained
(
"/raid/yoach/tmp/artefacts/
tiny-model
/
"
)
model
.
save_pretrained
(
os
.
path
.
join
(
TMP_DIR
,
"
tiny-model"
)
)
scripts/model_init/init_model_300M.py
0 → 100644
View file @
5bfd88ee
from
parler_tts
import
ParlerTTSForCausalLM
,
ParlerTTSForConditionalGeneration
,
ParlerTTSDecoderConfig
from
transformers
import
AutoConfig
import
os
TMP_DIR
=
"./tmp/artefacts/"
text_model
=
"google/flan-t5-base"
encodec_version
=
"ylacombe/dac_44khZ_8kbps"
num_codebooks
=
9
t5
=
AutoConfig
.
from_pretrained
(
text_model
)
encodec
=
AutoConfig
.
from_pretrained
(
encodec_version
)
encodec_vocab_size
=
encodec
.
codebook_size
decoder_config
=
ParlerTTSDecoderConfig
(
vocab_size
=
encodec_vocab_size
+
64
,
# + 64 instead of +1 to have a multiple of 64
max_position_embeddings
=
4096
,
# 30 s = 2580
num_hidden_layers
=
24
,
ffn_dim
=
4096
,
num_attention_heads
=
16
,
layerdrop
=
0.0
,
use_cache
=
True
,
activation_function
=
"gelu"
,
hidden_size
=
1024
,
dropout
=
0.1
,
attention_dropout
=
0.0
,
activation_dropout
=
0.0
,
pad_token_id
=
encodec_vocab_size
,
eos_token_id
=
encodec_vocab_size
,
bos_token_id
=
encodec_vocab_size
+
1
,
num_codebooks
=
num_codebooks
,
)
decoder
=
ParlerTTSForCausalLM
(
decoder_config
)
decoder
.
save_pretrained
(
os
.
path
.
join
(
TMP_DIR
,
"decoder"
))
model
=
ParlerTTSForConditionalGeneration
.
from_sub_models_pretrained
(
text_encoder_pretrained_model_name_or_path
=
text_model
,
audio_encoder_pretrained_model_name_or_path
=
encodec_version
,
decoder_pretrained_model_name_or_path
=
os
.
path
.
join
(
TMP_DIR
,
"decoder"
),
vocab_size
=
t5
.
vocab_size
,
)
# set the appropriate bos/pad token ids
model
.
generation_config
.
decoder_start_token_id
=
encodec_vocab_size
+
1
model
.
generation_config
.
pad_token_id
=
encodec_vocab_size
model
.
generation_config
.
eos_token_id
=
encodec_vocab_size
# set other default generation config params
model
.
generation_config
.
max_length
=
int
(
30
*
model
.
audio_encoder
.
config
.
frame_rate
)
model
.
generation_config
.
do_sample
=
True
# True
model
.
generation_config
.
guidance_scale
=
1
# 3.0
model
.
save_pretrained
(
os
.
path
.
join
(
TMP_DIR
,
"stable-speech-untrained-300M/"
))
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment