Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenpangpang
parler-tts
Commits
7ea2b865
Commit
7ea2b865
authored
Apr 08, 2024
by
Yoach Lacombe
Browse files
delete some useless files
parent
5bfd88ee
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
0 additions
and
130 deletions
+0
-130
init_model.py
init_model.py
+0
-65
init_model_75M.py
init_model_75M.py
+0
-65
No files found.
init_model.py
deleted
100644 → 0
View file @
5bfd88ee
from
parler_tts
import
ParlerTTSConfig
,
ParlerTTSForCausalLM
,
ParlerTTSForConditionalGeneration
,
ParlerTTSDecoderConfig
from
transformers
import
T5Config
,
EncodecConfig
from
transformers
import
AutoConfig
from
transformers
import
AutoConfig
,
AutoModel
from
parler_tts
import
DACConfig
,
DACModel
AutoConfig
.
register
(
"dac"
,
DACConfig
)
AutoModel
.
register
(
DACConfig
,
DACModel
)
text_model
=
"google-t5/t5-small"
encodec_version
=
"ylacombe/dac_44khZ_8kbps"
num_codebooks
=
9
t5
=
AutoConfig
.
from_pretrained
(
text_model
)
encodec
=
AutoConfig
.
from_pretrained
(
encodec_version
)
encodec_vocab_size
=
encodec
.
codebook_size
decoder_config
=
ParlerTTSDecoderConfig
(
vocab_size
=
encodec_vocab_size
+
1
,
max_position_embeddings
=
3000
,
# 30 s = 2580
num_hidden_layers
=
12
,
ffn_dim
=
4096
,
num_attention_heads
=
16
,
layerdrop
=
0.0
,
use_cache
=
True
,
activation_function
=
"gelu"
,
hidden_size
=
1024
,
dropout
=
0.0
,
attention_dropout
=
0.0
,
activation_dropout
=
0.0
,
pad_token_id
=
encodec_vocab_size
,
eos_token_id
=
encodec_vocab_size
,
bos_token_id
=
encodec_vocab_size
+
1
,
num_codebooks
=
num_codebooks
,
)
decoder
=
ParlerTTSForCausalLM
(
decoder_config
)
decoder
.
save_pretrained
(
"/raid/yoach/tmp/artefacts/decoder/"
)
model
=
ParlerTTSForConditionalGeneration
.
from_sub_models_pretrained
(
text_encoder_pretrained_model_name_or_path
=
text_model
,
audio_encoder_pretrained_model_name_or_path
=
encodec_version
,
decoder_pretrained_model_name_or_path
=
"/raid/yoach/tmp/artefacts/decoder/"
,
vocab_size
=
t5
.
vocab_size
,
)
# set the appropriate bos/pad token ids
model
.
generation_config
.
decoder_start_token_id
=
encodec_vocab_size
+
1
model
.
generation_config
.
pad_token_id
=
encodec_vocab_size
model
.
generation_config
.
eos_token_id
=
encodec_vocab_size
# set other default generation config params
model
.
generation_config
.
max_length
=
int
(
30
*
model
.
audio_encoder
.
config
.
frame_rate
)
model
.
generation_config
.
do_sample
=
False
# True
model
.
generation_config
.
guidance_scale
=
1
# 3.0
model
.
save_pretrained
(
"/raid/yoach/tmp/artefacts/small-stable-speech-untrained/"
)
init_model_75M.py
deleted
100644 → 0
View file @
5bfd88ee
from
parler_tts
import
ParlerTTSConfig
,
ParlerTTSForCausalLM
,
ParlerTTSForConditionalGeneration
,
ParlerTTSDecoderConfig
from
transformers
import
T5Config
,
EncodecConfig
from
transformers
import
AutoConfig
from
transformers
import
AutoConfig
,
AutoModel
from
parler_tts
import
DACConfig
,
DACModel
AutoConfig
.
register
(
"dac"
,
DACConfig
)
AutoModel
.
register
(
DACConfig
,
DACModel
)
text_model
=
"google/t5-v1_1-small"
encodec_version
=
"ylacombe/dac_44khZ_8kbps"
num_codebooks
=
9
t5
=
AutoConfig
.
from_pretrained
(
text_model
)
encodec
=
AutoConfig
.
from_pretrained
(
encodec_version
)
encodec_vocab_size
=
encodec
.
codebook_size
decoder_config
=
ParlerTTSDecoderConfig
(
vocab_size
=
encodec_vocab_size
+
1
,
max_position_embeddings
=
4096
,
# 30 s = 2580
num_hidden_layers
=
8
,
ffn_dim
=
3072
,
num_attention_heads
=
12
,
layerdrop
=
0.0
,
use_cache
=
True
,
activation_function
=
"gelu"
,
hidden_size
=
768
,
dropout
=
0.0
,
attention_dropout
=
0.0
,
activation_dropout
=
0.0
,
pad_token_id
=
encodec_vocab_size
,
eos_token_id
=
encodec_vocab_size
,
bos_token_id
=
encodec_vocab_size
+
1
,
num_codebooks
=
num_codebooks
,
)
decoder
=
ParlerTTSForCausalLM
(
decoder_config
)
decoder
.
save_pretrained
(
"/raid/yoach/tmp/artefacts/decoder_small/"
)
model
=
ParlerTTSForConditionalGeneration
.
from_sub_models_pretrained
(
text_encoder_pretrained_model_name_or_path
=
text_model
,
audio_encoder_pretrained_model_name_or_path
=
encodec_version
,
decoder_pretrained_model_name_or_path
=
"/raid/yoach/tmp/artefacts/decoder_small/"
,
vocab_size
=
t5
.
vocab_size
,
)
# set the appropriate bos/pad token ids
model
.
generation_config
.
decoder_start_token_id
=
encodec_vocab_size
+
1
model
.
generation_config
.
pad_token_id
=
encodec_vocab_size
model
.
generation_config
.
eos_token_id
=
encodec_vocab_size
# set other default generation config params
model
.
generation_config
.
max_length
=
int
(
30
*
model
.
audio_encoder
.
config
.
frame_rate
)
model
.
generation_config
.
do_sample
=
False
# True
model
.
generation_config
.
guidance_scale
=
1
# 3.0
model
.
save_pretrained
(
"/raid/yoach/tmp/artefacts/stable-speech-untrained-75M/"
)
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment