Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenpangpang
parler-tts
Commits
31a54850
Commit
31a54850
authored
Apr 08, 2024
by
Yoach Lacombe
Browse files
make style
parent
91542bfa
Changes
12
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
12 changed files
with
497 additions
and
404 deletions
+497
-404
init_dummy_model.py
init_dummy_model.py
+8
-10
init_dummy_model_dac.py
init_dummy_model_dac.py
+8
-10
init_model.py
init_model.py
+9
-11
init_model_75M.py
init_model_75M.py
+9
-11
parler_tts/__init__.py
parler_tts/__init__.py
+7
-2
parler_tts/configuration_parler_tts.py
parler_tts/configuration_parler_tts.py
+1
-1
parler_tts/dac_wrapper/__init__.py
parler_tts/dac_wrapper/__init__.py
+1
-1
parler_tts/dac_wrapper/configuration_dac.py
parler_tts/dac_wrapper/configuration_dac.py
+3
-4
parler_tts/dac_wrapper/modeling_dac.py
parler_tts/dac_wrapper/modeling_dac.py
+24
-23
parler_tts/modeling_parler_tts.py
parler_tts/modeling_parler_tts.py
+87
-68
push_dac_to_hub.py
push_dac_to_hub.py
+3
-2
run_parler_tts_training.py
run_parler_tts_training.py
+337
-261
No files found.
init_dummy_model.py
View file @
31a54850
...
@@ -13,7 +13,7 @@ encodec_vocab_size = encodec.codebook_size
...
@@ -13,7 +13,7 @@ encodec_vocab_size = encodec.codebook_size
decoder_config
=
ParlerTTSDecoderConfig
(
decoder_config
=
ParlerTTSDecoderConfig
(
vocab_size
=
encodec_vocab_size
+
1
,
vocab_size
=
encodec_vocab_size
+
1
,
max_position_embeddings
=
2048
,
max_position_embeddings
=
2048
,
num_hidden_layers
=
4
,
num_hidden_layers
=
4
,
ffn_dim
=
512
,
ffn_dim
=
512
,
...
@@ -27,34 +27,32 @@ decoder_config = ParlerTTSDecoderConfig(
...
@@ -27,34 +27,32 @@ decoder_config = ParlerTTSDecoderConfig(
activation_dropout
=
0.0
,
activation_dropout
=
0.0
,
pad_token_id
=
encodec_vocab_size
,
pad_token_id
=
encodec_vocab_size
,
eos_token_id
=
encodec_vocab_size
,
eos_token_id
=
encodec_vocab_size
,
bos_token_id
=
encodec_vocab_size
+
1
,
bos_token_id
=
encodec_vocab_size
+
1
,
num_codebooks
=
num_codebooks
,
num_codebooks
=
num_codebooks
,
)
)
# TODO: ?? how to make it stop ?
# TODO: ?? how to make it stop ?
decoder
=
ParlerTTSForCausalLM
(
decoder_config
)
decoder
=
ParlerTTSForCausalLM
(
decoder_config
)
decoder
.
save_pretrained
(
"/raid/yoach/tmp/artefacts/decoder/"
)
decoder
.
save_pretrained
(
"/raid/yoach/tmp/artefacts/decoder/"
)
model
=
ParlerTTSForConditionalGeneration
.
from_sub_models_pretrained
(
model
=
ParlerTTSForConditionalGeneration
.
from_sub_models_pretrained
(
text_encoder_pretrained_model_name_or_path
=
text_model
,
text_encoder_pretrained_model_name_or_path
=
text_model
,
audio_encoder_pretrained_model_name_or_path
=
encodec_version
,
audio_encoder_pretrained_model_name_or_path
=
encodec_version
,
decoder_pretrained_model_name_or_path
=
"/raid/yoach/tmp/artefacts/decoder/"
,
decoder_pretrained_model_name_or_path
=
"/raid/yoach/tmp/artefacts/decoder/"
,
vocab_size
=
t5
.
vocab_size
vocab_size
=
t5
.
vocab_size
,
)
)
# set the appropriate bos/pad token ids
# set the appropriate bos/pad token ids
model
.
generation_config
.
decoder_start_token_id
=
encodec_vocab_size
+
1
model
.
generation_config
.
decoder_start_token_id
=
encodec_vocab_size
+
1
model
.
generation_config
.
pad_token_id
=
encodec_vocab_size
model
.
generation_config
.
pad_token_id
=
encodec_vocab_size
model
.
generation_config
.
eos_token_id
=
encodec_vocab_size
model
.
generation_config
.
eos_token_id
=
encodec_vocab_size
# set other default generation config params
# set other default generation config params
model
.
generation_config
.
max_length
=
int
(
30
*
model
.
audio_encoder
.
config
.
frame_rate
)
model
.
generation_config
.
max_length
=
int
(
30
*
model
.
audio_encoder
.
config
.
frame_rate
)
model
.
generation_config
.
do_sample
=
False
# True
model
.
generation_config
.
do_sample
=
False
# True
model
.
generation_config
.
guidance_scale
=
1
# 3.0
model
.
generation_config
.
guidance_scale
=
1
# 3.0
model
.
save_pretrained
(
"/raid/yoach/tmp/artefacts/tiny-model/"
)
model
.
save_pretrained
(
"/raid/yoach/tmp/artefacts/tiny-model/"
)
\ No newline at end of file
init_dummy_model_dac.py
View file @
31a54850
...
@@ -20,7 +20,7 @@ encodec_vocab_size = encodec.codebook_size
...
@@ -20,7 +20,7 @@ encodec_vocab_size = encodec.codebook_size
decoder_config
=
ParlerTTSDecoderConfig
(
decoder_config
=
ParlerTTSDecoderConfig
(
vocab_size
=
encodec_vocab_size
+
1
,
vocab_size
=
encodec_vocab_size
+
1
,
max_position_embeddings
=
2048
,
max_position_embeddings
=
2048
,
num_hidden_layers
=
4
,
num_hidden_layers
=
4
,
ffn_dim
=
512
,
ffn_dim
=
512
,
...
@@ -34,34 +34,32 @@ decoder_config = ParlerTTSDecoderConfig(
...
@@ -34,34 +34,32 @@ decoder_config = ParlerTTSDecoderConfig(
activation_dropout
=
0.0
,
activation_dropout
=
0.0
,
pad_token_id
=
encodec_vocab_size
,
pad_token_id
=
encodec_vocab_size
,
eos_token_id
=
encodec_vocab_size
,
eos_token_id
=
encodec_vocab_size
,
bos_token_id
=
encodec_vocab_size
+
1
,
bos_token_id
=
encodec_vocab_size
+
1
,
num_codebooks
=
num_codebooks
,
num_codebooks
=
num_codebooks
,
)
)
# TODO: ?? how to make it stop ?
# TODO: ?? how to make it stop ?
decoder
=
ParlerTTSForCausalLM
(
decoder_config
)
decoder
=
ParlerTTSForCausalLM
(
decoder_config
)
decoder
.
save_pretrained
(
"/raid/yoach/tmp/artefacts/decoder/"
)
decoder
.
save_pretrained
(
"/raid/yoach/tmp/artefacts/decoder/"
)
model
=
ParlerTTSForConditionalGeneration
.
from_sub_models_pretrained
(
model
=
ParlerTTSForConditionalGeneration
.
from_sub_models_pretrained
(
text_encoder_pretrained_model_name_or_path
=
text_model
,
text_encoder_pretrained_model_name_or_path
=
text_model
,
audio_encoder_pretrained_model_name_or_path
=
encodec_version
,
audio_encoder_pretrained_model_name_or_path
=
encodec_version
,
decoder_pretrained_model_name_or_path
=
"/raid/yoach/tmp/artefacts/decoder/"
,
decoder_pretrained_model_name_or_path
=
"/raid/yoach/tmp/artefacts/decoder/"
,
vocab_size
=
t5
.
vocab_size
vocab_size
=
t5
.
vocab_size
,
)
)
# set the appropriate bos/pad token ids
# set the appropriate bos/pad token ids
model
.
generation_config
.
decoder_start_token_id
=
encodec_vocab_size
+
1
model
.
generation_config
.
decoder_start_token_id
=
encodec_vocab_size
+
1
model
.
generation_config
.
pad_token_id
=
encodec_vocab_size
model
.
generation_config
.
pad_token_id
=
encodec_vocab_size
model
.
generation_config
.
eos_token_id
=
encodec_vocab_size
model
.
generation_config
.
eos_token_id
=
encodec_vocab_size
# set other default generation config params
# set other default generation config params
model
.
generation_config
.
max_length
=
int
(
30
*
model
.
audio_encoder
.
config
.
frame_rate
)
model
.
generation_config
.
max_length
=
int
(
30
*
model
.
audio_encoder
.
config
.
frame_rate
)
model
.
generation_config
.
do_sample
=
False
# True
model
.
generation_config
.
do_sample
=
False
# True
model
.
generation_config
.
guidance_scale
=
1
# 3.0
model
.
generation_config
.
guidance_scale
=
1
# 3.0
model
.
save_pretrained
(
"/raid/yoach/tmp/artefacts/tiny-dac-model/"
)
model
.
save_pretrained
(
"/raid/yoach/tmp/artefacts/tiny-dac-model/"
)
\ No newline at end of file
init_model.py
View file @
31a54850
...
@@ -21,8 +21,8 @@ encodec_vocab_size = encodec.codebook_size
...
@@ -21,8 +21,8 @@ encodec_vocab_size = encodec.codebook_size
decoder_config
=
ParlerTTSDecoderConfig
(
decoder_config
=
ParlerTTSDecoderConfig
(
vocab_size
=
encodec_vocab_size
+
1
,
vocab_size
=
encodec_vocab_size
+
1
,
max_position_embeddings
=
3000
,
# 30 s = 2580
max_position_embeddings
=
3000
,
# 30 s = 2580
num_hidden_layers
=
12
,
num_hidden_layers
=
12
,
ffn_dim
=
4096
,
ffn_dim
=
4096
,
num_attention_heads
=
16
,
num_attention_heads
=
16
,
...
@@ -35,33 +35,31 @@ decoder_config = ParlerTTSDecoderConfig(
...
@@ -35,33 +35,31 @@ decoder_config = ParlerTTSDecoderConfig(
activation_dropout
=
0.0
,
activation_dropout
=
0.0
,
pad_token_id
=
encodec_vocab_size
,
pad_token_id
=
encodec_vocab_size
,
eos_token_id
=
encodec_vocab_size
,
eos_token_id
=
encodec_vocab_size
,
bos_token_id
=
encodec_vocab_size
+
1
,
bos_token_id
=
encodec_vocab_size
+
1
,
num_codebooks
=
num_codebooks
,
num_codebooks
=
num_codebooks
,
)
)
decoder
=
ParlerTTSForCausalLM
(
decoder_config
)
decoder
=
ParlerTTSForCausalLM
(
decoder_config
)
decoder
.
save_pretrained
(
"/raid/yoach/tmp/artefacts/decoder/"
)
decoder
.
save_pretrained
(
"/raid/yoach/tmp/artefacts/decoder/"
)
model
=
ParlerTTSForConditionalGeneration
.
from_sub_models_pretrained
(
model
=
ParlerTTSForConditionalGeneration
.
from_sub_models_pretrained
(
text_encoder_pretrained_model_name_or_path
=
text_model
,
text_encoder_pretrained_model_name_or_path
=
text_model
,
audio_encoder_pretrained_model_name_or_path
=
encodec_version
,
audio_encoder_pretrained_model_name_or_path
=
encodec_version
,
decoder_pretrained_model_name_or_path
=
"/raid/yoach/tmp/artefacts/decoder/"
,
decoder_pretrained_model_name_or_path
=
"/raid/yoach/tmp/artefacts/decoder/"
,
vocab_size
=
t5
.
vocab_size
vocab_size
=
t5
.
vocab_size
,
)
)
# set the appropriate bos/pad token ids
# set the appropriate bos/pad token ids
model
.
generation_config
.
decoder_start_token_id
=
encodec_vocab_size
+
1
model
.
generation_config
.
decoder_start_token_id
=
encodec_vocab_size
+
1
model
.
generation_config
.
pad_token_id
=
encodec_vocab_size
model
.
generation_config
.
pad_token_id
=
encodec_vocab_size
model
.
generation_config
.
eos_token_id
=
encodec_vocab_size
model
.
generation_config
.
eos_token_id
=
encodec_vocab_size
# set other default generation config params
# set other default generation config params
model
.
generation_config
.
max_length
=
int
(
30
*
model
.
audio_encoder
.
config
.
frame_rate
)
model
.
generation_config
.
max_length
=
int
(
30
*
model
.
audio_encoder
.
config
.
frame_rate
)
model
.
generation_config
.
do_sample
=
False
# True
model
.
generation_config
.
do_sample
=
False
# True
model
.
generation_config
.
guidance_scale
=
1
# 3.0
model
.
generation_config
.
guidance_scale
=
1
# 3.0
model
.
save_pretrained
(
"/raid/yoach/tmp/artefacts/small-stable-speech-untrained/"
)
model
.
save_pretrained
(
"/raid/yoach/tmp/artefacts/small-stable-speech-untrained/"
)
\ No newline at end of file
init_model_75M.py
View file @
31a54850
...
@@ -21,8 +21,8 @@ encodec_vocab_size = encodec.codebook_size
...
@@ -21,8 +21,8 @@ encodec_vocab_size = encodec.codebook_size
decoder_config
=
ParlerTTSDecoderConfig
(
decoder_config
=
ParlerTTSDecoderConfig
(
vocab_size
=
encodec_vocab_size
+
1
,
vocab_size
=
encodec_vocab_size
+
1
,
max_position_embeddings
=
4096
,
# 30 s = 2580
max_position_embeddings
=
4096
,
# 30 s = 2580
num_hidden_layers
=
8
,
num_hidden_layers
=
8
,
ffn_dim
=
3072
,
ffn_dim
=
3072
,
num_attention_heads
=
12
,
num_attention_heads
=
12
,
...
@@ -35,33 +35,31 @@ decoder_config = ParlerTTSDecoderConfig(
...
@@ -35,33 +35,31 @@ decoder_config = ParlerTTSDecoderConfig(
activation_dropout
=
0.0
,
activation_dropout
=
0.0
,
pad_token_id
=
encodec_vocab_size
,
pad_token_id
=
encodec_vocab_size
,
eos_token_id
=
encodec_vocab_size
,
eos_token_id
=
encodec_vocab_size
,
bos_token_id
=
encodec_vocab_size
+
1
,
bos_token_id
=
encodec_vocab_size
+
1
,
num_codebooks
=
num_codebooks
,
num_codebooks
=
num_codebooks
,
)
)
decoder
=
ParlerTTSForCausalLM
(
decoder_config
)
decoder
=
ParlerTTSForCausalLM
(
decoder_config
)
decoder
.
save_pretrained
(
"/raid/yoach/tmp/artefacts/decoder_small/"
)
decoder
.
save_pretrained
(
"/raid/yoach/tmp/artefacts/decoder_small/"
)
model
=
ParlerTTSForConditionalGeneration
.
from_sub_models_pretrained
(
model
=
ParlerTTSForConditionalGeneration
.
from_sub_models_pretrained
(
text_encoder_pretrained_model_name_or_path
=
text_model
,
text_encoder_pretrained_model_name_or_path
=
text_model
,
audio_encoder_pretrained_model_name_or_path
=
encodec_version
,
audio_encoder_pretrained_model_name_or_path
=
encodec_version
,
decoder_pretrained_model_name_or_path
=
"/raid/yoach/tmp/artefacts/decoder_small/"
,
decoder_pretrained_model_name_or_path
=
"/raid/yoach/tmp/artefacts/decoder_small/"
,
vocab_size
=
t5
.
vocab_size
vocab_size
=
t5
.
vocab_size
,
)
)
# set the appropriate bos/pad token ids
# set the appropriate bos/pad token ids
model
.
generation_config
.
decoder_start_token_id
=
encodec_vocab_size
+
1
model
.
generation_config
.
decoder_start_token_id
=
encodec_vocab_size
+
1
model
.
generation_config
.
pad_token_id
=
encodec_vocab_size
model
.
generation_config
.
pad_token_id
=
encodec_vocab_size
model
.
generation_config
.
eos_token_id
=
encodec_vocab_size
model
.
generation_config
.
eos_token_id
=
encodec_vocab_size
# set other default generation config params
# set other default generation config params
model
.
generation_config
.
max_length
=
int
(
30
*
model
.
audio_encoder
.
config
.
frame_rate
)
model
.
generation_config
.
max_length
=
int
(
30
*
model
.
audio_encoder
.
config
.
frame_rate
)
model
.
generation_config
.
do_sample
=
False
# True
model
.
generation_config
.
do_sample
=
False
# True
model
.
generation_config
.
guidance_scale
=
1
# 3.0
model
.
generation_config
.
guidance_scale
=
1
# 3.0
model
.
save_pretrained
(
"/raid/yoach/tmp/artefacts/stable-speech-untrained-75M/"
)
model
.
save_pretrained
(
"/raid/yoach/tmp/artefacts/stable-speech-untrained-75M/"
)
\ No newline at end of file
parler_tts/__init__.py
View file @
31a54850
from
.configuration_parler_tts
import
ParlerTTSConfig
,
ParlerTTSDecoderConfig
from
.configuration_parler_tts
import
ParlerTTSConfig
,
ParlerTTSDecoderConfig
from
.modeling_parler_tts
import
ParlerTTSForCausalLM
,
ParlerTTSForConditionalGeneration
,
apply_delay_pattern_mask
,
build_delay_pattern_mask
from
.modeling_parler_tts
import
(
ParlerTTSForCausalLM
,
ParlerTTSForConditionalGeneration
,
apply_delay_pattern_mask
,
build_delay_pattern_mask
,
)
from
.dac_wrapper
import
DACConfig
,
DACModel
from
.dac_wrapper
import
DACConfig
,
DACModel
\ No newline at end of file
parler_tts/configuration_parler_tts.py
View file @
31a54850
...
@@ -81,7 +81,7 @@ class ParlerTTSDecoderConfig(PretrainedConfig):
...
@@ -81,7 +81,7 @@ class ParlerTTSDecoderConfig(PretrainedConfig):
def
__init__
(
def
__init__
(
self
,
self
,
vocab_size
=
2049
,
# vocab size = 2048 (encodec vocab size) + 1 (eos)
vocab_size
=
2049
,
# vocab size = 2048 (encodec vocab size) + 1 (eos)
max_position_embeddings
=
2048
,
max_position_embeddings
=
2048
,
num_hidden_layers
=
24
,
num_hidden_layers
=
24
,
ffn_dim
=
4096
,
ffn_dim
=
4096
,
...
...
parler_tts/dac_wrapper/__init__.py
View file @
31a54850
from
.configuration_dac
import
DACConfig
from
.configuration_dac
import
DACConfig
from
.modeling_dac
import
DACModel
from
.modeling_dac
import
DACModel
\ No newline at end of file
parler_tts/dac_wrapper/configuration_dac.py
View file @
31a54850
...
@@ -8,17 +8,16 @@ class DACConfig(PretrainedConfig):
...
@@ -8,17 +8,16 @@ class DACConfig(PretrainedConfig):
def
__init__
(
def
__init__
(
self
,
self
,
num_codebooks
:
int
=
9
,
num_codebooks
:
int
=
9
,
model_bitrate
:
int
=
8
,
# kbps
model_bitrate
:
int
=
8
,
# kbps
codebook_size
:
int
=
1024
,
codebook_size
:
int
=
1024
,
latent_dim
:
int
=
1024
,
latent_dim
:
int
=
1024
,
frame_rate
:
int
=
86
,
frame_rate
:
int
=
86
,
**
kwargs
,
**
kwargs
,
):
):
self
.
codebook_size
=
codebook_size
self
.
codebook_size
=
codebook_size
self
.
model_bitrate
=
model_bitrate
self
.
model_bitrate
=
model_bitrate
self
.
latent_dim
=
latent_dim
self
.
latent_dim
=
latent_dim
self
.
num_codebooks
=
num_codebooks
self
.
num_codebooks
=
num_codebooks
self
.
frame_rate
=
frame_rate
self
.
frame_rate
=
frame_rate
super
().
__init__
(
**
kwargs
)
super
().
__init__
(
**
kwargs
)
\ No newline at end of file
parler_tts/dac_wrapper/modeling_dac.py
View file @
31a54850
...
@@ -7,22 +7,24 @@ from .configuration_dac import DACConfig
...
@@ -7,22 +7,24 @@ from .configuration_dac import DACConfig
from
dac.model
import
DAC
from
dac.model
import
DAC
# model doesn't support batching yet
# model doesn't support batching yet
class
DACModel
(
PreTrainedModel
):
class
DACModel
(
PreTrainedModel
):
config_class
=
DACConfig
config_class
=
DACConfig
def
__init__
(
self
,
config
):
def
__init__
(
self
,
config
):
super
().
__init__
(
config
)
super
().
__init__
(
config
)
self
.
model
=
DAC
(
self
.
model
=
DAC
(
n_codebooks
=
config
.
num_codebooks
,
n_codebooks
=
config
.
num_codebooks
,
latent_dim
=
config
.
latent_dim
,
latent_dim
=
config
.
latent_dim
,
codebook_size
=
config
.
codebook_size
,
codebook_size
=
config
.
codebook_size
,
)
)
def
encode
(
self
,
input_values
,
padding_mask
=
None
,
bandwidth
=
None
,
return_dict
=
None
,
n_quantizers
=
None
,
sample_rate
=
None
):
def
encode
(
self
,
input_values
,
padding_mask
=
None
,
bandwidth
=
None
,
return_dict
=
None
,
n_quantizers
=
None
,
sample_rate
=
None
):
"""
"""
Encodes the input audio waveform into discrete codes.
Encodes the input audio waveform into discrete codes.
...
@@ -44,7 +46,7 @@ class DACModel(PreTrainedModel):
...
@@ -44,7 +46,7 @@ class DACModel(PreTrainedModel):
factors for each chunk when `normalize` is True. Each frames is a tuple `(codebook, scale)`, with
factors for each chunk when `normalize` is True. Each frames is a tuple `(codebook, scale)`, with
`codebook` of shape `[batch_size, num_codebooks, frames]`.
`codebook` of shape `[batch_size, num_codebooks, frames]`.
Scale is not used here.
Scale is not used here.
"""
"""
_
,
channels
,
input_length
=
input_values
.
shape
_
,
channels
,
input_length
=
input_values
.
shape
...
@@ -52,12 +54,12 @@ class DACModel(PreTrainedModel):
...
@@ -52,12 +54,12 @@ class DACModel(PreTrainedModel):
raise
ValueError
(
f
"Number of audio channels must be 1 or 2, but got
{
channels
}
"
)
raise
ValueError
(
f
"Number of audio channels must be 1 or 2, but got
{
channels
}
"
)
audio_data
=
self
.
model
.
preprocess
(
input_values
,
sample_rate
)
audio_data
=
self
.
model
.
preprocess
(
input_values
,
sample_rate
)
return_dict
=
return_dict
if
return_dict
is
not
None
else
self
.
config
.
return_dict
return_dict
=
return_dict
if
return_dict
is
not
None
else
self
.
config
.
return_dict
# TODO: for now, no chunk length
# TODO: for now, no chunk length
chunk_length
=
None
# self.config.chunk_length
chunk_length
=
None
# self.config.chunk_length
if
chunk_length
is
None
:
if
chunk_length
is
None
:
chunk_length
=
input_length
chunk_length
=
input_length
stride
=
input_length
stride
=
input_length
...
@@ -79,9 +81,9 @@ class DACModel(PreTrainedModel):
...
@@ -79,9 +81,9 @@ class DACModel(PreTrainedModel):
for
offset
in
range
(
0
,
input_length
-
step
,
stride
):
for
offset
in
range
(
0
,
input_length
-
step
,
stride
):
mask
=
padding_mask
[...,
offset
:
offset
+
chunk_length
].
bool
()
mask
=
padding_mask
[...,
offset
:
offset
+
chunk_length
].
bool
()
frame
=
audio_data
[:,
:,
offset
:
offset
+
chunk_length
]
frame
=
audio_data
[:,
:,
offset
:
offset
+
chunk_length
]
scale
=
None
scale
=
None
_
,
encoded_frame
,
_
,
_
,
_
=
self
.
model
.
encode
(
frame
,
n_quantizers
=
n_quantizers
)
_
,
encoded_frame
,
_
,
_
,
_
=
self
.
model
.
encode
(
frame
,
n_quantizers
=
n_quantizers
)
encoded_frames
.
append
(
encoded_frame
)
encoded_frames
.
append
(
encoded_frame
)
scales
.
append
(
scale
)
scales
.
append
(
scale
)
...
@@ -92,15 +94,14 @@ class DACModel(PreTrainedModel):
...
@@ -92,15 +94,14 @@ class DACModel(PreTrainedModel):
return
(
encoded_frames
,
scales
)
return
(
encoded_frames
,
scales
)
return
EncodecEncoderOutput
(
encoded_frames
,
scales
)
return
EncodecEncoderOutput
(
encoded_frames
,
scales
)
def
decode
(
def
decode
(
self
,
self
,
audio_codes
,
audio_codes
,
audio_scales
,
audio_scales
,
padding_mask
=
None
,
padding_mask
=
None
,
return_dict
=
None
,
return_dict
=
None
,
):
):
"""
"""
Decodes the given frames into an output audio waveform.
Decodes the given frames into an output audio waveform.
...
@@ -125,12 +126,12 @@ class DACModel(PreTrainedModel):
...
@@ -125,12 +126,12 @@ class DACModel(PreTrainedModel):
if
len
(
audio_codes
)
!=
1
:
if
len
(
audio_codes
)
!=
1
:
raise
ValueError
(
f
"Expected one frame, got
{
len
(
audio_codes
)
}
"
)
raise
ValueError
(
f
"Expected one frame, got
{
len
(
audio_codes
)
}
"
)
audio_values
=
self
.
model
.
quantizer
.
from_codes
(
audio_codes
.
squeeze
(
0
))[
0
]
audio_values
=
self
.
model
.
quantizer
.
from_codes
(
audio_codes
.
squeeze
(
0
))[
0
]
audio_values
=
self
.
model
.
decode
(
audio_values
)
audio_values
=
self
.
model
.
decode
(
audio_values
)
if
not
return_dict
:
if
not
return_dict
:
return
(
audio_values
,)
return
(
audio_values
,)
return
EncodecDecoderOutput
(
audio_values
)
return
EncodecDecoderOutput
(
audio_values
)
def
forward
(
self
,
tensor
):
def
forward
(
self
,
tensor
):
raise
ValueError
(
f
"`DACModel.forward` not implemented yet"
)
raise
ValueError
(
f
"`DACModel.forward` not implemented yet"
)
\ No newline at end of file
parler_tts/modeling_parler_tts.py
View file @
31a54850
This diff is collapsed.
Click to expand it.
push_dac_to_hub.py
View file @
31a54850
import
dac
import
dac
# Download a model
# Download a model
model_path
=
dac
.
utils
.
download
(
model_type
=
"44khz"
)
model_path
=
dac
.
utils
.
download
(
model_type
=
"44khz"
)
model
=
dac
.
DAC
.
load
(
model_path
)
model
=
dac
.
DAC
.
load
(
model_path
)
...
@@ -10,6 +10,7 @@ hf_dac = DACModel(DACConfig())
...
@@ -10,6 +10,7 @@ hf_dac = DACModel(DACConfig())
hf_dac
.
model
.
load_state_dict
(
model
.
state_dict
())
hf_dac
.
model
.
load_state_dict
(
model
.
state_dict
())
from
transformers
import
AutoConfig
,
AutoModel
from
transformers
import
AutoConfig
,
AutoModel
AutoConfig
.
register
(
"dac"
,
DACConfig
)
AutoConfig
.
register
(
"dac"
,
DACConfig
)
AutoModel
.
register
(
DACConfig
,
DACModel
)
AutoModel
.
register
(
DACConfig
,
DACModel
)
...
@@ -20,4 +21,4 @@ hf_dac.push_to_hub("ylacombe/dac_44khZ_8kbps")
...
@@ -20,4 +21,4 @@ hf_dac.push_to_hub("ylacombe/dac_44khZ_8kbps")
from
transformers
import
EncodecFeatureExtractor
from
transformers
import
EncodecFeatureExtractor
EncodecFeatureExtractor
(
sampling_rate
=
44100
).
push_to_hub
(
"ylacombe/dac_44khZ_8kbps"
)
EncodecFeatureExtractor
(
sampling_rate
=
44100
).
push_to_hub
(
"ylacombe/dac_44khZ_8kbps"
)
\ No newline at end of file
run_parler_tts_training.py
View file @
31a54850
This diff is collapsed.
Click to expand it.
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment