Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
ModelZoo
CosyVoice_pytorch
Commits
8d03db9a
Commit
8d03db9a
authored
Aug 26, 2024
by
wanglch
Browse files
Initial commit
parents
Pipeline
#1602
canceled with stages
Changes
85
Pipelines
1
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
3728 additions
and
0 deletions
+3728
-0
cosyvoice/flow/flow.py
cosyvoice/flow/flow.py
+141
-0
cosyvoice/flow/flow_matching.py
cosyvoice/flow/flow_matching.py
+138
-0
cosyvoice/flow/length_regulator.py
cosyvoice/flow/length_regulator.py
+49
-0
cosyvoice/hifigan/f0_predictor.py
cosyvoice/hifigan/f0_predictor.py
+55
-0
cosyvoice/hifigan/generator.py
cosyvoice/hifigan/generator.py
+391
-0
cosyvoice/llm/llm.py
cosyvoice/llm/llm.py
+206
-0
cosyvoice/transformer/__init__.py
cosyvoice/transformer/__init__.py
+0
-0
cosyvoice/transformer/activation.py
cosyvoice/transformer/activation.py
+84
-0
cosyvoice/transformer/attention.py
cosyvoice/transformer/attention.py
+326
-0
cosyvoice/transformer/convolution.py
cosyvoice/transformer/convolution.py
+145
-0
cosyvoice/transformer/decoder.py
cosyvoice/transformer/decoder.py
+396
-0
cosyvoice/transformer/decoder_layer.py
cosyvoice/transformer/decoder_layer.py
+132
-0
cosyvoice/transformer/embedding.py
cosyvoice/transformer/embedding.py
+293
-0
cosyvoice/transformer/encoder.py
cosyvoice/transformer/encoder.py
+472
-0
cosyvoice/transformer/encoder_layer.py
cosyvoice/transformer/encoder_layer.py
+236
-0
cosyvoice/transformer/label_smoothing_loss.py
cosyvoice/transformer/label_smoothing_loss.py
+96
-0
cosyvoice/transformer/positionwise_feed_forward.py
cosyvoice/transformer/positionwise_feed_forward.py
+115
-0
cosyvoice/transformer/subsampling.py
cosyvoice/transformer/subsampling.py
+383
-0
cosyvoice/utils/__init__.py
cosyvoice/utils/__init__.py
+0
-0
cosyvoice/utils/class_utils.py
cosyvoice/utils/class_utils.py
+70
-0
No files found.
cosyvoice/flow/flow.py
0 → 100644
View file @
8d03db9a
# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu, Zhihao Du)
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
logging
import
random
from
typing
import
Dict
,
Optional
import
torch
import
torch.nn
as
nn
from
torch.nn
import
functional
as
F
from
omegaconf
import
DictConfig
from
cosyvoice.utils.mask
import
make_pad_mask
class
MaskedDiffWithXvec
(
torch
.
nn
.
Module
):
def
__init__
(
self
,
input_size
:
int
=
512
,
output_size
:
int
=
80
,
spk_embed_dim
:
int
=
192
,
output_type
:
str
=
"mel"
,
vocab_size
:
int
=
4096
,
input_frame_rate
:
int
=
50
,
only_mask_loss
:
bool
=
True
,
encoder
:
torch
.
nn
.
Module
=
None
,
length_regulator
:
torch
.
nn
.
Module
=
None
,
decoder
:
torch
.
nn
.
Module
=
None
,
decoder_conf
:
Dict
=
{
'in_channels'
:
240
,
'out_channel'
:
80
,
'spk_emb_dim'
:
80
,
'n_spks'
:
1
,
'cfm_params'
:
DictConfig
({
'sigma_min'
:
1e-06
,
'solver'
:
'euler'
,
't_scheduler'
:
'cosine'
,
'training_cfg_rate'
:
0.2
,
'inference_cfg_rate'
:
0.7
,
'reg_loss_type'
:
'l1'
}),
'decoder_params'
:
{
'channels'
:
[
256
,
256
],
'dropout'
:
0.0
,
'attention_head_dim'
:
64
,
'n_blocks'
:
4
,
'num_mid_blocks'
:
12
,
'num_heads'
:
8
,
'act_fn'
:
'gelu'
}},
mel_feat_conf
:
Dict
=
{
'n_fft'
:
1024
,
'num_mels'
:
80
,
'sampling_rate'
:
22050
,
'hop_size'
:
256
,
'win_size'
:
1024
,
'fmin'
:
0
,
'fmax'
:
8000
}):
super
().
__init__
()
self
.
input_size
=
input_size
self
.
output_size
=
output_size
self
.
decoder_conf
=
decoder_conf
self
.
mel_feat_conf
=
mel_feat_conf
self
.
vocab_size
=
vocab_size
self
.
output_type
=
output_type
self
.
input_frame_rate
=
input_frame_rate
logging
.
info
(
f
"input frame rate=
{
self
.
input_frame_rate
}
"
)
self
.
input_embedding
=
nn
.
Embedding
(
vocab_size
,
input_size
)
self
.
spk_embed_affine_layer
=
torch
.
nn
.
Linear
(
spk_embed_dim
,
output_size
)
self
.
encoder
=
encoder
self
.
encoder_proj
=
torch
.
nn
.
Linear
(
self
.
encoder
.
output_size
(),
output_size
)
self
.
decoder
=
decoder
self
.
length_regulator
=
length_regulator
self
.
only_mask_loss
=
only_mask_loss
def
forward
(
self
,
batch
:
dict
,
device
:
torch
.
device
,
)
->
Dict
[
str
,
Optional
[
torch
.
Tensor
]]:
token
=
batch
[
'speech_token'
].
to
(
device
)
token_len
=
batch
[
'speech_token_len'
].
to
(
device
)
feat
=
batch
[
'speech_feat'
].
to
(
device
)
feat_len
=
batch
[
'speech_feat_len'
].
to
(
device
)
embedding
=
batch
[
'embedding'
].
to
(
device
)
# xvec projection
embedding
=
F
.
normalize
(
embedding
,
dim
=
1
)
embedding
=
self
.
spk_embed_affine_layer
(
embedding
)
# concat text and prompt_text
mask
=
(
~
make_pad_mask
(
token_len
)).
float
().
unsqueeze
(
-
1
).
to
(
device
)
token
=
self
.
input_embedding
(
torch
.
clamp
(
token
,
min
=
0
))
*
mask
# text encode
h
,
h_lengths
=
self
.
encoder
(
token
,
token_len
)
h
=
self
.
encoder_proj
(
h
)
h
,
h_lengths
=
self
.
length_regulator
(
h
,
feat_len
)
# get conditions
conds
=
torch
.
zeros
(
feat
.
shape
,
device
=
token
.
device
)
for
i
,
j
in
enumerate
(
feat_len
):
if
random
.
random
()
<
0.5
:
continue
index
=
random
.
randint
(
0
,
int
(
0.3
*
j
))
conds
[
i
,
:
index
]
=
feat
[
i
,
:
index
]
conds
=
conds
.
transpose
(
1
,
2
)
mask
=
(
~
make_pad_mask
(
feat_len
)).
to
(
h
)
feat
=
F
.
interpolate
(
feat
.
unsqueeze
(
dim
=
1
),
size
=
h
.
shape
[
1
:],
mode
=
"nearest"
).
squeeze
(
dim
=
1
)
loss
,
_
=
self
.
decoder
.
compute_loss
(
feat
.
transpose
(
1
,
2
).
contiguous
(),
mask
.
unsqueeze
(
1
),
h
.
transpose
(
1
,
2
).
contiguous
(),
embedding
,
cond
=
conds
)
return
{
'loss'
:
loss
}
@
torch
.
inference_mode
()
def
inference
(
self
,
token
,
token_len
,
prompt_token
,
prompt_token_len
,
prompt_feat
,
prompt_feat_len
,
embedding
):
assert
token
.
shape
[
0
]
==
1
# xvec projection
embedding
=
F
.
normalize
(
embedding
,
dim
=
1
)
embedding
=
self
.
spk_embed_affine_layer
(
embedding
)
# concat text and prompt_text
token
,
token_len
=
torch
.
concat
([
prompt_token
,
token
],
dim
=
1
),
prompt_token_len
+
token_len
mask
=
(
~
make_pad_mask
(
token_len
)).
float
().
unsqueeze
(
-
1
).
to
(
embedding
)
token
=
self
.
input_embedding
(
torch
.
clamp
(
token
,
min
=
0
))
*
mask
# text encode
h
,
h_lengths
=
self
.
encoder
(
token
,
token_len
)
h
=
self
.
encoder_proj
(
h
)
feat_len
=
(
token_len
/
50
*
22050
/
256
).
int
()
h
,
h_lengths
=
self
.
length_regulator
(
h
,
feat_len
)
# get conditions
conds
=
torch
.
zeros
([
1
,
feat_len
.
max
().
item
(),
self
.
output_size
],
device
=
token
.
device
)
if
prompt_feat
.
shape
[
1
]
!=
0
:
for
i
,
j
in
enumerate
(
prompt_feat_len
):
conds
[
i
,
:
j
]
=
prompt_feat
[
i
]
conds
=
conds
.
transpose
(
1
,
2
)
mask
=
(
~
make_pad_mask
(
feat_len
)).
to
(
h
)
feat
=
self
.
decoder
(
mu
=
h
.
transpose
(
1
,
2
).
contiguous
(),
mask
=
mask
.
unsqueeze
(
1
),
spks
=
embedding
,
cond
=
conds
,
n_timesteps
=
10
)
if
prompt_feat
.
shape
[
1
]
!=
0
:
feat
=
feat
[:,
:,
prompt_feat
.
shape
[
1
]:]
return
feat
cosyvoice/flow/flow_matching.py
0 → 100644
View file @
8d03db9a
# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu, Zhihao Du)
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
torch
import
torch.nn.functional
as
F
from
matcha.models.components.flow_matching
import
BASECFM
class
ConditionalCFM
(
BASECFM
):
def
__init__
(
self
,
in_channels
,
cfm_params
,
n_spks
=
1
,
spk_emb_dim
=
64
,
estimator
:
torch
.
nn
.
Module
=
None
):
super
().
__init__
(
n_feats
=
in_channels
,
cfm_params
=
cfm_params
,
n_spks
=
n_spks
,
spk_emb_dim
=
spk_emb_dim
,
)
self
.
t_scheduler
=
cfm_params
.
t_scheduler
self
.
training_cfg_rate
=
cfm_params
.
training_cfg_rate
self
.
inference_cfg_rate
=
cfm_params
.
inference_cfg_rate
in_channels
=
in_channels
+
(
spk_emb_dim
if
n_spks
>
0
else
0
)
# Just change the architecture of the estimator here
self
.
estimator
=
estimator
@
torch
.
inference_mode
()
def
forward
(
self
,
mu
,
mask
,
n_timesteps
,
temperature
=
1.0
,
spks
=
None
,
cond
=
None
):
"""Forward diffusion
Args:
mu (torch.Tensor): output of encoder
shape: (batch_size, n_feats, mel_timesteps)
mask (torch.Tensor): output_mask
shape: (batch_size, 1, mel_timesteps)
n_timesteps (int): number of diffusion steps
temperature (float, optional): temperature for scaling noise. Defaults to 1.0.
spks (torch.Tensor, optional): speaker ids. Defaults to None.
shape: (batch_size, spk_emb_dim)
cond: Not used but kept for future purposes
Returns:
sample: generated mel-spectrogram
shape: (batch_size, n_feats, mel_timesteps)
"""
z
=
torch
.
randn_like
(
mu
)
*
temperature
t_span
=
torch
.
linspace
(
0
,
1
,
n_timesteps
+
1
,
device
=
mu
.
device
)
if
self
.
t_scheduler
==
'cosine'
:
t_span
=
1
-
torch
.
cos
(
t_span
*
0.5
*
torch
.
pi
)
return
self
.
solve_euler
(
z
,
t_span
=
t_span
,
mu
=
mu
,
mask
=
mask
,
spks
=
spks
,
cond
=
cond
)
def
solve_euler
(
self
,
x
,
t_span
,
mu
,
mask
,
spks
,
cond
):
"""
Fixed euler solver for ODEs.
Args:
x (torch.Tensor): random noise
t_span (torch.Tensor): n_timesteps interpolated
shape: (n_timesteps + 1,)
mu (torch.Tensor): output of encoder
shape: (batch_size, n_feats, mel_timesteps)
mask (torch.Tensor): output_mask
shape: (batch_size, 1, mel_timesteps)
spks (torch.Tensor, optional): speaker ids. Defaults to None.
shape: (batch_size, spk_emb_dim)
cond: Not used but kept for future purposes
"""
t
,
_
,
dt
=
t_span
[
0
],
t_span
[
-
1
],
t_span
[
1
]
-
t_span
[
0
]
# I am storing this because I can later plot it by putting a debugger here and saving it to a file
# Or in future might add like a return_all_steps flag
sol
=
[]
for
step
in
range
(
1
,
len
(
t_span
)):
dphi_dt
=
self
.
estimator
(
x
,
mask
,
mu
,
t
,
spks
,
cond
)
# Classifier-Free Guidance inference introduced in VoiceBox
if
self
.
inference_cfg_rate
>
0
:
cfg_dphi_dt
=
self
.
estimator
(
x
,
mask
,
torch
.
zeros_like
(
mu
),
t
,
torch
.
zeros_like
(
spks
)
if
spks
is
not
None
else
None
,
torch
.
zeros_like
(
cond
)
)
dphi_dt
=
((
1.0
+
self
.
inference_cfg_rate
)
*
dphi_dt
-
self
.
inference_cfg_rate
*
cfg_dphi_dt
)
x
=
x
+
dt
*
dphi_dt
t
=
t
+
dt
sol
.
append
(
x
)
if
step
<
len
(
t_span
)
-
1
:
dt
=
t_span
[
step
+
1
]
-
t
return
sol
[
-
1
]
def
compute_loss
(
self
,
x1
,
mask
,
mu
,
spks
=
None
,
cond
=
None
):
"""Computes diffusion loss
Args:
x1 (torch.Tensor): Target
shape: (batch_size, n_feats, mel_timesteps)
mask (torch.Tensor): target mask
shape: (batch_size, 1, mel_timesteps)
mu (torch.Tensor): output of encoder
shape: (batch_size, n_feats, mel_timesteps)
spks (torch.Tensor, optional): speaker embedding. Defaults to None.
shape: (batch_size, spk_emb_dim)
Returns:
loss: conditional flow matching loss
y: conditional flow
shape: (batch_size, n_feats, mel_timesteps)
"""
b
,
_
,
t
=
mu
.
shape
# random timestep
t
=
torch
.
rand
([
b
,
1
,
1
],
device
=
mu
.
device
,
dtype
=
mu
.
dtype
)
if
self
.
t_scheduler
==
'cosine'
:
t
=
1
-
torch
.
cos
(
t
*
0.5
*
torch
.
pi
)
# sample noise p(x_0)
z
=
torch
.
randn_like
(
x1
)
y
=
(
1
-
(
1
-
self
.
sigma_min
)
*
t
)
*
z
+
t
*
x1
u
=
x1
-
(
1
-
self
.
sigma_min
)
*
z
# during training, we randomly drop condition to trade off mode coverage and sample fidelity
if
self
.
training_cfg_rate
>
0
:
cfg_mask
=
torch
.
rand
(
b
,
device
=
x1
.
device
)
>
self
.
training_cfg_rate
mu
=
mu
*
cfg_mask
.
view
(
-
1
,
1
,
1
)
spks
=
spks
*
cfg_mask
.
view
(
-
1
,
1
)
cond
=
cond
*
cfg_mask
.
view
(
-
1
,
1
,
1
)
pred
=
self
.
estimator
(
y
,
mask
,
mu
,
t
.
squeeze
(),
spks
,
cond
)
loss
=
F
.
mse_loss
(
pred
*
mask
,
u
*
mask
,
reduction
=
"sum"
)
/
(
torch
.
sum
(
mask
)
*
u
.
shape
[
1
])
return
loss
,
y
cosyvoice/flow/length_regulator.py
0 → 100644
View file @
8d03db9a
# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu, Zhihao Du)
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
typing
import
Tuple
import
torch.nn
as
nn
from
torch.nn
import
functional
as
F
from
cosyvoice.utils.mask
import
make_pad_mask
class
InterpolateRegulator
(
nn
.
Module
):
def
__init__
(
self
,
channels
:
int
,
sampling_ratios
:
Tuple
,
out_channels
:
int
=
None
,
groups
:
int
=
1
,
):
super
().
__init__
()
self
.
sampling_ratios
=
sampling_ratios
out_channels
=
out_channels
or
channels
model
=
nn
.
ModuleList
([])
if
len
(
sampling_ratios
)
>
0
:
for
_
in
sampling_ratios
:
module
=
nn
.
Conv1d
(
channels
,
channels
,
3
,
1
,
1
)
norm
=
nn
.
GroupNorm
(
groups
,
channels
)
act
=
nn
.
Mish
()
model
.
extend
([
module
,
norm
,
act
])
model
.
append
(
nn
.
Conv1d
(
channels
,
out_channels
,
1
,
1
)
)
self
.
model
=
nn
.
Sequential
(
*
model
)
def
forward
(
self
,
x
,
ylens
=
None
):
# x in (B, T, D)
mask
=
(
~
make_pad_mask
(
ylens
)).
to
(
x
).
unsqueeze
(
-
1
)
x
=
F
.
interpolate
(
x
.
transpose
(
1
,
2
).
contiguous
(),
size
=
ylens
.
max
(),
mode
=
'nearest'
)
out
=
self
.
model
(
x
).
transpose
(
1
,
2
).
contiguous
()
olens
=
ylens
return
out
*
mask
,
olens
cosyvoice/hifigan/f0_predictor.py
0 → 100644
View file @
8d03db9a
# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu, Kai Hu)
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
torch
import
torch.nn
as
nn
from
torch.nn.utils
import
weight_norm
class
ConvRNNF0Predictor
(
nn
.
Module
):
def
__init__
(
self
,
num_class
:
int
=
1
,
in_channels
:
int
=
80
,
cond_channels
:
int
=
512
):
super
().
__init__
()
self
.
num_class
=
num_class
self
.
condnet
=
nn
.
Sequential
(
weight_norm
(
nn
.
Conv1d
(
in_channels
,
cond_channels
,
kernel_size
=
3
,
padding
=
1
)
),
nn
.
ELU
(),
weight_norm
(
nn
.
Conv1d
(
cond_channels
,
cond_channels
,
kernel_size
=
3
,
padding
=
1
)
),
nn
.
ELU
(),
weight_norm
(
nn
.
Conv1d
(
cond_channels
,
cond_channels
,
kernel_size
=
3
,
padding
=
1
)
),
nn
.
ELU
(),
weight_norm
(
nn
.
Conv1d
(
cond_channels
,
cond_channels
,
kernel_size
=
3
,
padding
=
1
)
),
nn
.
ELU
(),
weight_norm
(
nn
.
Conv1d
(
cond_channels
,
cond_channels
,
kernel_size
=
3
,
padding
=
1
)
),
nn
.
ELU
(),
)
self
.
classifier
=
nn
.
Linear
(
in_features
=
cond_channels
,
out_features
=
self
.
num_class
)
def
forward
(
self
,
x
:
torch
.
Tensor
)
->
torch
.
Tensor
:
x
=
self
.
condnet
(
x
)
x
=
x
.
transpose
(
1
,
2
)
return
torch
.
abs
(
self
.
classifier
(
x
).
squeeze
(
-
1
))
cosyvoice/hifigan/generator.py
0 → 100644
View file @
8d03db9a
# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu, Kai Hu)
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""HIFI-GAN"""
import
typing
as
tp
import
numpy
as
np
from
scipy.signal
import
get_window
import
torch
import
torch.nn
as
nn
import
torch.nn.functional
as
F
from
torch.nn
import
Conv1d
from
torch.nn
import
ConvTranspose1d
from
torch.nn.utils
import
remove_weight_norm
from
torch.nn.utils
import
weight_norm
from
torch.distributions.uniform
import
Uniform
from
cosyvoice.transformer.activation
import
Snake
from
cosyvoice.utils.common
import
get_padding
from
cosyvoice.utils.common
import
init_weights
"""hifigan based generator implementation.
This code is modified from https://github.com/jik876/hifi-gan
,https://github.com/kan-bayashi/ParallelWaveGAN and
https://github.com/NVIDIA/BigVGAN
"""
class
ResBlock
(
torch
.
nn
.
Module
):
"""Residual block module in HiFiGAN/BigVGAN."""
def
__init__
(
self
,
channels
:
int
=
512
,
kernel_size
:
int
=
3
,
dilations
:
tp
.
List
[
int
]
=
[
1
,
3
,
5
],
):
super
(
ResBlock
,
self
).
__init__
()
self
.
convs1
=
nn
.
ModuleList
()
self
.
convs2
=
nn
.
ModuleList
()
for
dilation
in
dilations
:
self
.
convs1
.
append
(
weight_norm
(
Conv1d
(
channels
,
channels
,
kernel_size
,
1
,
dilation
=
dilation
,
padding
=
get_padding
(
kernel_size
,
dilation
)
)
)
)
self
.
convs2
.
append
(
weight_norm
(
Conv1d
(
channels
,
channels
,
kernel_size
,
1
,
dilation
=
1
,
padding
=
get_padding
(
kernel_size
,
1
)
)
)
)
self
.
convs1
.
apply
(
init_weights
)
self
.
convs2
.
apply
(
init_weights
)
self
.
activations1
=
nn
.
ModuleList
([
Snake
(
channels
,
alpha_logscale
=
False
)
for
_
in
range
(
len
(
self
.
convs1
))
])
self
.
activations2
=
nn
.
ModuleList
([
Snake
(
channels
,
alpha_logscale
=
False
)
for
_
in
range
(
len
(
self
.
convs2
))
])
def
forward
(
self
,
x
:
torch
.
Tensor
)
->
torch
.
Tensor
:
for
idx
in
range
(
len
(
self
.
convs1
)):
xt
=
self
.
activations1
[
idx
](
x
)
xt
=
self
.
convs1
[
idx
](
xt
)
xt
=
self
.
activations2
[
idx
](
xt
)
xt
=
self
.
convs2
[
idx
](
xt
)
x
=
xt
+
x
return
x
def
remove_weight_norm
(
self
):
for
idx
in
range
(
len
(
self
.
convs1
)):
remove_weight_norm
(
self
.
convs1
[
idx
])
remove_weight_norm
(
self
.
convs2
[
idx
])
class
SineGen
(
torch
.
nn
.
Module
):
""" Definition of sine generator
SineGen(samp_rate, harmonic_num = 0,
sine_amp = 0.1, noise_std = 0.003,
voiced_threshold = 0,
flag_for_pulse=False)
samp_rate: sampling rate in Hz
harmonic_num: number of harmonic overtones (default 0)
sine_amp: amplitude of sine-wavefrom (default 0.1)
noise_std: std of Gaussian noise (default 0.003)
voiced_thoreshold: F0 threshold for U/V classification (default 0)
flag_for_pulse: this SinGen is used inside PulseGen (default False)
Note: when flag_for_pulse is True, the first time step of a voiced
segment is always sin(np.pi) or cos(0)
"""
def
__init__
(
self
,
samp_rate
,
harmonic_num
=
0
,
sine_amp
=
0.1
,
noise_std
=
0.003
,
voiced_threshold
=
0
):
super
(
SineGen
,
self
).
__init__
()
self
.
sine_amp
=
sine_amp
self
.
noise_std
=
noise_std
self
.
harmonic_num
=
harmonic_num
self
.
sampling_rate
=
samp_rate
self
.
voiced_threshold
=
voiced_threshold
def
_f02uv
(
self
,
f0
):
# generate uv signal
uv
=
(
f0
>
self
.
voiced_threshold
).
type
(
torch
.
float32
)
return
uv
@
torch
.
no_grad
()
def
forward
(
self
,
f0
):
"""
:param f0: [B, 1, sample_len], Hz
:return: [B, 1, sample_len]
"""
F_mat
=
torch
.
zeros
((
f0
.
size
(
0
),
self
.
harmonic_num
+
1
,
f0
.
size
(
-
1
))).
to
(
f0
.
device
)
for
i
in
range
(
self
.
harmonic_num
+
1
):
F_mat
[:,
i
:
i
+
1
,
:]
=
f0
*
(
i
+
1
)
/
self
.
sampling_rate
theta_mat
=
2
*
np
.
pi
*
(
torch
.
cumsum
(
F_mat
,
dim
=-
1
)
%
1
)
u_dist
=
Uniform
(
low
=-
np
.
pi
,
high
=
np
.
pi
)
phase_vec
=
u_dist
.
sample
(
sample_shape
=
(
f0
.
size
(
0
),
self
.
harmonic_num
+
1
,
1
)).
to
(
F_mat
.
device
)
phase_vec
[:,
0
,
:]
=
0
# generate sine waveforms
sine_waves
=
self
.
sine_amp
*
torch
.
sin
(
theta_mat
+
phase_vec
)
# generate uv signal
uv
=
self
.
_f02uv
(
f0
)
# noise: for unvoiced should be similar to sine_amp
# std = self.sine_amp/3 -> max value ~ self.sine_amp
# . for voiced regions is self.noise_std
noise_amp
=
uv
*
self
.
noise_std
+
(
1
-
uv
)
*
self
.
sine_amp
/
3
noise
=
noise_amp
*
torch
.
randn_like
(
sine_waves
)
# first: set the unvoiced part to 0 by uv
# then: additive noise
sine_waves
=
sine_waves
*
uv
+
noise
return
sine_waves
,
uv
,
noise
class
SourceModuleHnNSF
(
torch
.
nn
.
Module
):
""" SourceModule for hn-nsf
SourceModule(sampling_rate, harmonic_num=0, sine_amp=0.1,
add_noise_std=0.003, voiced_threshod=0)
sampling_rate: sampling_rate in Hz
harmonic_num: number of harmonic above F0 (default: 0)
sine_amp: amplitude of sine source signal (default: 0.1)
add_noise_std: std of additive Gaussian noise (default: 0.003)
note that amplitude of noise in unvoiced is decided
by sine_amp
voiced_threshold: threhold to set U/V given F0 (default: 0)
Sine_source, noise_source = SourceModuleHnNSF(F0_sampled)
F0_sampled (batchsize, length, 1)
Sine_source (batchsize, length, 1)
noise_source (batchsize, length 1)
uv (batchsize, length, 1)
"""
def
__init__
(
self
,
sampling_rate
,
upsample_scale
,
harmonic_num
=
0
,
sine_amp
=
0.1
,
add_noise_std
=
0.003
,
voiced_threshod
=
0
):
super
(
SourceModuleHnNSF
,
self
).
__init__
()
self
.
sine_amp
=
sine_amp
self
.
noise_std
=
add_noise_std
# to produce sine waveforms
self
.
l_sin_gen
=
SineGen
(
sampling_rate
,
harmonic_num
,
sine_amp
,
add_noise_std
,
voiced_threshod
)
# to merge source harmonics into a single excitation
self
.
l_linear
=
torch
.
nn
.
Linear
(
harmonic_num
+
1
,
1
)
self
.
l_tanh
=
torch
.
nn
.
Tanh
()
def
forward
(
self
,
x
):
"""
Sine_source, noise_source = SourceModuleHnNSF(F0_sampled)
F0_sampled (batchsize, length, 1)
Sine_source (batchsize, length, 1)
noise_source (batchsize, length 1)
"""
# source for harmonic branch
with
torch
.
no_grad
():
sine_wavs
,
uv
,
_
=
self
.
l_sin_gen
(
x
.
transpose
(
1
,
2
))
sine_wavs
=
sine_wavs
.
transpose
(
1
,
2
)
uv
=
uv
.
transpose
(
1
,
2
)
sine_merge
=
self
.
l_tanh
(
self
.
l_linear
(
sine_wavs
))
# source for noise branch, in the same shape as uv
noise
=
torch
.
randn_like
(
uv
)
*
self
.
sine_amp
/
3
return
sine_merge
,
noise
,
uv
class
HiFTGenerator
(
nn
.
Module
):
"""
HiFTNet Generator: Neural Source Filter + ISTFTNet
https://arxiv.org/abs/2309.09493
"""
def
__init__
(
self
,
in_channels
:
int
=
80
,
base_channels
:
int
=
512
,
nb_harmonics
:
int
=
8
,
sampling_rate
:
int
=
22050
,
nsf_alpha
:
float
=
0.1
,
nsf_sigma
:
float
=
0.003
,
nsf_voiced_threshold
:
float
=
10
,
upsample_rates
:
tp
.
List
[
int
]
=
[
8
,
8
],
upsample_kernel_sizes
:
tp
.
List
[
int
]
=
[
16
,
16
],
istft_params
:
tp
.
Dict
[
str
,
int
]
=
{
"n_fft"
:
16
,
"hop_len"
:
4
},
resblock_kernel_sizes
:
tp
.
List
[
int
]
=
[
3
,
7
,
11
],
resblock_dilation_sizes
:
tp
.
List
[
tp
.
List
[
int
]]
=
[[
1
,
3
,
5
],
[
1
,
3
,
5
],
[
1
,
3
,
5
]],
source_resblock_kernel_sizes
:
tp
.
List
[
int
]
=
[
7
,
11
],
source_resblock_dilation_sizes
:
tp
.
List
[
tp
.
List
[
int
]]
=
[[
1
,
3
,
5
],
[
1
,
3
,
5
]],
lrelu_slope
:
float
=
0.1
,
audio_limit
:
float
=
0.99
,
f0_predictor
:
torch
.
nn
.
Module
=
None
,
):
super
(
HiFTGenerator
,
self
).
__init__
()
self
.
out_channels
=
1
self
.
nb_harmonics
=
nb_harmonics
self
.
sampling_rate
=
sampling_rate
self
.
istft_params
=
istft_params
self
.
lrelu_slope
=
lrelu_slope
self
.
audio_limit
=
audio_limit
self
.
num_kernels
=
len
(
resblock_kernel_sizes
)
self
.
num_upsamples
=
len
(
upsample_rates
)
self
.
m_source
=
SourceModuleHnNSF
(
sampling_rate
=
sampling_rate
,
upsample_scale
=
np
.
prod
(
upsample_rates
)
*
istft_params
[
"hop_len"
],
harmonic_num
=
nb_harmonics
,
sine_amp
=
nsf_alpha
,
add_noise_std
=
nsf_sigma
,
voiced_threshod
=
nsf_voiced_threshold
)
self
.
f0_upsamp
=
torch
.
nn
.
Upsample
(
scale_factor
=
np
.
prod
(
upsample_rates
)
*
istft_params
[
"hop_len"
])
self
.
conv_pre
=
weight_norm
(
Conv1d
(
in_channels
,
base_channels
,
7
,
1
,
padding
=
3
)
)
# Up
self
.
ups
=
nn
.
ModuleList
()
for
i
,
(
u
,
k
)
in
enumerate
(
zip
(
upsample_rates
,
upsample_kernel_sizes
)):
self
.
ups
.
append
(
weight_norm
(
ConvTranspose1d
(
base_channels
//
(
2
**
i
),
base_channels
//
(
2
**
(
i
+
1
)),
k
,
u
,
padding
=
(
k
-
u
)
//
2
,
)
)
)
# Down
self
.
source_downs
=
nn
.
ModuleList
()
self
.
source_resblocks
=
nn
.
ModuleList
()
downsample_rates
=
[
1
]
+
upsample_rates
[::
-
1
][:
-
1
]
downsample_cum_rates
=
np
.
cumprod
(
downsample_rates
)
for
i
,
(
u
,
k
,
d
)
in
enumerate
(
zip
(
downsample_cum_rates
[::
-
1
],
source_resblock_kernel_sizes
,
source_resblock_dilation_sizes
)):
if
u
==
1
:
self
.
source_downs
.
append
(
Conv1d
(
istft_params
[
"n_fft"
]
+
2
,
base_channels
//
(
2
**
(
i
+
1
)),
1
,
1
)
)
else
:
self
.
source_downs
.
append
(
Conv1d
(
istft_params
[
"n_fft"
]
+
2
,
base_channels
//
(
2
**
(
i
+
1
)),
u
*
2
,
u
,
padding
=
(
u
//
2
))
)
self
.
source_resblocks
.
append
(
ResBlock
(
base_channels
//
(
2
**
(
i
+
1
)),
k
,
d
)
)
self
.
resblocks
=
nn
.
ModuleList
()
for
i
in
range
(
len
(
self
.
ups
)):
ch
=
base_channels
//
(
2
**
(
i
+
1
))
for
j
,
(
k
,
d
)
in
enumerate
(
zip
(
resblock_kernel_sizes
,
resblock_dilation_sizes
)):
self
.
resblocks
.
append
(
ResBlock
(
ch
,
k
,
d
))
self
.
conv_post
=
weight_norm
(
Conv1d
(
ch
,
istft_params
[
"n_fft"
]
+
2
,
7
,
1
,
padding
=
3
))
self
.
ups
.
apply
(
init_weights
)
self
.
conv_post
.
apply
(
init_weights
)
self
.
reflection_pad
=
nn
.
ReflectionPad1d
((
1
,
0
))
self
.
stft_window
=
torch
.
from_numpy
(
get_window
(
"hann"
,
istft_params
[
"n_fft"
],
fftbins
=
True
).
astype
(
np
.
float32
))
self
.
f0_predictor
=
f0_predictor
def
_f02source
(
self
,
f0
:
torch
.
Tensor
)
->
torch
.
Tensor
:
f0
=
self
.
f0_upsamp
(
f0
[:,
None
]).
transpose
(
1
,
2
)
# bs,n,t
har_source
,
_
,
_
=
self
.
m_source
(
f0
)
return
har_source
.
transpose
(
1
,
2
)
def
_stft
(
self
,
x
):
spec
=
torch
.
stft
(
x
,
self
.
istft_params
[
"n_fft"
],
self
.
istft_params
[
"hop_len"
],
self
.
istft_params
[
"n_fft"
],
window
=
self
.
stft_window
.
to
(
x
.
device
),
return_complex
=
True
)
spec
=
torch
.
view_as_real
(
spec
)
# [B, F, TT, 2]
return
spec
[...,
0
],
spec
[...,
1
]
def
_istft
(
self
,
magnitude
,
phase
):
magnitude
=
torch
.
clip
(
magnitude
,
max
=
1e2
)
real
=
magnitude
*
torch
.
cos
(
phase
)
img
=
magnitude
*
torch
.
sin
(
phase
)
inverse_transform
=
torch
.
istft
(
torch
.
complex
(
real
,
img
),
self
.
istft_params
[
"n_fft"
],
self
.
istft_params
[
"hop_len"
],
self
.
istft_params
[
"n_fft"
],
window
=
self
.
stft_window
.
to
(
magnitude
.
device
))
return
inverse_transform
def
forward
(
self
,
x
:
torch
.
Tensor
)
->
torch
.
Tensor
:
f0
=
self
.
f0_predictor
(
x
)
s
=
self
.
_f02source
(
f0
)
s_stft_real
,
s_stft_imag
=
self
.
_stft
(
s
.
squeeze
(
1
))
s_stft
=
torch
.
cat
([
s_stft_real
,
s_stft_imag
],
dim
=
1
)
x
=
self
.
conv_pre
(
x
)
for
i
in
range
(
self
.
num_upsamples
):
x
=
F
.
leaky_relu
(
x
,
self
.
lrelu_slope
)
x
=
self
.
ups
[
i
](
x
)
if
i
==
self
.
num_upsamples
-
1
:
x
=
self
.
reflection_pad
(
x
)
# fusion
si
=
self
.
source_downs
[
i
](
s_stft
)
si
=
self
.
source_resblocks
[
i
](
si
)
x
=
x
+
si
xs
=
None
for
j
in
range
(
self
.
num_kernels
):
if
xs
is
None
:
xs
=
self
.
resblocks
[
i
*
self
.
num_kernels
+
j
](
x
)
else
:
xs
+=
self
.
resblocks
[
i
*
self
.
num_kernels
+
j
](
x
)
x
=
xs
/
self
.
num_kernels
x
=
F
.
leaky_relu
(
x
)
x
=
self
.
conv_post
(
x
)
magnitude
=
torch
.
exp
(
x
[:,
:
self
.
istft_params
[
"n_fft"
]
//
2
+
1
,
:])
phase
=
torch
.
sin
(
x
[:,
self
.
istft_params
[
"n_fft"
]
//
2
+
1
:,
:])
# actually, sin is redundancy
x
=
self
.
_istft
(
magnitude
,
phase
)
x
=
torch
.
clamp
(
x
,
-
self
.
audio_limit
,
self
.
audio_limit
)
return
x
def
remove_weight_norm
(
self
):
print
(
'Removing weight norm...'
)
for
l
in
self
.
ups
:
remove_weight_norm
(
l
)
for
l
in
self
.
resblocks
:
l
.
remove_weight_norm
()
remove_weight_norm
(
self
.
conv_pre
)
remove_weight_norm
(
self
.
conv_post
)
self
.
source_module
.
remove_weight_norm
()
for
l
in
self
.
source_downs
:
remove_weight_norm
(
l
)
for
l
in
self
.
source_resblocks
:
l
.
remove_weight_norm
()
@
torch
.
inference_mode
()
def
inference
(
self
,
mel
:
torch
.
Tensor
)
->
torch
.
Tensor
:
return
self
.
forward
(
x
=
mel
)
cosyvoice/llm/llm.py
0 → 100644
View file @
8d03db9a
# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu, Zhihao Du)
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
typing
import
Dict
,
Optional
,
Union
import
torch
from
torch
import
nn
import
torch.nn.functional
as
F
from
torch.nn.utils.rnn
import
pad_sequence
,
unpad_sequence
from
cosyvoice.utils.common
import
IGNORE_ID
from
cosyvoice.transformer.label_smoothing_loss
import
LabelSmoothingLoss
from
cosyvoice.utils.common
import
th_accuracy
class
TransformerLM
(
torch
.
nn
.
Module
):
def
__init__
(
self
,
text_encoder_input_size
:
int
,
llm_input_size
:
int
,
llm_output_size
:
int
,
text_token_size
:
int
,
speech_token_size
:
int
,
text_encoder
:
torch
.
nn
.
Module
,
llm
:
torch
.
nn
.
Module
,
length_normalized_loss
:
bool
=
True
,
lsm_weight
:
float
=
0.0
,
spk_embed_dim
:
int
=
192
,
):
super
().
__init__
()
self
.
llm_input_size
=
llm_input_size
self
.
speech_token_size
=
speech_token_size
# 1. build text token inputs related modules
self
.
text_embedding
=
torch
.
nn
.
Embedding
(
text_token_size
,
text_encoder_input_size
)
self
.
text_encoder
=
text_encoder
self
.
text_encoder_affine_layer
=
nn
.
Linear
(
self
.
text_encoder
.
output_size
(),
llm_input_size
)
# 2. build speech token language model related modules
self
.
sos_eos
=
0
self
.
task_id
=
1
self
.
llm_embedding
=
torch
.
nn
.
Embedding
(
2
,
llm_input_size
)
self
.
llm
=
llm
self
.
llm_decoder
=
nn
.
Linear
(
llm_output_size
,
speech_token_size
+
1
)
self
.
criterion_ce
=
LabelSmoothingLoss
(
size
=
speech_token_size
+
1
,
padding_idx
=
IGNORE_ID
,
smoothing
=
lsm_weight
,
normalize_length
=
length_normalized_loss
,
)
# 3. [Optional] build speech token related modules
self
.
speech_embedding
=
torch
.
nn
.
Embedding
(
speech_token_size
,
llm_input_size
)
self
.
spk_embed_affine_layer
=
torch
.
nn
.
Linear
(
spk_embed_dim
,
llm_input_size
)
def
encode
(
self
,
text
:
torch
.
Tensor
,
text_lengths
:
torch
.
Tensor
,
):
encoder_out
,
encoder_mask
=
self
.
text_encoder
(
text
,
text_lengths
,
decoding_chunk_size
=
1
,
num_decoding_left_chunks
=-
1
)
encoder_out_lens
=
encoder_mask
.
squeeze
(
1
).
sum
(
1
)
encoder_out
=
self
.
text_encoder_affine_layer
(
encoder_out
)
return
encoder_out
,
encoder_out_lens
def
pad_unpad_sequence
(
self
,
sos_eos_emb
,
embedding
,
text_token
,
text_token_len
,
task_id_emb
,
speech_token
,
speech_token_len
):
text_token
=
unpad_sequence
(
text_token
,
text_token_len
.
cpu
(),
batch_first
=
True
)
speech_token
=
unpad_sequence
(
speech_token
,
speech_token_len
.
cpu
(),
batch_first
=
True
)
lm_input
=
[
torch
.
concat
([
sos_eos_emb
.
squeeze
(
dim
=
0
),
embedding
[
i
],
text_token
[
i
],
task_id_emb
.
squeeze
(
dim
=
0
),
speech_token
[
i
]],
dim
=
0
)
for
i
in
range
(
len
(
text_token
))]
lm_input_len
=
torch
.
tensor
([
i
.
size
(
0
)
for
i
in
lm_input
],
dtype
=
torch
.
int32
)
lm_input
=
pad_sequence
(
lm_input
,
batch_first
=
True
,
padding_value
=
IGNORE_ID
)
return
lm_input
,
lm_input_len
def
forward
(
self
,
batch
:
dict
,
device
:
torch
.
device
,
)
->
Dict
[
str
,
Optional
[
torch
.
Tensor
]]:
"""
Args:
text: (B, L, D)
text_lengths: (B,)
audio: (B, T, N) or (B, T)
audio_lengths: (B,)
"""
text_token
=
batch
[
'text_token'
].
to
(
device
)
text_token_len
=
batch
[
'text_token_len'
].
to
(
device
)
speech_token
=
batch
[
'speech_token'
].
to
(
device
)
speech_token_len
=
batch
[
'speech_token_len'
].
to
(
device
)
embedding
=
batch
[
'embedding'
].
to
(
device
)
# 1. prepare llm_target
lm_target
=
[
torch
.
tensor
([
IGNORE_ID
]
*
(
2
+
text_token_len
[
i
])
+
speech_token
[
i
,
:
speech_token_len
[
i
]].
tolist
()
+
[
self
.
speech_token_size
])
for
i
in
range
(
text_token
.
size
(
0
))]
lm_target
=
pad_sequence
(
lm_target
,
batch_first
=
True
,
padding_value
=
IGNORE_ID
).
to
(
device
)
# 1. encode text_token
text_token
=
self
.
text_embedding
(
text_token
)
text_token
,
text_token_len
=
self
.
encode
(
text_token
,
text_token_len
)
# 2. embedding projection
embedding
=
F
.
normalize
(
embedding
,
dim
=
1
)
embedding
=
self
.
spk_embed_affine_layer
(
embedding
)
embedding
=
embedding
.
unsqueeze
(
1
)
# 3. eos and task_id
sos_eos_emb
=
self
.
llm_embedding
.
weight
[
self
.
sos_eos
].
reshape
(
1
,
1
,
-
1
)
task_id_emb
=
self
.
llm_embedding
.
weight
[
self
.
task_id
].
reshape
(
1
,
1
,
-
1
)
# 4. encode speech_token
speech_token
=
self
.
speech_embedding
(
speech_token
)
# 5. unpad and pad
lm_input
,
lm_input_len
=
self
.
pad_unpad_sequence
(
sos_eos_emb
,
embedding
,
text_token
,
text_token_len
,
task_id_emb
,
speech_token
,
speech_token_len
)
# 6. run lm forward
lm_output
,
lm_output_mask
=
self
.
llm
(
lm_input
,
lm_input_len
.
to
(
device
))
logits
=
self
.
llm_decoder
(
lm_output
)
loss
=
self
.
criterion_ce
(
logits
,
lm_target
)
acc
=
th_accuracy
(
logits
.
view
(
-
1
,
self
.
speech_token_size
+
1
),
lm_target
,
ignore_label
=
IGNORE_ID
)
return
{
'loss'
:
loss
,
'acc'
:
acc
}
def
sampling_ids
(
self
,
weighted_scores
:
torch
.
Tensor
,
sampling
:
Union
[
bool
,
int
,
float
]
=
True
,
beam_size
:
int
=
1
,
ignore_eos
:
bool
=
True
,
):
while
True
:
prob
,
indices
=
weighted_scores
.
softmax
(
dim
=-
1
).
topk
(
sampling
)
top_ids
=
prob
.
multinomial
(
beam_size
,
replacement
=
True
)
top_ids
=
indices
[
top_ids
]
if
(
not
ignore_eos
)
or
(
self
.
speech_token_size
not
in
top_ids
):
break
return
top_ids
@
torch
.
inference_mode
()
def
inference
(
self
,
text
:
torch
.
Tensor
,
text_len
:
torch
.
Tensor
,
prompt_text
:
torch
.
Tensor
,
prompt_text_len
:
torch
.
Tensor
,
prompt_speech_token
:
torch
.
Tensor
,
prompt_speech_token_len
:
torch
.
Tensor
,
embedding
:
torch
.
Tensor
,
beam_size
:
int
=
1
,
sampling
:
int
=
25
,
max_token_text_ratio
:
float
=
20
,
min_token_text_ratio
:
float
=
2
,
)
->
torch
.
Tensor
:
device
=
text
.
device
text
=
torch
.
concat
([
prompt_text
,
text
],
dim
=
1
)
text_len
+=
prompt_text_len
text
=
self
.
text_embedding
(
text
)
# 1. encode text
text
,
text_len
=
self
.
encode
(
text
,
text_len
)
# 2. encode embedding
if
embedding
.
shape
[
0
]
!=
0
:
embedding
=
F
.
normalize
(
embedding
,
dim
=
1
)
embedding
=
self
.
spk_embed_affine_layer
(
embedding
)
embedding
=
embedding
.
unsqueeze
(
dim
=
1
)
else
:
embedding
=
torch
.
zeros
(
1
,
0
,
self
.
llm_input_size
).
to
(
device
)
# 3. concat llm_input
sos_eos_emb
=
self
.
llm_embedding
.
weight
[
self
.
sos_eos
].
reshape
(
1
,
1
,
-
1
)
task_id_emb
=
self
.
llm_embedding
.
weight
[
self
.
task_id
].
reshape
(
1
,
1
,
-
1
)
if
prompt_speech_token_len
!=
0
:
prompt_speech_token_emb
=
self
.
speech_embedding
(
prompt_speech_token
)
else
:
prompt_speech_token_emb
=
torch
.
zeros
(
1
,
0
,
self
.
llm_input_size
).
to
(
device
)
lm_input
=
torch
.
concat
([
sos_eos_emb
,
embedding
,
text
,
task_id_emb
,
prompt_speech_token_emb
],
dim
=
1
)
# 4. cal min/max_length
min_len
=
int
((
text_len
-
prompt_text_len
)
*
min_token_text_ratio
)
max_len
=
int
((
text_len
-
prompt_text_len
)
*
max_token_text_ratio
)
# 5. step by step decode
out_tokens
=
[]
offset
=
0
att_cache
,
cnn_cache
=
torch
.
zeros
((
0
,
0
,
0
,
0
),
device
=
lm_input
.
device
),
torch
.
zeros
((
0
,
0
,
0
,
0
),
device
=
lm_input
.
device
)
for
i
in
range
(
max_len
):
y_pred
,
att_cache
,
cnn_cache
=
self
.
llm
.
forward_chunk
(
lm_input
,
offset
=
0
,
required_cache_size
=-
1
,
att_cache
=
att_cache
,
cnn_cache
=
cnn_cache
,
att_mask
=
torch
.
tril
(
torch
.
ones
((
1
,
lm_input
.
shape
[
1
],
lm_input
.
shape
[
1
]),
device
=
lm_input
.
device
)).
to
(
torch
.
bool
))
logp
=
self
.
llm_decoder
(
y_pred
[:,
-
1
]).
log_softmax
(
dim
=-
1
)
top_ids
=
self
.
sampling_ids
(
logp
.
squeeze
(
dim
=
0
),
sampling
,
beam_size
,
ignore_eos
=
True
if
i
<
min_len
else
False
).
item
()
if
top_ids
==
self
.
speech_token_size
:
break
out_tokens
.
append
(
top_ids
)
offset
+=
lm_input
.
size
(
1
)
lm_input
=
self
.
speech_embedding
.
weight
[
top_ids
].
reshape
(
1
,
1
,
-
1
)
return
torch
.
tensor
([
out_tokens
],
dtype
=
torch
.
int64
,
device
=
device
)
cosyvoice/transformer/__init__.py
0 → 100644
View file @
8d03db9a
cosyvoice/transformer/activation.py
0 → 100644
View file @
8d03db9a
# Copyright (c) 2020 Johns Hopkins University (Shinji Watanabe)
# 2020 Northwestern Polytechnical University (Pengcheng Guo)
# 2020 Mobvoi Inc (Binbin Zhang)
# 2024 Alibaba Inc (Xiang Lyu)
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Swish() activation function for Conformer."""
import
torch
from
torch
import
nn
,
sin
,
pow
from
torch.nn
import
Parameter
class
Swish
(
torch
.
nn
.
Module
):
"""Construct an Swish object."""
def
forward
(
self
,
x
:
torch
.
Tensor
)
->
torch
.
Tensor
:
"""Return Swish activation function."""
return
x
*
torch
.
sigmoid
(
x
)
# Implementation adapted from https://github.com/EdwardDixon/snake under the MIT license.
# LICENSE is in incl_licenses directory.
class
Snake
(
nn
.
Module
):
'''
Implementation of a sine-based periodic activation function
Shape:
- Input: (B, C, T)
- Output: (B, C, T), same shape as the input
Parameters:
- alpha - trainable parameter
References:
- This activation function is from this paper by Liu Ziyin, Tilman Hartwig, Masahito Ueda:
https://arxiv.org/abs/2006.08195
Examples:
>>> a1 = snake(256)
>>> x = torch.randn(256)
>>> x = a1(x)
'''
def
__init__
(
self
,
in_features
,
alpha
=
1.0
,
alpha_trainable
=
True
,
alpha_logscale
=
False
):
'''
Initialization.
INPUT:
- in_features: shape of the input
- alpha: trainable parameter
alpha is initialized to 1 by default, higher values = higher-frequency.
alpha will be trained along with the rest of your model.
'''
super
(
Snake
,
self
).
__init__
()
self
.
in_features
=
in_features
# initialize alpha
self
.
alpha_logscale
=
alpha_logscale
if
self
.
alpha_logscale
:
# log scale alphas initialized to zeros
self
.
alpha
=
Parameter
(
torch
.
zeros
(
in_features
)
*
alpha
)
else
:
# linear scale alphas initialized to ones
self
.
alpha
=
Parameter
(
torch
.
ones
(
in_features
)
*
alpha
)
self
.
alpha
.
requires_grad
=
alpha_trainable
self
.
no_div_by_zero
=
0.000000001
def
forward
(
self
,
x
):
'''
Forward pass of the function.
Applies the function to the input elementwise.
Snake ∶= x + 1/a * sin^2 (xa)
'''
alpha
=
self
.
alpha
.
unsqueeze
(
0
).
unsqueeze
(
-
1
)
# line up with x to [B, C, T]
if
self
.
alpha_logscale
:
alpha
=
torch
.
exp
(
alpha
)
x
=
x
+
(
1.0
/
(
alpha
+
self
.
no_div_by_zero
))
*
pow
(
sin
(
x
*
alpha
),
2
)
return
x
cosyvoice/transformer/attention.py
0 → 100644
View file @
8d03db9a
# Copyright (c) 2019 Shigeki Karita
# 2020 Mobvoi Inc (Binbin Zhang)
# 2022 Xingchen Song (sxc19@mails.tsinghua.edu.cn)
# 2024 Alibaba Inc (Xiang Lyu)
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Multi-Head Attention layer definition."""
import
math
from
typing
import
Tuple
import
torch
from
torch
import
nn
class
MultiHeadedAttention
(
nn
.
Module
):
"""Multi-Head Attention layer.
Args:
n_head (int): The number of heads.
n_feat (int): The number of features.
dropout_rate (float): Dropout rate.
"""
def
__init__
(
self
,
n_head
:
int
,
n_feat
:
int
,
dropout_rate
:
float
,
key_bias
:
bool
=
True
):
"""Construct an MultiHeadedAttention object."""
super
().
__init__
()
assert
n_feat
%
n_head
==
0
# We assume d_v always equals d_k
self
.
d_k
=
n_feat
//
n_head
self
.
h
=
n_head
self
.
linear_q
=
nn
.
Linear
(
n_feat
,
n_feat
)
self
.
linear_k
=
nn
.
Linear
(
n_feat
,
n_feat
,
bias
=
key_bias
)
self
.
linear_v
=
nn
.
Linear
(
n_feat
,
n_feat
)
self
.
linear_out
=
nn
.
Linear
(
n_feat
,
n_feat
)
self
.
dropout
=
nn
.
Dropout
(
p
=
dropout_rate
)
def
forward_qkv
(
self
,
query
:
torch
.
Tensor
,
key
:
torch
.
Tensor
,
value
:
torch
.
Tensor
)
->
Tuple
[
torch
.
Tensor
,
torch
.
Tensor
,
torch
.
Tensor
]:
"""Transform query, key and value.
Args:
query (torch.Tensor): Query tensor (#batch, time1, size).
key (torch.Tensor): Key tensor (#batch, time2, size).
value (torch.Tensor): Value tensor (#batch, time2, size).
Returns:
torch.Tensor: Transformed query tensor, size
(#batch, n_head, time1, d_k).
torch.Tensor: Transformed key tensor, size
(#batch, n_head, time2, d_k).
torch.Tensor: Transformed value tensor, size
(#batch, n_head, time2, d_k).
"""
n_batch
=
query
.
size
(
0
)
q
=
self
.
linear_q
(
query
).
view
(
n_batch
,
-
1
,
self
.
h
,
self
.
d_k
)
k
=
self
.
linear_k
(
key
).
view
(
n_batch
,
-
1
,
self
.
h
,
self
.
d_k
)
v
=
self
.
linear_v
(
value
).
view
(
n_batch
,
-
1
,
self
.
h
,
self
.
d_k
)
q
=
q
.
transpose
(
1
,
2
)
# (batch, head, time1, d_k)
k
=
k
.
transpose
(
1
,
2
)
# (batch, head, time2, d_k)
v
=
v
.
transpose
(
1
,
2
)
# (batch, head, time2, d_k)
return
q
,
k
,
v
def
forward_attention
(
self
,
value
:
torch
.
Tensor
,
scores
:
torch
.
Tensor
,
mask
:
torch
.
Tensor
=
torch
.
ones
((
0
,
0
,
0
),
dtype
=
torch
.
bool
)
)
->
torch
.
Tensor
:
"""Compute attention context vector.
Args:
value (torch.Tensor): Transformed value, size
(#batch, n_head, time2, d_k).
scores (torch.Tensor): Attention score, size
(#batch, n_head, time1, time2).
mask (torch.Tensor): Mask, size (#batch, 1, time2) or
(#batch, time1, time2), (0, 0, 0) means fake mask.
Returns:
torch.Tensor: Transformed value (#batch, time1, d_model)
weighted by the attention score (#batch, time1, time2).
"""
n_batch
=
value
.
size
(
0
)
# NOTE(xcsong): When will `if mask.size(2) > 0` be True?
# 1. onnx(16/4) [WHY? Because we feed real cache & real mask for the
# 1st chunk to ease the onnx export.]
# 2. pytorch training
if
mask
.
size
(
2
)
>
0
:
# time2 > 0
mask
=
mask
.
unsqueeze
(
1
).
eq
(
0
)
# (batch, 1, *, time2)
# For last chunk, time2 might be larger than scores.size(-1)
mask
=
mask
[:,
:,
:,
:
scores
.
size
(
-
1
)]
# (batch, 1, *, time2)
scores
=
scores
.
masked_fill
(
mask
,
-
float
(
'inf'
))
attn
=
torch
.
softmax
(
scores
,
dim
=-
1
).
masked_fill
(
mask
,
0.0
)
# (batch, head, time1, time2)
# NOTE(xcsong): When will `if mask.size(2) > 0` be False?
# 1. onnx(16/-1, -1/-1, 16/0)
# 2. jit (16/-1, -1/-1, 16/0, 16/4)
else
:
attn
=
torch
.
softmax
(
scores
,
dim
=-
1
)
# (batch, head, time1, time2)
p_attn
=
self
.
dropout
(
attn
)
x
=
torch
.
matmul
(
p_attn
,
value
)
# (batch, head, time1, d_k)
x
=
(
x
.
transpose
(
1
,
2
).
contiguous
().
view
(
n_batch
,
-
1
,
self
.
h
*
self
.
d_k
)
)
# (batch, time1, d_model)
return
self
.
linear_out
(
x
)
# (batch, time1, d_model)
def
forward
(
self
,
query
:
torch
.
Tensor
,
key
:
torch
.
Tensor
,
value
:
torch
.
Tensor
,
mask
:
torch
.
Tensor
=
torch
.
ones
((
0
,
0
,
0
),
dtype
=
torch
.
bool
),
pos_emb
:
torch
.
Tensor
=
torch
.
empty
(
0
),
cache
:
torch
.
Tensor
=
torch
.
zeros
((
0
,
0
,
0
,
0
))
)
->
Tuple
[
torch
.
Tensor
,
torch
.
Tensor
]:
"""Compute scaled dot product attention.
Args:
query (torch.Tensor): Query tensor (#batch, time1, size).
key (torch.Tensor): Key tensor (#batch, time2, size).
value (torch.Tensor): Value tensor (#batch, time2, size).
mask (torch.Tensor): Mask tensor (#batch, 1, time2) or
(#batch, time1, time2).
1.When applying cross attention between decoder and encoder,
the batch padding mask for input is in (#batch, 1, T) shape.
2.When applying self attention of encoder,
the mask is in (#batch, T, T) shape.
3.When applying self attention of decoder,
the mask is in (#batch, L, L) shape.
4.If the different position in decoder see different block
of the encoder, such as Mocha, the passed in mask could be
in (#batch, L, T) shape. But there is no such case in current
CosyVoice.
cache (torch.Tensor): Cache tensor (1, head, cache_t, d_k * 2),
where `cache_t == chunk_size * num_decoding_left_chunks`
and `head * d_k == size`
Returns:
torch.Tensor: Output tensor (#batch, time1, d_model).
torch.Tensor: Cache tensor (1, head, cache_t + time1, d_k * 2)
where `cache_t == chunk_size * num_decoding_left_chunks`
and `head * d_k == size`
"""
q
,
k
,
v
=
self
.
forward_qkv
(
query
,
key
,
value
)
# NOTE(xcsong):
# when export onnx model, for 1st chunk, we feed
# cache(1, head, 0, d_k * 2) (16/-1, -1/-1, 16/0 mode)
# or cache(1, head, real_cache_t, d_k * 2) (16/4 mode).
# In all modes, `if cache.size(0) > 0` will alwayse be `True`
# and we will always do splitting and
# concatnation(this will simplify onnx export). Note that
# it's OK to concat & split zero-shaped tensors(see code below).
# when export jit model, for 1st chunk, we always feed
# cache(0, 0, 0, 0) since jit supports dynamic if-branch.
# >>> a = torch.ones((1, 2, 0, 4))
# >>> b = torch.ones((1, 2, 3, 4))
# >>> c = torch.cat((a, b), dim=2)
# >>> torch.equal(b, c) # True
# >>> d = torch.split(a, 2, dim=-1)
# >>> torch.equal(d[0], d[1]) # True
if
cache
.
size
(
0
)
>
0
:
key_cache
,
value_cache
=
torch
.
split
(
cache
,
cache
.
size
(
-
1
)
//
2
,
dim
=-
1
)
k
=
torch
.
cat
([
key_cache
,
k
],
dim
=
2
)
v
=
torch
.
cat
([
value_cache
,
v
],
dim
=
2
)
# NOTE(xcsong): We do cache slicing in encoder.forward_chunk, since it's
# non-trivial to calculate `next_cache_start` here.
new_cache
=
torch
.
cat
((
k
,
v
),
dim
=-
1
)
scores
=
torch
.
matmul
(
q
,
k
.
transpose
(
-
2
,
-
1
))
/
math
.
sqrt
(
self
.
d_k
)
return
self
.
forward_attention
(
v
,
scores
,
mask
),
new_cache
class
RelPositionMultiHeadedAttention
(
MultiHeadedAttention
):
"""Multi-Head Attention layer with relative position encoding.
Paper: https://arxiv.org/abs/1901.02860
Args:
n_head (int): The number of heads.
n_feat (int): The number of features.
dropout_rate (float): Dropout rate.
"""
def
__init__
(
self
,
n_head
:
int
,
n_feat
:
int
,
dropout_rate
:
float
,
key_bias
:
bool
=
True
):
"""Construct an RelPositionMultiHeadedAttention object."""
super
().
__init__
(
n_head
,
n_feat
,
dropout_rate
,
key_bias
)
# linear transformation for positional encoding
self
.
linear_pos
=
nn
.
Linear
(
n_feat
,
n_feat
,
bias
=
False
)
# these two learnable bias are used in matrix c and matrix d
# as described in https://arxiv.org/abs/1901.02860 Section 3.3
self
.
pos_bias_u
=
nn
.
Parameter
(
torch
.
Tensor
(
self
.
h
,
self
.
d_k
))
self
.
pos_bias_v
=
nn
.
Parameter
(
torch
.
Tensor
(
self
.
h
,
self
.
d_k
))
torch
.
nn
.
init
.
xavier_uniform_
(
self
.
pos_bias_u
)
torch
.
nn
.
init
.
xavier_uniform_
(
self
.
pos_bias_v
)
def
rel_shift
(
self
,
x
):
"""Compute relative positional encoding.
Args:
x (torch.Tensor): Input tensor (batch, head, time1, 2*time1-1).
time1 means the length of query vector.
Returns:
torch.Tensor: Output tensor.
"""
zero_pad
=
torch
.
zeros
((
*
x
.
size
()[:
3
],
1
),
device
=
x
.
device
,
dtype
=
x
.
dtype
)
x_padded
=
torch
.
cat
([
zero_pad
,
x
],
dim
=-
1
)
x_padded
=
x_padded
.
view
(
*
x
.
size
()[:
2
],
x
.
size
(
3
)
+
1
,
x
.
size
(
2
))
x
=
x_padded
[:,
:,
1
:].
view_as
(
x
)[
:,
:,
:,
:
x
.
size
(
-
1
)
//
2
+
1
]
# only keep the positions from 0 to time2
return
x
def
forward
(
self
,
query
:
torch
.
Tensor
,
key
:
torch
.
Tensor
,
value
:
torch
.
Tensor
,
mask
:
torch
.
Tensor
=
torch
.
ones
((
0
,
0
,
0
),
dtype
=
torch
.
bool
),
pos_emb
:
torch
.
Tensor
=
torch
.
empty
(
0
),
cache
:
torch
.
Tensor
=
torch
.
zeros
((
0
,
0
,
0
,
0
))
)
->
Tuple
[
torch
.
Tensor
,
torch
.
Tensor
]:
"""Compute 'Scaled Dot Product Attention' with rel. positional encoding.
Args:
query (torch.Tensor): Query tensor (#batch, time1, size).
key (torch.Tensor): Key tensor (#batch, time2, size).
value (torch.Tensor): Value tensor (#batch, time2, size).
mask (torch.Tensor): Mask tensor (#batch, 1, time2) or
(#batch, time1, time2), (0, 0, 0) means fake mask.
pos_emb (torch.Tensor): Positional embedding tensor
(#batch, time2, size).
cache (torch.Tensor): Cache tensor (1, head, cache_t, d_k * 2),
where `cache_t == chunk_size * num_decoding_left_chunks`
and `head * d_k == size`
Returns:
torch.Tensor: Output tensor (#batch, time1, d_model).
torch.Tensor: Cache tensor (1, head, cache_t + time1, d_k * 2)
where `cache_t == chunk_size * num_decoding_left_chunks`
and `head * d_k == size`
"""
q
,
k
,
v
=
self
.
forward_qkv
(
query
,
key
,
value
)
q
=
q
.
transpose
(
1
,
2
)
# (batch, time1, head, d_k)
# NOTE(xcsong):
# when export onnx model, for 1st chunk, we feed
# cache(1, head, 0, d_k * 2) (16/-1, -1/-1, 16/0 mode)
# or cache(1, head, real_cache_t, d_k * 2) (16/4 mode).
# In all modes, `if cache.size(0) > 0` will alwayse be `True`
# and we will always do splitting and
# concatnation(this will simplify onnx export). Note that
# it's OK to concat & split zero-shaped tensors(see code below).
# when export jit model, for 1st chunk, we always feed
# cache(0, 0, 0, 0) since jit supports dynamic if-branch.
# >>> a = torch.ones((1, 2, 0, 4))
# >>> b = torch.ones((1, 2, 3, 4))
# >>> c = torch.cat((a, b), dim=2)
# >>> torch.equal(b, c) # True
# >>> d = torch.split(a, 2, dim=-1)
# >>> torch.equal(d[0], d[1]) # True
if
cache
.
size
(
0
)
>
0
:
key_cache
,
value_cache
=
torch
.
split
(
cache
,
cache
.
size
(
-
1
)
//
2
,
dim
=-
1
)
k
=
torch
.
cat
([
key_cache
,
k
],
dim
=
2
)
v
=
torch
.
cat
([
value_cache
,
v
],
dim
=
2
)
# NOTE(xcsong): We do cache slicing in encoder.forward_chunk, since it's
# non-trivial to calculate `next_cache_start` here.
new_cache
=
torch
.
cat
((
k
,
v
),
dim
=-
1
)
n_batch_pos
=
pos_emb
.
size
(
0
)
p
=
self
.
linear_pos
(
pos_emb
).
view
(
n_batch_pos
,
-
1
,
self
.
h
,
self
.
d_k
)
p
=
p
.
transpose
(
1
,
2
)
# (batch, head, time1, d_k)
# (batch, head, time1, d_k)
q_with_bias_u
=
(
q
+
self
.
pos_bias_u
).
transpose
(
1
,
2
)
# (batch, head, time1, d_k)
q_with_bias_v
=
(
q
+
self
.
pos_bias_v
).
transpose
(
1
,
2
)
# compute attention score
# first compute matrix a and matrix c
# as described in https://arxiv.org/abs/1901.02860 Section 3.3
# (batch, head, time1, time2)
matrix_ac
=
torch
.
matmul
(
q_with_bias_u
,
k
.
transpose
(
-
2
,
-
1
))
# compute matrix b and matrix d
# (batch, head, time1, time2)
matrix_bd
=
torch
.
matmul
(
q_with_bias_v
,
p
.
transpose
(
-
2
,
-
1
))
# NOTE(Xiang Lyu): Keep rel_shift since espnet rel_pos_emb is used
if
matrix_ac
.
shape
!=
matrix_bd
.
shape
:
matrix_bd
=
self
.
rel_shift
(
matrix_bd
)
scores
=
(
matrix_ac
+
matrix_bd
)
/
math
.
sqrt
(
self
.
d_k
)
# (batch, head, time1, time2)
return
self
.
forward_attention
(
v
,
scores
,
mask
),
new_cache
cosyvoice/transformer/convolution.py
0 → 100644
View file @
8d03db9a
# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Di Wu)
# 2024 Alibaba Inc (Xiang Lyu)
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Modified from ESPnet(https://github.com/espnet/espnet)
"""ConvolutionModule definition."""
from
typing
import
Tuple
import
torch
from
torch
import
nn
class
ConvolutionModule
(
nn
.
Module
):
"""ConvolutionModule in Conformer model."""
def
__init__
(
self
,
channels
:
int
,
kernel_size
:
int
=
15
,
activation
:
nn
.
Module
=
nn
.
ReLU
(),
norm
:
str
=
"batch_norm"
,
causal
:
bool
=
False
,
bias
:
bool
=
True
):
"""Construct an ConvolutionModule object.
Args:
channels (int): The number of channels of conv layers.
kernel_size (int): Kernel size of conv layers.
causal (int): Whether use causal convolution or not
"""
super
().
__init__
()
self
.
pointwise_conv1
=
nn
.
Conv1d
(
channels
,
2
*
channels
,
kernel_size
=
1
,
stride
=
1
,
padding
=
0
,
bias
=
bias
,
)
# self.lorder is used to distinguish if it's a causal convolution,
# if self.lorder > 0: it's a causal convolution, the input will be
# padded with self.lorder frames on the left in forward.
# else: it's a symmetrical convolution
if
causal
:
padding
=
0
self
.
lorder
=
kernel_size
-
1
else
:
# kernel_size should be an odd number for none causal convolution
assert
(
kernel_size
-
1
)
%
2
==
0
padding
=
(
kernel_size
-
1
)
//
2
self
.
lorder
=
0
self
.
depthwise_conv
=
nn
.
Conv1d
(
channels
,
channels
,
kernel_size
,
stride
=
1
,
padding
=
padding
,
groups
=
channels
,
bias
=
bias
,
)
assert
norm
in
[
'batch_norm'
,
'layer_norm'
]
if
norm
==
"batch_norm"
:
self
.
use_layer_norm
=
False
self
.
norm
=
nn
.
BatchNorm1d
(
channels
)
else
:
self
.
use_layer_norm
=
True
self
.
norm
=
nn
.
LayerNorm
(
channels
)
self
.
pointwise_conv2
=
nn
.
Conv1d
(
channels
,
channels
,
kernel_size
=
1
,
stride
=
1
,
padding
=
0
,
bias
=
bias
,
)
self
.
activation
=
activation
def
forward
(
self
,
x
:
torch
.
Tensor
,
mask_pad
:
torch
.
Tensor
=
torch
.
ones
((
0
,
0
,
0
),
dtype
=
torch
.
bool
),
cache
:
torch
.
Tensor
=
torch
.
zeros
((
0
,
0
,
0
)),
)
->
Tuple
[
torch
.
Tensor
,
torch
.
Tensor
]:
"""Compute convolution module.
Args:
x (torch.Tensor): Input tensor (#batch, time, channels).
mask_pad (torch.Tensor): used for batch padding (#batch, 1, time),
(0, 0, 0) means fake mask.
cache (torch.Tensor): left context cache, it is only
used in causal convolution (#batch, channels, cache_t),
(0, 0, 0) meas fake cache.
Returns:
torch.Tensor: Output tensor (#batch, time, channels).
"""
# exchange the temporal dimension and the feature dimension
x
=
x
.
transpose
(
1
,
2
)
# (#batch, channels, time)
# mask batch padding
if
mask_pad
.
size
(
2
)
>
0
:
# time > 0
x
.
masked_fill_
(
~
mask_pad
,
0.0
)
if
self
.
lorder
>
0
:
if
cache
.
size
(
2
)
==
0
:
# cache_t == 0
x
=
nn
.
functional
.
pad
(
x
,
(
self
.
lorder
,
0
),
'constant'
,
0.0
)
else
:
assert
cache
.
size
(
0
)
==
x
.
size
(
0
)
# equal batch
assert
cache
.
size
(
1
)
==
x
.
size
(
1
)
# equal channel
x
=
torch
.
cat
((
cache
,
x
),
dim
=
2
)
assert
(
x
.
size
(
2
)
>
self
.
lorder
)
new_cache
=
x
[:,
:,
-
self
.
lorder
:]
else
:
# It's better we just return None if no cache is required,
# However, for JIT export, here we just fake one tensor instead of
# None.
new_cache
=
torch
.
zeros
((
0
,
0
,
0
),
dtype
=
x
.
dtype
,
device
=
x
.
device
)
# GLU mechanism
x
=
self
.
pointwise_conv1
(
x
)
# (batch, 2*channel, dim)
x
=
nn
.
functional
.
glu
(
x
,
dim
=
1
)
# (batch, channel, dim)
# 1D Depthwise Conv
x
=
self
.
depthwise_conv
(
x
)
if
self
.
use_layer_norm
:
x
=
x
.
transpose
(
1
,
2
)
x
=
self
.
activation
(
self
.
norm
(
x
))
if
self
.
use_layer_norm
:
x
=
x
.
transpose
(
1
,
2
)
x
=
self
.
pointwise_conv2
(
x
)
# mask batch padding
if
mask_pad
.
size
(
2
)
>
0
:
# time > 0
x
.
masked_fill_
(
~
mask_pad
,
0.0
)
return
x
.
transpose
(
1
,
2
),
new_cache
cosyvoice/transformer/decoder.py
0 → 100644
View file @
8d03db9a
# Copyright (c) 2021 Mobvoi Inc. (authors: Binbin Zhang, Di Wu)
# 2024 Alibaba Inc (Xiang Lyu)
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Modified from ESPnet(https://github.com/espnet/espnet)
"""Decoder definition."""
from
typing
import
Tuple
,
List
,
Optional
import
torch
import
torch.utils.checkpoint
as
ckpt
import
logging
from
cosyvoice.transformer.decoder_layer
import
DecoderLayer
from
cosyvoice.transformer.positionwise_feed_forward
import
PositionwiseFeedForward
from
cosyvoice.utils.class_utils
import
(
COSYVOICE_EMB_CLASSES
,
COSYVOICE_ATTENTION_CLASSES
,
COSYVOICE_ACTIVATION_CLASSES
,
)
from
cosyvoice.utils.mask
import
(
subsequent_mask
,
make_pad_mask
)
class
TransformerDecoder
(
torch
.
nn
.
Module
):
"""Base class of Transfomer decoder module.
Args:
vocab_size: output dim
encoder_output_size: dimension of attention
attention_heads: the number of heads of multi head attention
linear_units: the hidden units number of position-wise feedforward
num_blocks: the number of decoder blocks
dropout_rate: dropout rate
self_attention_dropout_rate: dropout rate for attention
input_layer: input layer type
use_output_layer: whether to use output layer
pos_enc_class: PositionalEncoding or ScaledPositionalEncoding
normalize_before:
True: use layer_norm before each sub-block of a layer.
False: use layer_norm after each sub-block of a layer.
src_attention: if false, encoder-decoder cross attention is not
applied, such as CIF model
key_bias: whether use bias in attention.linear_k, False for whisper models.
gradient_checkpointing: rerunning a forward-pass segment for each
checkpointed segment during backward.
tie_word_embedding: Tie or clone module weights depending of whether we are
using TorchScript or not
"""
def
__init__
(
self
,
vocab_size
:
int
,
encoder_output_size
:
int
,
attention_heads
:
int
=
4
,
linear_units
:
int
=
2048
,
num_blocks
:
int
=
6
,
dropout_rate
:
float
=
0.1
,
positional_dropout_rate
:
float
=
0.1
,
self_attention_dropout_rate
:
float
=
0.0
,
src_attention_dropout_rate
:
float
=
0.0
,
input_layer
:
str
=
"embed"
,
use_output_layer
:
bool
=
True
,
normalize_before
:
bool
=
True
,
src_attention
:
bool
=
True
,
key_bias
:
bool
=
True
,
activation_type
:
str
=
"relu"
,
gradient_checkpointing
:
bool
=
False
,
tie_word_embedding
:
bool
=
False
,
):
super
().
__init__
()
attention_dim
=
encoder_output_size
activation
=
COSYVOICE_ACTIVATION_CLASSES
[
activation_type
]()
self
.
embed
=
torch
.
nn
.
Sequential
(
torch
.
nn
.
Identity
()
if
input_layer
==
"no_pos"
else
torch
.
nn
.
Embedding
(
vocab_size
,
attention_dim
),
COSYVOICE_EMB_CLASSES
[
input_layer
](
attention_dim
,
positional_dropout_rate
),
)
self
.
normalize_before
=
normalize_before
self
.
after_norm
=
torch
.
nn
.
LayerNorm
(
attention_dim
,
eps
=
1e-5
)
self
.
use_output_layer
=
use_output_layer
if
use_output_layer
:
self
.
output_layer
=
torch
.
nn
.
Linear
(
attention_dim
,
vocab_size
)
else
:
self
.
output_layer
=
torch
.
nn
.
Identity
()
self
.
num_blocks
=
num_blocks
self
.
decoders
=
torch
.
nn
.
ModuleList
([
DecoderLayer
(
attention_dim
,
COSYVOICE_ATTENTION_CLASSES
[
"selfattn"
](
attention_heads
,
attention_dim
,
self_attention_dropout_rate
,
key_bias
),
COSYVOICE_ATTENTION_CLASSES
[
"selfattn"
](
attention_heads
,
attention_dim
,
src_attention_dropout_rate
,
key_bias
)
if
src_attention
else
None
,
PositionwiseFeedForward
(
attention_dim
,
linear_units
,
dropout_rate
,
activation
),
dropout_rate
,
normalize_before
,
)
for
_
in
range
(
self
.
num_blocks
)
])
self
.
gradient_checkpointing
=
gradient_checkpointing
self
.
tie_word_embedding
=
tie_word_embedding
def
forward
(
self
,
memory
:
torch
.
Tensor
,
memory_mask
:
torch
.
Tensor
,
ys_in_pad
:
torch
.
Tensor
,
ys_in_lens
:
torch
.
Tensor
,
r_ys_in_pad
:
torch
.
Tensor
=
torch
.
empty
(
0
),
reverse_weight
:
float
=
0.0
,
)
->
Tuple
[
torch
.
Tensor
,
torch
.
Tensor
,
torch
.
Tensor
]:
"""Forward decoder.
Args:
memory: encoded memory, float32 (batch, maxlen_in, feat)
memory_mask: encoder memory mask, (batch, 1, maxlen_in)
ys_in_pad: padded input token ids, int64 (batch, maxlen_out)
ys_in_lens: input lengths of this batch (batch)
r_ys_in_pad: not used in transformer decoder, in order to unify api
with bidirectional decoder
reverse_weight: not used in transformer decoder, in order to unify
api with bidirectional decode
Returns:
(tuple): tuple containing:
x: decoded token score before softmax (batch, maxlen_out,
vocab_size) if use_output_layer is True,
torch.tensor(0.0), in order to unify api with bidirectional decoder
olens: (batch, )
NOTE(xcsong):
We pass the `__call__` method of the modules instead of `forward` to the
checkpointing API because `__call__` attaches all the hooks of the module.
https://discuss.pytorch.org/t/any-different-between-model-input-and-model-forward-input/3690/2
"""
tgt
=
ys_in_pad
maxlen
=
tgt
.
size
(
1
)
# tgt_mask: (B, 1, L)
tgt_mask
=
~
make_pad_mask
(
ys_in_lens
,
maxlen
).
unsqueeze
(
1
)
tgt_mask
=
tgt_mask
.
to
(
tgt
.
device
)
# m: (1, L, L)
m
=
subsequent_mask
(
tgt_mask
.
size
(
-
1
),
device
=
tgt_mask
.
device
).
unsqueeze
(
0
)
# tgt_mask: (B, L, L)
tgt_mask
=
tgt_mask
&
m
x
,
_
=
self
.
embed
(
tgt
)
if
self
.
gradient_checkpointing
and
self
.
training
:
x
=
self
.
forward_layers_checkpointed
(
x
,
tgt_mask
,
memory
,
memory_mask
)
else
:
x
=
self
.
forward_layers
(
x
,
tgt_mask
,
memory
,
memory_mask
)
if
self
.
normalize_before
:
x
=
self
.
after_norm
(
x
)
if
self
.
use_output_layer
:
x
=
self
.
output_layer
(
x
)
olens
=
tgt_mask
.
sum
(
1
)
return
x
,
torch
.
tensor
(
0.0
),
olens
def
forward_layers
(
self
,
x
:
torch
.
Tensor
,
tgt_mask
:
torch
.
Tensor
,
memory
:
torch
.
Tensor
,
memory_mask
:
torch
.
Tensor
)
->
torch
.
Tensor
:
for
layer
in
self
.
decoders
:
x
,
tgt_mask
,
memory
,
memory_mask
=
layer
(
x
,
tgt_mask
,
memory
,
memory_mask
)
return
x
@
torch
.
jit
.
ignore
(
drop
=
True
)
def
forward_layers_checkpointed
(
self
,
x
:
torch
.
Tensor
,
tgt_mask
:
torch
.
Tensor
,
memory
:
torch
.
Tensor
,
memory_mask
:
torch
.
Tensor
)
->
torch
.
Tensor
:
for
layer
in
self
.
decoders
:
x
,
tgt_mask
,
memory
,
memory_mask
=
ckpt
.
checkpoint
(
layer
.
__call__
,
x
,
tgt_mask
,
memory
,
memory_mask
)
return
x
def
forward_one_step
(
self
,
memory
:
torch
.
Tensor
,
memory_mask
:
torch
.
Tensor
,
tgt
:
torch
.
Tensor
,
tgt_mask
:
torch
.
Tensor
,
cache
:
Optional
[
List
[
torch
.
Tensor
]]
=
None
,
)
->
Tuple
[
torch
.
Tensor
,
List
[
torch
.
Tensor
]]:
"""Forward one step.
This is only used for decoding.
Args:
memory: encoded memory, float32 (batch, maxlen_in, feat)
memory_mask: encoded memory mask, (batch, 1, maxlen_in)
tgt: input token ids, int64 (batch, maxlen_out)
tgt_mask: input token mask, (batch, maxlen_out)
dtype=torch.uint8 in PyTorch 1.2-
dtype=torch.bool in PyTorch 1.2+ (include 1.2)
cache: cached output list of (batch, max_time_out-1, size)
Returns:
y, cache: NN output value and cache per `self.decoders`.
y.shape` is (batch, maxlen_out, token)
"""
x
,
_
=
self
.
embed
(
tgt
)
new_cache
=
[]
for
i
,
decoder
in
enumerate
(
self
.
decoders
):
if
cache
is
None
:
c
=
None
else
:
c
=
cache
[
i
]
x
,
tgt_mask
,
memory
,
memory_mask
=
decoder
(
x
,
tgt_mask
,
memory
,
memory_mask
,
cache
=
c
)
new_cache
.
append
(
x
)
if
self
.
normalize_before
:
y
=
self
.
after_norm
(
x
[:,
-
1
])
else
:
y
=
x
[:,
-
1
]
if
self
.
use_output_layer
:
y
=
torch
.
log_softmax
(
self
.
output_layer
(
y
),
dim
=-
1
)
return
y
,
new_cache
def
tie_or_clone_weights
(
self
,
jit_mode
:
bool
=
True
):
"""Tie or clone module weights (between word_emb and output_layer)
depending of whether we are using TorchScript or not"""
if
not
self
.
use_output_layer
:
return
if
jit_mode
:
logging
.
info
(
"clone emb.weight to output.weight"
)
self
.
output_layer
.
weight
=
torch
.
nn
.
Parameter
(
self
.
embed
[
0
].
weight
.
clone
())
else
:
logging
.
info
(
"tie emb.weight with output.weight"
)
self
.
output_layer
.
weight
=
self
.
embed
[
0
].
weight
if
getattr
(
self
.
output_layer
,
"bias"
,
None
)
is
not
None
:
self
.
output_layer
.
bias
.
data
=
torch
.
nn
.
functional
.
pad
(
self
.
output_layer
.
bias
.
data
,
(
0
,
self
.
output_layer
.
weight
.
shape
[
0
]
-
self
.
output_layer
.
bias
.
shape
[
0
],
),
"constant"
,
0
,
)
class
BiTransformerDecoder
(
torch
.
nn
.
Module
):
"""Base class of Transfomer decoder module.
Args:
vocab_size: output dim
encoder_output_size: dimension of attention
attention_heads: the number of heads of multi head attention
linear_units: the hidden units number of position-wise feedforward
num_blocks: the number of decoder blocks
r_num_blocks: the number of right to left decoder blocks
dropout_rate: dropout rate
self_attention_dropout_rate: dropout rate for attention
input_layer: input layer type
use_output_layer: whether to use output layer
pos_enc_class: PositionalEncoding or ScaledPositionalEncoding
normalize_before:
True: use layer_norm before each sub-block of a layer.
False: use layer_norm after each sub-block of a layer.
key_bias: whether use bias in attention.linear_k, False for whisper models.
"""
def
__init__
(
self
,
vocab_size
:
int
,
encoder_output_size
:
int
,
attention_heads
:
int
=
4
,
linear_units
:
int
=
2048
,
num_blocks
:
int
=
6
,
r_num_blocks
:
int
=
0
,
dropout_rate
:
float
=
0.1
,
positional_dropout_rate
:
float
=
0.1
,
self_attention_dropout_rate
:
float
=
0.0
,
src_attention_dropout_rate
:
float
=
0.0
,
input_layer
:
str
=
"embed"
,
use_output_layer
:
bool
=
True
,
normalize_before
:
bool
=
True
,
key_bias
:
bool
=
True
,
gradient_checkpointing
:
bool
=
False
,
tie_word_embedding
:
bool
=
False
,
):
super
().
__init__
()
self
.
tie_word_embedding
=
tie_word_embedding
self
.
left_decoder
=
TransformerDecoder
(
vocab_size
,
encoder_output_size
,
attention_heads
,
linear_units
,
num_blocks
,
dropout_rate
,
positional_dropout_rate
,
self_attention_dropout_rate
,
src_attention_dropout_rate
,
input_layer
,
use_output_layer
,
normalize_before
,
key_bias
=
key_bias
,
gradient_checkpointing
=
gradient_checkpointing
,
tie_word_embedding
=
tie_word_embedding
)
self
.
right_decoder
=
TransformerDecoder
(
vocab_size
,
encoder_output_size
,
attention_heads
,
linear_units
,
r_num_blocks
,
dropout_rate
,
positional_dropout_rate
,
self_attention_dropout_rate
,
src_attention_dropout_rate
,
input_layer
,
use_output_layer
,
normalize_before
,
key_bias
=
key_bias
,
gradient_checkpointing
=
gradient_checkpointing
,
tie_word_embedding
=
tie_word_embedding
)
def
forward
(
self
,
memory
:
torch
.
Tensor
,
memory_mask
:
torch
.
Tensor
,
ys_in_pad
:
torch
.
Tensor
,
ys_in_lens
:
torch
.
Tensor
,
r_ys_in_pad
:
torch
.
Tensor
,
reverse_weight
:
float
=
0.0
,
)
->
Tuple
[
torch
.
Tensor
,
torch
.
Tensor
,
torch
.
Tensor
]:
"""Forward decoder.
Args:
memory: encoded memory, float32 (batch, maxlen_in, feat)
memory_mask: encoder memory mask, (batch, 1, maxlen_in)
ys_in_pad: padded input token ids, int64 (batch, maxlen_out)
ys_in_lens: input lengths of this batch (batch)
r_ys_in_pad: padded input token ids, int64 (batch, maxlen_out),
used for right to left decoder
reverse_weight: used for right to left decoder
Returns:
(tuple): tuple containing:
x: decoded token score before softmax (batch, maxlen_out,
vocab_size) if use_output_layer is True,
r_x: x: decoded token score (right to left decoder)
before softmax (batch, maxlen_out, vocab_size)
if use_output_layer is True,
olens: (batch, )
"""
l_x
,
_
,
olens
=
self
.
left_decoder
(
memory
,
memory_mask
,
ys_in_pad
,
ys_in_lens
)
r_x
=
torch
.
tensor
(
0.0
)
if
reverse_weight
>
0.0
:
r_x
,
_
,
olens
=
self
.
right_decoder
(
memory
,
memory_mask
,
r_ys_in_pad
,
ys_in_lens
)
return
l_x
,
r_x
,
olens
def
forward_one_step
(
self
,
memory
:
torch
.
Tensor
,
memory_mask
:
torch
.
Tensor
,
tgt
:
torch
.
Tensor
,
tgt_mask
:
torch
.
Tensor
,
cache
:
Optional
[
List
[
torch
.
Tensor
]]
=
None
,
)
->
Tuple
[
torch
.
Tensor
,
List
[
torch
.
Tensor
]]:
"""Forward one step.
This is only used for decoding.
Args:
memory: encoded memory, float32 (batch, maxlen_in, feat)
memory_mask: encoded memory mask, (batch, 1, maxlen_in)
tgt: input token ids, int64 (batch, maxlen_out)
tgt_mask: input token mask, (batch, maxlen_out)
dtype=torch.uint8 in PyTorch 1.2-
dtype=torch.bool in PyTorch 1.2+ (include 1.2)
cache: cached output list of (batch, max_time_out-1, size)
Returns:
y, cache: NN output value and cache per `self.decoders`.
y.shape` is (batch, maxlen_out, token)
"""
return
self
.
left_decoder
.
forward_one_step
(
memory
,
memory_mask
,
tgt
,
tgt_mask
,
cache
)
def
tie_or_clone_weights
(
self
,
jit_mode
:
bool
=
True
):
"""Tie or clone module weights (between word_emb and output_layer)
depending of whether we are using TorchScript or not"""
self
.
left_decoder
.
tie_or_clone_weights
(
jit_mode
)
self
.
right_decoder
.
tie_or_clone_weights
(
jit_mode
)
cosyvoice/transformer/decoder_layer.py
0 → 100644
View file @
8d03db9a
# Copyright (c) 2019 Shigeki Karita
# 2020 Mobvoi Inc (Binbin Zhang)
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Decoder self-attention layer definition."""
from
typing
import
Optional
,
Tuple
import
torch
from
torch
import
nn
class
DecoderLayer
(
nn
.
Module
):
"""Single decoder layer module.
Args:
size (int): Input dimension.
self_attn (torch.nn.Module): Self-attention module instance.
`MultiHeadedAttention` instance can be used as the argument.
src_attn (torch.nn.Module): Inter-attention module instance.
`MultiHeadedAttention` instance can be used as the argument.
If `None` is passed, Inter-attention is not used, such as
CIF, GPT, and other decoder only model.
feed_forward (torch.nn.Module): Feed-forward module instance.
`PositionwiseFeedForward` instance can be used as the argument.
dropout_rate (float): Dropout rate.
normalize_before (bool):
True: use layer_norm before each sub-block.
False: to use layer_norm after each sub-block.
"""
def
__init__
(
self
,
size
:
int
,
self_attn
:
nn
.
Module
,
src_attn
:
Optional
[
nn
.
Module
],
feed_forward
:
nn
.
Module
,
dropout_rate
:
float
,
normalize_before
:
bool
=
True
,
):
"""Construct an DecoderLayer object."""
super
().
__init__
()
self
.
size
=
size
self
.
self_attn
=
self_attn
self
.
src_attn
=
src_attn
self
.
feed_forward
=
feed_forward
self
.
norm1
=
nn
.
LayerNorm
(
size
,
eps
=
1e-5
)
self
.
norm2
=
nn
.
LayerNorm
(
size
,
eps
=
1e-5
)
self
.
norm3
=
nn
.
LayerNorm
(
size
,
eps
=
1e-5
)
self
.
dropout
=
nn
.
Dropout
(
dropout_rate
)
self
.
normalize_before
=
normalize_before
def
forward
(
self
,
tgt
:
torch
.
Tensor
,
tgt_mask
:
torch
.
Tensor
,
memory
:
torch
.
Tensor
,
memory_mask
:
torch
.
Tensor
,
cache
:
Optional
[
torch
.
Tensor
]
=
None
)
->
Tuple
[
torch
.
Tensor
,
torch
.
Tensor
,
torch
.
Tensor
,
torch
.
Tensor
]:
"""Compute decoded features.
Args:
tgt (torch.Tensor): Input tensor (#batch, maxlen_out, size).
tgt_mask (torch.Tensor): Mask for input tensor
(#batch, maxlen_out).
memory (torch.Tensor): Encoded memory
(#batch, maxlen_in, size).
memory_mask (torch.Tensor): Encoded memory mask
(#batch, maxlen_in).
cache (torch.Tensor): cached tensors.
(#batch, maxlen_out - 1, size).
Returns:
torch.Tensor: Output tensor (#batch, maxlen_out, size).
torch.Tensor: Mask for output tensor (#batch, maxlen_out).
torch.Tensor: Encoded memory (#batch, maxlen_in, size).
torch.Tensor: Encoded memory mask (#batch, maxlen_in).
"""
residual
=
tgt
if
self
.
normalize_before
:
tgt
=
self
.
norm1
(
tgt
)
if
cache
is
None
:
tgt_q
=
tgt
tgt_q_mask
=
tgt_mask
else
:
# compute only the last frame query keeping dim: max_time_out -> 1
assert
cache
.
shape
==
(
tgt
.
shape
[
0
],
tgt
.
shape
[
1
]
-
1
,
self
.
size
,
),
"{cache.shape} == {(tgt.shape[0], tgt.shape[1] - 1, self.size)}"
tgt_q
=
tgt
[:,
-
1
:,
:]
residual
=
residual
[:,
-
1
:,
:]
tgt_q_mask
=
tgt_mask
[:,
-
1
:,
:]
x
=
residual
+
self
.
dropout
(
self
.
self_attn
(
tgt_q
,
tgt
,
tgt
,
tgt_q_mask
)[
0
])
if
not
self
.
normalize_before
:
x
=
self
.
norm1
(
x
)
if
self
.
src_attn
is
not
None
:
residual
=
x
if
self
.
normalize_before
:
x
=
self
.
norm2
(
x
)
x
=
residual
+
self
.
dropout
(
self
.
src_attn
(
x
,
memory
,
memory
,
memory_mask
)[
0
])
if
not
self
.
normalize_before
:
x
=
self
.
norm2
(
x
)
residual
=
x
if
self
.
normalize_before
:
x
=
self
.
norm3
(
x
)
x
=
residual
+
self
.
dropout
(
self
.
feed_forward
(
x
))
if
not
self
.
normalize_before
:
x
=
self
.
norm3
(
x
)
if
cache
is
not
None
:
x
=
torch
.
cat
([
cache
,
x
],
dim
=
1
)
return
x
,
tgt_mask
,
memory
,
memory_mask
cosyvoice/transformer/embedding.py
0 → 100644
View file @
8d03db9a
# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Di Wu)
# 2024 Alibaba Inc (Xiang Lyu)
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Modified from ESPnet(https://github.com/espnet/espnet)
"""Positonal Encoding Module."""
import
math
from
typing
import
Tuple
,
Union
import
torch
import
torch.nn.functional
as
F
import
numpy
as
np
class
PositionalEncoding
(
torch
.
nn
.
Module
):
"""Positional encoding.
:param int d_model: embedding dim
:param float dropout_rate: dropout rate
:param int max_len: maximum input length
PE(pos, 2i) = sin(pos/(10000^(2i/dmodel)))
PE(pos, 2i+1) = cos(pos/(10000^(2i/dmodel)))
"""
def
__init__
(
self
,
d_model
:
int
,
dropout_rate
:
float
,
max_len
:
int
=
5000
,
reverse
:
bool
=
False
):
"""Construct an PositionalEncoding object."""
super
().
__init__
()
self
.
d_model
=
d_model
self
.
xscale
=
math
.
sqrt
(
self
.
d_model
)
self
.
dropout
=
torch
.
nn
.
Dropout
(
p
=
dropout_rate
)
self
.
max_len
=
max_len
self
.
pe
=
torch
.
zeros
(
self
.
max_len
,
self
.
d_model
)
position
=
torch
.
arange
(
0
,
self
.
max_len
,
dtype
=
torch
.
float32
).
unsqueeze
(
1
)
div_term
=
torch
.
exp
(
torch
.
arange
(
0
,
self
.
d_model
,
2
,
dtype
=
torch
.
float32
)
*
-
(
math
.
log
(
10000.0
)
/
self
.
d_model
))
self
.
pe
[:,
0
::
2
]
=
torch
.
sin
(
position
*
div_term
)
self
.
pe
[:,
1
::
2
]
=
torch
.
cos
(
position
*
div_term
)
self
.
pe
=
self
.
pe
.
unsqueeze
(
0
)
def
forward
(
self
,
x
:
torch
.
Tensor
,
offset
:
Union
[
int
,
torch
.
Tensor
]
=
0
)
\
->
Tuple
[
torch
.
Tensor
,
torch
.
Tensor
]:
"""Add positional encoding.
Args:
x (torch.Tensor): Input. Its shape is (batch, time, ...)
offset (int, torch.tensor): position offset
Returns:
torch.Tensor: Encoded tensor. Its shape is (batch, time, ...)
torch.Tensor: for compatibility to RelPositionalEncoding
"""
self
.
pe
=
self
.
pe
.
to
(
x
.
device
)
pos_emb
=
self
.
position_encoding
(
offset
,
x
.
size
(
1
),
False
)
x
=
x
*
self
.
xscale
+
pos_emb
return
self
.
dropout
(
x
),
self
.
dropout
(
pos_emb
)
def
position_encoding
(
self
,
offset
:
Union
[
int
,
torch
.
Tensor
],
size
:
int
,
apply_dropout
:
bool
=
True
)
->
torch
.
Tensor
:
""" For getting encoding in a streaming fashion
Attention!!!!!
we apply dropout only once at the whole utterance level in a none
streaming way, but will call this function several times with
increasing input size in a streaming scenario, so the dropout will
be applied several times.
Args:
offset (int or torch.tensor): start offset
size (int): required size of position encoding
Returns:
torch.Tensor: Corresponding encoding
"""
# How to subscript a Union type:
# https://github.com/pytorch/pytorch/issues/69434
if
isinstance
(
offset
,
int
):
assert
offset
+
size
<=
self
.
max_len
pos_emb
=
self
.
pe
[:,
offset
:
offset
+
size
]
elif
isinstance
(
offset
,
torch
.
Tensor
)
and
offset
.
dim
()
==
0
:
# scalar
assert
offset
+
size
<=
self
.
max_len
pos_emb
=
self
.
pe
[:,
offset
:
offset
+
size
]
else
:
# for batched streaming decoding on GPU
assert
torch
.
max
(
offset
)
+
size
<=
self
.
max_len
index
=
offset
.
unsqueeze
(
1
)
+
\
torch
.
arange
(
0
,
size
).
to
(
offset
.
device
)
# B X T
flag
=
index
>
0
# remove negative offset
index
=
index
*
flag
pos_emb
=
F
.
embedding
(
index
,
self
.
pe
[
0
])
# B X T X d_model
if
apply_dropout
:
pos_emb
=
self
.
dropout
(
pos_emb
)
return
pos_emb
class
RelPositionalEncoding
(
PositionalEncoding
):
"""Relative positional encoding module.
See : Appendix B in https://arxiv.org/abs/1901.02860
Args:
d_model (int): Embedding dimension.
dropout_rate (float): Dropout rate.
max_len (int): Maximum input length.
"""
def
__init__
(
self
,
d_model
:
int
,
dropout_rate
:
float
,
max_len
:
int
=
5000
):
"""Initialize class."""
super
().
__init__
(
d_model
,
dropout_rate
,
max_len
,
reverse
=
True
)
def
forward
(
self
,
x
:
torch
.
Tensor
,
offset
:
Union
[
int
,
torch
.
Tensor
]
=
0
)
\
->
Tuple
[
torch
.
Tensor
,
torch
.
Tensor
]:
"""Compute positional encoding.
Args:
x (torch.Tensor): Input tensor (batch, time, `*`).
Returns:
torch.Tensor: Encoded tensor (batch, time, `*`).
torch.Tensor: Positional embedding tensor (1, time, `*`).
"""
self
.
pe
=
self
.
pe
.
to
(
x
.
device
)
x
=
x
*
self
.
xscale
pos_emb
=
self
.
position_encoding
(
offset
,
x
.
size
(
1
),
False
)
return
self
.
dropout
(
x
),
self
.
dropout
(
pos_emb
)
class
WhisperPositionalEncoding
(
PositionalEncoding
):
""" Sinusoids position encoding used in openai-whisper.encoder
"""
def
__init__
(
self
,
d_model
:
int
,
dropout_rate
:
float
,
max_len
:
int
=
1500
):
super
().
__init__
(
d_model
,
dropout_rate
,
max_len
)
self
.
xscale
=
1.0
log_timescale_increment
=
np
.
log
(
10000
)
/
(
d_model
//
2
-
1
)
inv_timescales
=
torch
.
exp
(
-
log_timescale_increment
*
torch
.
arange
(
d_model
//
2
))
scaled_time
=
torch
.
arange
(
max_len
)[:,
np
.
newaxis
]
*
\
inv_timescales
[
np
.
newaxis
,
:]
pe
=
torch
.
cat
([
torch
.
sin
(
scaled_time
),
torch
.
cos
(
scaled_time
)],
dim
=
1
)
delattr
(
self
,
"pe"
)
self
.
register_buffer
(
"pe"
,
pe
.
unsqueeze
(
0
))
class
LearnablePositionalEncoding
(
PositionalEncoding
):
""" Learnable position encoding used in openai-whisper.decoder
"""
def
__init__
(
self
,
d_model
:
int
,
dropout_rate
:
float
,
max_len
:
int
=
448
):
super
().
__init__
(
d_model
,
dropout_rate
,
max_len
)
# NOTE(xcsong): overwrite self.pe & self.xscale
self
.
pe
=
torch
.
nn
.
Parameter
(
torch
.
empty
(
1
,
max_len
,
d_model
))
self
.
xscale
=
1.0
class
NoPositionalEncoding
(
torch
.
nn
.
Module
):
""" No position encoding
"""
def
__init__
(
self
,
d_model
:
int
,
dropout_rate
:
float
):
super
().
__init__
()
self
.
d_model
=
d_model
self
.
dropout
=
torch
.
nn
.
Dropout
(
p
=
dropout_rate
)
def
forward
(
self
,
x
:
torch
.
Tensor
,
offset
:
Union
[
int
,
torch
.
Tensor
]
=
0
)
\
->
Tuple
[
torch
.
Tensor
,
torch
.
Tensor
]:
""" Just return zero vector for interface compatibility
"""
pos_emb
=
torch
.
zeros
(
1
,
x
.
size
(
1
),
self
.
d_model
).
to
(
x
.
device
)
return
self
.
dropout
(
x
),
pos_emb
def
position_encoding
(
self
,
offset
:
Union
[
int
,
torch
.
Tensor
],
size
:
int
)
->
torch
.
Tensor
:
return
torch
.
zeros
(
1
,
size
,
self
.
d_model
)
class
EspnetRelPositionalEncoding
(
torch
.
nn
.
Module
):
"""Relative positional encoding module (new implementation).
Details can be found in https://github.com/espnet/espnet/pull/2816.
See : Appendix B in https://arxiv.org/abs/1901.02860
Args:
d_model (int): Embedding dimension.
dropout_rate (float): Dropout rate.
max_len (int): Maximum input length.
"""
def
__init__
(
self
,
d_model
,
dropout_rate
,
max_len
=
5000
):
"""Construct an PositionalEncoding object."""
super
(
EspnetRelPositionalEncoding
,
self
).
__init__
()
self
.
d_model
=
d_model
self
.
xscale
=
math
.
sqrt
(
self
.
d_model
)
self
.
dropout
=
torch
.
nn
.
Dropout
(
p
=
dropout_rate
)
self
.
pe
=
None
self
.
extend_pe
(
torch
.
tensor
(
0.0
).
expand
(
1
,
max_len
))
def
extend_pe
(
self
,
x
):
"""Reset the positional encodings."""
if
self
.
pe
is
not
None
:
# self.pe contains both positive and negative parts
# the length of self.pe is 2 * input_len - 1
if
self
.
pe
.
size
(
1
)
>=
x
.
size
(
1
)
*
2
-
1
:
if
self
.
pe
.
dtype
!=
x
.
dtype
or
self
.
pe
.
device
!=
x
.
device
:
self
.
pe
=
self
.
pe
.
to
(
dtype
=
x
.
dtype
,
device
=
x
.
device
)
return
# Suppose `i` means to the position of query vecotr and `j` means the
# position of key vector. We use position relative positions when keys
# are to the left (i>j) and negative relative positions otherwise (i<j).
pe_positive
=
torch
.
zeros
(
x
.
size
(
1
),
self
.
d_model
)
pe_negative
=
torch
.
zeros
(
x
.
size
(
1
),
self
.
d_model
)
position
=
torch
.
arange
(
0
,
x
.
size
(
1
),
dtype
=
torch
.
float32
).
unsqueeze
(
1
)
div_term
=
torch
.
exp
(
torch
.
arange
(
0
,
self
.
d_model
,
2
,
dtype
=
torch
.
float32
)
*
-
(
math
.
log
(
10000.0
)
/
self
.
d_model
)
)
pe_positive
[:,
0
::
2
]
=
torch
.
sin
(
position
*
div_term
)
pe_positive
[:,
1
::
2
]
=
torch
.
cos
(
position
*
div_term
)
pe_negative
[:,
0
::
2
]
=
torch
.
sin
(
-
1
*
position
*
div_term
)
pe_negative
[:,
1
::
2
]
=
torch
.
cos
(
-
1
*
position
*
div_term
)
# Reserve the order of positive indices and concat both positive and
# negative indices. This is used to support the shifting trick
# as in https://arxiv.org/abs/1901.02860
pe_positive
=
torch
.
flip
(
pe_positive
,
[
0
]).
unsqueeze
(
0
)
pe_negative
=
pe_negative
[
1
:].
unsqueeze
(
0
)
pe
=
torch
.
cat
([
pe_positive
,
pe_negative
],
dim
=
1
)
self
.
pe
=
pe
.
to
(
device
=
x
.
device
,
dtype
=
x
.
dtype
)
def
forward
(
self
,
x
:
torch
.
Tensor
,
offset
:
Union
[
int
,
torch
.
Tensor
]
=
0
):
"""Add positional encoding.
Args:
x (torch.Tensor): Input tensor (batch, time, `*`).
Returns:
torch.Tensor: Encoded tensor (batch, time, `*`).
"""
self
.
extend_pe
(
x
)
x
=
x
*
self
.
xscale
pos_emb
=
self
.
position_encoding
(
size
=
x
.
size
(
1
),
offset
=
offset
)
return
self
.
dropout
(
x
),
self
.
dropout
(
pos_emb
)
def
position_encoding
(
self
,
offset
:
Union
[
int
,
torch
.
Tensor
],
size
:
int
)
->
torch
.
Tensor
:
""" For getting encoding in a streaming fashion
Attention!!!!!
we apply dropout only once at the whole utterance level in a none
streaming way, but will call this function several times with
increasing input size in a streaming scenario, so the dropout will
be applied several times.
Args:
offset (int or torch.tensor): start offset
size (int): required size of position encoding
Returns:
torch.Tensor: Corresponding encoding
"""
pos_emb
=
self
.
pe
[
:,
self
.
pe
.
size
(
1
)
//
2
-
size
+
1
:
self
.
pe
.
size
(
1
)
//
2
+
size
,
]
return
pos_emb
cosyvoice/transformer/encoder.py
0 → 100644
View file @
8d03db9a
# Copyright (c) 2021 Mobvoi Inc (Binbin Zhang, Di Wu)
# 2022 Xingchen Song (sxc19@mails.tsinghua.edu.cn)
# 2024 Alibaba Inc (Xiang Lyu)
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Modified from ESPnet(https://github.com/espnet/espnet)
"""Encoder definition."""
from
typing
import
Tuple
import
torch
import
torch.utils.checkpoint
as
ckpt
from
cosyvoice.transformer.convolution
import
ConvolutionModule
from
cosyvoice.transformer.encoder_layer
import
TransformerEncoderLayer
from
cosyvoice.transformer.encoder_layer
import
ConformerEncoderLayer
from
cosyvoice.transformer.positionwise_feed_forward
import
PositionwiseFeedForward
from
cosyvoice.utils.class_utils
import
(
COSYVOICE_EMB_CLASSES
,
COSYVOICE_SUBSAMPLE_CLASSES
,
COSYVOICE_ATTENTION_CLASSES
,
COSYVOICE_ACTIVATION_CLASSES
,
)
from
cosyvoice.utils.mask
import
make_pad_mask
from
cosyvoice.utils.mask
import
add_optional_chunk_mask
class
BaseEncoder
(
torch
.
nn
.
Module
):
def
__init__
(
self
,
input_size
:
int
,
output_size
:
int
=
256
,
attention_heads
:
int
=
4
,
linear_units
:
int
=
2048
,
num_blocks
:
int
=
6
,
dropout_rate
:
float
=
0.1
,
positional_dropout_rate
:
float
=
0.1
,
attention_dropout_rate
:
float
=
0.0
,
input_layer
:
str
=
"conv2d"
,
pos_enc_layer_type
:
str
=
"abs_pos"
,
normalize_before
:
bool
=
True
,
static_chunk_size
:
int
=
0
,
use_dynamic_chunk
:
bool
=
False
,
global_cmvn
:
torch
.
nn
.
Module
=
None
,
use_dynamic_left_chunk
:
bool
=
False
,
gradient_checkpointing
:
bool
=
False
,
):
"""
Args:
input_size (int): input dim
output_size (int): dimension of attention
attention_heads (int): the number of heads of multi head attention
linear_units (int): the hidden units number of position-wise feed
forward
num_blocks (int): the number of decoder blocks
dropout_rate (float): dropout rate
attention_dropout_rate (float): dropout rate in attention
positional_dropout_rate (float): dropout rate after adding
positional encoding
input_layer (str): input layer type.
optional [linear, conv2d, conv2d6, conv2d8]
pos_enc_layer_type (str): Encoder positional encoding layer type.
opitonal [abs_pos, scaled_abs_pos, rel_pos, no_pos]
normalize_before (bool):
True: use layer_norm before each sub-block of a layer.
False: use layer_norm after each sub-block of a layer.
static_chunk_size (int): chunk size for static chunk training and
decoding
use_dynamic_chunk (bool): whether use dynamic chunk size for
training or not, You can only use fixed chunk(chunk_size > 0)
or dyanmic chunk size(use_dynamic_chunk = True)
global_cmvn (Optional[torch.nn.Module]): Optional GlobalCMVN module
use_dynamic_left_chunk (bool): whether use dynamic left chunk in
dynamic chunk training
key_bias: whether use bias in attention.linear_k, False for whisper models.
gradient_checkpointing: rerunning a forward-pass segment for each
checkpointed segment during backward.
"""
super
().
__init__
()
self
.
_output_size
=
output_size
self
.
global_cmvn
=
global_cmvn
self
.
embed
=
COSYVOICE_SUBSAMPLE_CLASSES
[
input_layer
](
input_size
,
output_size
,
dropout_rate
,
COSYVOICE_EMB_CLASSES
[
pos_enc_layer_type
](
output_size
,
positional_dropout_rate
),
)
self
.
normalize_before
=
normalize_before
self
.
after_norm
=
torch
.
nn
.
LayerNorm
(
output_size
,
eps
=
1e-5
)
self
.
static_chunk_size
=
static_chunk_size
self
.
use_dynamic_chunk
=
use_dynamic_chunk
self
.
use_dynamic_left_chunk
=
use_dynamic_left_chunk
self
.
gradient_checkpointing
=
gradient_checkpointing
def
output_size
(
self
)
->
int
:
return
self
.
_output_size
def
forward
(
self
,
xs
:
torch
.
Tensor
,
xs_lens
:
torch
.
Tensor
,
decoding_chunk_size
:
int
=
0
,
num_decoding_left_chunks
:
int
=
-
1
,
)
->
Tuple
[
torch
.
Tensor
,
torch
.
Tensor
]:
"""Embed positions in tensor.
Args:
xs: padded input tensor (B, T, D)
xs_lens: input length (B)
decoding_chunk_size: decoding chunk size for dynamic chunk
0: default for training, use random dynamic chunk.
<0: for decoding, use full chunk.
>0: for decoding, use fixed chunk size as set.
num_decoding_left_chunks: number of left chunks, this is for decoding,
the chunk size is decoding_chunk_size.
>=0: use num_decoding_left_chunks
<0: use all left chunks
Returns:
encoder output tensor xs, and subsampled masks
xs: padded output tensor (B, T' ~= T/subsample_rate, D)
masks: torch.Tensor batch padding mask after subsample
(B, 1, T' ~= T/subsample_rate)
NOTE(xcsong):
We pass the `__call__` method of the modules instead of `forward` to the
checkpointing API because `__call__` attaches all the hooks of the module.
https://discuss.pytorch.org/t/any-different-between-model-input-and-model-forward-input/3690/2
"""
T
=
xs
.
size
(
1
)
masks
=
~
make_pad_mask
(
xs_lens
,
T
).
unsqueeze
(
1
)
# (B, 1, T)
if
self
.
global_cmvn
is
not
None
:
xs
=
self
.
global_cmvn
(
xs
)
xs
,
pos_emb
,
masks
=
self
.
embed
(
xs
,
masks
)
mask_pad
=
masks
# (B, 1, T/subsample_rate)
chunk_masks
=
add_optional_chunk_mask
(
xs
,
masks
,
self
.
use_dynamic_chunk
,
self
.
use_dynamic_left_chunk
,
decoding_chunk_size
,
self
.
static_chunk_size
,
num_decoding_left_chunks
)
if
self
.
gradient_checkpointing
and
self
.
training
:
xs
=
self
.
forward_layers_checkpointed
(
xs
,
chunk_masks
,
pos_emb
,
mask_pad
)
else
:
xs
=
self
.
forward_layers
(
xs
,
chunk_masks
,
pos_emb
,
mask_pad
)
if
self
.
normalize_before
:
xs
=
self
.
after_norm
(
xs
)
# Here we assume the mask is not changed in encoder layers, so just
# return the masks before encoder layers, and the masks will be used
# for cross attention with decoder later
return
xs
,
masks
def
forward_layers
(
self
,
xs
:
torch
.
Tensor
,
chunk_masks
:
torch
.
Tensor
,
pos_emb
:
torch
.
Tensor
,
mask_pad
:
torch
.
Tensor
)
->
torch
.
Tensor
:
for
layer
in
self
.
encoders
:
xs
,
chunk_masks
,
_
,
_
=
layer
(
xs
,
chunk_masks
,
pos_emb
,
mask_pad
)
return
xs
@
torch
.
jit
.
ignore
(
drop
=
True
)
def
forward_layers_checkpointed
(
self
,
xs
:
torch
.
Tensor
,
chunk_masks
:
torch
.
Tensor
,
pos_emb
:
torch
.
Tensor
,
mask_pad
:
torch
.
Tensor
)
->
torch
.
Tensor
:
for
layer
in
self
.
encoders
:
xs
,
chunk_masks
,
_
,
_
=
ckpt
.
checkpoint
(
layer
.
__call__
,
xs
,
chunk_masks
,
pos_emb
,
mask_pad
)
return
xs
def
forward_chunk
(
self
,
xs
:
torch
.
Tensor
,
offset
:
int
,
required_cache_size
:
int
,
att_cache
:
torch
.
Tensor
=
torch
.
zeros
(
0
,
0
,
0
,
0
),
cnn_cache
:
torch
.
Tensor
=
torch
.
zeros
(
0
,
0
,
0
,
0
),
att_mask
:
torch
.
Tensor
=
torch
.
ones
((
0
,
0
,
0
),
dtype
=
torch
.
bool
),
)
->
Tuple
[
torch
.
Tensor
,
torch
.
Tensor
,
torch
.
Tensor
]:
""" Forward just one chunk
Args:
xs (torch.Tensor): chunk input, with shape (b=1, time, mel-dim),
where `time == (chunk_size - 1) * subsample_rate +
\
subsample.right_context + 1`
offset (int): current offset in encoder output time stamp
required_cache_size (int): cache size required for next chunk
compuation
>=0: actual cache size
<0: means all history cache is required
att_cache (torch.Tensor): cache tensor for KEY & VALUE in
transformer/conformer attention, with shape
(elayers, head, cache_t1, d_k * 2), where
`head * d_k == hidden-dim` and
`cache_t1 == chunk_size * num_decoding_left_chunks`.
cnn_cache (torch.Tensor): cache tensor for cnn_module in conformer,
(elayers, b=1, hidden-dim, cache_t2), where
`cache_t2 == cnn.lorder - 1`
Returns:
torch.Tensor: output of current input xs,
with shape (b=1, chunk_size, hidden-dim).
torch.Tensor: new attention cache required for next chunk, with
dynamic shape (elayers, head, ?, d_k * 2)
depending on required_cache_size.
torch.Tensor: new conformer cnn cache required for next chunk, with
same shape as the original cnn_cache.
"""
assert
xs
.
size
(
0
)
==
1
# tmp_masks is just for interface compatibility
tmp_masks
=
torch
.
ones
(
1
,
xs
.
size
(
1
),
device
=
xs
.
device
,
dtype
=
torch
.
bool
)
tmp_masks
=
tmp_masks
.
unsqueeze
(
1
)
if
self
.
global_cmvn
is
not
None
:
xs
=
self
.
global_cmvn
(
xs
)
# NOTE(xcsong): Before embed, shape(xs) is (b=1, time, mel-dim)
xs
,
pos_emb
,
_
=
self
.
embed
(
xs
,
tmp_masks
,
offset
)
# NOTE(xcsong): After embed, shape(xs) is (b=1, chunk_size, hidden-dim)
elayers
,
cache_t1
=
att_cache
.
size
(
0
),
att_cache
.
size
(
2
)
chunk_size
=
xs
.
size
(
1
)
attention_key_size
=
cache_t1
+
chunk_size
pos_emb
=
self
.
embed
.
position_encoding
(
offset
=
offset
-
cache_t1
,
size
=
attention_key_size
)
if
required_cache_size
<
0
:
next_cache_start
=
0
elif
required_cache_size
==
0
:
next_cache_start
=
attention_key_size
else
:
next_cache_start
=
max
(
attention_key_size
-
required_cache_size
,
0
)
r_att_cache
=
[]
r_cnn_cache
=
[]
for
i
,
layer
in
enumerate
(
self
.
encoders
):
# NOTE(xcsong): Before layer.forward
# shape(att_cache[i:i + 1]) is (1, head, cache_t1, d_k * 2),
# shape(cnn_cache[i]) is (b=1, hidden-dim, cache_t2)
xs
,
_
,
new_att_cache
,
new_cnn_cache
=
layer
(
xs
,
att_mask
,
pos_emb
,
att_cache
=
att_cache
[
i
:
i
+
1
]
if
elayers
>
0
else
att_cache
,
cnn_cache
=
cnn_cache
[
i
]
if
cnn_cache
.
size
(
0
)
>
0
else
cnn_cache
)
# NOTE(xcsong): After layer.forward
# shape(new_att_cache) is (1, head, attention_key_size, d_k * 2),
# shape(new_cnn_cache) is (b=1, hidden-dim, cache_t2)
r_att_cache
.
append
(
new_att_cache
[:,
:,
next_cache_start
:,
:])
r_cnn_cache
.
append
(
new_cnn_cache
.
unsqueeze
(
0
))
if
self
.
normalize_before
:
xs
=
self
.
after_norm
(
xs
)
# NOTE(xcsong): shape(r_att_cache) is (elayers, head, ?, d_k * 2),
# ? may be larger than cache_t1, it depends on required_cache_size
r_att_cache
=
torch
.
cat
(
r_att_cache
,
dim
=
0
)
# NOTE(xcsong): shape(r_cnn_cache) is (e, b=1, hidden-dim, cache_t2)
r_cnn_cache
=
torch
.
cat
(
r_cnn_cache
,
dim
=
0
)
return
(
xs
,
r_att_cache
,
r_cnn_cache
)
def
forward_chunk_by_chunk
(
self
,
xs
:
torch
.
Tensor
,
decoding_chunk_size
:
int
,
num_decoding_left_chunks
:
int
=
-
1
,
)
->
Tuple
[
torch
.
Tensor
,
torch
.
Tensor
]:
""" Forward input chunk by chunk with chunk_size like a streaming
fashion
Here we should pay special attention to computation cache in the
streaming style forward chunk by chunk. Three things should be taken
into account for computation in the current network:
1. transformer/conformer encoder layers output cache
2. convolution in conformer
3. convolution in subsampling
However, we don't implement subsampling cache for:
1. We can control subsampling module to output the right result by
overlapping input instead of cache left context, even though it
wastes some computation, but subsampling only takes a very
small fraction of computation in the whole model.
2. Typically, there are several covolution layers with subsampling
in subsampling module, it is tricky and complicated to do cache
with different convolution layers with different subsampling
rate.
3. Currently, nn.Sequential is used to stack all the convolution
layers in subsampling, we need to rewrite it to make it work
with cache, which is not preferred.
Args:
xs (torch.Tensor): (1, max_len, dim)
chunk_size (int): decoding chunk size
"""
assert
decoding_chunk_size
>
0
# The model is trained by static or dynamic chunk
assert
self
.
static_chunk_size
>
0
or
self
.
use_dynamic_chunk
subsampling
=
self
.
embed
.
subsampling_rate
context
=
self
.
embed
.
right_context
+
1
# Add current frame
stride
=
subsampling
*
decoding_chunk_size
decoding_window
=
(
decoding_chunk_size
-
1
)
*
subsampling
+
context
num_frames
=
xs
.
size
(
1
)
att_cache
:
torch
.
Tensor
=
torch
.
zeros
((
0
,
0
,
0
,
0
),
device
=
xs
.
device
)
cnn_cache
:
torch
.
Tensor
=
torch
.
zeros
((
0
,
0
,
0
,
0
),
device
=
xs
.
device
)
outputs
=
[]
offset
=
0
required_cache_size
=
decoding_chunk_size
*
num_decoding_left_chunks
# Feed forward overlap input step by step
for
cur
in
range
(
0
,
num_frames
-
context
+
1
,
stride
):
end
=
min
(
cur
+
decoding_window
,
num_frames
)
chunk_xs
=
xs
[:,
cur
:
end
,
:]
(
y
,
att_cache
,
cnn_cache
)
=
self
.
forward_chunk
(
chunk_xs
,
offset
,
required_cache_size
,
att_cache
,
cnn_cache
)
outputs
.
append
(
y
)
offset
+=
y
.
size
(
1
)
ys
=
torch
.
cat
(
outputs
,
1
)
masks
=
torch
.
ones
((
1
,
1
,
ys
.
size
(
1
)),
device
=
ys
.
device
,
dtype
=
torch
.
bool
)
return
ys
,
masks
class
TransformerEncoder
(
BaseEncoder
):
"""Transformer encoder module."""
def
__init__
(
self
,
input_size
:
int
,
output_size
:
int
=
256
,
attention_heads
:
int
=
4
,
linear_units
:
int
=
2048
,
num_blocks
:
int
=
6
,
dropout_rate
:
float
=
0.1
,
positional_dropout_rate
:
float
=
0.1
,
attention_dropout_rate
:
float
=
0.0
,
input_layer
:
str
=
"conv2d"
,
pos_enc_layer_type
:
str
=
"abs_pos"
,
normalize_before
:
bool
=
True
,
static_chunk_size
:
int
=
0
,
use_dynamic_chunk
:
bool
=
False
,
global_cmvn
:
torch
.
nn
.
Module
=
None
,
use_dynamic_left_chunk
:
bool
=
False
,
key_bias
:
bool
=
True
,
selfattention_layer_type
:
str
=
"selfattn"
,
activation_type
:
str
=
"relu"
,
gradient_checkpointing
:
bool
=
False
,
):
""" Construct TransformerEncoder
See Encoder for the meaning of each parameter.
"""
super
().
__init__
(
input_size
,
output_size
,
attention_heads
,
linear_units
,
num_blocks
,
dropout_rate
,
positional_dropout_rate
,
attention_dropout_rate
,
input_layer
,
pos_enc_layer_type
,
normalize_before
,
static_chunk_size
,
use_dynamic_chunk
,
global_cmvn
,
use_dynamic_left_chunk
,
gradient_checkpointing
)
activation
=
COSYVOICE_ACTIVATION_CLASSES
[
activation_type
]()
self
.
encoders
=
torch
.
nn
.
ModuleList
([
TransformerEncoderLayer
(
output_size
,
COSYVOICE_ATTENTION_CLASSES
[
selfattention_layer_type
](
attention_heads
,
output_size
,
attention_dropout_rate
,
key_bias
),
PositionwiseFeedForward
(
output_size
,
linear_units
,
dropout_rate
,
activation
),
dropout_rate
,
normalize_before
)
for
_
in
range
(
num_blocks
)
])
class
ConformerEncoder
(
BaseEncoder
):
"""Conformer encoder module."""
def
__init__
(
self
,
input_size
:
int
,
output_size
:
int
=
256
,
attention_heads
:
int
=
4
,
linear_units
:
int
=
2048
,
num_blocks
:
int
=
6
,
dropout_rate
:
float
=
0.1
,
positional_dropout_rate
:
float
=
0.1
,
attention_dropout_rate
:
float
=
0.0
,
input_layer
:
str
=
"conv2d"
,
pos_enc_layer_type
:
str
=
"rel_pos"
,
normalize_before
:
bool
=
True
,
static_chunk_size
:
int
=
0
,
use_dynamic_chunk
:
bool
=
False
,
global_cmvn
:
torch
.
nn
.
Module
=
None
,
use_dynamic_left_chunk
:
bool
=
False
,
positionwise_conv_kernel_size
:
int
=
1
,
macaron_style
:
bool
=
True
,
selfattention_layer_type
:
str
=
"rel_selfattn"
,
activation_type
:
str
=
"swish"
,
use_cnn_module
:
bool
=
True
,
cnn_module_kernel
:
int
=
15
,
causal
:
bool
=
False
,
cnn_module_norm
:
str
=
"batch_norm"
,
key_bias
:
bool
=
True
,
gradient_checkpointing
:
bool
=
False
,
):
"""Construct ConformerEncoder
Args:
input_size to use_dynamic_chunk, see in BaseEncoder
positionwise_conv_kernel_size (int): Kernel size of positionwise
conv1d layer.
macaron_style (bool): Whether to use macaron style for
positionwise layer.
selfattention_layer_type (str): Encoder attention layer type,
the parameter has no effect now, it's just for configure
compatibility.
activation_type (str): Encoder activation function type.
use_cnn_module (bool): Whether to use convolution module.
cnn_module_kernel (int): Kernel size of convolution module.
causal (bool): whether to use causal convolution or not.
key_bias: whether use bias in attention.linear_k, False for whisper models.
"""
super
().
__init__
(
input_size
,
output_size
,
attention_heads
,
linear_units
,
num_blocks
,
dropout_rate
,
positional_dropout_rate
,
attention_dropout_rate
,
input_layer
,
pos_enc_layer_type
,
normalize_before
,
static_chunk_size
,
use_dynamic_chunk
,
global_cmvn
,
use_dynamic_left_chunk
,
gradient_checkpointing
)
activation
=
COSYVOICE_ACTIVATION_CLASSES
[
activation_type
]()
# self-attention module definition
encoder_selfattn_layer_args
=
(
attention_heads
,
output_size
,
attention_dropout_rate
,
key_bias
,
)
# feed-forward module definition
positionwise_layer_args
=
(
output_size
,
linear_units
,
dropout_rate
,
activation
,
)
# convolution module definition
convolution_layer_args
=
(
output_size
,
cnn_module_kernel
,
activation
,
cnn_module_norm
,
causal
)
self
.
encoders
=
torch
.
nn
.
ModuleList
([
ConformerEncoderLayer
(
output_size
,
COSYVOICE_ATTENTION_CLASSES
[
selfattention_layer_type
](
*
encoder_selfattn_layer_args
),
PositionwiseFeedForward
(
*
positionwise_layer_args
),
PositionwiseFeedForward
(
*
positionwise_layer_args
)
if
macaron_style
else
None
,
ConvolutionModule
(
*
convolution_layer_args
)
if
use_cnn_module
else
None
,
dropout_rate
,
normalize_before
,
)
for
_
in
range
(
num_blocks
)
])
cosyvoice/transformer/encoder_layer.py
0 → 100644
View file @
8d03db9a
# Copyright (c) 2021 Mobvoi Inc (Binbin Zhang, Di Wu)
# 2022 Xingchen Song (sxc19@mails.tsinghua.edu.cn)
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Modified from ESPnet(https://github.com/espnet/espnet)
"""Encoder self-attention layer definition."""
from
typing
import
Optional
,
Tuple
import
torch
from
torch
import
nn
class
TransformerEncoderLayer
(
nn
.
Module
):
"""Encoder layer module.
Args:
size (int): Input dimension.
self_attn (torch.nn.Module): Self-attention module instance.
`MultiHeadedAttention` or `RelPositionMultiHeadedAttention`
instance can be used as the argument.
feed_forward (torch.nn.Module): Feed-forward module instance.
`PositionwiseFeedForward`, instance can be used as the argument.
dropout_rate (float): Dropout rate.
normalize_before (bool):
True: use layer_norm before each sub-block.
False: to use layer_norm after each sub-block.
"""
def
__init__
(
self
,
size
:
int
,
self_attn
:
torch
.
nn
.
Module
,
feed_forward
:
torch
.
nn
.
Module
,
dropout_rate
:
float
,
normalize_before
:
bool
=
True
,
):
"""Construct an EncoderLayer object."""
super
().
__init__
()
self
.
self_attn
=
self_attn
self
.
feed_forward
=
feed_forward
self
.
norm1
=
nn
.
LayerNorm
(
size
,
eps
=
1e-5
)
self
.
norm2
=
nn
.
LayerNorm
(
size
,
eps
=
1e-5
)
self
.
dropout
=
nn
.
Dropout
(
dropout_rate
)
self
.
size
=
size
self
.
normalize_before
=
normalize_before
def
forward
(
self
,
x
:
torch
.
Tensor
,
mask
:
torch
.
Tensor
,
pos_emb
:
torch
.
Tensor
,
mask_pad
:
torch
.
Tensor
=
torch
.
ones
((
0
,
0
,
0
),
dtype
=
torch
.
bool
),
att_cache
:
torch
.
Tensor
=
torch
.
zeros
((
0
,
0
,
0
,
0
)),
cnn_cache
:
torch
.
Tensor
=
torch
.
zeros
((
0
,
0
,
0
,
0
)),
)
->
Tuple
[
torch
.
Tensor
,
torch
.
Tensor
,
torch
.
Tensor
,
torch
.
Tensor
]:
"""Compute encoded features.
Args:
x (torch.Tensor): (#batch, time, size)
mask (torch.Tensor): Mask tensor for the input (#batch, time,time),
(0, 0, 0) means fake mask.
pos_emb (torch.Tensor): just for interface compatibility
to ConformerEncoderLayer
mask_pad (torch.Tensor): does not used in transformer layer,
just for unified api with conformer.
att_cache (torch.Tensor): Cache tensor of the KEY & VALUE
(#batch=1, head, cache_t1, d_k * 2), head * d_k == size.
cnn_cache (torch.Tensor): Convolution cache in conformer layer
(#batch=1, size, cache_t2), not used here, it's for interface
compatibility to ConformerEncoderLayer.
Returns:
torch.Tensor: Output tensor (#batch, time, size).
torch.Tensor: Mask tensor (#batch, time, time).
torch.Tensor: att_cache tensor,
(#batch=1, head, cache_t1 + time, d_k * 2).
torch.Tensor: cnn_cahce tensor (#batch=1, size, cache_t2).
"""
residual
=
x
if
self
.
normalize_before
:
x
=
self
.
norm1
(
x
)
x_att
,
new_att_cache
=
self
.
self_attn
(
x
,
x
,
x
,
mask
,
pos_emb
=
pos_emb
,
cache
=
att_cache
)
x
=
residual
+
self
.
dropout
(
x_att
)
if
not
self
.
normalize_before
:
x
=
self
.
norm1
(
x
)
residual
=
x
if
self
.
normalize_before
:
x
=
self
.
norm2
(
x
)
x
=
residual
+
self
.
dropout
(
self
.
feed_forward
(
x
))
if
not
self
.
normalize_before
:
x
=
self
.
norm2
(
x
)
fake_cnn_cache
=
torch
.
zeros
((
0
,
0
,
0
),
dtype
=
x
.
dtype
,
device
=
x
.
device
)
return
x
,
mask
,
new_att_cache
,
fake_cnn_cache
class
ConformerEncoderLayer
(
nn
.
Module
):
"""Encoder layer module.
Args:
size (int): Input dimension.
self_attn (torch.nn.Module): Self-attention module instance.
`MultiHeadedAttention` or `RelPositionMultiHeadedAttention`
instance can be used as the argument.
feed_forward (torch.nn.Module): Feed-forward module instance.
`PositionwiseFeedForward` instance can be used as the argument.
feed_forward_macaron (torch.nn.Module): Additional feed-forward module
instance.
`PositionwiseFeedForward` instance can be used as the argument.
conv_module (torch.nn.Module): Convolution module instance.
`ConvlutionModule` instance can be used as the argument.
dropout_rate (float): Dropout rate.
normalize_before (bool):
True: use layer_norm before each sub-block.
False: use layer_norm after each sub-block.
"""
def
__init__
(
self
,
size
:
int
,
self_attn
:
torch
.
nn
.
Module
,
feed_forward
:
Optional
[
nn
.
Module
]
=
None
,
feed_forward_macaron
:
Optional
[
nn
.
Module
]
=
None
,
conv_module
:
Optional
[
nn
.
Module
]
=
None
,
dropout_rate
:
float
=
0.1
,
normalize_before
:
bool
=
True
,
):
"""Construct an EncoderLayer object."""
super
().
__init__
()
self
.
self_attn
=
self_attn
self
.
feed_forward
=
feed_forward
self
.
feed_forward_macaron
=
feed_forward_macaron
self
.
conv_module
=
conv_module
self
.
norm_ff
=
nn
.
LayerNorm
(
size
,
eps
=
1e-5
)
# for the FNN module
self
.
norm_mha
=
nn
.
LayerNorm
(
size
,
eps
=
1e-5
)
# for the MHA module
if
feed_forward_macaron
is
not
None
:
self
.
norm_ff_macaron
=
nn
.
LayerNorm
(
size
,
eps
=
1e-5
)
self
.
ff_scale
=
0.5
else
:
self
.
ff_scale
=
1.0
if
self
.
conv_module
is
not
None
:
self
.
norm_conv
=
nn
.
LayerNorm
(
size
,
eps
=
1e-5
)
# for the CNN module
self
.
norm_final
=
nn
.
LayerNorm
(
size
,
eps
=
1e-5
)
# for the final output of the block
self
.
dropout
=
nn
.
Dropout
(
dropout_rate
)
self
.
size
=
size
self
.
normalize_before
=
normalize_before
def
forward
(
self
,
x
:
torch
.
Tensor
,
mask
:
torch
.
Tensor
,
pos_emb
:
torch
.
Tensor
,
mask_pad
:
torch
.
Tensor
=
torch
.
ones
((
0
,
0
,
0
),
dtype
=
torch
.
bool
),
att_cache
:
torch
.
Tensor
=
torch
.
zeros
((
0
,
0
,
0
,
0
)),
cnn_cache
:
torch
.
Tensor
=
torch
.
zeros
((
0
,
0
,
0
,
0
)),
)
->
Tuple
[
torch
.
Tensor
,
torch
.
Tensor
,
torch
.
Tensor
,
torch
.
Tensor
]:
"""Compute encoded features.
Args:
x (torch.Tensor): (#batch, time, size)
mask (torch.Tensor): Mask tensor for the input (#batch, time,time),
(0, 0, 0) means fake mask.
pos_emb (torch.Tensor): positional encoding, must not be None
for ConformerEncoderLayer.
mask_pad (torch.Tensor): batch padding mask used for conv module.
(#batch, 1,time), (0, 0, 0) means fake mask.
att_cache (torch.Tensor): Cache tensor of the KEY & VALUE
(#batch=1, head, cache_t1, d_k * 2), head * d_k == size.
cnn_cache (torch.Tensor): Convolution cache in conformer layer
(#batch=1, size, cache_t2)
Returns:
torch.Tensor: Output tensor (#batch, time, size).
torch.Tensor: Mask tensor (#batch, time, time).
torch.Tensor: att_cache tensor,
(#batch=1, head, cache_t1 + time, d_k * 2).
torch.Tensor: cnn_cahce tensor (#batch, size, cache_t2).
"""
# whether to use macaron style
if
self
.
feed_forward_macaron
is
not
None
:
residual
=
x
if
self
.
normalize_before
:
x
=
self
.
norm_ff_macaron
(
x
)
x
=
residual
+
self
.
ff_scale
*
self
.
dropout
(
self
.
feed_forward_macaron
(
x
))
if
not
self
.
normalize_before
:
x
=
self
.
norm_ff_macaron
(
x
)
# multi-headed self-attention module
residual
=
x
if
self
.
normalize_before
:
x
=
self
.
norm_mha
(
x
)
x_att
,
new_att_cache
=
self
.
self_attn
(
x
,
x
,
x
,
mask
,
pos_emb
,
att_cache
)
x
=
residual
+
self
.
dropout
(
x_att
)
if
not
self
.
normalize_before
:
x
=
self
.
norm_mha
(
x
)
# convolution module
# Fake new cnn cache here, and then change it in conv_module
new_cnn_cache
=
torch
.
zeros
((
0
,
0
,
0
),
dtype
=
x
.
dtype
,
device
=
x
.
device
)
if
self
.
conv_module
is
not
None
:
residual
=
x
if
self
.
normalize_before
:
x
=
self
.
norm_conv
(
x
)
x
,
new_cnn_cache
=
self
.
conv_module
(
x
,
mask_pad
,
cnn_cache
)
x
=
residual
+
self
.
dropout
(
x
)
if
not
self
.
normalize_before
:
x
=
self
.
norm_conv
(
x
)
# feed forward module
residual
=
x
if
self
.
normalize_before
:
x
=
self
.
norm_ff
(
x
)
x
=
residual
+
self
.
ff_scale
*
self
.
dropout
(
self
.
feed_forward
(
x
))
if
not
self
.
normalize_before
:
x
=
self
.
norm_ff
(
x
)
if
self
.
conv_module
is
not
None
:
x
=
self
.
norm_final
(
x
)
return
x
,
mask
,
new_att_cache
,
new_cnn_cache
cosyvoice/transformer/label_smoothing_loss.py
0 → 100644
View file @
8d03db9a
# Copyright (c) 2019 Shigeki Karita
# 2020 Mobvoi Inc (Binbin Zhang)
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Label smoothing module."""
import
torch
from
torch
import
nn
class
LabelSmoothingLoss
(
nn
.
Module
):
"""Label-smoothing loss.
In a standard CE loss, the label's data distribution is:
[0,1,2] ->
[
[1.0, 0.0, 0.0],
[0.0, 1.0, 0.0],
[0.0, 0.0, 1.0],
]
In the smoothing version CE Loss,some probabilities
are taken from the true label prob (1.0) and are divided
among other labels.
e.g.
smoothing=0.1
[0,1,2] ->
[
[0.9, 0.05, 0.05],
[0.05, 0.9, 0.05],
[0.05, 0.05, 0.9],
]
Args:
size (int): the number of class
padding_idx (int): padding class id which will be ignored for loss
smoothing (float): smoothing rate (0.0 means the conventional CE)
normalize_length (bool):
normalize loss by sequence length if True
normalize loss by batch size if False
"""
def
__init__
(
self
,
size
:
int
,
padding_idx
:
int
,
smoothing
:
float
,
normalize_length
:
bool
=
False
):
"""Construct an LabelSmoothingLoss object."""
super
(
LabelSmoothingLoss
,
self
).
__init__
()
self
.
criterion
=
nn
.
KLDivLoss
(
reduction
=
"none"
)
self
.
padding_idx
=
padding_idx
self
.
confidence
=
1.0
-
smoothing
self
.
smoothing
=
smoothing
self
.
size
=
size
self
.
normalize_length
=
normalize_length
def
forward
(
self
,
x
:
torch
.
Tensor
,
target
:
torch
.
Tensor
)
->
torch
.
Tensor
:
"""Compute loss between x and target.
The model outputs and data labels tensors are flatten to
(batch*seqlen, class) shape and a mask is applied to the
padding part which should not be calculated for loss.
Args:
x (torch.Tensor): prediction (batch, seqlen, class)
target (torch.Tensor):
target signal masked with self.padding_id (batch, seqlen)
Returns:
loss (torch.Tensor) : The KL loss, scalar float value
"""
assert
x
.
size
(
2
)
==
self
.
size
batch_size
=
x
.
size
(
0
)
x
=
x
.
view
(
-
1
,
self
.
size
)
target
=
target
.
view
(
-
1
)
# use zeros_like instead of torch.no_grad() for true_dist,
# since no_grad() can not be exported by JIT
true_dist
=
torch
.
zeros_like
(
x
)
true_dist
.
fill_
(
self
.
smoothing
/
(
self
.
size
-
1
))
ignore
=
target
==
self
.
padding_idx
# (B,)
total
=
len
(
target
)
-
ignore
.
sum
().
item
()
target
=
target
.
masked_fill
(
ignore
,
0
)
# avoid -1 index
true_dist
.
scatter_
(
1
,
target
.
unsqueeze
(
1
),
self
.
confidence
)
kl
=
self
.
criterion
(
torch
.
log_softmax
(
x
,
dim
=
1
),
true_dist
)
denom
=
total
if
self
.
normalize_length
else
batch_size
return
kl
.
masked_fill
(
ignore
.
unsqueeze
(
1
),
0
).
sum
()
/
denom
cosyvoice/transformer/positionwise_feed_forward.py
0 → 100644
View file @
8d03db9a
# Copyright (c) 2019 Shigeki Karita
# 2020 Mobvoi Inc (Binbin Zhang)
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Positionwise feed forward layer definition."""
import
torch
class
PositionwiseFeedForward
(
torch
.
nn
.
Module
):
"""Positionwise feed forward layer.
FeedForward are appied on each position of the sequence.
The output dim is same with the input dim.
Args:
idim (int): Input dimenstion.
hidden_units (int): The number of hidden units.
dropout_rate (float): Dropout rate.
activation (torch.nn.Module): Activation function
"""
def
__init__
(
self
,
idim
:
int
,
hidden_units
:
int
,
dropout_rate
:
float
,
activation
:
torch
.
nn
.
Module
=
torch
.
nn
.
ReLU
(),
):
"""Construct a PositionwiseFeedForward object."""
super
(
PositionwiseFeedForward
,
self
).
__init__
()
self
.
w_1
=
torch
.
nn
.
Linear
(
idim
,
hidden_units
)
self
.
activation
=
activation
self
.
dropout
=
torch
.
nn
.
Dropout
(
dropout_rate
)
self
.
w_2
=
torch
.
nn
.
Linear
(
hidden_units
,
idim
)
def
forward
(
self
,
xs
:
torch
.
Tensor
)
->
torch
.
Tensor
:
"""Forward function.
Args:
xs: input tensor (B, L, D)
Returns:
output tensor, (B, L, D)
"""
return
self
.
w_2
(
self
.
dropout
(
self
.
activation
(
self
.
w_1
(
xs
))))
class
MoEFFNLayer
(
torch
.
nn
.
Module
):
"""
Mixture of expert with Positionwise feed forward layer
See also figure 1 in https://arxiv.org/pdf/2305.15663.pdf
The output dim is same with the input dim.
Modified from https://github.com/Lightning-AI/lit-gpt/pull/823
https://github.com/mistralai/mistral-src/blob/b46d6/moe_one_file_ref.py#L203-L219
Args:
n_expert: number of expert.
n_expert_per_token: The actual number of experts used for each frame
idim (int): Input dimenstion.
hidden_units (int): The number of hidden units.
dropout_rate (float): Dropout rate.
activation (torch.nn.Module): Activation function
"""
def
__init__
(
self
,
n_expert
:
int
,
n_expert_per_token
:
int
,
idim
:
int
,
hidden_units
:
int
,
dropout_rate
:
float
,
activation
:
torch
.
nn
.
Module
=
torch
.
nn
.
ReLU
(),
):
super
(
MoEFFNLayer
,
self
).
__init__
()
self
.
gate
=
torch
.
nn
.
Linear
(
idim
,
n_expert
,
bias
=
False
)
self
.
experts
=
torch
.
nn
.
ModuleList
(
PositionwiseFeedForward
(
idim
,
hidden_units
,
dropout_rate
,
activation
)
for
_
in
range
(
n_expert
))
self
.
n_expert_per_token
=
n_expert_per_token
def
forward
(
self
,
xs
:
torch
.
Tensor
)
->
torch
.
Tensor
:
"""Foward function.
Args:
xs: input tensor (B, L, D)
Returns:
output tensor, (B, L, D)
"""
B
,
L
,
D
=
xs
.
size
(
)
# batch size, sequence length, embedding dimension (idim)
xs
=
xs
.
view
(
-
1
,
D
)
# (B*L, D)
router
=
self
.
gate
(
xs
)
# (B*L, n_expert)
logits
,
indices
=
torch
.
topk
(
router
,
self
.
n_expert_per_token
)
# probs:(B*L, n_expert), indices: (B*L, n_expert)
weights
=
torch
.
nn
.
functional
.
softmax
(
logits
,
dim
=
1
,
dtype
=
torch
.
float
).
to
(
dtype
=
xs
.
dtype
)
# (B*L, n_expert_per_token)
output
=
torch
.
zeros_like
(
xs
)
# (B*L, D)
for
i
,
expert
in
enumerate
(
self
.
experts
):
mask
=
indices
==
i
batch_idx
,
ith_expert
=
torch
.
where
(
mask
)
output
[
batch_idx
]
+=
weights
[
batch_idx
,
ith_expert
,
None
]
*
expert
(
xs
[
batch_idx
])
return
output
.
view
(
B
,
L
,
D
)
cosyvoice/transformer/subsampling.py
0 → 100644
View file @
8d03db9a
# Copyright (c) 2021 Mobvoi Inc (Binbin Zhang, Di Wu)
# 2024 Alibaba Inc (Xiang Lyu)
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Modified from ESPnet(https://github.com/espnet/espnet)
"""Subsampling layer definition."""
from
typing
import
Tuple
,
Union
import
torch
class
BaseSubsampling
(
torch
.
nn
.
Module
):
def
__init__
(
self
):
super
().
__init__
()
self
.
right_context
=
0
self
.
subsampling_rate
=
1
def
position_encoding
(
self
,
offset
:
Union
[
int
,
torch
.
Tensor
],
size
:
int
)
->
torch
.
Tensor
:
return
self
.
pos_enc
.
position_encoding
(
offset
,
size
)
class
EmbedinigNoSubsampling
(
BaseSubsampling
):
"""Embedding input without subsampling
"""
def
__init__
(
self
,
idim
:
int
,
odim
:
int
,
dropout_rate
:
float
,
pos_enc_class
:
torch
.
nn
.
Module
):
super
().
__init__
()
self
.
embed
=
torch
.
nn
.
Embedding
(
idim
,
odim
)
self
.
pos_enc
=
pos_enc_class
def
forward
(
self
,
x
:
torch
.
Tensor
,
x_mask
:
torch
.
Tensor
,
offset
:
Union
[
int
,
torch
.
Tensor
]
=
0
)
->
Tuple
[
torch
.
Tensor
,
torch
.
Tensor
,
torch
.
Tensor
]:
"""Input x.
Args:
x (torch.Tensor): Input tensor (#batch, time, idim).
x_mask (torch.Tensor): Input mask (#batch, 1, time).
Returns:
torch.Tensor: linear input tensor (#batch, time', odim),
where time' = time .
torch.Tensor: linear input mask (#batch, 1, time'),
where time' = time .
"""
x
=
self
.
embed
(
x
)
x
,
pos_emb
=
self
.
pos_enc
(
x
,
offset
)
return
x
,
pos_emb
,
x_mask
class
LinearNoSubsampling
(
BaseSubsampling
):
"""Linear transform the input without subsampling
Args:
idim (int): Input dimension.
odim (int): Output dimension.
dropout_rate (float): Dropout rate.
"""
def
__init__
(
self
,
idim
:
int
,
odim
:
int
,
dropout_rate
:
float
,
pos_enc_class
:
torch
.
nn
.
Module
):
"""Construct an linear object."""
super
().
__init__
()
self
.
out
=
torch
.
nn
.
Sequential
(
torch
.
nn
.
Linear
(
idim
,
odim
),
torch
.
nn
.
LayerNorm
(
odim
,
eps
=
1e-5
),
torch
.
nn
.
Dropout
(
dropout_rate
),
)
self
.
pos_enc
=
pos_enc_class
self
.
right_context
=
0
self
.
subsampling_rate
=
1
def
forward
(
self
,
x
:
torch
.
Tensor
,
x_mask
:
torch
.
Tensor
,
offset
:
Union
[
int
,
torch
.
Tensor
]
=
0
)
->
Tuple
[
torch
.
Tensor
,
torch
.
Tensor
,
torch
.
Tensor
]:
"""Input x.
Args:
x (torch.Tensor): Input tensor (#batch, time, idim).
x_mask (torch.Tensor): Input mask (#batch, 1, time).
Returns:
torch.Tensor: linear input tensor (#batch, time', odim),
where time' = time .
torch.Tensor: linear input mask (#batch, 1, time'),
where time' = time .
"""
x
=
self
.
out
(
x
)
x
,
pos_emb
=
self
.
pos_enc
(
x
,
offset
)
return
x
,
pos_emb
,
x_mask
class
Conv1dSubsampling2
(
BaseSubsampling
):
"""Convolutional 1D subsampling (to 1/2 length).
It is designed for Whisper, ref:
https://github.com/openai/whisper/blob/main/whisper/model.py
Args:
idim (int): Input dimension.
odim (int): Output dimension.
dropout_rate (float): Dropout rate.
"""
def
__init__
(
self
,
idim
:
int
,
odim
:
int
,
dropout_rate
:
float
,
pos_enc_class
:
torch
.
nn
.
Module
):
"""Construct an Conv1dSubsampling2 object."""
super
().
__init__
()
self
.
conv
=
torch
.
nn
.
Sequential
(
torch
.
nn
.
Conv1d
(
idim
,
odim
,
kernel_size
=
3
,
padding
=
1
),
torch
.
nn
.
GELU
(),
torch
.
nn
.
Conv1d
(
odim
,
odim
,
kernel_size
=
3
,
stride
=
2
,
padding
=
1
),
torch
.
nn
.
GELU
(),
)
self
.
pos_enc
=
pos_enc_class
# The right context for every conv layer is computed by:
# (kernel_size - 1) * frame_rate_of_this_layer
self
.
subsampling_rate
=
2
# 4 = (3 - 1) * 1 + (3 - 1) * 1
self
.
right_context
=
4
def
forward
(
self
,
x
:
torch
.
Tensor
,
x_mask
:
torch
.
Tensor
,
offset
:
Union
[
int
,
torch
.
Tensor
]
=
0
)
->
Tuple
[
torch
.
Tensor
,
torch
.
Tensor
,
torch
.
Tensor
]:
"""Subsample x.
Args:
x (torch.Tensor): Input tensor (#batch, time, idim).
x_mask (torch.Tensor): Input mask (#batch, 1, time).
Returns:
torch.Tensor: Subsampled tensor (#batch, time', odim),
where time' = time // 2.
torch.Tensor: Subsampled mask (#batch, 1, time'),
where time' = time // 2.
torch.Tensor: positional encoding
"""
time
=
x
.
size
(
1
)
x
=
x
.
transpose
(
1
,
2
)
# (b, f, t)
x
=
self
.
conv
(
x
)
x
=
x
.
transpose
(
1
,
2
)
# (b, t, f)
x
,
pos_emb
=
self
.
pos_enc
(
x
,
offset
)
return
x
,
pos_emb
,
x_mask
[:,
:,
(
time
+
1
)
%
2
::
2
]
class
Conv2dSubsampling4
(
BaseSubsampling
):
"""Convolutional 2D subsampling (to 1/4 length).
Args:
idim (int): Input dimension.
odim (int): Output dimension.
dropout_rate (float): Dropout rate.
"""
def
__init__
(
self
,
idim
:
int
,
odim
:
int
,
dropout_rate
:
float
,
pos_enc_class
:
torch
.
nn
.
Module
):
"""Construct an Conv2dSubsampling4 object."""
super
().
__init__
()
self
.
conv
=
torch
.
nn
.
Sequential
(
torch
.
nn
.
Conv2d
(
1
,
odim
,
3
,
2
),
torch
.
nn
.
ReLU
(),
torch
.
nn
.
Conv2d
(
odim
,
odim
,
3
,
2
),
torch
.
nn
.
ReLU
(),
)
self
.
out
=
torch
.
nn
.
Sequential
(
torch
.
nn
.
Linear
(
odim
*
(((
idim
-
1
)
//
2
-
1
)
//
2
),
odim
))
self
.
pos_enc
=
pos_enc_class
# The right context for every conv layer is computed by:
# (kernel_size - 1) * frame_rate_of_this_layer
self
.
subsampling_rate
=
4
# 6 = (3 - 1) * 1 + (3 - 1) * 2
self
.
right_context
=
6
def
forward
(
self
,
x
:
torch
.
Tensor
,
x_mask
:
torch
.
Tensor
,
offset
:
Union
[
int
,
torch
.
Tensor
]
=
0
)
->
Tuple
[
torch
.
Tensor
,
torch
.
Tensor
,
torch
.
Tensor
]:
"""Subsample x.
Args:
x (torch.Tensor): Input tensor (#batch, time, idim).
x_mask (torch.Tensor): Input mask (#batch, 1, time).
Returns:
torch.Tensor: Subsampled tensor (#batch, time', odim),
where time' = time // 4.
torch.Tensor: Subsampled mask (#batch, 1, time'),
where time' = time // 4.
torch.Tensor: positional encoding
"""
x
=
x
.
unsqueeze
(
1
)
# (b, c=1, t, f)
x
=
self
.
conv
(
x
)
b
,
c
,
t
,
f
=
x
.
size
()
x
=
self
.
out
(
x
.
transpose
(
1
,
2
).
contiguous
().
view
(
b
,
t
,
c
*
f
))
x
,
pos_emb
=
self
.
pos_enc
(
x
,
offset
)
return
x
,
pos_emb
,
x_mask
[:,
:,
2
::
2
][:,
:,
2
::
2
]
class
Conv2dSubsampling6
(
BaseSubsampling
):
"""Convolutional 2D subsampling (to 1/6 length).
Args:
idim (int): Input dimension.
odim (int): Output dimension.
dropout_rate (float): Dropout rate.
pos_enc (torch.nn.Module): Custom position encoding layer.
"""
def
__init__
(
self
,
idim
:
int
,
odim
:
int
,
dropout_rate
:
float
,
pos_enc_class
:
torch
.
nn
.
Module
):
"""Construct an Conv2dSubsampling6 object."""
super
().
__init__
()
self
.
conv
=
torch
.
nn
.
Sequential
(
torch
.
nn
.
Conv2d
(
1
,
odim
,
3
,
2
),
torch
.
nn
.
ReLU
(),
torch
.
nn
.
Conv2d
(
odim
,
odim
,
5
,
3
),
torch
.
nn
.
ReLU
(),
)
self
.
linear
=
torch
.
nn
.
Linear
(
odim
*
(((
idim
-
1
)
//
2
-
2
)
//
3
),
odim
)
self
.
pos_enc
=
pos_enc_class
# 10 = (3 - 1) * 1 + (5 - 1) * 2
self
.
subsampling_rate
=
6
self
.
right_context
=
10
def
forward
(
self
,
x
:
torch
.
Tensor
,
x_mask
:
torch
.
Tensor
,
offset
:
Union
[
int
,
torch
.
Tensor
]
=
0
)
->
Tuple
[
torch
.
Tensor
,
torch
.
Tensor
,
torch
.
Tensor
]:
"""Subsample x.
Args:
x (torch.Tensor): Input tensor (#batch, time, idim).
x_mask (torch.Tensor): Input mask (#batch, 1, time).
Returns:
torch.Tensor: Subsampled tensor (#batch, time', odim),
where time' = time // 6.
torch.Tensor: Subsampled mask (#batch, 1, time'),
where time' = time // 6.
torch.Tensor: positional encoding
"""
x
=
x
.
unsqueeze
(
1
)
# (b, c, t, f)
x
=
self
.
conv
(
x
)
b
,
c
,
t
,
f
=
x
.
size
()
x
=
self
.
linear
(
x
.
transpose
(
1
,
2
).
contiguous
().
view
(
b
,
t
,
c
*
f
))
x
,
pos_emb
=
self
.
pos_enc
(
x
,
offset
)
return
x
,
pos_emb
,
x_mask
[:,
:,
2
::
2
][:,
:,
4
::
3
]
class
Conv2dSubsampling8
(
BaseSubsampling
):
"""Convolutional 2D subsampling (to 1/8 length).
Args:
idim (int): Input dimension.
odim (int): Output dimension.
dropout_rate (float): Dropout rate.
"""
def
__init__
(
self
,
idim
:
int
,
odim
:
int
,
dropout_rate
:
float
,
pos_enc_class
:
torch
.
nn
.
Module
):
"""Construct an Conv2dSubsampling8 object."""
super
().
__init__
()
self
.
conv
=
torch
.
nn
.
Sequential
(
torch
.
nn
.
Conv2d
(
1
,
odim
,
3
,
2
),
torch
.
nn
.
ReLU
(),
torch
.
nn
.
Conv2d
(
odim
,
odim
,
3
,
2
),
torch
.
nn
.
ReLU
(),
torch
.
nn
.
Conv2d
(
odim
,
odim
,
3
,
2
),
torch
.
nn
.
ReLU
(),
)
self
.
linear
=
torch
.
nn
.
Linear
(
odim
*
((((
idim
-
1
)
//
2
-
1
)
//
2
-
1
)
//
2
),
odim
)
self
.
pos_enc
=
pos_enc_class
self
.
subsampling_rate
=
8
# 14 = (3 - 1) * 1 + (3 - 1) * 2 + (3 - 1) * 4
self
.
right_context
=
14
def
forward
(
self
,
x
:
torch
.
Tensor
,
x_mask
:
torch
.
Tensor
,
offset
:
Union
[
int
,
torch
.
Tensor
]
=
0
)
->
Tuple
[
torch
.
Tensor
,
torch
.
Tensor
,
torch
.
Tensor
]:
"""Subsample x.
Args:
x (torch.Tensor): Input tensor (#batch, time, idim).
x_mask (torch.Tensor): Input mask (#batch, 1, time).
Returns:
torch.Tensor: Subsampled tensor (#batch, time', odim),
where time' = time // 8.
torch.Tensor: Subsampled mask (#batch, 1, time'),
where time' = time // 8.
torch.Tensor: positional encoding
"""
x
=
x
.
unsqueeze
(
1
)
# (b, c, t, f)
x
=
self
.
conv
(
x
)
b
,
c
,
t
,
f
=
x
.
size
()
x
=
self
.
linear
(
x
.
transpose
(
1
,
2
).
contiguous
().
view
(
b
,
t
,
c
*
f
))
x
,
pos_emb
=
self
.
pos_enc
(
x
,
offset
)
return
x
,
pos_emb
,
x_mask
[:,
:,
2
::
2
][:,
:,
2
::
2
][:,
:,
2
::
2
]
class
LegacyLinearNoSubsampling
(
BaseSubsampling
):
"""Linear transform the input without subsampling
Args:
idim (int): Input dimension.
odim (int): Output dimension.
dropout_rate (float): Dropout rate.
"""
def
__init__
(
self
,
idim
:
int
,
odim
:
int
,
dropout_rate
:
float
,
pos_enc_class
:
torch
.
nn
.
Module
):
"""Construct an linear object."""
super
().
__init__
()
self
.
out
=
torch
.
nn
.
Sequential
(
torch
.
nn
.
Linear
(
idim
,
odim
),
torch
.
nn
.
LayerNorm
(
odim
,
eps
=
1e-5
),
torch
.
nn
.
Dropout
(
dropout_rate
),
torch
.
nn
.
ReLU
(),
)
self
.
pos_enc
=
pos_enc_class
self
.
right_context
=
0
self
.
subsampling_rate
=
1
def
forward
(
self
,
x
:
torch
.
Tensor
,
x_mask
:
torch
.
Tensor
,
offset
:
Union
[
int
,
torch
.
Tensor
]
=
0
)
->
Tuple
[
torch
.
Tensor
,
torch
.
Tensor
,
torch
.
Tensor
]:
"""Input x.
Args:
x (torch.Tensor): Input tensor (#batch, time, idim).
x_mask (torch.Tensor): Input mask (#batch, 1, time).
Returns:
torch.Tensor: linear input tensor (#batch, time', odim),
where time' = time .
torch.Tensor: linear input mask (#batch, 1, time'),
where time' = time .
"""
x
=
self
.
out
(
x
)
x
,
pos_emb
=
self
.
pos_enc
(
x
,
offset
)
return
x
,
pos_emb
,
x_mask
cosyvoice/utils/__init__.py
0 → 100644
View file @
8d03db9a
cosyvoice/utils/class_utils.py
0 → 100644
View file @
8d03db9a
# Copyright [2023-11-28] <sxc19@mails.tsinghua.edu.cn, Xingchen Song>
# 2024 Alibaba Inc (authors: Xiang Lyu)
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
torch
from
cosyvoice.transformer.activation
import
Swish
from
cosyvoice.transformer.subsampling
import
(
LinearNoSubsampling
,
EmbedinigNoSubsampling
,
Conv1dSubsampling2
,
Conv2dSubsampling4
,
Conv2dSubsampling6
,
Conv2dSubsampling8
,
)
from
cosyvoice.transformer.embedding
import
(
PositionalEncoding
,
RelPositionalEncoding
,
WhisperPositionalEncoding
,
LearnablePositionalEncoding
,
NoPositionalEncoding
)
from
cosyvoice.transformer.attention
import
(
MultiHeadedAttention
,
RelPositionMultiHeadedAttention
)
from
cosyvoice.transformer.embedding
import
EspnetRelPositionalEncoding
from
cosyvoice.transformer.subsampling
import
LegacyLinearNoSubsampling
COSYVOICE_ACTIVATION_CLASSES
=
{
"hardtanh"
:
torch
.
nn
.
Hardtanh
,
"tanh"
:
torch
.
nn
.
Tanh
,
"relu"
:
torch
.
nn
.
ReLU
,
"selu"
:
torch
.
nn
.
SELU
,
"swish"
:
getattr
(
torch
.
nn
,
"SiLU"
,
Swish
),
"gelu"
:
torch
.
nn
.
GELU
,
}
COSYVOICE_SUBSAMPLE_CLASSES
=
{
"linear"
:
LinearNoSubsampling
,
"linear_legacy"
:
LegacyLinearNoSubsampling
,
"embed"
:
EmbedinigNoSubsampling
,
"conv1d2"
:
Conv1dSubsampling2
,
"conv2d"
:
Conv2dSubsampling4
,
"conv2d6"
:
Conv2dSubsampling6
,
"conv2d8"
:
Conv2dSubsampling8
,
'paraformer_dummy'
:
torch
.
nn
.
Identity
}
COSYVOICE_EMB_CLASSES
=
{
"embed"
:
PositionalEncoding
,
"abs_pos"
:
PositionalEncoding
,
"rel_pos"
:
RelPositionalEncoding
,
"rel_pos_espnet"
:
EspnetRelPositionalEncoding
,
"no_pos"
:
NoPositionalEncoding
,
"abs_pos_whisper"
:
WhisperPositionalEncoding
,
"embed_learnable_pe"
:
LearnablePositionalEncoding
,
}
COSYVOICE_ATTENTION_CLASSES
=
{
"selfattn"
:
MultiHeadedAttention
,
"rel_selfattn"
:
RelPositionMultiHeadedAttention
,
}
Prev
1
2
3
4
5
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment