Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
ModelZoo
VITA-Audio_pytorch
Commits
39ac40a9
Commit
39ac40a9
authored
Jun 06, 2025
by
chenzk
Browse files
v1.0
parents
Pipeline
#2747
failed with stages
in 0 seconds
Changes
427
Pipelines
1
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
2737 additions
and
0 deletions
+2737
-0
third_party/GLM-4-Voice/cosyvoice/transformer/decoder.py
third_party/GLM-4-Voice/cosyvoice/transformer/decoder.py
+396
-0
third_party/GLM-4-Voice/cosyvoice/transformer/decoder_layer.py
..._party/GLM-4-Voice/cosyvoice/transformer/decoder_layer.py
+132
-0
third_party/GLM-4-Voice/cosyvoice/transformer/embedding.py
third_party/GLM-4-Voice/cosyvoice/transformer/embedding.py
+293
-0
third_party/GLM-4-Voice/cosyvoice/transformer/encoder.py
third_party/GLM-4-Voice/cosyvoice/transformer/encoder.py
+567
-0
third_party/GLM-4-Voice/cosyvoice/transformer/encoder_layer.py
..._party/GLM-4-Voice/cosyvoice/transformer/encoder_layer.py
+236
-0
third_party/GLM-4-Voice/cosyvoice/transformer/label_smoothing_loss.py
...GLM-4-Voice/cosyvoice/transformer/label_smoothing_loss.py
+96
-0
third_party/GLM-4-Voice/cosyvoice/transformer/positionwise_feed_forward.py
...-Voice/cosyvoice/transformer/positionwise_feed_forward.py
+115
-0
third_party/GLM-4-Voice/cosyvoice/transformer/subsampling.py
third_party/GLM-4-Voice/cosyvoice/transformer/subsampling.py
+383
-0
third_party/GLM-4-Voice/cosyvoice/utils/__init__.py
third_party/GLM-4-Voice/cosyvoice/utils/__init__.py
+0
-0
third_party/GLM-4-Voice/cosyvoice/utils/__pycache__/__init__.cpython-310.pyc
...oice/cosyvoice/utils/__pycache__/__init__.cpython-310.pyc
+0
-0
third_party/GLM-4-Voice/cosyvoice/utils/__pycache__/block_mask_util.cpython-310.pyc
...syvoice/utils/__pycache__/block_mask_util.cpython-310.pyc
+0
-0
third_party/GLM-4-Voice/cosyvoice/utils/__pycache__/class_utils.cpython-310.pyc
...e/cosyvoice/utils/__pycache__/class_utils.cpython-310.pyc
+0
-0
third_party/GLM-4-Voice/cosyvoice/utils/__pycache__/common.cpython-310.pyc
...-Voice/cosyvoice/utils/__pycache__/common.cpython-310.pyc
+0
-0
third_party/GLM-4-Voice/cosyvoice/utils/__pycache__/mask.cpython-310.pyc
...-4-Voice/cosyvoice/utils/__pycache__/mask.cpython-310.pyc
+0
-0
third_party/GLM-4-Voice/cosyvoice/utils/block_mask_util.py
third_party/GLM-4-Voice/cosyvoice/utils/block_mask_util.py
+34
-0
third_party/GLM-4-Voice/cosyvoice/utils/class_utils.py
third_party/GLM-4-Voice/cosyvoice/utils/class_utils.py
+72
-0
third_party/GLM-4-Voice/cosyvoice/utils/common.py
third_party/GLM-4-Voice/cosyvoice/utils/common.py
+103
-0
third_party/GLM-4-Voice/cosyvoice/utils/executor.py
third_party/GLM-4-Voice/cosyvoice/utils/executor.py
+132
-0
third_party/GLM-4-Voice/cosyvoice/utils/file_utils.py
third_party/GLM-4-Voice/cosyvoice/utils/file_utils.py
+53
-0
third_party/GLM-4-Voice/cosyvoice/utils/frontend_utils.py
third_party/GLM-4-Voice/cosyvoice/utils/frontend_utils.py
+125
-0
No files found.
Too many changes to show.
To preserve performance only
427 of 427+
files are displayed.
Plain diff
Email patch
third_party/GLM-4-Voice/cosyvoice/transformer/decoder.py
0 → 100644
View file @
39ac40a9
# Copyright (c) 2021 Mobvoi Inc. (authors: Binbin Zhang, Di Wu)
# 2024 Alibaba Inc (Xiang Lyu)
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Modified from ESPnet(https://github.com/espnet/espnet)
"""Decoder definition."""
from
typing
import
Tuple
,
List
,
Optional
import
torch
import
torch.utils.checkpoint
as
ckpt
import
logging
from
cosyvoice.transformer.decoder_layer
import
DecoderLayer
from
cosyvoice.transformer.positionwise_feed_forward
import
PositionwiseFeedForward
from
cosyvoice.utils.class_utils
import
(
COSYVOICE_EMB_CLASSES
,
COSYVOICE_ATTENTION_CLASSES
,
COSYVOICE_ACTIVATION_CLASSES
,
)
from
cosyvoice.utils.mask
import
(
subsequent_mask
,
make_pad_mask
)
class
TransformerDecoder
(
torch
.
nn
.
Module
):
"""Base class of Transfomer decoder module.
Args:
vocab_size: output dim
encoder_output_size: dimension of attention
attention_heads: the number of heads of multi head attention
linear_units: the hidden units number of position-wise feedforward
num_blocks: the number of decoder blocks
dropout_rate: dropout rate
self_attention_dropout_rate: dropout rate for attention
input_layer: input layer type
use_output_layer: whether to use output layer
pos_enc_class: PositionalEncoding or ScaledPositionalEncoding
normalize_before:
True: use layer_norm before each sub-block of a layer.
False: use layer_norm after each sub-block of a layer.
src_attention: if false, encoder-decoder cross attention is not
applied, such as CIF model
key_bias: whether use bias in attention.linear_k, False for whisper models.
gradient_checkpointing: rerunning a forward-pass segment for each
checkpointed segment during backward.
tie_word_embedding: Tie or clone module weights depending of whether we are
using TorchScript or not
"""
def
__init__
(
self
,
vocab_size
:
int
,
encoder_output_size
:
int
,
attention_heads
:
int
=
4
,
linear_units
:
int
=
2048
,
num_blocks
:
int
=
6
,
dropout_rate
:
float
=
0.1
,
positional_dropout_rate
:
float
=
0.1
,
self_attention_dropout_rate
:
float
=
0.0
,
src_attention_dropout_rate
:
float
=
0.0
,
input_layer
:
str
=
"embed"
,
use_output_layer
:
bool
=
True
,
normalize_before
:
bool
=
True
,
src_attention
:
bool
=
True
,
key_bias
:
bool
=
True
,
activation_type
:
str
=
"relu"
,
gradient_checkpointing
:
bool
=
False
,
tie_word_embedding
:
bool
=
False
,
):
super
().
__init__
()
attention_dim
=
encoder_output_size
activation
=
COSYVOICE_ACTIVATION_CLASSES
[
activation_type
]()
self
.
embed
=
torch
.
nn
.
Sequential
(
torch
.
nn
.
Identity
()
if
input_layer
==
"no_pos"
else
torch
.
nn
.
Embedding
(
vocab_size
,
attention_dim
),
COSYVOICE_EMB_CLASSES
[
input_layer
](
attention_dim
,
positional_dropout_rate
),
)
self
.
normalize_before
=
normalize_before
self
.
after_norm
=
torch
.
nn
.
LayerNorm
(
attention_dim
,
eps
=
1e-5
)
self
.
use_output_layer
=
use_output_layer
if
use_output_layer
:
self
.
output_layer
=
torch
.
nn
.
Linear
(
attention_dim
,
vocab_size
)
else
:
self
.
output_layer
=
torch
.
nn
.
Identity
()
self
.
num_blocks
=
num_blocks
self
.
decoders
=
torch
.
nn
.
ModuleList
([
DecoderLayer
(
attention_dim
,
COSYVOICE_ATTENTION_CLASSES
[
"selfattn"
](
attention_heads
,
attention_dim
,
self_attention_dropout_rate
,
key_bias
),
COSYVOICE_ATTENTION_CLASSES
[
"selfattn"
](
attention_heads
,
attention_dim
,
src_attention_dropout_rate
,
key_bias
)
if
src_attention
else
None
,
PositionwiseFeedForward
(
attention_dim
,
linear_units
,
dropout_rate
,
activation
),
dropout_rate
,
normalize_before
,
)
for
_
in
range
(
self
.
num_blocks
)
])
self
.
gradient_checkpointing
=
gradient_checkpointing
self
.
tie_word_embedding
=
tie_word_embedding
def
forward
(
self
,
memory
:
torch
.
Tensor
,
memory_mask
:
torch
.
Tensor
,
ys_in_pad
:
torch
.
Tensor
,
ys_in_lens
:
torch
.
Tensor
,
r_ys_in_pad
:
torch
.
Tensor
=
torch
.
empty
(
0
),
reverse_weight
:
float
=
0.0
,
)
->
Tuple
[
torch
.
Tensor
,
torch
.
Tensor
,
torch
.
Tensor
]:
"""Forward decoder.
Args:
memory: encoded memory, float32 (batch, maxlen_in, feat)
memory_mask: encoder memory mask, (batch, 1, maxlen_in)
ys_in_pad: padded input token ids, int64 (batch, maxlen_out)
ys_in_lens: input lengths of this batch (batch)
r_ys_in_pad: not used in transformer decoder, in order to unify api
with bidirectional decoder
reverse_weight: not used in transformer decoder, in order to unify
api with bidirectional decode
Returns:
(tuple): tuple containing:
x: decoded token score before softmax (batch, maxlen_out,
vocab_size) if use_output_layer is True,
torch.tensor(0.0), in order to unify api with bidirectional decoder
olens: (batch, )
NOTE(xcsong):
We pass the `__call__` method of the modules instead of `forward` to the
checkpointing API because `__call__` attaches all the hooks of the module.
https://discuss.pytorch.org/t/any-different-between-model-input-and-model-forward-input/3690/2
"""
tgt
=
ys_in_pad
maxlen
=
tgt
.
size
(
1
)
# tgt_mask: (B, 1, L)
tgt_mask
=
~
make_pad_mask
(
ys_in_lens
,
maxlen
).
unsqueeze
(
1
)
tgt_mask
=
tgt_mask
.
to
(
tgt
.
device
)
# m: (1, L, L)
m
=
subsequent_mask
(
tgt_mask
.
size
(
-
1
),
device
=
tgt_mask
.
device
).
unsqueeze
(
0
)
# tgt_mask: (B, L, L)
tgt_mask
=
tgt_mask
&
m
x
,
_
=
self
.
embed
(
tgt
)
if
self
.
gradient_checkpointing
and
self
.
training
:
x
=
self
.
forward_layers_checkpointed
(
x
,
tgt_mask
,
memory
,
memory_mask
)
else
:
x
=
self
.
forward_layers
(
x
,
tgt_mask
,
memory
,
memory_mask
)
if
self
.
normalize_before
:
x
=
self
.
after_norm
(
x
)
if
self
.
use_output_layer
:
x
=
self
.
output_layer
(
x
)
olens
=
tgt_mask
.
sum
(
1
)
return
x
,
torch
.
tensor
(
0.0
),
olens
def
forward_layers
(
self
,
x
:
torch
.
Tensor
,
tgt_mask
:
torch
.
Tensor
,
memory
:
torch
.
Tensor
,
memory_mask
:
torch
.
Tensor
)
->
torch
.
Tensor
:
for
layer
in
self
.
decoders
:
x
,
tgt_mask
,
memory
,
memory_mask
=
layer
(
x
,
tgt_mask
,
memory
,
memory_mask
)
return
x
@
torch
.
jit
.
ignore
(
drop
=
True
)
def
forward_layers_checkpointed
(
self
,
x
:
torch
.
Tensor
,
tgt_mask
:
torch
.
Tensor
,
memory
:
torch
.
Tensor
,
memory_mask
:
torch
.
Tensor
)
->
torch
.
Tensor
:
for
layer
in
self
.
decoders
:
x
,
tgt_mask
,
memory
,
memory_mask
=
ckpt
.
checkpoint
(
layer
.
__call__
,
x
,
tgt_mask
,
memory
,
memory_mask
)
return
x
def
forward_one_step
(
self
,
memory
:
torch
.
Tensor
,
memory_mask
:
torch
.
Tensor
,
tgt
:
torch
.
Tensor
,
tgt_mask
:
torch
.
Tensor
,
cache
:
Optional
[
List
[
torch
.
Tensor
]]
=
None
,
)
->
Tuple
[
torch
.
Tensor
,
List
[
torch
.
Tensor
]]:
"""Forward one step.
This is only used for decoding.
Args:
memory: encoded memory, float32 (batch, maxlen_in, feat)
memory_mask: encoded memory mask, (batch, 1, maxlen_in)
tgt: input token ids, int64 (batch, maxlen_out)
tgt_mask: input token mask, (batch, maxlen_out)
dtype=torch.uint8 in PyTorch 1.2-
dtype=torch.bool in PyTorch 1.2+ (include 1.2)
cache: cached output list of (batch, max_time_out-1, size)
Returns:
y, cache: NN output value and cache per `self.decoders`.
y.shape` is (batch, maxlen_out, token)
"""
x
,
_
=
self
.
embed
(
tgt
)
new_cache
=
[]
for
i
,
decoder
in
enumerate
(
self
.
decoders
):
if
cache
is
None
:
c
=
None
else
:
c
=
cache
[
i
]
x
,
tgt_mask
,
memory
,
memory_mask
=
decoder
(
x
,
tgt_mask
,
memory
,
memory_mask
,
cache
=
c
)
new_cache
.
append
(
x
)
if
self
.
normalize_before
:
y
=
self
.
after_norm
(
x
[:,
-
1
])
else
:
y
=
x
[:,
-
1
]
if
self
.
use_output_layer
:
y
=
torch
.
log_softmax
(
self
.
output_layer
(
y
),
dim
=-
1
)
return
y
,
new_cache
def
tie_or_clone_weights
(
self
,
jit_mode
:
bool
=
True
):
"""Tie or clone module weights (between word_emb and output_layer)
depending of whether we are using TorchScript or not"""
if
not
self
.
use_output_layer
:
return
if
jit_mode
:
logging
.
info
(
"clone emb.weight to output.weight"
)
self
.
output_layer
.
weight
=
torch
.
nn
.
Parameter
(
self
.
embed
[
0
].
weight
.
clone
())
else
:
logging
.
info
(
"tie emb.weight with output.weight"
)
self
.
output_layer
.
weight
=
self
.
embed
[
0
].
weight
if
getattr
(
self
.
output_layer
,
"bias"
,
None
)
is
not
None
:
self
.
output_layer
.
bias
.
data
=
torch
.
nn
.
functional
.
pad
(
self
.
output_layer
.
bias
.
data
,
(
0
,
self
.
output_layer
.
weight
.
shape
[
0
]
-
self
.
output_layer
.
bias
.
shape
[
0
],
),
"constant"
,
0
,
)
class
BiTransformerDecoder
(
torch
.
nn
.
Module
):
"""Base class of Transfomer decoder module.
Args:
vocab_size: output dim
encoder_output_size: dimension of attention
attention_heads: the number of heads of multi head attention
linear_units: the hidden units number of position-wise feedforward
num_blocks: the number of decoder blocks
r_num_blocks: the number of right to left decoder blocks
dropout_rate: dropout rate
self_attention_dropout_rate: dropout rate for attention
input_layer: input layer type
use_output_layer: whether to use output layer
pos_enc_class: PositionalEncoding or ScaledPositionalEncoding
normalize_before:
True: use layer_norm before each sub-block of a layer.
False: use layer_norm after each sub-block of a layer.
key_bias: whether use bias in attention.linear_k, False for whisper models.
"""
def
__init__
(
self
,
vocab_size
:
int
,
encoder_output_size
:
int
,
attention_heads
:
int
=
4
,
linear_units
:
int
=
2048
,
num_blocks
:
int
=
6
,
r_num_blocks
:
int
=
0
,
dropout_rate
:
float
=
0.1
,
positional_dropout_rate
:
float
=
0.1
,
self_attention_dropout_rate
:
float
=
0.0
,
src_attention_dropout_rate
:
float
=
0.0
,
input_layer
:
str
=
"embed"
,
use_output_layer
:
bool
=
True
,
normalize_before
:
bool
=
True
,
key_bias
:
bool
=
True
,
gradient_checkpointing
:
bool
=
False
,
tie_word_embedding
:
bool
=
False
,
):
super
().
__init__
()
self
.
tie_word_embedding
=
tie_word_embedding
self
.
left_decoder
=
TransformerDecoder
(
vocab_size
,
encoder_output_size
,
attention_heads
,
linear_units
,
num_blocks
,
dropout_rate
,
positional_dropout_rate
,
self_attention_dropout_rate
,
src_attention_dropout_rate
,
input_layer
,
use_output_layer
,
normalize_before
,
key_bias
=
key_bias
,
gradient_checkpointing
=
gradient_checkpointing
,
tie_word_embedding
=
tie_word_embedding
)
self
.
right_decoder
=
TransformerDecoder
(
vocab_size
,
encoder_output_size
,
attention_heads
,
linear_units
,
r_num_blocks
,
dropout_rate
,
positional_dropout_rate
,
self_attention_dropout_rate
,
src_attention_dropout_rate
,
input_layer
,
use_output_layer
,
normalize_before
,
key_bias
=
key_bias
,
gradient_checkpointing
=
gradient_checkpointing
,
tie_word_embedding
=
tie_word_embedding
)
def
forward
(
self
,
memory
:
torch
.
Tensor
,
memory_mask
:
torch
.
Tensor
,
ys_in_pad
:
torch
.
Tensor
,
ys_in_lens
:
torch
.
Tensor
,
r_ys_in_pad
:
torch
.
Tensor
,
reverse_weight
:
float
=
0.0
,
)
->
Tuple
[
torch
.
Tensor
,
torch
.
Tensor
,
torch
.
Tensor
]:
"""Forward decoder.
Args:
memory: encoded memory, float32 (batch, maxlen_in, feat)
memory_mask: encoder memory mask, (batch, 1, maxlen_in)
ys_in_pad: padded input token ids, int64 (batch, maxlen_out)
ys_in_lens: input lengths of this batch (batch)
r_ys_in_pad: padded input token ids, int64 (batch, maxlen_out),
used for right to left decoder
reverse_weight: used for right to left decoder
Returns:
(tuple): tuple containing:
x: decoded token score before softmax (batch, maxlen_out,
vocab_size) if use_output_layer is True,
r_x: x: decoded token score (right to left decoder)
before softmax (batch, maxlen_out, vocab_size)
if use_output_layer is True,
olens: (batch, )
"""
l_x
,
_
,
olens
=
self
.
left_decoder
(
memory
,
memory_mask
,
ys_in_pad
,
ys_in_lens
)
r_x
=
torch
.
tensor
(
0.0
)
if
reverse_weight
>
0.0
:
r_x
,
_
,
olens
=
self
.
right_decoder
(
memory
,
memory_mask
,
r_ys_in_pad
,
ys_in_lens
)
return
l_x
,
r_x
,
olens
def
forward_one_step
(
self
,
memory
:
torch
.
Tensor
,
memory_mask
:
torch
.
Tensor
,
tgt
:
torch
.
Tensor
,
tgt_mask
:
torch
.
Tensor
,
cache
:
Optional
[
List
[
torch
.
Tensor
]]
=
None
,
)
->
Tuple
[
torch
.
Tensor
,
List
[
torch
.
Tensor
]]:
"""Forward one step.
This is only used for decoding.
Args:
memory: encoded memory, float32 (batch, maxlen_in, feat)
memory_mask: encoded memory mask, (batch, 1, maxlen_in)
tgt: input token ids, int64 (batch, maxlen_out)
tgt_mask: input token mask, (batch, maxlen_out)
dtype=torch.uint8 in PyTorch 1.2-
dtype=torch.bool in PyTorch 1.2+ (include 1.2)
cache: cached output list of (batch, max_time_out-1, size)
Returns:
y, cache: NN output value and cache per `self.decoders`.
y.shape` is (batch, maxlen_out, token)
"""
return
self
.
left_decoder
.
forward_one_step
(
memory
,
memory_mask
,
tgt
,
tgt_mask
,
cache
)
def
tie_or_clone_weights
(
self
,
jit_mode
:
bool
=
True
):
"""Tie or clone module weights (between word_emb and output_layer)
depending of whether we are using TorchScript or not"""
self
.
left_decoder
.
tie_or_clone_weights
(
jit_mode
)
self
.
right_decoder
.
tie_or_clone_weights
(
jit_mode
)
third_party/GLM-4-Voice/cosyvoice/transformer/decoder_layer.py
0 → 100644
View file @
39ac40a9
# Copyright (c) 2019 Shigeki Karita
# 2020 Mobvoi Inc (Binbin Zhang)
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Decoder self-attention layer definition."""
from
typing
import
Optional
,
Tuple
import
torch
from
torch
import
nn
class
DecoderLayer
(
nn
.
Module
):
"""Single decoder layer module.
Args:
size (int): Input dimension.
self_attn (torch.nn.Module): Self-attention module instance.
`MultiHeadedAttention` instance can be used as the argument.
src_attn (torch.nn.Module): Inter-attention module instance.
`MultiHeadedAttention` instance can be used as the argument.
If `None` is passed, Inter-attention is not used, such as
CIF, GPT, and other decoder only model.
feed_forward (torch.nn.Module): Feed-forward module instance.
`PositionwiseFeedForward` instance can be used as the argument.
dropout_rate (float): Dropout rate.
normalize_before (bool):
True: use layer_norm before each sub-block.
False: to use layer_norm after each sub-block.
"""
def
__init__
(
self
,
size
:
int
,
self_attn
:
nn
.
Module
,
src_attn
:
Optional
[
nn
.
Module
],
feed_forward
:
nn
.
Module
,
dropout_rate
:
float
,
normalize_before
:
bool
=
True
,
):
"""Construct an DecoderLayer object."""
super
().
__init__
()
self
.
size
=
size
self
.
self_attn
=
self_attn
self
.
src_attn
=
src_attn
self
.
feed_forward
=
feed_forward
self
.
norm1
=
nn
.
LayerNorm
(
size
,
eps
=
1e-5
)
self
.
norm2
=
nn
.
LayerNorm
(
size
,
eps
=
1e-5
)
self
.
norm3
=
nn
.
LayerNorm
(
size
,
eps
=
1e-5
)
self
.
dropout
=
nn
.
Dropout
(
dropout_rate
)
self
.
normalize_before
=
normalize_before
def
forward
(
self
,
tgt
:
torch
.
Tensor
,
tgt_mask
:
torch
.
Tensor
,
memory
:
torch
.
Tensor
,
memory_mask
:
torch
.
Tensor
,
cache
:
Optional
[
torch
.
Tensor
]
=
None
)
->
Tuple
[
torch
.
Tensor
,
torch
.
Tensor
,
torch
.
Tensor
,
torch
.
Tensor
]:
"""Compute decoded features.
Args:
tgt (torch.Tensor): Input tensor (#batch, maxlen_out, size).
tgt_mask (torch.Tensor): Mask for input tensor
(#batch, maxlen_out).
memory (torch.Tensor): Encoded memory
(#batch, maxlen_in, size).
memory_mask (torch.Tensor): Encoded memory mask
(#batch, maxlen_in).
cache (torch.Tensor): cached tensors.
(#batch, maxlen_out - 1, size).
Returns:
torch.Tensor: Output tensor (#batch, maxlen_out, size).
torch.Tensor: Mask for output tensor (#batch, maxlen_out).
torch.Tensor: Encoded memory (#batch, maxlen_in, size).
torch.Tensor: Encoded memory mask (#batch, maxlen_in).
"""
residual
=
tgt
if
self
.
normalize_before
:
tgt
=
self
.
norm1
(
tgt
)
if
cache
is
None
:
tgt_q
=
tgt
tgt_q_mask
=
tgt_mask
else
:
# compute only the last frame query keeping dim: max_time_out -> 1
assert
cache
.
shape
==
(
tgt
.
shape
[
0
],
tgt
.
shape
[
1
]
-
1
,
self
.
size
,
),
"{cache.shape} == {(tgt.shape[0], tgt.shape[1] - 1, self.size)}"
tgt_q
=
tgt
[:,
-
1
:,
:]
residual
=
residual
[:,
-
1
:,
:]
tgt_q_mask
=
tgt_mask
[:,
-
1
:,
:]
x
=
residual
+
self
.
dropout
(
self
.
self_attn
(
tgt_q
,
tgt
,
tgt
,
tgt_q_mask
)[
0
])
if
not
self
.
normalize_before
:
x
=
self
.
norm1
(
x
)
if
self
.
src_attn
is
not
None
:
residual
=
x
if
self
.
normalize_before
:
x
=
self
.
norm2
(
x
)
x
=
residual
+
self
.
dropout
(
self
.
src_attn
(
x
,
memory
,
memory
,
memory_mask
)[
0
])
if
not
self
.
normalize_before
:
x
=
self
.
norm2
(
x
)
residual
=
x
if
self
.
normalize_before
:
x
=
self
.
norm3
(
x
)
x
=
residual
+
self
.
dropout
(
self
.
feed_forward
(
x
))
if
not
self
.
normalize_before
:
x
=
self
.
norm3
(
x
)
if
cache
is
not
None
:
x
=
torch
.
cat
([
cache
,
x
],
dim
=
1
)
return
x
,
tgt_mask
,
memory
,
memory_mask
third_party/GLM-4-Voice/cosyvoice/transformer/embedding.py
0 → 100644
View file @
39ac40a9
# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Di Wu)
# 2024 Alibaba Inc (Xiang Lyu)
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Modified from ESPnet(https://github.com/espnet/espnet)
"""Positonal Encoding Module."""
import
math
from
typing
import
Tuple
,
Union
import
torch
import
torch.nn.functional
as
F
import
numpy
as
np
class
PositionalEncoding
(
torch
.
nn
.
Module
):
"""Positional encoding.
:param int d_model: embedding dim
:param float dropout_rate: dropout rate
:param int max_len: maximum input length
PE(pos, 2i) = sin(pos/(10000^(2i/dmodel)))
PE(pos, 2i+1) = cos(pos/(10000^(2i/dmodel)))
"""
def
__init__
(
self
,
d_model
:
int
,
dropout_rate
:
float
,
max_len
:
int
=
5000
,
reverse
:
bool
=
False
):
"""Construct an PositionalEncoding object."""
super
().
__init__
()
self
.
d_model
=
d_model
self
.
xscale
=
math
.
sqrt
(
self
.
d_model
)
self
.
dropout
=
torch
.
nn
.
Dropout
(
p
=
dropout_rate
)
self
.
max_len
=
max_len
self
.
pe
=
torch
.
zeros
(
self
.
max_len
,
self
.
d_model
)
position
=
torch
.
arange
(
0
,
self
.
max_len
,
dtype
=
torch
.
float32
).
unsqueeze
(
1
)
div_term
=
torch
.
exp
(
torch
.
arange
(
0
,
self
.
d_model
,
2
,
dtype
=
torch
.
float32
)
*
-
(
math
.
log
(
10000.0
)
/
self
.
d_model
))
self
.
pe
[:,
0
::
2
]
=
torch
.
sin
(
position
*
div_term
)
self
.
pe
[:,
1
::
2
]
=
torch
.
cos
(
position
*
div_term
)
self
.
pe
=
self
.
pe
.
unsqueeze
(
0
)
def
forward
(
self
,
x
:
torch
.
Tensor
,
offset
:
Union
[
int
,
torch
.
Tensor
]
=
0
)
\
->
Tuple
[
torch
.
Tensor
,
torch
.
Tensor
]:
"""Add positional encoding.
Args:
x (torch.Tensor): Input. Its shape is (batch, time, ...)
offset (int, torch.tensor): position offset
Returns:
torch.Tensor: Encoded tensor. Its shape is (batch, time, ...)
torch.Tensor: for compatibility to RelPositionalEncoding
"""
self
.
pe
=
self
.
pe
.
to
(
x
.
device
)
pos_emb
=
self
.
position_encoding
(
offset
,
x
.
size
(
1
),
False
)
x
=
x
*
self
.
xscale
+
pos_emb
return
self
.
dropout
(
x
),
self
.
dropout
(
pos_emb
)
def
position_encoding
(
self
,
offset
:
Union
[
int
,
torch
.
Tensor
],
size
:
int
,
apply_dropout
:
bool
=
True
)
->
torch
.
Tensor
:
""" For getting encoding in a streaming fashion
Attention!!!!!
we apply dropout only once at the whole utterance level in a none
streaming way, but will call this function several times with
increasing input size in a streaming scenario, so the dropout will
be applied several times.
Args:
offset (int or torch.tensor): start offset
size (int): required size of position encoding
Returns:
torch.Tensor: Corresponding encoding
"""
# How to subscript a Union type:
# https://github.com/pytorch/pytorch/issues/69434
if
isinstance
(
offset
,
int
):
assert
offset
+
size
<=
self
.
max_len
pos_emb
=
self
.
pe
[:,
offset
:
offset
+
size
]
elif
isinstance
(
offset
,
torch
.
Tensor
)
and
offset
.
dim
()
==
0
:
# scalar
assert
offset
+
size
<=
self
.
max_len
pos_emb
=
self
.
pe
[:,
offset
:
offset
+
size
]
else
:
# for batched streaming decoding on GPU
assert
torch
.
max
(
offset
)
+
size
<=
self
.
max_len
index
=
offset
.
unsqueeze
(
1
)
+
\
torch
.
arange
(
0
,
size
).
to
(
offset
.
device
)
# B X T
flag
=
index
>
0
# remove negative offset
index
=
index
*
flag
pos_emb
=
F
.
embedding
(
index
,
self
.
pe
[
0
])
# B X T X d_model
if
apply_dropout
:
pos_emb
=
self
.
dropout
(
pos_emb
)
return
pos_emb
class
RelPositionalEncoding
(
PositionalEncoding
):
"""Relative positional encoding module.
See : Appendix B in https://arxiv.org/abs/1901.02860
Args:
d_model (int): Embedding dimension.
dropout_rate (float): Dropout rate.
max_len (int): Maximum input length.
"""
def
__init__
(
self
,
d_model
:
int
,
dropout_rate
:
float
,
max_len
:
int
=
5000
):
"""Initialize class."""
super
().
__init__
(
d_model
,
dropout_rate
,
max_len
,
reverse
=
True
)
def
forward
(
self
,
x
:
torch
.
Tensor
,
offset
:
Union
[
int
,
torch
.
Tensor
]
=
0
)
\
->
Tuple
[
torch
.
Tensor
,
torch
.
Tensor
]:
"""Compute positional encoding.
Args:
x (torch.Tensor): Input tensor (batch, time, `*`).
Returns:
torch.Tensor: Encoded tensor (batch, time, `*`).
torch.Tensor: Positional embedding tensor (1, time, `*`).
"""
self
.
pe
=
self
.
pe
.
to
(
x
.
device
)
x
=
x
*
self
.
xscale
pos_emb
=
self
.
position_encoding
(
offset
,
x
.
size
(
1
),
False
)
return
self
.
dropout
(
x
),
self
.
dropout
(
pos_emb
)
class
WhisperPositionalEncoding
(
PositionalEncoding
):
""" Sinusoids position encoding used in openai-whisper.encoder
"""
def
__init__
(
self
,
d_model
:
int
,
dropout_rate
:
float
,
max_len
:
int
=
1500
):
super
().
__init__
(
d_model
,
dropout_rate
,
max_len
)
self
.
xscale
=
1.0
log_timescale_increment
=
np
.
log
(
10000
)
/
(
d_model
//
2
-
1
)
inv_timescales
=
torch
.
exp
(
-
log_timescale_increment
*
torch
.
arange
(
d_model
//
2
))
scaled_time
=
torch
.
arange
(
max_len
)[:,
np
.
newaxis
]
*
\
inv_timescales
[
np
.
newaxis
,
:]
pe
=
torch
.
cat
([
torch
.
sin
(
scaled_time
),
torch
.
cos
(
scaled_time
)],
dim
=
1
)
delattr
(
self
,
"pe"
)
self
.
register_buffer
(
"pe"
,
pe
.
unsqueeze
(
0
))
class
LearnablePositionalEncoding
(
PositionalEncoding
):
""" Learnable position encoding used in openai-whisper.decoder
"""
def
__init__
(
self
,
d_model
:
int
,
dropout_rate
:
float
,
max_len
:
int
=
448
):
super
().
__init__
(
d_model
,
dropout_rate
,
max_len
)
# NOTE(xcsong): overwrite self.pe & self.xscale
self
.
pe
=
torch
.
nn
.
Parameter
(
torch
.
empty
(
1
,
max_len
,
d_model
))
self
.
xscale
=
1.0
class
NoPositionalEncoding
(
torch
.
nn
.
Module
):
""" No position encoding
"""
def
__init__
(
self
,
d_model
:
int
,
dropout_rate
:
float
):
super
().
__init__
()
self
.
d_model
=
d_model
self
.
dropout
=
torch
.
nn
.
Dropout
(
p
=
dropout_rate
)
def
forward
(
self
,
x
:
torch
.
Tensor
,
offset
:
Union
[
int
,
torch
.
Tensor
]
=
0
)
\
->
Tuple
[
torch
.
Tensor
,
torch
.
Tensor
]:
""" Just return zero vector for interface compatibility
"""
pos_emb
=
torch
.
zeros
(
1
,
x
.
size
(
1
),
self
.
d_model
).
to
(
x
.
device
)
return
self
.
dropout
(
x
),
pos_emb
def
position_encoding
(
self
,
offset
:
Union
[
int
,
torch
.
Tensor
],
size
:
int
)
->
torch
.
Tensor
:
return
torch
.
zeros
(
1
,
size
,
self
.
d_model
)
class
EspnetRelPositionalEncoding
(
torch
.
nn
.
Module
):
"""Relative positional encoding module (new implementation).
Details can be found in https://github.com/espnet/espnet/pull/2816.
See : Appendix B in https://arxiv.org/abs/1901.02860
Args:
d_model (int): Embedding dimension.
dropout_rate (float): Dropout rate.
max_len (int): Maximum input length.
"""
def
__init__
(
self
,
d_model
,
dropout_rate
,
max_len
=
5000
):
"""Construct an PositionalEncoding object."""
super
(
EspnetRelPositionalEncoding
,
self
).
__init__
()
self
.
d_model
=
d_model
self
.
xscale
=
math
.
sqrt
(
self
.
d_model
)
self
.
dropout
=
torch
.
nn
.
Dropout
(
p
=
dropout_rate
)
self
.
pe
=
None
self
.
extend_pe
(
torch
.
tensor
(
0.0
).
expand
(
1
,
max_len
))
def
extend_pe
(
self
,
x
):
"""Reset the positional encodings."""
if
self
.
pe
is
not
None
:
# self.pe contains both positive and negative parts
# the length of self.pe is 2 * input_len - 1
if
self
.
pe
.
size
(
1
)
>=
x
.
size
(
1
)
*
2
-
1
:
if
self
.
pe
.
dtype
!=
x
.
dtype
or
self
.
pe
.
device
!=
x
.
device
:
self
.
pe
=
self
.
pe
.
to
(
dtype
=
x
.
dtype
,
device
=
x
.
device
)
return
# Suppose `i` means to the position of query vecotr and `j` means the
# position of key vector. We use position relative positions when keys
# are to the left (i>j) and negative relative positions otherwise (i<j).
pe_positive
=
torch
.
zeros
(
x
.
size
(
1
),
self
.
d_model
)
pe_negative
=
torch
.
zeros
(
x
.
size
(
1
),
self
.
d_model
)
position
=
torch
.
arange
(
0
,
x
.
size
(
1
),
dtype
=
torch
.
float32
).
unsqueeze
(
1
)
div_term
=
torch
.
exp
(
torch
.
arange
(
0
,
self
.
d_model
,
2
,
dtype
=
torch
.
float32
)
*
-
(
math
.
log
(
10000.0
)
/
self
.
d_model
)
)
pe_positive
[:,
0
::
2
]
=
torch
.
sin
(
position
*
div_term
)
pe_positive
[:,
1
::
2
]
=
torch
.
cos
(
position
*
div_term
)
pe_negative
[:,
0
::
2
]
=
torch
.
sin
(
-
1
*
position
*
div_term
)
pe_negative
[:,
1
::
2
]
=
torch
.
cos
(
-
1
*
position
*
div_term
)
# Reserve the order of positive indices and concat both positive and
# negative indices. This is used to support the shifting trick
# as in https://arxiv.org/abs/1901.02860
pe_positive
=
torch
.
flip
(
pe_positive
,
[
0
]).
unsqueeze
(
0
)
pe_negative
=
pe_negative
[
1
:].
unsqueeze
(
0
)
pe
=
torch
.
cat
([
pe_positive
,
pe_negative
],
dim
=
1
)
self
.
pe
=
pe
.
to
(
device
=
x
.
device
,
dtype
=
x
.
dtype
)
def
forward
(
self
,
x
:
torch
.
Tensor
,
offset
:
Union
[
int
,
torch
.
Tensor
]
=
0
):
"""Add positional encoding.
Args:
x (torch.Tensor): Input tensor (batch, time, `*`).
Returns:
torch.Tensor: Encoded tensor (batch, time, `*`).
"""
self
.
extend_pe
(
x
)
x
=
x
*
self
.
xscale
pos_emb
=
self
.
position_encoding
(
size
=
x
.
size
(
1
),
offset
=
offset
)
return
self
.
dropout
(
x
),
self
.
dropout
(
pos_emb
)
def
position_encoding
(
self
,
offset
:
Union
[
int
,
torch
.
Tensor
],
size
:
int
)
->
torch
.
Tensor
:
""" For getting encoding in a streaming fashion
Attention!!!!!
we apply dropout only once at the whole utterance level in a none
streaming way, but will call this function several times with
increasing input size in a streaming scenario, so the dropout will
be applied several times.
Args:
offset (int or torch.tensor): start offset
size (int): required size of position encoding
Returns:
torch.Tensor: Corresponding encoding
"""
pos_emb
=
self
.
pe
[
:,
self
.
pe
.
size
(
1
)
//
2
-
size
+
1
:
self
.
pe
.
size
(
1
)
//
2
+
size
,
]
return
pos_emb
third_party/GLM-4-Voice/cosyvoice/transformer/encoder.py
0 → 100644
View file @
39ac40a9
# Copyright (c) 2021 Mobvoi Inc (Binbin Zhang, Di Wu)
# 2022 Xingchen Song (sxc19@mails.tsinghua.edu.cn)
# 2024 Alibaba Inc (Xiang Lyu)
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Modified from ESPnet(https://github.com/espnet/espnet)
"""Encoder definition."""
from
typing
import
Tuple
import
torch
import
torch.utils.checkpoint
as
ckpt
from
cosyvoice.transformer.convolution
import
ConvolutionModule
from
cosyvoice.transformer.encoder_layer
import
TransformerEncoderLayer
from
cosyvoice.transformer.encoder_layer
import
ConformerEncoderLayer
from
cosyvoice.transformer.positionwise_feed_forward
import
PositionwiseFeedForward
from
cosyvoice.utils.class_utils
import
(
COSYVOICE_EMB_CLASSES
,
COSYVOICE_SUBSAMPLE_CLASSES
,
COSYVOICE_ATTENTION_CLASSES
,
COSYVOICE_ACTIVATION_CLASSES
,
)
from
cosyvoice.utils.mask
import
make_pad_mask
from
cosyvoice.utils.mask
import
add_optional_chunk_mask
class
BaseEncoder
(
torch
.
nn
.
Module
):
def
__init__
(
self
,
input_size
:
int
,
output_size
:
int
=
256
,
attention_heads
:
int
=
4
,
linear_units
:
int
=
2048
,
num_blocks
:
int
=
6
,
dropout_rate
:
float
=
0.1
,
positional_dropout_rate
:
float
=
0.1
,
attention_dropout_rate
:
float
=
0.0
,
input_layer
:
str
=
"conv2d"
,
pos_enc_layer_type
:
str
=
"abs_pos"
,
normalize_before
:
bool
=
True
,
static_chunk_size
:
int
=
0
,
use_dynamic_chunk
:
bool
=
False
,
global_cmvn
:
torch
.
nn
.
Module
=
None
,
use_dynamic_left_chunk
:
bool
=
False
,
gradient_checkpointing
:
bool
=
False
,
):
"""
Args:
input_size (int): input dim
output_size (int): dimension of attention
attention_heads (int): the number of heads of multi head attention
linear_units (int): the hidden units number of position-wise feed
forward
num_blocks (int): the number of decoder blocks
dropout_rate (float): dropout rate
attention_dropout_rate (float): dropout rate in attention
positional_dropout_rate (float): dropout rate after adding
positional encoding
input_layer (str): input layer type.
optional [linear, conv2d, conv2d6, conv2d8]
pos_enc_layer_type (str): Encoder positional encoding layer type.
opitonal [abs_pos, scaled_abs_pos, rel_pos, no_pos]
normalize_before (bool):
True: use layer_norm before each sub-block of a layer.
False: use layer_norm after each sub-block of a layer.
static_chunk_size (int): chunk size for static chunk training and
decoding
use_dynamic_chunk (bool): whether use dynamic chunk size for
training or not, You can only use fixed chunk(chunk_size > 0)
or dyanmic chunk size(use_dynamic_chunk = True)
global_cmvn (Optional[torch.nn.Module]): Optional GlobalCMVN module
use_dynamic_left_chunk (bool): whether use dynamic left chunk in
dynamic chunk training
key_bias: whether use bias in attention.linear_k, False for whisper models.
gradient_checkpointing: rerunning a forward-pass segment for each
checkpointed segment during backward.
"""
super
().
__init__
()
self
.
_output_size
=
output_size
self
.
global_cmvn
=
global_cmvn
self
.
embed
=
COSYVOICE_SUBSAMPLE_CLASSES
[
input_layer
](
input_size
,
output_size
,
dropout_rate
,
COSYVOICE_EMB_CLASSES
[
pos_enc_layer_type
](
output_size
,
positional_dropout_rate
),
)
self
.
normalize_before
=
normalize_before
self
.
after_norm
=
torch
.
nn
.
LayerNorm
(
output_size
,
eps
=
1e-5
)
self
.
static_chunk_size
=
static_chunk_size
self
.
use_dynamic_chunk
=
use_dynamic_chunk
self
.
use_dynamic_left_chunk
=
use_dynamic_left_chunk
self
.
gradient_checkpointing
=
gradient_checkpointing
def
output_size
(
self
)
->
int
:
return
self
.
_output_size
def
forward
(
self
,
xs
:
torch
.
Tensor
,
xs_lens
:
torch
.
Tensor
,
decoding_chunk_size
:
int
=
0
,
num_decoding_left_chunks
:
int
=
-
1
,
)
->
Tuple
[
torch
.
Tensor
,
torch
.
Tensor
]:
"""Embed positions in tensor.
Args:
xs: padded input tensor (B, T, D)
xs_lens: input length (B)
decoding_chunk_size: decoding chunk size for dynamic chunk
0: default for training, use random dynamic chunk.
<0: for decoding, use full chunk.
>0: for decoding, use fixed chunk size as set.
num_decoding_left_chunks: number of left chunks, this is for decoding,
the chunk size is decoding_chunk_size.
>=0: use num_decoding_left_chunks
<0: use all left chunks
Returns:
encoder output tensor xs, and subsampled masks
xs: padded output tensor (B, T' ~= T/subsample_rate, D)
masks: torch.Tensor batch padding mask after subsample
(B, 1, T' ~= T/subsample_rate)
NOTE(xcsong):
We pass the `__call__` method of the modules instead of `forward` to the
checkpointing API because `__call__` attaches all the hooks of the module.
https://discuss.pytorch.org/t/any-different-between-model-input-and-model-forward-input/3690/2
"""
T
=
xs
.
size
(
1
)
masks
=
~
make_pad_mask
(
xs_lens
,
T
).
unsqueeze
(
1
)
# (B, 1, T)
if
self
.
global_cmvn
is
not
None
:
xs
=
self
.
global_cmvn
(
xs
)
xs
,
pos_emb
,
masks
=
self
.
embed
(
xs
,
masks
)
mask_pad
=
masks
# (B, 1, T/subsample_rate)
chunk_masks
=
add_optional_chunk_mask
(
xs
,
masks
,
self
.
use_dynamic_chunk
,
self
.
use_dynamic_left_chunk
,
decoding_chunk_size
,
self
.
static_chunk_size
,
num_decoding_left_chunks
)
if
self
.
gradient_checkpointing
and
self
.
training
:
xs
=
self
.
forward_layers_checkpointed
(
xs
,
chunk_masks
,
pos_emb
,
mask_pad
)
else
:
xs
=
self
.
forward_layers
(
xs
,
chunk_masks
,
pos_emb
,
mask_pad
)
if
self
.
normalize_before
:
xs
=
self
.
after_norm
(
xs
)
# Here we assume the mask is not changed in encoder layers, so just
# return the masks before encoder layers, and the masks will be used
# for cross attention with decoder later
return
xs
,
masks
def
forward_layers
(
self
,
xs
:
torch
.
Tensor
,
chunk_masks
:
torch
.
Tensor
,
pos_emb
:
torch
.
Tensor
,
mask_pad
:
torch
.
Tensor
)
->
torch
.
Tensor
:
for
layer
in
self
.
encoders
:
xs
,
chunk_masks
,
_
,
_
=
layer
(
xs
,
chunk_masks
,
pos_emb
,
mask_pad
)
return
xs
@
torch
.
jit
.
ignore
(
drop
=
True
)
def
forward_layers_checkpointed
(
self
,
xs
:
torch
.
Tensor
,
chunk_masks
:
torch
.
Tensor
,
pos_emb
:
torch
.
Tensor
,
mask_pad
:
torch
.
Tensor
)
->
torch
.
Tensor
:
for
layer
in
self
.
encoders
:
xs
,
chunk_masks
,
_
,
_
=
ckpt
.
checkpoint
(
layer
.
__call__
,
xs
,
chunk_masks
,
pos_emb
,
mask_pad
)
return
xs
def
forward_chunk
(
self
,
xs
:
torch
.
Tensor
,
offset
:
int
,
required_cache_size
:
int
,
att_cache
:
torch
.
Tensor
=
torch
.
zeros
(
0
,
0
,
0
,
0
),
cnn_cache
:
torch
.
Tensor
=
torch
.
zeros
(
0
,
0
,
0
,
0
),
att_mask
:
torch
.
Tensor
=
torch
.
ones
((
0
,
0
,
0
),
dtype
=
torch
.
bool
),
)
->
Tuple
[
torch
.
Tensor
,
torch
.
Tensor
,
torch
.
Tensor
]:
""" Forward just one chunk
Args:
xs (torch.Tensor): chunk input, with shape (b=1, time, mel-dim),
where `time == (chunk_size - 1) * subsample_rate +
\
subsample.right_context + 1`
offset (int): current offset in encoder output time stamp
required_cache_size (int): cache size required for next chunk
compuation
>=0: actual cache size
<0: means all history cache is required
att_cache (torch.Tensor): cache tensor for KEY & VALUE in
transformer/conformer attention, with shape
(elayers, head, cache_t1, d_k * 2), where
`head * d_k == hidden-dim` and
`cache_t1 == chunk_size * num_decoding_left_chunks`.
cnn_cache (torch.Tensor): cache tensor for cnn_module in conformer,
(elayers, b=1, hidden-dim, cache_t2), where
`cache_t2 == cnn.lorder - 1`
Returns:
torch.Tensor: output of current input xs,
with shape (b=1, chunk_size, hidden-dim).
torch.Tensor: new attention cache required for next chunk, with
dynamic shape (elayers, head, ?, d_k * 2)
depending on required_cache_size.
torch.Tensor: new conformer cnn cache required for next chunk, with
same shape as the original cnn_cache.
"""
assert
xs
.
size
(
0
)
==
1
# tmp_masks is just for interface compatibility
tmp_masks
=
torch
.
ones
(
1
,
xs
.
size
(
1
),
device
=
xs
.
device
,
dtype
=
torch
.
bool
)
tmp_masks
=
tmp_masks
.
unsqueeze
(
1
)
if
self
.
global_cmvn
is
not
None
:
xs
=
self
.
global_cmvn
(
xs
)
# NOTE(xcsong): Before embed, shape(xs) is (b=1, time, mel-dim)
xs
,
pos_emb
,
_
=
self
.
embed
(
xs
,
tmp_masks
,
offset
)
# NOTE(xcsong): After embed, shape(xs) is (b=1, chunk_size, hidden-dim)
elayers
,
cache_t1
=
att_cache
.
size
(
0
),
att_cache
.
size
(
2
)
chunk_size
=
xs
.
size
(
1
)
attention_key_size
=
cache_t1
+
chunk_size
pos_emb
=
self
.
embed
.
position_encoding
(
offset
=
offset
-
cache_t1
,
size
=
attention_key_size
)
if
required_cache_size
<
0
:
next_cache_start
=
0
elif
required_cache_size
==
0
:
next_cache_start
=
attention_key_size
else
:
next_cache_start
=
max
(
attention_key_size
-
required_cache_size
,
0
)
r_att_cache
=
[]
r_cnn_cache
=
[]
for
i
,
layer
in
enumerate
(
self
.
encoders
):
# NOTE(xcsong): Before layer.forward
# shape(att_cache[i:i + 1]) is (1, head, cache_t1, d_k * 2),
# shape(cnn_cache[i]) is (b=1, hidden-dim, cache_t2)
xs
,
_
,
new_att_cache
,
new_cnn_cache
=
layer
(
xs
,
att_mask
,
pos_emb
,
att_cache
=
att_cache
[
i
:
i
+
1
]
if
elayers
>
0
else
att_cache
,
cnn_cache
=
cnn_cache
[
i
]
if
cnn_cache
.
size
(
0
)
>
0
else
cnn_cache
)
# NOTE(xcsong): After layer.forward
# shape(new_att_cache) is (1, head, attention_key_size, d_k * 2),
# shape(new_cnn_cache) is (b=1, hidden-dim, cache_t2)
r_att_cache
.
append
(
new_att_cache
[:,
:,
next_cache_start
:,
:])
r_cnn_cache
.
append
(
new_cnn_cache
.
unsqueeze
(
0
))
if
self
.
normalize_before
:
xs
=
self
.
after_norm
(
xs
)
# NOTE(xcsong): shape(r_att_cache) is (elayers, head, ?, d_k * 2),
# ? may be larger than cache_t1, it depends on required_cache_size
r_att_cache
=
torch
.
cat
(
r_att_cache
,
dim
=
0
)
# NOTE(xcsong): shape(r_cnn_cache) is (e, b=1, hidden-dim, cache_t2)
r_cnn_cache
=
torch
.
cat
(
r_cnn_cache
,
dim
=
0
)
return
(
xs
,
r_att_cache
,
r_cnn_cache
)
def
forward_chunk_by_chunk
(
self
,
xs
:
torch
.
Tensor
,
decoding_chunk_size
:
int
,
num_decoding_left_chunks
:
int
=
-
1
,
)
->
Tuple
[
torch
.
Tensor
,
torch
.
Tensor
]:
""" Forward input chunk by chunk with chunk_size like a streaming
fashion
Here we should pay special attention to computation cache in the
streaming style forward chunk by chunk. Three things should be taken
into account for computation in the current network:
1. transformer/conformer encoder layers output cache
2. convolution in conformer
3. convolution in subsampling
However, we don't implement subsampling cache for:
1. We can control subsampling module to output the right result by
overlapping input instead of cache left context, even though it
wastes some computation, but subsampling only takes a very
small fraction of computation in the whole model.
2. Typically, there are several covolution layers with subsampling
in subsampling module, it is tricky and complicated to do cache
with different convolution layers with different subsampling
rate.
3. Currently, nn.Sequential is used to stack all the convolution
layers in subsampling, we need to rewrite it to make it work
with cache, which is not prefered.
Args:
xs (torch.Tensor): (1, max_len, dim)
chunk_size (int): decoding chunk size
"""
assert
decoding_chunk_size
>
0
# The model is trained by static or dynamic chunk
assert
self
.
static_chunk_size
>
0
or
self
.
use_dynamic_chunk
subsampling
=
self
.
embed
.
subsampling_rate
context
=
self
.
embed
.
right_context
+
1
# Add current frame
stride
=
subsampling
*
decoding_chunk_size
decoding_window
=
(
decoding_chunk_size
-
1
)
*
subsampling
+
context
num_frames
=
xs
.
size
(
1
)
att_cache
:
torch
.
Tensor
=
torch
.
zeros
((
0
,
0
,
0
,
0
),
device
=
xs
.
device
)
cnn_cache
:
torch
.
Tensor
=
torch
.
zeros
((
0
,
0
,
0
,
0
),
device
=
xs
.
device
)
outputs
=
[]
offset
=
0
required_cache_size
=
decoding_chunk_size
*
num_decoding_left_chunks
# Feed forward overlap input step by step
for
cur
in
range
(
0
,
num_frames
-
context
+
1
,
stride
):
end
=
min
(
cur
+
decoding_window
,
num_frames
)
chunk_xs
=
xs
[:,
cur
:
end
,
:]
(
y
,
att_cache
,
cnn_cache
)
=
self
.
forward_chunk
(
chunk_xs
,
offset
,
required_cache_size
,
att_cache
,
cnn_cache
)
outputs
.
append
(
y
)
offset
+=
y
.
size
(
1
)
ys
=
torch
.
cat
(
outputs
,
1
)
masks
=
torch
.
ones
((
1
,
1
,
ys
.
size
(
1
)),
device
=
ys
.
device
,
dtype
=
torch
.
bool
)
return
ys
,
masks
class
TransformerEncoder
(
BaseEncoder
):
"""Transformer encoder module."""
def
__init__
(
self
,
input_size
:
int
,
output_size
:
int
=
256
,
attention_heads
:
int
=
4
,
linear_units
:
int
=
2048
,
num_blocks
:
int
=
6
,
dropout_rate
:
float
=
0.1
,
positional_dropout_rate
:
float
=
0.1
,
attention_dropout_rate
:
float
=
0.0
,
input_layer
:
str
=
"conv2d"
,
pos_enc_layer_type
:
str
=
"abs_pos"
,
normalize_before
:
bool
=
True
,
static_chunk_size
:
int
=
0
,
use_dynamic_chunk
:
bool
=
False
,
global_cmvn
:
torch
.
nn
.
Module
=
None
,
use_dynamic_left_chunk
:
bool
=
False
,
key_bias
:
bool
=
True
,
selfattention_layer_type
:
str
=
"selfattn"
,
activation_type
:
str
=
"relu"
,
gradient_checkpointing
:
bool
=
False
,
):
""" Construct TransformerEncoder
See Encoder for the meaning of each parameter.
"""
super
().
__init__
(
input_size
,
output_size
,
attention_heads
,
linear_units
,
num_blocks
,
dropout_rate
,
positional_dropout_rate
,
attention_dropout_rate
,
input_layer
,
pos_enc_layer_type
,
normalize_before
,
static_chunk_size
,
use_dynamic_chunk
,
global_cmvn
,
use_dynamic_left_chunk
,
gradient_checkpointing
)
activation
=
COSYVOICE_ACTIVATION_CLASSES
[
activation_type
]()
self
.
encoders
=
torch
.
nn
.
ModuleList
([
TransformerEncoderLayer
(
output_size
,
COSYVOICE_ATTENTION_CLASSES
[
selfattention_layer_type
](
attention_heads
,
output_size
,
attention_dropout_rate
,
key_bias
),
PositionwiseFeedForward
(
output_size
,
linear_units
,
dropout_rate
,
activation
),
dropout_rate
,
normalize_before
)
for
_
in
range
(
num_blocks
)
])
class
ConformerEncoder
(
BaseEncoder
):
"""Conformer encoder module."""
def
__init__
(
self
,
input_size
:
int
,
output_size
:
int
=
256
,
attention_heads
:
int
=
4
,
linear_units
:
int
=
2048
,
num_blocks
:
int
=
6
,
dropout_rate
:
float
=
0.1
,
positional_dropout_rate
:
float
=
0.1
,
attention_dropout_rate
:
float
=
0.0
,
input_layer
:
str
=
"conv2d"
,
pos_enc_layer_type
:
str
=
"rel_pos"
,
normalize_before
:
bool
=
True
,
static_chunk_size
:
int
=
0
,
use_dynamic_chunk
:
bool
=
False
,
global_cmvn
:
torch
.
nn
.
Module
=
None
,
use_dynamic_left_chunk
:
bool
=
False
,
positionwise_conv_kernel_size
:
int
=
1
,
macaron_style
:
bool
=
True
,
selfattention_layer_type
:
str
=
"rel_selfattn"
,
activation_type
:
str
=
"swish"
,
use_cnn_module
:
bool
=
True
,
cnn_module_kernel
:
int
=
15
,
causal
:
bool
=
False
,
cnn_module_norm
:
str
=
"batch_norm"
,
key_bias
:
bool
=
True
,
gradient_checkpointing
:
bool
=
False
,
):
"""Construct ConformerEncoder
Args:
input_size to use_dynamic_chunk, see in BaseEncoder
positionwise_conv_kernel_size (int): Kernel size of positionwise
conv1d layer.
macaron_style (bool): Whether to use macaron style for
positionwise layer.
selfattention_layer_type (str): Encoder attention layer type,
the parameter has no effect now, it's just for configure
compatibility.
activation_type (str): Encoder activation function type.
use_cnn_module (bool): Whether to use convolution module.
cnn_module_kernel (int): Kernel size of convolution module.
causal (bool): whether to use causal convolution or not.
key_bias: whether use bias in attention.linear_k, False for whisper models.
"""
super
().
__init__
(
input_size
,
output_size
,
attention_heads
,
linear_units
,
num_blocks
,
dropout_rate
,
positional_dropout_rate
,
attention_dropout_rate
,
input_layer
,
pos_enc_layer_type
,
normalize_before
,
static_chunk_size
,
use_dynamic_chunk
,
global_cmvn
,
use_dynamic_left_chunk
,
gradient_checkpointing
)
activation
=
COSYVOICE_ACTIVATION_CLASSES
[
activation_type
]()
# self-attention module definition
encoder_selfattn_layer_args
=
(
attention_heads
,
output_size
,
attention_dropout_rate
,
key_bias
,
)
# feed-forward module definition
positionwise_layer_args
=
(
output_size
,
linear_units
,
dropout_rate
,
activation
,
)
# convolution module definition
convolution_layer_args
=
(
output_size
,
cnn_module_kernel
,
activation
,
cnn_module_norm
,
causal
)
self
.
encoders
=
torch
.
nn
.
ModuleList
([
ConformerEncoderLayer
(
output_size
,
COSYVOICE_ATTENTION_CLASSES
[
selfattention_layer_type
](
*
encoder_selfattn_layer_args
),
PositionwiseFeedForward
(
*
positionwise_layer_args
),
PositionwiseFeedForward
(
*
positionwise_layer_args
)
if
macaron_style
else
None
,
ConvolutionModule
(
*
convolution_layer_args
)
if
use_cnn_module
else
None
,
dropout_rate
,
normalize_before
,
)
for
_
in
range
(
num_blocks
)
])
class
BlockConformerEncoder
(
BaseEncoder
):
"""Conformer encoder module."""
def
__init__
(
self
,
input_size
:
int
,
output_size
:
int
=
256
,
attention_heads
:
int
=
4
,
linear_units
:
int
=
2048
,
num_blocks
:
int
=
6
,
dropout_rate
:
float
=
0.1
,
positional_dropout_rate
:
float
=
0.1
,
attention_dropout_rate
:
float
=
0.0
,
input_layer
:
str
=
"conv2d"
,
pos_enc_layer_type
:
str
=
"rel_pos"
,
normalize_before
:
bool
=
True
,
static_chunk_size
:
int
=
0
,
use_dynamic_chunk
:
bool
=
False
,
global_cmvn
:
torch
.
nn
.
Module
=
None
,
use_dynamic_left_chunk
:
bool
=
False
,
positionwise_conv_kernel_size
:
int
=
1
,
macaron_style
:
bool
=
True
,
selfattention_layer_type
:
str
=
"rel_selfattn"
,
activation_type
:
str
=
"swish"
,
use_cnn_module
:
bool
=
True
,
cnn_module_kernel
:
int
=
15
,
causal
:
bool
=
False
,
cnn_module_norm
:
str
=
"batch_norm"
,
key_bias
:
bool
=
True
,
gradient_checkpointing
:
bool
=
False
,
block_size
=
25
,
):
"""Construct ConformerEncoder
Args:
input_size to use_dynamic_chunk, see in BaseEncoder
positionwise_conv_kernel_size (int): Kernel size of positionwise
conv1d layer.
macaron_style (bool): Whether to use macaron style for
positionwise layer.
selfattention_layer_type (str): Encoder attention layer type,
the parameter has no effect now, it's just for configure
compatibility.
activation_type (str): Encoder activation function type.
use_cnn_module (bool): Whether to use convolution module.
cnn_module_kernel (int): Kernel size of convolution module.
causal (bool): whether to use causal convolution or not.
key_bias: whether use bias in attention.linear_k, False for whisper models.
"""
super
().
__init__
(
input_size
,
output_size
,
attention_heads
,
linear_units
,
num_blocks
,
dropout_rate
,
positional_dropout_rate
,
attention_dropout_rate
,
input_layer
,
pos_enc_layer_type
,
normalize_before
,
static_chunk_size
,
use_dynamic_chunk
,
global_cmvn
,
use_dynamic_left_chunk
,
gradient_checkpointing
)
activation
=
COSYVOICE_ACTIVATION_CLASSES
[
activation_type
]()
# self-attention module definition
encoder_selfattn_layer_args
=
(
attention_heads
,
output_size
,
attention_dropout_rate
,
key_bias
,
block_size
,
)
# feed-forward module definition
positionwise_layer_args
=
(
output_size
,
linear_units
,
dropout_rate
,
activation
,
)
# convolution module definition
convolution_layer_args
=
(
output_size
,
cnn_module_kernel
,
activation
,
cnn_module_norm
,
causal
)
self
.
encoders
=
torch
.
nn
.
ModuleList
([
ConformerEncoderLayer
(
output_size
,
COSYVOICE_ATTENTION_CLASSES
[
selfattention_layer_type
](
*
encoder_selfattn_layer_args
),
PositionwiseFeedForward
(
*
positionwise_layer_args
),
PositionwiseFeedForward
(
*
positionwise_layer_args
)
if
macaron_style
else
None
,
ConvolutionModule
(
*
convolution_layer_args
)
if
use_cnn_module
else
None
,
dropout_rate
,
normalize_before
,
)
for
_
in
range
(
num_blocks
)
])
self
.
block_size
=
block_size
third_party/GLM-4-Voice/cosyvoice/transformer/encoder_layer.py
0 → 100644
View file @
39ac40a9
# Copyright (c) 2021 Mobvoi Inc (Binbin Zhang, Di Wu)
# 2022 Xingchen Song (sxc19@mails.tsinghua.edu.cn)
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Modified from ESPnet(https://github.com/espnet/espnet)
"""Encoder self-attention layer definition."""
from
typing
import
Optional
,
Tuple
import
torch
from
torch
import
nn
class
TransformerEncoderLayer
(
nn
.
Module
):
"""Encoder layer module.
Args:
size (int): Input dimension.
self_attn (torch.nn.Module): Self-attention module instance.
`MultiHeadedAttention` or `RelPositionMultiHeadedAttention`
instance can be used as the argument.
feed_forward (torch.nn.Module): Feed-forward module instance.
`PositionwiseFeedForward`, instance can be used as the argument.
dropout_rate (float): Dropout rate.
normalize_before (bool):
True: use layer_norm before each sub-block.
False: to use layer_norm after each sub-block.
"""
def
__init__
(
self
,
size
:
int
,
self_attn
:
torch
.
nn
.
Module
,
feed_forward
:
torch
.
nn
.
Module
,
dropout_rate
:
float
,
normalize_before
:
bool
=
True
,
):
"""Construct an EncoderLayer object."""
super
().
__init__
()
self
.
self_attn
=
self_attn
self
.
feed_forward
=
feed_forward
self
.
norm1
=
nn
.
LayerNorm
(
size
,
eps
=
1e-5
)
self
.
norm2
=
nn
.
LayerNorm
(
size
,
eps
=
1e-5
)
self
.
dropout
=
nn
.
Dropout
(
dropout_rate
)
self
.
size
=
size
self
.
normalize_before
=
normalize_before
def
forward
(
self
,
x
:
torch
.
Tensor
,
mask
:
torch
.
Tensor
,
pos_emb
:
torch
.
Tensor
,
mask_pad
:
torch
.
Tensor
=
torch
.
ones
((
0
,
0
,
0
),
dtype
=
torch
.
bool
),
att_cache
:
torch
.
Tensor
=
torch
.
zeros
((
0
,
0
,
0
,
0
)),
cnn_cache
:
torch
.
Tensor
=
torch
.
zeros
((
0
,
0
,
0
,
0
)),
)
->
Tuple
[
torch
.
Tensor
,
torch
.
Tensor
,
torch
.
Tensor
,
torch
.
Tensor
]:
"""Compute encoded features.
Args:
x (torch.Tensor): (#batch, time, size)
mask (torch.Tensor): Mask tensor for the input (#batch, time,time),
(0, 0, 0) means fake mask.
pos_emb (torch.Tensor): just for interface compatibility
to ConformerEncoderLayer
mask_pad (torch.Tensor): does not used in transformer layer,
just for unified api with conformer.
att_cache (torch.Tensor): Cache tensor of the KEY & VALUE
(#batch=1, head, cache_t1, d_k * 2), head * d_k == size.
cnn_cache (torch.Tensor): Convolution cache in conformer layer
(#batch=1, size, cache_t2), not used here, it's for interface
compatibility to ConformerEncoderLayer.
Returns:
torch.Tensor: Output tensor (#batch, time, size).
torch.Tensor: Mask tensor (#batch, time, time).
torch.Tensor: att_cache tensor,
(#batch=1, head, cache_t1 + time, d_k * 2).
torch.Tensor: cnn_cahce tensor (#batch=1, size, cache_t2).
"""
residual
=
x
if
self
.
normalize_before
:
x
=
self
.
norm1
(
x
)
x_att
,
new_att_cache
=
self
.
self_attn
(
x
,
x
,
x
,
mask
,
pos_emb
=
pos_emb
,
cache
=
att_cache
)
x
=
residual
+
self
.
dropout
(
x_att
)
if
not
self
.
normalize_before
:
x
=
self
.
norm1
(
x
)
residual
=
x
if
self
.
normalize_before
:
x
=
self
.
norm2
(
x
)
x
=
residual
+
self
.
dropout
(
self
.
feed_forward
(
x
))
if
not
self
.
normalize_before
:
x
=
self
.
norm2
(
x
)
fake_cnn_cache
=
torch
.
zeros
((
0
,
0
,
0
),
dtype
=
x
.
dtype
,
device
=
x
.
device
)
return
x
,
mask
,
new_att_cache
,
fake_cnn_cache
class
ConformerEncoderLayer
(
nn
.
Module
):
"""Encoder layer module.
Args:
size (int): Input dimension.
self_attn (torch.nn.Module): Self-attention module instance.
`MultiHeadedAttention` or `RelPositionMultiHeadedAttention`
instance can be used as the argument.
feed_forward (torch.nn.Module): Feed-forward module instance.
`PositionwiseFeedForward` instance can be used as the argument.
feed_forward_macaron (torch.nn.Module): Additional feed-forward module
instance.
`PositionwiseFeedForward` instance can be used as the argument.
conv_module (torch.nn.Module): Convolution module instance.
`ConvlutionModule` instance can be used as the argument.
dropout_rate (float): Dropout rate.
normalize_before (bool):
True: use layer_norm before each sub-block.
False: use layer_norm after each sub-block.
"""
def
__init__
(
self
,
size
:
int
,
self_attn
:
torch
.
nn
.
Module
,
feed_forward
:
Optional
[
nn
.
Module
]
=
None
,
feed_forward_macaron
:
Optional
[
nn
.
Module
]
=
None
,
conv_module
:
Optional
[
nn
.
Module
]
=
None
,
dropout_rate
:
float
=
0.1
,
normalize_before
:
bool
=
True
,
):
"""Construct an EncoderLayer object."""
super
().
__init__
()
self
.
self_attn
=
self_attn
self
.
feed_forward
=
feed_forward
self
.
feed_forward_macaron
=
feed_forward_macaron
self
.
conv_module
=
conv_module
self
.
norm_ff
=
nn
.
LayerNorm
(
size
,
eps
=
1e-5
)
# for the FNN module
self
.
norm_mha
=
nn
.
LayerNorm
(
size
,
eps
=
1e-5
)
# for the MHA module
if
feed_forward_macaron
is
not
None
:
self
.
norm_ff_macaron
=
nn
.
LayerNorm
(
size
,
eps
=
1e-5
)
self
.
ff_scale
=
0.5
else
:
self
.
ff_scale
=
1.0
if
self
.
conv_module
is
not
None
:
self
.
norm_conv
=
nn
.
LayerNorm
(
size
,
eps
=
1e-5
)
# for the CNN module
self
.
norm_final
=
nn
.
LayerNorm
(
size
,
eps
=
1e-5
)
# for the final output of the block
self
.
dropout
=
nn
.
Dropout
(
dropout_rate
)
self
.
size
=
size
self
.
normalize_before
=
normalize_before
def
forward
(
self
,
x
:
torch
.
Tensor
,
mask
:
torch
.
Tensor
,
pos_emb
:
torch
.
Tensor
,
mask_pad
:
torch
.
Tensor
=
torch
.
ones
((
0
,
0
,
0
),
dtype
=
torch
.
bool
),
att_cache
:
torch
.
Tensor
=
torch
.
zeros
((
0
,
0
,
0
,
0
)),
cnn_cache
:
torch
.
Tensor
=
torch
.
zeros
((
0
,
0
,
0
,
0
)),
)
->
Tuple
[
torch
.
Tensor
,
torch
.
Tensor
,
torch
.
Tensor
,
torch
.
Tensor
]:
"""Compute encoded features.
Args:
x (torch.Tensor): (#batch, time, size)
mask (torch.Tensor): Mask tensor for the input (#batch, time,time),
(0, 0, 0) means fake mask.
pos_emb (torch.Tensor): positional encoding, must not be None
for ConformerEncoderLayer.
mask_pad (torch.Tensor): batch padding mask used for conv module.
(#batch, 1,time), (0, 0, 0) means fake mask.
att_cache (torch.Tensor): Cache tensor of the KEY & VALUE
(#batch=1, head, cache_t1, d_k * 2), head * d_k == size.
cnn_cache (torch.Tensor): Convolution cache in conformer layer
(#batch=1, size, cache_t2)
Returns:
torch.Tensor: Output tensor (#batch, time, size).
torch.Tensor: Mask tensor (#batch, time, time).
torch.Tensor: att_cache tensor,
(#batch=1, head, cache_t1 + time, d_k * 2).
torch.Tensor: cnn_cahce tensor (#batch, size, cache_t2).
"""
# whether to use macaron style
if
self
.
feed_forward_macaron
is
not
None
:
residual
=
x
if
self
.
normalize_before
:
x
=
self
.
norm_ff_macaron
(
x
)
x
=
residual
+
self
.
ff_scale
*
self
.
dropout
(
self
.
feed_forward_macaron
(
x
))
if
not
self
.
normalize_before
:
x
=
self
.
norm_ff_macaron
(
x
)
# multi-headed self-attention module
residual
=
x
if
self
.
normalize_before
:
x
=
self
.
norm_mha
(
x
)
x_att
,
new_att_cache
=
self
.
self_attn
(
x
,
x
,
x
,
mask
,
pos_emb
,
att_cache
)
x
=
residual
+
self
.
dropout
(
x_att
)
if
not
self
.
normalize_before
:
x
=
self
.
norm_mha
(
x
)
# convolution module
# Fake new cnn cache here, and then change it in conv_module
new_cnn_cache
=
torch
.
zeros
((
0
,
0
,
0
),
dtype
=
x
.
dtype
,
device
=
x
.
device
)
if
self
.
conv_module
is
not
None
:
residual
=
x
if
self
.
normalize_before
:
x
=
self
.
norm_conv
(
x
)
x
,
new_cnn_cache
=
self
.
conv_module
(
x
,
mask_pad
,
cnn_cache
)
x
=
residual
+
self
.
dropout
(
x
)
if
not
self
.
normalize_before
:
x
=
self
.
norm_conv
(
x
)
# feed forward module
residual
=
x
if
self
.
normalize_before
:
x
=
self
.
norm_ff
(
x
)
x
=
residual
+
self
.
ff_scale
*
self
.
dropout
(
self
.
feed_forward
(
x
))
if
not
self
.
normalize_before
:
x
=
self
.
norm_ff
(
x
)
if
self
.
conv_module
is
not
None
:
x
=
self
.
norm_final
(
x
)
return
x
,
mask
,
new_att_cache
,
new_cnn_cache
third_party/GLM-4-Voice/cosyvoice/transformer/label_smoothing_loss.py
0 → 100644
View file @
39ac40a9
# Copyright (c) 2019 Shigeki Karita
# 2020 Mobvoi Inc (Binbin Zhang)
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Label smoothing module."""
import
torch
from
torch
import
nn
class
LabelSmoothingLoss
(
nn
.
Module
):
"""Label-smoothing loss.
In a standard CE loss, the label's data distribution is:
[0,1,2] ->
[
[1.0, 0.0, 0.0],
[0.0, 1.0, 0.0],
[0.0, 0.0, 1.0],
]
In the smoothing version CE Loss,some probabilities
are taken from the true label prob (1.0) and are divided
among other labels.
e.g.
smoothing=0.1
[0,1,2] ->
[
[0.9, 0.05, 0.05],
[0.05, 0.9, 0.05],
[0.05, 0.05, 0.9],
]
Args:
size (int): the number of class
padding_idx (int): padding class id which will be ignored for loss
smoothing (float): smoothing rate (0.0 means the conventional CE)
normalize_length (bool):
normalize loss by sequence length if True
normalize loss by batch size if False
"""
def
__init__
(
self
,
size
:
int
,
padding_idx
:
int
,
smoothing
:
float
,
normalize_length
:
bool
=
False
):
"""Construct an LabelSmoothingLoss object."""
super
(
LabelSmoothingLoss
,
self
).
__init__
()
self
.
criterion
=
nn
.
KLDivLoss
(
reduction
=
"none"
)
self
.
padding_idx
=
padding_idx
self
.
confidence
=
1.0
-
smoothing
self
.
smoothing
=
smoothing
self
.
size
=
size
self
.
normalize_length
=
normalize_length
def
forward
(
self
,
x
:
torch
.
Tensor
,
target
:
torch
.
Tensor
)
->
torch
.
Tensor
:
"""Compute loss between x and target.
The model outputs and data labels tensors are flatten to
(batch*seqlen, class) shape and a mask is applied to the
padding part which should not be calculated for loss.
Args:
x (torch.Tensor): prediction (batch, seqlen, class)
target (torch.Tensor):
target signal masked with self.padding_id (batch, seqlen)
Returns:
loss (torch.Tensor) : The KL loss, scalar float value
"""
assert
x
.
size
(
2
)
==
self
.
size
batch_size
=
x
.
size
(
0
)
x
=
x
.
view
(
-
1
,
self
.
size
)
target
=
target
.
view
(
-
1
)
# use zeros_like instead of torch.no_grad() for true_dist,
# since no_grad() can not be exported by JIT
true_dist
=
torch
.
zeros_like
(
x
)
true_dist
.
fill_
(
self
.
smoothing
/
(
self
.
size
-
1
))
ignore
=
target
==
self
.
padding_idx
# (B,)
total
=
len
(
target
)
-
ignore
.
sum
().
item
()
target
=
target
.
masked_fill
(
ignore
,
0
)
# avoid -1 index
true_dist
.
scatter_
(
1
,
target
.
unsqueeze
(
1
),
self
.
confidence
)
kl
=
self
.
criterion
(
torch
.
log_softmax
(
x
,
dim
=
1
),
true_dist
)
denom
=
total
if
self
.
normalize_length
else
batch_size
return
kl
.
masked_fill
(
ignore
.
unsqueeze
(
1
),
0
).
sum
()
/
denom
third_party/GLM-4-Voice/cosyvoice/transformer/positionwise_feed_forward.py
0 → 100644
View file @
39ac40a9
# Copyright (c) 2019 Shigeki Karita
# 2020 Mobvoi Inc (Binbin Zhang)
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Positionwise feed forward layer definition."""
import
torch
class
PositionwiseFeedForward
(
torch
.
nn
.
Module
):
"""Positionwise feed forward layer.
FeedForward are appied on each position of the sequence.
The output dim is same with the input dim.
Args:
idim (int): Input dimenstion.
hidden_units (int): The number of hidden units.
dropout_rate (float): Dropout rate.
activation (torch.nn.Module): Activation function
"""
def
__init__
(
self
,
idim
:
int
,
hidden_units
:
int
,
dropout_rate
:
float
,
activation
:
torch
.
nn
.
Module
=
torch
.
nn
.
ReLU
(),
):
"""Construct a PositionwiseFeedForward object."""
super
(
PositionwiseFeedForward
,
self
).
__init__
()
self
.
w_1
=
torch
.
nn
.
Linear
(
idim
,
hidden_units
)
self
.
activation
=
activation
self
.
dropout
=
torch
.
nn
.
Dropout
(
dropout_rate
)
self
.
w_2
=
torch
.
nn
.
Linear
(
hidden_units
,
idim
)
def
forward
(
self
,
xs
:
torch
.
Tensor
)
->
torch
.
Tensor
:
"""Forward function.
Args:
xs: input tensor (B, L, D)
Returns:
output tensor, (B, L, D)
"""
return
self
.
w_2
(
self
.
dropout
(
self
.
activation
(
self
.
w_1
(
xs
))))
class
MoEFFNLayer
(
torch
.
nn
.
Module
):
"""
Mixture of expert with Positionwise feed forward layer
See also figure 1 in https://arxiv.org/pdf/2305.15663.pdf
The output dim is same with the input dim.
Modified from https://github.com/Lightning-AI/lit-gpt/pull/823
https://github.com/mistralai/mistral-src/blob/b46d6/moe_one_file_ref.py#L203-L219
Args:
n_expert: number of expert.
n_expert_per_token: The actual number of experts used for each frame
idim (int): Input dimenstion.
hidden_units (int): The number of hidden units.
dropout_rate (float): Dropout rate.
activation (torch.nn.Module): Activation function
"""
def
__init__
(
self
,
n_expert
:
int
,
n_expert_per_token
:
int
,
idim
:
int
,
hidden_units
:
int
,
dropout_rate
:
float
,
activation
:
torch
.
nn
.
Module
=
torch
.
nn
.
ReLU
(),
):
super
(
MoEFFNLayer
,
self
).
__init__
()
self
.
gate
=
torch
.
nn
.
Linear
(
idim
,
n_expert
,
bias
=
False
)
self
.
experts
=
torch
.
nn
.
ModuleList
(
PositionwiseFeedForward
(
idim
,
hidden_units
,
dropout_rate
,
activation
)
for
_
in
range
(
n_expert
))
self
.
n_expert_per_token
=
n_expert_per_token
def
forward
(
self
,
xs
:
torch
.
Tensor
)
->
torch
.
Tensor
:
"""Foward function.
Args:
xs: input tensor (B, L, D)
Returns:
output tensor, (B, L, D)
"""
B
,
L
,
D
=
xs
.
size
(
)
# batch size, sequence length, embedding dimension (idim)
xs
=
xs
.
view
(
-
1
,
D
)
# (B*L, D)
router
=
self
.
gate
(
xs
)
# (B*L, n_expert)
logits
,
indices
=
torch
.
topk
(
router
,
self
.
n_expert_per_token
)
# probs:(B*L, n_expert), indices: (B*L, n_expert)
weights
=
torch
.
nn
.
functional
.
softmax
(
logits
,
dim
=
1
,
dtype
=
torch
.
float
).
to
(
dtype
=
xs
.
dtype
)
# (B*L, n_expert_per_token)
output
=
torch
.
zeros_like
(
xs
)
# (B*L, D)
for
i
,
expert
in
enumerate
(
self
.
experts
):
mask
=
indices
==
i
batch_idx
,
ith_expert
=
torch
.
where
(
mask
)
output
[
batch_idx
]
+=
weights
[
batch_idx
,
ith_expert
,
None
]
*
expert
(
xs
[
batch_idx
])
return
output
.
view
(
B
,
L
,
D
)
third_party/GLM-4-Voice/cosyvoice/transformer/subsampling.py
0 → 100644
View file @
39ac40a9
# Copyright (c) 2021 Mobvoi Inc (Binbin Zhang, Di Wu)
# 2024 Alibaba Inc (Xiang Lyu)
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Modified from ESPnet(https://github.com/espnet/espnet)
"""Subsampling layer definition."""
from
typing
import
Tuple
,
Union
import
torch
class
BaseSubsampling
(
torch
.
nn
.
Module
):
def
__init__
(
self
):
super
().
__init__
()
self
.
right_context
=
0
self
.
subsampling_rate
=
1
def
position_encoding
(
self
,
offset
:
Union
[
int
,
torch
.
Tensor
],
size
:
int
)
->
torch
.
Tensor
:
return
self
.
pos_enc
.
position_encoding
(
offset
,
size
)
class
EmbedinigNoSubsampling
(
BaseSubsampling
):
"""Embedding input without subsampling
"""
def
__init__
(
self
,
idim
:
int
,
odim
:
int
,
dropout_rate
:
float
,
pos_enc_class
:
torch
.
nn
.
Module
):
super
().
__init__
()
self
.
embed
=
torch
.
nn
.
Embedding
(
idim
,
odim
)
self
.
pos_enc
=
pos_enc_class
def
forward
(
self
,
x
:
torch
.
Tensor
,
x_mask
:
torch
.
Tensor
,
offset
:
Union
[
int
,
torch
.
Tensor
]
=
0
)
->
Tuple
[
torch
.
Tensor
,
torch
.
Tensor
,
torch
.
Tensor
]:
"""Input x.
Args:
x (torch.Tensor): Input tensor (#batch, time, idim).
x_mask (torch.Tensor): Input mask (#batch, 1, time).
Returns:
torch.Tensor: linear input tensor (#batch, time', odim),
where time' = time .
torch.Tensor: linear input mask (#batch, 1, time'),
where time' = time .
"""
x
=
self
.
embed
(
x
)
x
,
pos_emb
=
self
.
pos_enc
(
x
,
offset
)
return
x
,
pos_emb
,
x_mask
class
LinearNoSubsampling
(
BaseSubsampling
):
"""Linear transform the input without subsampling
Args:
idim (int): Input dimension.
odim (int): Output dimension.
dropout_rate (float): Dropout rate.
"""
def
__init__
(
self
,
idim
:
int
,
odim
:
int
,
dropout_rate
:
float
,
pos_enc_class
:
torch
.
nn
.
Module
):
"""Construct an linear object."""
super
().
__init__
()
self
.
out
=
torch
.
nn
.
Sequential
(
torch
.
nn
.
Linear
(
idim
,
odim
),
torch
.
nn
.
LayerNorm
(
odim
,
eps
=
1e-5
),
torch
.
nn
.
Dropout
(
dropout_rate
),
)
self
.
pos_enc
=
pos_enc_class
self
.
right_context
=
0
self
.
subsampling_rate
=
1
def
forward
(
self
,
x
:
torch
.
Tensor
,
x_mask
:
torch
.
Tensor
,
offset
:
Union
[
int
,
torch
.
Tensor
]
=
0
)
->
Tuple
[
torch
.
Tensor
,
torch
.
Tensor
,
torch
.
Tensor
]:
"""Input x.
Args:
x (torch.Tensor): Input tensor (#batch, time, idim).
x_mask (torch.Tensor): Input mask (#batch, 1, time).
Returns:
torch.Tensor: linear input tensor (#batch, time', odim),
where time' = time .
torch.Tensor: linear input mask (#batch, 1, time'),
where time' = time .
"""
x
=
self
.
out
(
x
)
x
,
pos_emb
=
self
.
pos_enc
(
x
,
offset
)
return
x
,
pos_emb
,
x_mask
class
Conv1dSubsampling2
(
BaseSubsampling
):
"""Convolutional 1D subsampling (to 1/2 length).
It is designed for Whisper, ref:
https://github.com/openai/whisper/blob/main/whisper/model.py
Args:
idim (int): Input dimension.
odim (int): Output dimension.
dropout_rate (float): Dropout rate.
"""
def
__init__
(
self
,
idim
:
int
,
odim
:
int
,
dropout_rate
:
float
,
pos_enc_class
:
torch
.
nn
.
Module
):
"""Construct an Conv1dSubsampling2 object."""
super
().
__init__
()
self
.
conv
=
torch
.
nn
.
Sequential
(
torch
.
nn
.
Conv1d
(
idim
,
odim
,
kernel_size
=
3
,
padding
=
1
),
torch
.
nn
.
GELU
(),
torch
.
nn
.
Conv1d
(
odim
,
odim
,
kernel_size
=
3
,
stride
=
2
,
padding
=
1
),
torch
.
nn
.
GELU
(),
)
self
.
pos_enc
=
pos_enc_class
# The right context for every conv layer is computed by:
# (kernel_size - 1) * frame_rate_of_this_layer
self
.
subsampling_rate
=
2
# 4 = (3 - 1) * 1 + (3 - 1) * 1
self
.
right_context
=
4
def
forward
(
self
,
x
:
torch
.
Tensor
,
x_mask
:
torch
.
Tensor
,
offset
:
Union
[
int
,
torch
.
Tensor
]
=
0
)
->
Tuple
[
torch
.
Tensor
,
torch
.
Tensor
,
torch
.
Tensor
]:
"""Subsample x.
Args:
x (torch.Tensor): Input tensor (#batch, time, idim).
x_mask (torch.Tensor): Input mask (#batch, 1, time).
Returns:
torch.Tensor: Subsampled tensor (#batch, time', odim),
where time' = time // 2.
torch.Tensor: Subsampled mask (#batch, 1, time'),
where time' = time // 2.
torch.Tensor: positional encoding
"""
time
=
x
.
size
(
1
)
x
=
x
.
transpose
(
1
,
2
)
# (b, f, t)
x
=
self
.
conv
(
x
)
x
=
x
.
transpose
(
1
,
2
)
# (b, t, f)
x
,
pos_emb
=
self
.
pos_enc
(
x
,
offset
)
return
x
,
pos_emb
,
x_mask
[:,
:,
(
time
+
1
)
%
2
::
2
]
class
Conv2dSubsampling4
(
BaseSubsampling
):
"""Convolutional 2D subsampling (to 1/4 length).
Args:
idim (int): Input dimension.
odim (int): Output dimension.
dropout_rate (float): Dropout rate.
"""
def
__init__
(
self
,
idim
:
int
,
odim
:
int
,
dropout_rate
:
float
,
pos_enc_class
:
torch
.
nn
.
Module
):
"""Construct an Conv2dSubsampling4 object."""
super
().
__init__
()
self
.
conv
=
torch
.
nn
.
Sequential
(
torch
.
nn
.
Conv2d
(
1
,
odim
,
3
,
2
),
torch
.
nn
.
ReLU
(),
torch
.
nn
.
Conv2d
(
odim
,
odim
,
3
,
2
),
torch
.
nn
.
ReLU
(),
)
self
.
out
=
torch
.
nn
.
Sequential
(
torch
.
nn
.
Linear
(
odim
*
(((
idim
-
1
)
//
2
-
1
)
//
2
),
odim
))
self
.
pos_enc
=
pos_enc_class
# The right context for every conv layer is computed by:
# (kernel_size - 1) * frame_rate_of_this_layer
self
.
subsampling_rate
=
4
# 6 = (3 - 1) * 1 + (3 - 1) * 2
self
.
right_context
=
6
def
forward
(
self
,
x
:
torch
.
Tensor
,
x_mask
:
torch
.
Tensor
,
offset
:
Union
[
int
,
torch
.
Tensor
]
=
0
)
->
Tuple
[
torch
.
Tensor
,
torch
.
Tensor
,
torch
.
Tensor
]:
"""Subsample x.
Args:
x (torch.Tensor): Input tensor (#batch, time, idim).
x_mask (torch.Tensor): Input mask (#batch, 1, time).
Returns:
torch.Tensor: Subsampled tensor (#batch, time', odim),
where time' = time // 4.
torch.Tensor: Subsampled mask (#batch, 1, time'),
where time' = time // 4.
torch.Tensor: positional encoding
"""
x
=
x
.
unsqueeze
(
1
)
# (b, c=1, t, f)
x
=
self
.
conv
(
x
)
b
,
c
,
t
,
f
=
x
.
size
()
x
=
self
.
out
(
x
.
transpose
(
1
,
2
).
contiguous
().
view
(
b
,
t
,
c
*
f
))
x
,
pos_emb
=
self
.
pos_enc
(
x
,
offset
)
return
x
,
pos_emb
,
x_mask
[:,
:,
2
::
2
][:,
:,
2
::
2
]
class
Conv2dSubsampling6
(
BaseSubsampling
):
"""Convolutional 2D subsampling (to 1/6 length).
Args:
idim (int): Input dimension.
odim (int): Output dimension.
dropout_rate (float): Dropout rate.
pos_enc (torch.nn.Module): Custom position encoding layer.
"""
def
__init__
(
self
,
idim
:
int
,
odim
:
int
,
dropout_rate
:
float
,
pos_enc_class
:
torch
.
nn
.
Module
):
"""Construct an Conv2dSubsampling6 object."""
super
().
__init__
()
self
.
conv
=
torch
.
nn
.
Sequential
(
torch
.
nn
.
Conv2d
(
1
,
odim
,
3
,
2
),
torch
.
nn
.
ReLU
(),
torch
.
nn
.
Conv2d
(
odim
,
odim
,
5
,
3
),
torch
.
nn
.
ReLU
(),
)
self
.
linear
=
torch
.
nn
.
Linear
(
odim
*
(((
idim
-
1
)
//
2
-
2
)
//
3
),
odim
)
self
.
pos_enc
=
pos_enc_class
# 10 = (3 - 1) * 1 + (5 - 1) * 2
self
.
subsampling_rate
=
6
self
.
right_context
=
10
def
forward
(
self
,
x
:
torch
.
Tensor
,
x_mask
:
torch
.
Tensor
,
offset
:
Union
[
int
,
torch
.
Tensor
]
=
0
)
->
Tuple
[
torch
.
Tensor
,
torch
.
Tensor
,
torch
.
Tensor
]:
"""Subsample x.
Args:
x (torch.Tensor): Input tensor (#batch, time, idim).
x_mask (torch.Tensor): Input mask (#batch, 1, time).
Returns:
torch.Tensor: Subsampled tensor (#batch, time', odim),
where time' = time // 6.
torch.Tensor: Subsampled mask (#batch, 1, time'),
where time' = time // 6.
torch.Tensor: positional encoding
"""
x
=
x
.
unsqueeze
(
1
)
# (b, c, t, f)
x
=
self
.
conv
(
x
)
b
,
c
,
t
,
f
=
x
.
size
()
x
=
self
.
linear
(
x
.
transpose
(
1
,
2
).
contiguous
().
view
(
b
,
t
,
c
*
f
))
x
,
pos_emb
=
self
.
pos_enc
(
x
,
offset
)
return
x
,
pos_emb
,
x_mask
[:,
:,
2
::
2
][:,
:,
4
::
3
]
class
Conv2dSubsampling8
(
BaseSubsampling
):
"""Convolutional 2D subsampling (to 1/8 length).
Args:
idim (int): Input dimension.
odim (int): Output dimension.
dropout_rate (float): Dropout rate.
"""
def
__init__
(
self
,
idim
:
int
,
odim
:
int
,
dropout_rate
:
float
,
pos_enc_class
:
torch
.
nn
.
Module
):
"""Construct an Conv2dSubsampling8 object."""
super
().
__init__
()
self
.
conv
=
torch
.
nn
.
Sequential
(
torch
.
nn
.
Conv2d
(
1
,
odim
,
3
,
2
),
torch
.
nn
.
ReLU
(),
torch
.
nn
.
Conv2d
(
odim
,
odim
,
3
,
2
),
torch
.
nn
.
ReLU
(),
torch
.
nn
.
Conv2d
(
odim
,
odim
,
3
,
2
),
torch
.
nn
.
ReLU
(),
)
self
.
linear
=
torch
.
nn
.
Linear
(
odim
*
((((
idim
-
1
)
//
2
-
1
)
//
2
-
1
)
//
2
),
odim
)
self
.
pos_enc
=
pos_enc_class
self
.
subsampling_rate
=
8
# 14 = (3 - 1) * 1 + (3 - 1) * 2 + (3 - 1) * 4
self
.
right_context
=
14
def
forward
(
self
,
x
:
torch
.
Tensor
,
x_mask
:
torch
.
Tensor
,
offset
:
Union
[
int
,
torch
.
Tensor
]
=
0
)
->
Tuple
[
torch
.
Tensor
,
torch
.
Tensor
,
torch
.
Tensor
]:
"""Subsample x.
Args:
x (torch.Tensor): Input tensor (#batch, time, idim).
x_mask (torch.Tensor): Input mask (#batch, 1, time).
Returns:
torch.Tensor: Subsampled tensor (#batch, time', odim),
where time' = time // 8.
torch.Tensor: Subsampled mask (#batch, 1, time'),
where time' = time // 8.
torch.Tensor: positional encoding
"""
x
=
x
.
unsqueeze
(
1
)
# (b, c, t, f)
x
=
self
.
conv
(
x
)
b
,
c
,
t
,
f
=
x
.
size
()
x
=
self
.
linear
(
x
.
transpose
(
1
,
2
).
contiguous
().
view
(
b
,
t
,
c
*
f
))
x
,
pos_emb
=
self
.
pos_enc
(
x
,
offset
)
return
x
,
pos_emb
,
x_mask
[:,
:,
2
::
2
][:,
:,
2
::
2
][:,
:,
2
::
2
]
class
LegacyLinearNoSubsampling
(
BaseSubsampling
):
"""Linear transform the input without subsampling
Args:
idim (int): Input dimension.
odim (int): Output dimension.
dropout_rate (float): Dropout rate.
"""
def
__init__
(
self
,
idim
:
int
,
odim
:
int
,
dropout_rate
:
float
,
pos_enc_class
:
torch
.
nn
.
Module
):
"""Construct an linear object."""
super
().
__init__
()
self
.
out
=
torch
.
nn
.
Sequential
(
torch
.
nn
.
Linear
(
idim
,
odim
),
torch
.
nn
.
LayerNorm
(
odim
,
eps
=
1e-5
),
torch
.
nn
.
Dropout
(
dropout_rate
),
torch
.
nn
.
ReLU
(),
)
self
.
pos_enc
=
pos_enc_class
self
.
right_context
=
0
self
.
subsampling_rate
=
1
def
forward
(
self
,
x
:
torch
.
Tensor
,
x_mask
:
torch
.
Tensor
,
offset
:
Union
[
int
,
torch
.
Tensor
]
=
0
)
->
Tuple
[
torch
.
Tensor
,
torch
.
Tensor
,
torch
.
Tensor
]:
"""Input x.
Args:
x (torch.Tensor): Input tensor (#batch, time, idim).
x_mask (torch.Tensor): Input mask (#batch, 1, time).
Returns:
torch.Tensor: linear input tensor (#batch, time', odim),
where time' = time .
torch.Tensor: linear input mask (#batch, 1, time'),
where time' = time .
"""
x
=
self
.
out
(
x
)
x
,
pos_emb
=
self
.
pos_enc
(
x
,
offset
)
return
x
,
pos_emb
,
x_mask
third_party/GLM-4-Voice/cosyvoice/utils/__init__.py
0 → 100644
View file @
39ac40a9
third_party/GLM-4-Voice/cosyvoice/utils/__pycache__/__init__.cpython-310.pyc
0 → 100644
View file @
39ac40a9
File added
third_party/GLM-4-Voice/cosyvoice/utils/__pycache__/block_mask_util.cpython-310.pyc
0 → 100644
View file @
39ac40a9
File added
third_party/GLM-4-Voice/cosyvoice/utils/__pycache__/class_utils.cpython-310.pyc
0 → 100644
View file @
39ac40a9
File added
third_party/GLM-4-Voice/cosyvoice/utils/__pycache__/common.cpython-310.pyc
0 → 100644
View file @
39ac40a9
File added
third_party/GLM-4-Voice/cosyvoice/utils/__pycache__/mask.cpython-310.pyc
0 → 100644
View file @
39ac40a9
File added
third_party/GLM-4-Voice/cosyvoice/utils/block_mask_util.py
0 → 100644
View file @
39ac40a9
import
torch
def
create_grid_mask
(
seq_length
,
trunck_length
,
fill_triangle
):
assert
seq_length
>
0
# 先不考虑seen_length创建一个grid mask:
if
fill_triangle
:
mask
=
1
-
torch
.
triu
(
torch
.
ones
(
seq_length
,
seq_length
),
diagonal
=
1
)
# 下三角与主对角线都为1
else
:
mask
=
torch
.
zeros
(
seq_length
,
seq_length
)
for
i
in
range
(
seq_length
):
trunck_idx
=
i
//
trunck_length
trunck_start
=
trunck_idx
*
trunck_length
trunck_end
=
trunck_length
+
trunck_start
mask
[
i
][
trunck_start
:
trunck_end
]
=
1
return
mask
if
__name__
==
"__main__"
:
mask
=
create_grid_mask
(
seq_length
=
8
,
trunck_length
=
3
,
fill_triangle
=
True
).
int
()
print
(
mask
)
# tensor([[1, 1, 1, 0, 0, 0, 0, 0],
# [1, 1, 1, 0, 0, 0, 0, 0],
# [1, 1, 1, 0, 0, 0, 0, 0],
# [1, 1, 1, 1, 1, 1, 0, 0],
# [1, 1, 1, 1, 1, 1, 0, 0],
# [1, 1, 1, 1, 1, 1, 0, 0],
# [1, 1, 1, 1, 1, 1, 1, 1],
# [1, 1, 1, 1, 1, 1, 1, 1]]
third_party/GLM-4-Voice/cosyvoice/utils/class_utils.py
0 → 100644
View file @
39ac40a9
# Copyright [2023-11-28] <sxc19@mails.tsinghua.edu.cn, Xingchen Song>
# 2024 Alibaba Inc (authors: Xiang Lyu)
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
torch
from
cosyvoice.transformer.activation
import
Swish
from
cosyvoice.transformer.subsampling
import
(
LinearNoSubsampling
,
EmbedinigNoSubsampling
,
Conv1dSubsampling2
,
Conv2dSubsampling4
,
Conv2dSubsampling6
,
Conv2dSubsampling8
,
)
from
cosyvoice.transformer.embedding
import
(
PositionalEncoding
,
RelPositionalEncoding
,
WhisperPositionalEncoding
,
LearnablePositionalEncoding
,
NoPositionalEncoding
)
from
cosyvoice.transformer.attention
import
(
MultiHeadedAttention
,
RelPositionMultiHeadedAttention
,
BlockRelPositionMultiHeadedAttention
)
from
cosyvoice.transformer.embedding
import
EspnetRelPositionalEncoding
from
cosyvoice.transformer.subsampling
import
LegacyLinearNoSubsampling
COSYVOICE_ACTIVATION_CLASSES
=
{
"hardtanh"
:
torch
.
nn
.
Hardtanh
,
"tanh"
:
torch
.
nn
.
Tanh
,
"relu"
:
torch
.
nn
.
ReLU
,
"selu"
:
torch
.
nn
.
SELU
,
"swish"
:
getattr
(
torch
.
nn
,
"SiLU"
,
Swish
),
"gelu"
:
torch
.
nn
.
GELU
,
}
COSYVOICE_SUBSAMPLE_CLASSES
=
{
"linear"
:
LinearNoSubsampling
,
"linear_legacy"
:
LegacyLinearNoSubsampling
,
"embed"
:
EmbedinigNoSubsampling
,
"conv1d2"
:
Conv1dSubsampling2
,
"conv2d"
:
Conv2dSubsampling4
,
"conv2d6"
:
Conv2dSubsampling6
,
"conv2d8"
:
Conv2dSubsampling8
,
'paraformer_dummy'
:
torch
.
nn
.
Identity
}
COSYVOICE_EMB_CLASSES
=
{
"embed"
:
PositionalEncoding
,
"abs_pos"
:
PositionalEncoding
,
"rel_pos"
:
RelPositionalEncoding
,
"rel_pos_espnet"
:
EspnetRelPositionalEncoding
,
"no_pos"
:
NoPositionalEncoding
,
"abs_pos_whisper"
:
WhisperPositionalEncoding
,
"embed_learnable_pe"
:
LearnablePositionalEncoding
,
}
COSYVOICE_ATTENTION_CLASSES
=
{
"selfattn"
:
MultiHeadedAttention
,
"rel_selfattn"
:
RelPositionMultiHeadedAttention
,
"block_rel_selfattn"
:
BlockRelPositionMultiHeadedAttention
,
}
third_party/GLM-4-Voice/cosyvoice/utils/common.py
0 → 100644
View file @
39ac40a9
# Copyright (c) 2020 Mobvoi Inc (Binbin Zhang)
# 2024 Alibaba Inc (authors: Xiang Lyu)
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Modified from ESPnet(https://github.com/espnet/espnet)
"""Unility functions for Transformer."""
from
typing
import
List
import
torch
IGNORE_ID
=
-
1
def
pad_list
(
xs
:
List
[
torch
.
Tensor
],
pad_value
:
int
):
"""Perform padding for the list of tensors.
Args:
xs (List): List of Tensors [(T_1, `*`), (T_2, `*`), ..., (T_B, `*`)].
pad_value (float): Value for padding.
Returns:
Tensor: Padded tensor (B, Tmax, `*`).
Examples:
>>> x = [torch.ones(4), torch.ones(2), torch.ones(1)]
>>> x
[tensor([1., 1., 1., 1.]), tensor([1., 1.]), tensor([1.])]
>>> pad_list(x, 0)
tensor([[1., 1., 1., 1.],
[1., 1., 0., 0.],
[1., 0., 0., 0.]])
"""
max_len
=
max
([
len
(
item
)
for
item
in
xs
])
batchs
=
len
(
xs
)
ndim
=
xs
[
0
].
ndim
if
ndim
==
1
:
pad_res
=
torch
.
zeros
(
batchs
,
max_len
,
dtype
=
xs
[
0
].
dtype
,
device
=
xs
[
0
].
device
)
elif
ndim
==
2
:
pad_res
=
torch
.
zeros
(
batchs
,
max_len
,
xs
[
0
].
shape
[
1
],
dtype
=
xs
[
0
].
dtype
,
device
=
xs
[
0
].
device
)
elif
ndim
==
3
:
pad_res
=
torch
.
zeros
(
batchs
,
max_len
,
xs
[
0
].
shape
[
1
],
xs
[
0
].
shape
[
2
],
dtype
=
xs
[
0
].
dtype
,
device
=
xs
[
0
].
device
)
else
:
raise
ValueError
(
f
"Unsupported ndim:
{
ndim
}
"
)
pad_res
.
fill_
(
pad_value
)
for
i
in
range
(
batchs
):
pad_res
[
i
,
:
len
(
xs
[
i
])]
=
xs
[
i
]
return
pad_res
def
th_accuracy
(
pad_outputs
:
torch
.
Tensor
,
pad_targets
:
torch
.
Tensor
,
ignore_label
:
int
)
->
torch
.
Tensor
:
"""Calculate accuracy.
Args:
pad_outputs (Tensor): Prediction tensors (B * Lmax, D).
pad_targets (LongTensor): Target label tensors (B, Lmax).
ignore_label (int): Ignore label id.
Returns:
torch.Tensor: Accuracy value (0.0 - 1.0).
"""
pad_pred
=
pad_outputs
.
view
(
pad_targets
.
size
(
0
),
pad_targets
.
size
(
1
),
pad_outputs
.
size
(
1
)).
argmax
(
2
)
mask
=
pad_targets
!=
ignore_label
numerator
=
torch
.
sum
(
pad_pred
.
masked_select
(
mask
)
==
pad_targets
.
masked_select
(
mask
))
denominator
=
torch
.
sum
(
mask
)
return
(
numerator
/
denominator
).
detach
()
def
get_padding
(
kernel_size
,
dilation
=
1
):
return
int
((
kernel_size
*
dilation
-
dilation
)
/
2
)
def
init_weights
(
m
,
mean
=
0.0
,
std
=
0.01
):
classname
=
m
.
__class__
.
__name__
if
classname
.
find
(
"Conv"
)
!=
-
1
:
m
.
weight
.
data
.
normal_
(
mean
,
std
)
third_party/GLM-4-Voice/cosyvoice/utils/executor.py
0 → 100644
View file @
39ac40a9
# Copyright (c) 2020 Mobvoi Inc (Binbin Zhang)
# 2024 Alibaba Inc (authors: Xiang Lyu)
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
logging
from
contextlib
import
nullcontext
import
os
import
torch
import
torch.distributed
as
dist
import
tqdm
from
cosyvoice.utils.train_utils
import
update_parameter_and_lr
,
log_per_step
,
log_per_save
,
batch_forward
,
batch_backward
,
save_model
,
cosyvoice_join
class
Executor
:
def
__init__
(
self
):
self
.
step
=
0
self
.
epoch
=
0
self
.
rank
=
int
(
os
.
environ
.
get
(
'RANK'
,
0
))
self
.
device
=
torch
.
device
(
'cuda:{}'
.
format
(
self
.
rank
))
def
train_one_epoc
(
self
,
model
,
optimizer
,
scheduler
,
train_data_loader
,
cv_data_loader
,
writer
,
info_dict
,
group_join
):
''' Train one epoch
'''
lr
=
optimizer
.
param_groups
[
0
][
'lr'
]
logging
.
info
(
'Epoch {} TRAIN info lr {} rank {}'
.
format
(
self
.
epoch
,
lr
,
self
.
rank
))
logging
.
info
(
'using accumulate grad, new batch size is {} times'
' larger than before'
.
format
(
info_dict
[
'accum_grad'
]))
# A context manager to be used in conjunction with an instance of
# torch.nn.parallel.DistributedDataParallel to be able to train
# with uneven inputs across participating processes.
model
.
train
()
model_context
=
model
.
join
if
info_dict
[
'train_engine'
]
==
'torch_ddp'
else
nullcontext
with
model_context
():
for
batch_idx
,
batch_dict
in
tqdm
.
tqdm
(
enumerate
(
train_data_loader
)):
# print("======== forword ========")
info_dict
[
"tag"
]
=
"TRAIN"
info_dict
[
"step"
]
=
self
.
step
info_dict
[
"epoch"
]
=
self
.
epoch
info_dict
[
"batch_idx"
]
=
batch_idx
if
cosyvoice_join
(
group_join
,
info_dict
):
break
# import pdb
# pdb.set_trace()
# Disable gradient synchronizations across DDP processes.
# Within this context, gradients will be accumulated on module
# variables, which will later be synchronized.
if
info_dict
[
'train_engine'
]
==
'torch_ddp'
and
(
batch_idx
+
1
)
%
info_dict
[
"accum_grad"
]
!=
0
:
context
=
model
.
no_sync
# Used for single gpu training and DDP gradient synchronization
# processes.
else
:
context
=
nullcontext
new_batch_dict
=
{
# "utts":batch_dict["utts"],
"speech_token"
:
batch_dict
[
"speech_token"
],
"speech_token_len"
:
batch_dict
[
"speech_token_len"
],
"speech_feat"
:
batch_dict
[
"speech_feat"
],
"speech_feat_len"
:
batch_dict
[
"speech_feat_len"
],
"embedding"
:
batch_dict
[
"embedding"
],
# "embedding":torch.zeros((batch_dict["speech_feat"].size(0),192),device=batch_dict["speech_feat"].device)
}
with
context
():
info_dict
=
batch_forward
(
model
,
new_batch_dict
,
info_dict
)
info_dict
=
batch_backward
(
model
,
info_dict
)
info_dict
=
update_parameter_and_lr
(
model
,
optimizer
,
scheduler
,
info_dict
)
log_per_step
(
writer
,
info_dict
)
# NOTE specify save_per_step in cosyvoice.yaml if you want to enable step save
if
info_dict
[
'save_per_step'
]
>
0
and
(
self
.
step
+
1
)
%
info_dict
[
'save_per_step'
]
==
0
and
(
batch_idx
+
1
)
%
info_dict
[
"accum_grad"
]
==
0
:
dist
.
barrier
()
# try:
# dist.barrier()
# except RuntimeError as e:
# logging.info('except RuntimeError as e: {}'.format(e))
self
.
cv
(
model
,
cv_data_loader
,
writer
,
info_dict
,
on_batch_end
=
False
)
model
.
train
()
if
(
batch_idx
+
1
)
%
info_dict
[
"accum_grad"
]
==
0
:
self
.
step
+=
1
dist
.
barrier
()
# try:
# dist.barrier()
# except RuntimeError as e:
# logging.info('except RuntimeError as e: {}'.format(e))
self
.
cv
(
model
,
cv_data_loader
,
writer
,
info_dict
,
on_batch_end
=
True
)
@
torch
.
inference_mode
()
def
cv
(
self
,
model
,
cv_data_loader
,
writer
,
info_dict
,
on_batch_end
=
True
):
''' Cross validation on
'''
logging
.
info
(
'Epoch {} Step {} on_batch_end {} CV rank {}'
.
format
(
self
.
epoch
,
self
.
step
+
1
,
on_batch_end
,
self
.
rank
))
model
.
eval
()
total_num_utts
,
total_loss_dict
=
0
,
{}
# avoid division by 0
for
batch_idx
,
batch_dict
in
enumerate
(
cv_data_loader
):
info_dict
[
"tag"
]
=
"CV"
info_dict
[
"step"
]
=
self
.
step
info_dict
[
"epoch"
]
=
self
.
epoch
info_dict
[
"batch_idx"
]
=
batch_idx
# num_utts = len(batch_dict["utts"])
num_utts
=
batch_dict
[
"speech_token"
].
size
(
0
)
total_num_utts
+=
num_utts
info_dict
=
batch_forward
(
model
,
batch_dict
,
info_dict
)
for
k
,
v
in
info_dict
[
'loss_dict'
].
items
():
if
k
not
in
total_loss_dict
:
total_loss_dict
[
k
]
=
[]
total_loss_dict
[
k
].
append
(
v
.
item
()
*
num_utts
)
log_per_step
(
None
,
info_dict
)
for
k
,
v
in
total_loss_dict
.
items
():
total_loss_dict
[
k
]
=
sum
(
v
)
/
total_num_utts
info_dict
[
'loss_dict'
]
=
total_loss_dict
log_per_save
(
writer
,
info_dict
)
model_name
=
'epoch_{}_whole'
.
format
(
self
.
epoch
)
if
on_batch_end
else
'epoch_{}_step_{}'
.
format
(
self
.
epoch
,
self
.
step
+
1
)
save_model
(
model
,
model_name
,
info_dict
)
third_party/GLM-4-Voice/cosyvoice/utils/file_utils.py
0 → 100644
View file @
39ac40a9
# Copyright (c) 2021 Mobvoi Inc. (authors: Binbin Zhang)
# 2024 Alibaba Inc (authors: Xiang Lyu)
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
json
import
torchaudio
def
read_lists
(
list_file
):
lists
=
[]
with
open
(
list_file
,
'r'
,
encoding
=
'utf8'
)
as
fin
:
for
line
in
fin
:
lists
.
append
(
line
.
strip
())
return
lists
def
read_json_lists
(
list_file
):
lists
=
read_lists
(
list_file
)
results
=
{}
for
fn
in
lists
:
with
open
(
fn
,
'r'
,
encoding
=
'utf8'
)
as
fin
:
results
.
update
(
json
.
load
(
fin
))
return
results
def
load_wav
(
wav
,
target_sr
):
speech
,
sample_rate
=
torchaudio
.
load
(
wav
)
speech
=
speech
.
mean
(
dim
=
0
,
keepdim
=
True
)
if
sample_rate
!=
target_sr
:
assert
sample_rate
>
target_sr
,
'wav sample rate {} must be greater than {}'
.
format
(
sample_rate
,
target_sr
)
speech
=
torchaudio
.
transforms
.
Resample
(
orig_freq
=
sample_rate
,
new_freq
=
target_sr
)(
speech
)
return
speech
def
speed_change
(
waveform
,
sample_rate
,
speed_factor
:
str
):
effects
=
[
[
"tempo"
,
speed_factor
],
# speed_factor
[
"rate"
,
f
"
{
sample_rate
}
"
]
]
augmented_waveform
,
new_sample_rate
=
torchaudio
.
sox_effects
.
apply_effects_tensor
(
waveform
,
sample_rate
,
effects
)
return
augmented_waveform
,
new_sample_rate
third_party/GLM-4-Voice/cosyvoice/utils/frontend_utils.py
0 → 100644
View file @
39ac40a9
# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu, Zhihao Du)
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
re
chinese_char_pattern
=
re
.
compile
(
r
'[\u4e00-\u9fff]+'
)
# whether contain chinese character
def
contains_chinese
(
text
):
return
bool
(
chinese_char_pattern
.
search
(
text
))
# replace special symbol
def
replace_corner_mark
(
text
):
text
=
text
.
replace
(
'²'
,
'平方'
)
text
=
text
.
replace
(
'³'
,
'立方'
)
return
text
# remove meaningless symbol
def
remove_bracket
(
text
):
text
=
text
.
replace
(
'('
,
''
).
replace
(
')'
,
''
)
text
=
text
.
replace
(
'【'
,
''
).
replace
(
'】'
,
''
)
text
=
text
.
replace
(
'`'
,
''
).
replace
(
'`'
,
''
)
text
=
text
.
replace
(
"——"
,
" "
)
return
text
# spell Arabic numerals
def
spell_out_number
(
text
:
str
,
inflect_parser
):
new_text
=
[]
st
=
None
for
i
,
c
in
enumerate
(
text
):
if
not
c
.
isdigit
():
if
st
is
not
None
:
num_str
=
inflect_parser
.
number_to_words
(
text
[
st
:
i
])
new_text
.
append
(
num_str
)
st
=
None
new_text
.
append
(
c
)
else
:
if
st
is
None
:
st
=
i
if
st
is
not
None
and
st
<
len
(
text
):
num_str
=
inflect_parser
.
number_to_words
(
text
[
st
:])
new_text
.
append
(
num_str
)
return
''
.
join
(
new_text
)
# split paragrah logic:
# 1. per sentence max len token_max_n, min len token_min_n, merge if last sentence len less than merge_len
# 2. cal sentence len according to lang
# 3. split sentence according to puncatation
def
split_paragraph
(
text
:
str
,
tokenize
,
lang
=
"zh"
,
token_max_n
=
80
,
token_min_n
=
60
,
merge_len
=
20
,
comma_split
=
False
):
def
calc_utt_length
(
_text
:
str
):
if
lang
==
"zh"
:
return
len
(
_text
)
else
:
return
len
(
tokenize
(
_text
))
def
should_merge
(
_text
:
str
):
if
lang
==
"zh"
:
return
len
(
_text
)
<
merge_len
else
:
return
len
(
tokenize
(
_text
))
<
merge_len
if
lang
==
"zh"
:
pounc
=
[
'。'
,
'?'
,
'!'
,
';'
,
':'
,
'、'
,
'.'
,
'?'
,
'!'
,
';'
]
else
:
pounc
=
[
'.'
,
'?'
,
'!'
,
';'
,
':'
]
if
comma_split
:
pounc
.
extend
([
','
,
','
])
st
=
0
utts
=
[]
for
i
,
c
in
enumerate
(
text
):
if
c
in
pounc
:
if
len
(
text
[
st
:
i
])
>
0
:
utts
.
append
(
text
[
st
:
i
]
+
c
)
if
i
+
1
<
len
(
text
)
and
text
[
i
+
1
]
in
[
'"'
,
'”'
]:
tmp
=
utts
.
pop
(
-
1
)
utts
.
append
(
tmp
+
text
[
i
+
1
])
st
=
i
+
2
else
:
st
=
i
+
1
if
len
(
utts
)
==
0
:
if
lang
==
"zh"
:
utts
.
append
(
text
+
'。'
)
else
:
utts
.
append
(
text
+
'.'
)
final_utts
=
[]
cur_utt
=
""
for
utt
in
utts
:
if
calc_utt_length
(
cur_utt
+
utt
)
>
token_max_n
and
calc_utt_length
(
cur_utt
)
>
token_min_n
:
final_utts
.
append
(
cur_utt
)
cur_utt
=
""
cur_utt
=
cur_utt
+
utt
if
len
(
cur_utt
)
>
0
:
if
should_merge
(
cur_utt
)
and
len
(
final_utts
)
!=
0
:
final_utts
[
-
1
]
=
final_utts
[
-
1
]
+
cur_utt
else
:
final_utts
.
append
(
cur_utt
)
return
final_utts
# remove blank between chinese character
def
replace_blank
(
text
:
str
):
out_str
=
[]
for
i
,
c
in
enumerate
(
text
):
if
c
==
" "
:
if
((
text
[
i
+
1
].
isascii
()
and
text
[
i
+
1
]
!=
" "
)
and
(
text
[
i
-
1
].
isascii
()
and
text
[
i
-
1
]
!=
" "
)):
out_str
.
append
(
c
)
else
:
out_str
.
append
(
c
)
return
""
.
join
(
out_str
)
Prev
1
…
3
4
5
6
7
8
9
10
11
…
22
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment