Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
ModelZoo
sambert-hifigan_pytorch
Commits
ee10550a
Commit
ee10550a
authored
Feb 06, 2024
by
liugh5
Browse files
Initial commit
parents
Pipeline
#790
canceled with stages
Changes
197
Pipelines
1
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
4257 additions
and
0 deletions
+4257
-0
kantts/models/sambert/kantts_sambert_divide.py
kantts/models/sambert/kantts_sambert_divide.py
+884
-0
kantts/models/sambert/positions.py
kantts/models/sambert/positions.py
+98
-0
kantts/models/utils.py
kantts/models/utils.py
+23
-0
kantts/preprocess/__init__.py
kantts/preprocess/__init__.py
+0
-0
kantts/preprocess/__pycache__/__init__.cpython-38.pyc
kantts/preprocess/__pycache__/__init__.cpython-38.pyc
+0
-0
kantts/preprocess/__pycache__/fp_processor.cpython-38.pyc
kantts/preprocess/__pycache__/fp_processor.cpython-38.pyc
+0
-0
kantts/preprocess/audio_processor/__init__.py
kantts/preprocess/audio_processor/__init__.py
+0
-0
kantts/preprocess/audio_processor/__pycache__/__init__.cpython-38.pyc
...ocess/audio_processor/__pycache__/__init__.cpython-38.pyc
+0
-0
kantts/preprocess/audio_processor/__pycache__/audio_processor.cpython-38.pyc
...udio_processor/__pycache__/audio_processor.cpython-38.pyc
+0
-0
kantts/preprocess/audio_processor/audio_processor.py
kantts/preprocess/audio_processor/audio_processor.py
+791
-0
kantts/preprocess/audio_processor/core/__init__.py
kantts/preprocess/audio_processor/core/__init__.py
+0
-0
kantts/preprocess/audio_processor/core/__pycache__/__init__.cpython-38.pyc
.../audio_processor/core/__pycache__/__init__.cpython-38.pyc
+0
-0
kantts/preprocess/audio_processor/core/__pycache__/dsp.cpython-38.pyc
...ocess/audio_processor/core/__pycache__/dsp.cpython-38.pyc
+0
-0
kantts/preprocess/audio_processor/core/__pycache__/utils.cpython-38.pyc
...ess/audio_processor/core/__pycache__/utils.cpython-38.pyc
+0
-0
kantts/preprocess/audio_processor/core/dsp.py
kantts/preprocess/audio_processor/core/dsp.py
+237
-0
kantts/preprocess/audio_processor/core/utils.py
kantts/preprocess/audio_processor/core/utils.py
+555
-0
kantts/preprocess/data_process.py
kantts/preprocess/data_process.py
+246
-0
kantts/preprocess/fp_processor.py
kantts/preprocess/fp_processor.py
+158
-0
kantts/preprocess/languages/PinYin/En2ChPhoneMap.txt
kantts/preprocess/languages/PinYin/En2ChPhoneMap.txt
+2
-0
kantts/preprocess/languages/PinYin/PhoneSet.xml
kantts/preprocess/languages/PinYin/PhoneSet.xml
+1263
-0
No files found.
kantts/models/sambert/kantts_sambert_divide.py
0 → 100644
View file @
ee10550a
import
torch
import
torch.nn
as
nn
import
torch.nn.functional
as
F
from
kantts.models.sambert
import
FFTBlock
,
PNCABlock
,
Prenet
from
kantts.models.sambert.positions
import
(
SinusoidalPositionEncoder
,
DurSinusoidalPositionEncoder
,
)
from
kantts.models.sambert.adaptors
import
(
LengthRegulator
,
VarFsmnRnnNARPredictor
,
VarRnnARPredictor
,
)
from
kantts.models.sambert.fsmn
import
FsmnEncoderV2
from
kantts.models.sambert.alignment
import
b_mas
from
kantts.models.sambert.attention
import
ConvAttention
from
kantts.models.utils
import
get_mask_from_lengths
class
SelfAttentionEncoder
(
nn
.
Module
):
def
__init__
(
self
,
n_layer
,
d_in
,
d_model
,
n_head
,
d_head
,
d_inner
,
dropout
,
dropout_att
,
dropout_relu
,
position_encoder
,
):
super
(
SelfAttentionEncoder
,
self
).
__init__
()
self
.
d_in
=
d_in
self
.
d_model
=
d_model
self
.
dropout
=
dropout
d_in_lst
=
[
d_in
]
+
[
d_model
]
*
(
n_layer
-
1
)
self
.
fft
=
nn
.
ModuleList
(
[
FFTBlock
(
d
,
d_model
,
n_head
,
d_head
,
d_inner
,
(
3
,
1
),
dropout
,
dropout_att
,
dropout_relu
,
)
for
d
in
d_in_lst
]
)
self
.
ln
=
nn
.
LayerNorm
(
d_model
,
eps
=
1e-6
)
self
.
position_enc
=
position_encoder
def
forward
(
self
,
input
,
mask
=
None
,
return_attns
=
False
):
input
*=
self
.
d_model
**
0.5
if
isinstance
(
self
.
position_enc
,
SinusoidalPositionEncoder
):
input
=
self
.
position_enc
(
input
)
else
:
raise
NotImplementedError
input
=
F
.
dropout
(
input
,
p
=
self
.
dropout
,
training
=
self
.
training
)
enc_slf_attn_list
=
[]
max_len
=
input
.
size
(
1
)
if
mask
is
not
None
:
slf_attn_mask
=
mask
.
unsqueeze
(
1
).
expand
(
-
1
,
max_len
,
-
1
)
else
:
slf_attn_mask
=
None
enc_output
=
input
for
id
,
layer
in
enumerate
(
self
.
fft
):
enc_output
,
enc_slf_attn
=
layer
(
enc_output
,
mask
=
mask
,
slf_attn_mask
=
slf_attn_mask
)
if
return_attns
:
enc_slf_attn_list
+=
[
enc_slf_attn
]
enc_output
=
self
.
ln
(
enc_output
)
return
enc_output
,
enc_slf_attn_list
class
HybridAttentionDecoder
(
nn
.
Module
):
def
__init__
(
self
,
d_in
,
prenet_units
,
n_layer
,
d_model
,
d_mem
,
n_head
,
d_head
,
d_inner
,
dropout
,
dropout_att
,
dropout_relu
,
d_out
,
):
super
(
HybridAttentionDecoder
,
self
).
__init__
()
self
.
d_model
=
d_model
self
.
dropout
=
dropout
self
.
prenet
=
Prenet
(
d_in
,
prenet_units
,
d_model
)
self
.
dec_in_proj
=
nn
.
Linear
(
d_model
+
d_mem
,
d_model
)
self
.
pnca
=
nn
.
ModuleList
(
[
PNCABlock
(
d_model
,
d_mem
,
n_head
,
d_head
,
d_inner
,
(
1
,
1
),
dropout
,
dropout_att
,
dropout_relu
,
)
for
_
in
range
(
n_layer
)
]
)
self
.
ln
=
nn
.
LayerNorm
(
d_model
,
eps
=
1e-6
)
self
.
dec_out_proj
=
nn
.
Linear
(
d_model
,
d_out
)
def
reset_state
(
self
):
for
layer
in
self
.
pnca
:
layer
.
reset_state
()
def
get_pnca_attn_mask
(
self
,
device
,
max_len
,
x_band_width
,
h_band_width
,
masks
=
None
):
if
masks
is
not
None
:
pnca_attn_mask
=
masks
.
unsqueeze
(
1
).
expand
(
-
1
,
max_len
,
-
1
)
else
:
pnca_attn_mask
=
None
range_
=
torch
.
arange
(
max_len
).
to
(
device
)
x_start
=
torch
.
clamp_min
(
range_
-
x_band_width
,
0
)[
None
,
None
,
:]
x_end
=
(
range_
+
1
)[
None
,
None
,
:]
h_start
=
range_
[
None
,
None
,
:]
h_end
=
torch
.
clamp_max
(
range_
+
h_band_width
+
1
,
max_len
+
1
)[
None
,
None
,
:]
pnca_x_attn_mask
=
~
(
(
x_start
<=
range_
[
None
,
:,
None
])
&
(
x_end
>
range_
[
None
,
:,
None
])
).
transpose
(
1
,
2
)
pnca_h_attn_mask
=
~
(
(
h_start
<=
range_
[
None
,
:,
None
])
&
(
h_end
>
range_
[
None
,
:,
None
])
).
transpose
(
1
,
2
)
if
pnca_attn_mask
is
not
None
:
pnca_x_attn_mask
=
pnca_x_attn_mask
|
pnca_attn_mask
pnca_h_attn_mask
=
pnca_h_attn_mask
|
pnca_attn_mask
pnca_x_attn_mask
=
pnca_x_attn_mask
.
masked_fill
(
pnca_attn_mask
.
transpose
(
1
,
2
),
False
)
pnca_h_attn_mask
=
pnca_h_attn_mask
.
masked_fill
(
pnca_attn_mask
.
transpose
(
1
,
2
),
False
)
return
pnca_attn_mask
,
pnca_x_attn_mask
,
pnca_h_attn_mask
# must call reset_state before
def
forward
(
self
,
input
,
memory
,
x_band_width
,
h_band_width
,
masks
=
None
,
return_attns
=
False
):
input
=
self
.
prenet
(
input
)
input
=
torch
.
cat
([
memory
,
input
],
dim
=-
1
)
input
=
self
.
dec_in_proj
(
input
)
if
masks
is
not
None
:
input
=
input
.
masked_fill
(
masks
.
unsqueeze
(
-
1
),
0
)
input
*=
self
.
d_model
**
0.5
input
=
F
.
dropout
(
input
,
p
=
self
.
dropout
,
training
=
self
.
training
)
max_len
=
input
.
size
(
1
)
pnca_attn_mask
,
pnca_x_attn_mask
,
pnca_h_attn_mask
=
self
.
get_pnca_attn_mask
(
input
.
device
,
max_len
,
x_band_width
,
h_band_width
,
masks
)
dec_pnca_attn_x_list
=
[]
dec_pnca_attn_h_list
=
[]
dec_output
=
input
for
id
,
layer
in
enumerate
(
self
.
pnca
):
dec_output
,
dec_pnca_attn_x
,
dec_pnca_attn_h
=
layer
(
dec_output
,
memory
,
masks
=
masks
,
pnca_x_attn_mask
=
pnca_x_attn_mask
,
pnca_h_attn_mask
=
pnca_h_attn_mask
,
)
if
return_attns
:
dec_pnca_attn_x_list
+=
[
dec_pnca_attn_x
]
dec_pnca_attn_h_list
+=
[
dec_pnca_attn_h
]
dec_output
=
self
.
ln
(
dec_output
)
dec_output
=
self
.
dec_out_proj
(
dec_output
)
return
dec_output
,
dec_pnca_attn_x_list
,
dec_pnca_attn_h_list
# must call reset_state before when step == 0
def
infer
(
self
,
step
,
input
,
memory
,
x_band_width
,
h_band_width
,
masks
=
None
,
return_attns
=
False
,
):
max_len
=
memory
.
size
(
1
)
input
=
self
.
prenet
(
input
)
input
=
torch
.
cat
([
memory
[:,
step
:
step
+
1
,
:],
input
],
dim
=-
1
)
input
=
self
.
dec_in_proj
(
input
)
input
*=
self
.
d_model
**
0.5
input
=
F
.
dropout
(
input
,
p
=
self
.
dropout
,
training
=
self
.
training
)
pnca_attn_mask
,
pnca_x_attn_mask
,
pnca_h_attn_mask
=
self
.
get_pnca_attn_mask
(
input
.
device
,
max_len
,
x_band_width
,
h_band_width
,
masks
)
dec_pnca_attn_x_list
=
[]
dec_pnca_attn_h_list
=
[]
dec_output
=
input
for
id
,
layer
in
enumerate
(
self
.
pnca
):
if
masks
is
not
None
:
mask_step
=
masks
[:,
step
:
step
+
1
]
else
:
mask_step
=
None
dec_output
,
dec_pnca_attn_x
,
dec_pnca_attn_h
=
layer
(
dec_output
,
memory
,
mask
=
mask_step
,
pnca_x_attn_mask
=
pnca_x_attn_mask
[:,
step
:
step
+
1
,
:
(
step
+
1
)],
pnca_h_attn_mask
=
pnca_h_attn_mask
[:,
step
:
step
+
1
,
:],
)
if
return_attns
:
dec_pnca_attn_x_list
+=
[
dec_pnca_attn_x
]
dec_pnca_attn_h_list
+=
[
dec_pnca_attn_h
]
dec_output
=
self
.
ln
(
dec_output
)
dec_output
=
self
.
dec_out_proj
(
dec_output
)
return
dec_output
,
dec_pnca_attn_x_list
,
dec_pnca_attn_h_list
class
TextFftEncoder
(
nn
.
Module
):
def
__init__
(
self
,
config
):
super
(
TextFftEncoder
,
self
).
__init__
()
d_emb
=
config
[
"embedding_dim"
]
self
.
using_byte
=
False
if
config
.
get
(
"using_byte"
,
False
):
self
.
using_byte
=
True
nb_ling_byte_index
=
config
[
"byte_index"
]
self
.
byte_index_emb
=
nn
.
Embedding
(
nb_ling_byte_index
,
d_emb
)
else
:
# linguistic unit lookup table
nb_ling_sy
=
config
[
"sy"
]
nb_ling_tone
=
config
[
"tone"
]
nb_ling_syllable_flag
=
config
[
"syllable_flag"
]
nb_ling_ws
=
config
[
"word_segment"
]
self
.
sy_emb
=
nn
.
Embedding
(
nb_ling_sy
,
d_emb
)
self
.
tone_emb
=
nn
.
Embedding
(
nb_ling_tone
,
d_emb
)
self
.
syllable_flag_emb
=
nn
.
Embedding
(
nb_ling_syllable_flag
,
d_emb
)
self
.
ws_emb
=
nn
.
Embedding
(
nb_ling_ws
,
d_emb
)
max_len
=
config
[
"max_len"
]
nb_layers
=
config
[
"encoder_num_layers"
]
nb_heads
=
config
[
"encoder_num_heads"
]
d_model
=
config
[
"encoder_num_units"
]
d_head
=
d_model
//
nb_heads
d_inner
=
config
[
"encoder_ffn_inner_dim"
]
dropout
=
config
[
"encoder_dropout"
]
dropout_attn
=
config
[
"encoder_attention_dropout"
]
dropout_relu
=
config
[
"encoder_relu_dropout"
]
d_proj
=
config
[
"encoder_projection_units"
]
self
.
d_model
=
d_model
position_enc
=
SinusoidalPositionEncoder
(
max_len
,
d_emb
)
self
.
ling_enc
=
SelfAttentionEncoder
(
nb_layers
,
d_emb
,
d_model
,
nb_heads
,
d_head
,
d_inner
,
dropout
,
dropout_attn
,
dropout_relu
,
position_enc
,
)
self
.
ling_proj
=
nn
.
Linear
(
d_model
,
d_proj
,
bias
=
False
)
def
forward
(
self
,
inputs_ling
,
masks
=
None
,
return_attns
=
False
):
# Parse inputs_ling_seq
if
self
.
using_byte
:
inputs_byte_index
=
inputs_ling
[:,
:,
0
]
byte_index_embedding
=
self
.
byte_index_emb
(
inputs_byte_index
)
ling_embedding
=
byte_index_embedding
else
:
inputs_sy
=
inputs_ling
[:,
:,
0
]
inputs_tone
=
inputs_ling
[:,
:,
1
]
inputs_syllable_flag
=
inputs_ling
[:,
:,
2
]
inputs_ws
=
inputs_ling
[:,
:,
3
]
# Lookup table
sy_embedding
=
self
.
sy_emb
(
inputs_sy
)
tone_embedding
=
self
.
tone_emb
(
inputs_tone
)
syllable_flag_embedding
=
self
.
syllable_flag_emb
(
inputs_syllable_flag
)
ws_embedding
=
self
.
ws_emb
(
inputs_ws
)
ling_embedding
=
(
sy_embedding
+
tone_embedding
+
syllable_flag_embedding
+
ws_embedding
)
enc_output
,
enc_slf_attn_lst
=
self
.
ling_enc
(
ling_embedding
,
masks
,
return_attns
)
if
hasattr
(
self
,
"ling_proj"
):
enc_output
=
self
.
ling_proj
(
enc_output
)
return
enc_output
,
enc_slf_attn_lst
,
ling_embedding
class
TextEncoder
(
nn
.
Module
):
def
__init__
(
self
,
config
):
super
(
TextEncoder
,
self
).
__init__
()
self
.
text_encoder
=
TextFftEncoder
(
config
)
self
.
se_enable
=
config
.
get
(
"SE"
,
False
)
if
not
self
.
se_enable
:
self
.
spk_tokenizer
=
nn
.
Embedding
(
config
[
"speaker"
],
config
[
"speaker_units"
])
self
.
emo_tokenizer
=
nn
.
Embedding
(
config
[
"emotion"
],
config
[
"emotion_units"
])
# self.variance_adaptor = VarianceAdaptor(config)
# self.mel_decoder = MelPNCADecoder(config)
# self.mel_postnet = PostNet(config)
self
.
MAS
=
False
if
config
.
get
(
"MAS"
,
False
):
self
.
MAS
=
True
self
.
align_attention
=
ConvAttention
(
n_mel_channels
=
config
[
"num_mels"
],
n_text_channels
=
config
[
"embedding_dim"
],
n_att_channels
=
config
[
"num_mels"
],
)
self
.
fp_enable
=
config
.
get
(
"FP"
,
False
)
if
self
.
fp_enable
:
self
.
FP_predictor
=
FP_Predictor
(
config
)
def
forward
(
self
,
inputs_ling
,
inputs_emotion
,
inputs_speaker
,
inputs_ling_masks
=
None
,
return_attns
=
False
):
text_hid
,
enc_sla_attn_lst
,
ling_embedding
=
self
.
text_encoder
(
inputs_ling
,
inputs_ling_masks
,
return_attns
)
emo_hid
=
self
.
emo_tokenizer
(
inputs_emotion
)
spk_hid
=
inputs_speaker
if
self
.
se_enable
else
self
.
spk_tokenizer
(
inputs_speaker
)
if
return_attns
:
return
text_hid
,
enc_sla_attn_lst
,
ling_embedding
,
emo_hid
,
spk_hid
else
:
return
text_hid
,
ling_embedding
,
emo_hid
,
spk_hid
class
VarianceAdaptor
(
nn
.
Module
):
def
__init__
(
self
,
config
):
super
(
VarianceAdaptor
,
self
).
__init__
()
input_dim
=
(
config
[
"encoder_projection_units"
]
+
config
[
"emotion_units"
]
+
config
[
"speaker_units"
]
)
filter_size
=
config
[
"predictor_filter_size"
]
fsmn_num_layers
=
config
[
"predictor_fsmn_num_layers"
]
num_memory_units
=
config
[
"predictor_num_memory_units"
]
ffn_inner_dim
=
config
[
"predictor_ffn_inner_dim"
]
dropout
=
config
[
"predictor_dropout"
]
shift
=
config
[
"predictor_shift"
]
lstm_units
=
config
[
"predictor_lstm_units"
]
dur_pred_prenet_units
=
config
[
"dur_pred_prenet_units"
]
dur_pred_lstm_units
=
config
[
"dur_pred_lstm_units"
]
self
.
pitch_predictor
=
VarFsmnRnnNARPredictor
(
input_dim
,
filter_size
,
fsmn_num_layers
,
num_memory_units
,
ffn_inner_dim
,
dropout
,
shift
,
lstm_units
,
)
self
.
energy_predictor
=
VarFsmnRnnNARPredictor
(
input_dim
,
filter_size
,
fsmn_num_layers
,
num_memory_units
,
ffn_inner_dim
,
dropout
,
shift
,
lstm_units
,
)
self
.
duration_predictor
=
VarRnnARPredictor
(
input_dim
,
dur_pred_prenet_units
,
dur_pred_lstm_units
)
self
.
length_regulator
=
LengthRegulator
(
config
[
"outputs_per_step"
])
self
.
dur_position_encoder
=
DurSinusoidalPositionEncoder
(
config
[
"encoder_projection_units"
],
config
[
"outputs_per_step"
]
)
self
.
pitch_emb
=
nn
.
Conv1d
(
1
,
config
[
"encoder_projection_units"
],
kernel_size
=
9
,
padding
=
4
)
self
.
energy_emb
=
nn
.
Conv1d
(
1
,
config
[
"encoder_projection_units"
],
kernel_size
=
9
,
padding
=
4
)
def
forward
(
self
,
inputs_text_embedding
,
inputs_emo_embedding
,
inputs_spk_embedding
,
# [1,20,192]
masks
=
None
,
output_masks
=
None
,
duration_targets
=
None
,
pitch_targets
=
None
,
energy_targets
=
None
,
):
batch_size
=
inputs_text_embedding
.
size
(
0
)
variance_predictor_inputs
=
torch
.
cat
(
[
inputs_text_embedding
,
inputs_spk_embedding
,
inputs_emo_embedding
],
dim
=-
1
)
pitch_predictions
=
self
.
pitch_predictor
(
variance_predictor_inputs
,
masks
)
energy_predictions
=
self
.
energy_predictor
(
variance_predictor_inputs
,
masks
)
if
pitch_targets
is
not
None
:
pitch_embeddings
=
self
.
pitch_emb
(
pitch_targets
.
unsqueeze
(
1
)).
transpose
(
1
,
2
)
else
:
pitch_embeddings
=
self
.
pitch_emb
(
pitch_predictions
.
unsqueeze
(
1
)).
transpose
(
1
,
2
)
if
energy_targets
is
not
None
:
energy_embeddings
=
self
.
energy_emb
(
energy_targets
.
unsqueeze
(
1
)).
transpose
(
1
,
2
)
else
:
energy_embeddings
=
self
.
energy_emb
(
energy_predictions
.
unsqueeze
(
1
)).
transpose
(
1
,
2
)
inputs_text_embedding_aug
=
(
inputs_text_embedding
+
pitch_embeddings
+
energy_embeddings
)
duration_predictor_cond
=
torch
.
cat
(
[
inputs_text_embedding_aug
,
inputs_spk_embedding
,
inputs_emo_embedding
],
dim
=-
1
,
)
if
duration_targets
is
not
None
:
duration_predictor_go_frame
=
torch
.
zeros
(
batch_size
,
1
).
to
(
inputs_text_embedding
.
device
)
duration_predictor_input
=
torch
.
cat
(
[
duration_predictor_go_frame
,
duration_targets
[:,
:
-
1
].
float
()],
dim
=-
1
)
duration_predictor_input
=
torch
.
log
(
duration_predictor_input
+
1
)
log_duration_predictions
,
_
=
self
.
duration_predictor
(
duration_predictor_input
.
unsqueeze
(
-
1
),
duration_predictor_cond
,
masks
=
masks
,
)
duration_predictions
=
torch
.
exp
(
log_duration_predictions
)
-
1
else
:
log_duration_predictions
=
self
.
duration_predictor
.
infer
(
duration_predictor_cond
,
masks
=
masks
)
duration_predictions
=
torch
.
exp
(
log_duration_predictions
)
-
1
if
duration_targets
is
not
None
:
LR_text_outputs
,
LR_length_rounded
=
self
.
length_regulator
(
inputs_text_embedding_aug
,
duration_targets
,
masks
=
output_masks
)
LR_position_embeddings
=
self
.
dur_position_encoder
(
duration_targets
,
masks
=
output_masks
)
LR_emo_outputs
,
_
=
self
.
length_regulator
(
inputs_emo_embedding
,
duration_targets
,
masks
=
output_masks
)
LR_spk_outputs
,
_
=
self
.
length_regulator
(
inputs_spk_embedding
,
duration_targets
,
masks
=
output_masks
)
else
:
LR_text_outputs
,
LR_length_rounded
=
self
.
length_regulator
(
inputs_text_embedding_aug
,
duration_predictions
,
masks
=
output_masks
)
LR_position_embeddings
=
self
.
dur_position_encoder
(
duration_predictions
,
masks
=
output_masks
)
LR_emo_outputs
,
_
=
self
.
length_regulator
(
inputs_emo_embedding
,
duration_predictions
,
masks
=
output_masks
)
LR_spk_outputs
,
_
=
self
.
length_regulator
(
inputs_spk_embedding
,
duration_predictions
,
masks
=
output_masks
)
LR_text_outputs
=
LR_text_outputs
+
LR_position_embeddings
return
(
LR_text_outputs
,
LR_emo_outputs
,
LR_spk_outputs
,
# [1,153,192]
LR_length_rounded
,
log_duration_predictions
,
pitch_predictions
,
energy_predictions
,
)
class
VarianceAdaptor2
(
nn
.
Module
):
def
__init__
(
self
,
config
):
super
(
VarianceAdaptor2
,
self
).
__init__
()
input_dim
=
(
config
[
"encoder_projection_units"
]
+
config
[
"emotion_units"
]
+
config
[
"speaker_units"
]
)
filter_size
=
config
[
"predictor_filter_size"
]
fsmn_num_layers
=
config
[
"predictor_fsmn_num_layers"
]
num_memory_units
=
config
[
"predictor_num_memory_units"
]
ffn_inner_dim
=
config
[
"predictor_ffn_inner_dim"
]
dropout
=
config
[
"predictor_dropout"
]
shift
=
config
[
"predictor_shift"
]
lstm_units
=
config
[
"predictor_lstm_units"
]
dur_pred_prenet_units
=
config
[
"dur_pred_prenet_units"
]
dur_pred_lstm_units
=
config
[
"dur_pred_lstm_units"
]
self
.
pitch_predictor
=
VarFsmnRnnNARPredictor
(
input_dim
,
filter_size
,
fsmn_num_layers
,
num_memory_units
,
ffn_inner_dim
,
dropout
,
shift
,
lstm_units
,
)
self
.
energy_predictor
=
VarFsmnRnnNARPredictor
(
input_dim
,
filter_size
,
fsmn_num_layers
,
num_memory_units
,
ffn_inner_dim
,
dropout
,
shift
,
lstm_units
,
)
self
.
duration_predictor
=
VarRnnARPredictor
(
input_dim
,
dur_pred_prenet_units
,
dur_pred_lstm_units
)
self
.
length_regulator
=
LengthRegulator
(
config
[
"outputs_per_step"
])
self
.
dur_position_encoder
=
DurSinusoidalPositionEncoder
(
config
[
"encoder_projection_units"
],
config
[
"outputs_per_step"
]
)
self
.
pitch_emb
=
nn
.
Conv1d
(
1
,
config
[
"encoder_projection_units"
],
kernel_size
=
9
,
padding
=
4
)
self
.
energy_emb
=
nn
.
Conv1d
(
1
,
config
[
"encoder_projection_units"
],
kernel_size
=
9
,
padding
=
4
)
def
forward
(
self
,
inputs_text_embedding
,
inputs_emo_embedding
,
inputs_spk_embedding
,
# [1,20,192]
scale
=
1.0
,
masks
=
None
,
output_masks
=
None
,
duration_targets
=
None
,
pitch_targets
=
None
,
energy_targets
=
None
,
):
batch_size
=
inputs_text_embedding
.
size
(
0
)
variance_predictor_inputs
=
torch
.
cat
(
[
inputs_text_embedding
,
inputs_spk_embedding
,
inputs_emo_embedding
],
dim
=-
1
)
pitch_predictions
=
self
.
pitch_predictor
(
variance_predictor_inputs
,
masks
)
energy_predictions
=
self
.
energy_predictor
(
variance_predictor_inputs
,
masks
)
if
pitch_targets
is
not
None
:
pitch_embeddings
=
self
.
pitch_emb
(
pitch_targets
.
unsqueeze
(
1
)).
transpose
(
1
,
2
)
else
:
pitch_embeddings
=
self
.
pitch_emb
(
pitch_predictions
.
unsqueeze
(
1
)).
transpose
(
1
,
2
)
if
energy_targets
is
not
None
:
energy_embeddings
=
self
.
energy_emb
(
energy_targets
.
unsqueeze
(
1
)).
transpose
(
1
,
2
)
else
:
energy_embeddings
=
self
.
energy_emb
(
energy_predictions
.
unsqueeze
(
1
)).
transpose
(
1
,
2
)
inputs_text_embedding_aug
=
(
inputs_text_embedding
+
pitch_embeddings
+
energy_embeddings
)
duration_predictor_cond
=
torch
.
cat
(
[
inputs_text_embedding_aug
,
inputs_spk_embedding
,
inputs_emo_embedding
],
dim
=-
1
,
)
if
duration_targets
is
not
None
:
duration_predictor_go_frame
=
torch
.
zeros
(
batch_size
,
1
).
to
(
inputs_text_embedding
.
device
)
duration_predictor_input
=
torch
.
cat
(
[
duration_predictor_go_frame
,
duration_targets
[:,
:
-
1
].
float
()],
dim
=-
1
)
duration_predictor_input
=
torch
.
log
(
duration_predictor_input
+
1
)
log_duration_predictions
,
_
=
self
.
duration_predictor
(
duration_predictor_input
.
unsqueeze
(
-
1
),
duration_predictor_cond
,
masks
=
masks
,
)
duration_predictions
=
torch
.
exp
(
log_duration_predictions
)
-
1
else
:
log_duration_predictions
=
self
.
duration_predictor
.
infer
(
duration_predictor_cond
,
masks
=
masks
)
duration_predictions
=
torch
.
exp
(
log_duration_predictions
)
-
1
if
duration_targets
is
not
None
:
LR_text_outputs
,
LR_length_rounded
=
self
.
length_regulator
(
inputs_text_embedding_aug
,
duration_targets
*
scale
,
masks
=
output_masks
# *scale
)
LR_position_embeddings
=
self
.
dur_position_encoder
(
duration_targets
,
masks
=
output_masks
)
LR_emo_outputs
,
_
=
self
.
length_regulator
(
inputs_emo_embedding
,
duration_targets
*
scale
,
masks
=
output_masks
# *scale
)
LR_spk_outputs
,
_
=
self
.
length_regulator
(
inputs_spk_embedding
,
duration_targets
*
scale
,
masks
=
output_masks
# *scale
)
else
:
LR_text_outputs
,
LR_length_rounded
=
self
.
length_regulator
(
inputs_text_embedding_aug
,
duration_predictions
*
scale
,
masks
=
output_masks
# *scale
)
LR_position_embeddings
=
self
.
dur_position_encoder
(
duration_predictions
*
scale
,
masks
=
output_masks
# *target_rate
)
LR_emo_outputs
,
_
=
self
.
length_regulator
(
inputs_emo_embedding
,
duration_predictions
*
scale
,
masks
=
output_masks
# *scale
)
LR_spk_outputs
,
_
=
self
.
length_regulator
(
inputs_spk_embedding
,
duration_predictions
*
scale
,
masks
=
output_masks
# *scale
)
LR_text_outputs
=
LR_text_outputs
+
LR_position_embeddings
return
(
LR_text_outputs
,
LR_emo_outputs
,
LR_spk_outputs
,
# [1,153,192]
LR_length_rounded
,
log_duration_predictions
,
pitch_predictions
,
energy_predictions
,
)
class
MelPNCADecoder
(
nn
.
Module
):
def
__init__
(
self
,
config
):
super
(
MelPNCADecoder
,
self
).
__init__
()
prenet_units
=
config
[
"decoder_prenet_units"
]
nb_layers
=
config
[
"decoder_num_layers"
]
nb_heads
=
config
[
"decoder_num_heads"
]
d_model
=
config
[
"decoder_num_units"
]
d_head
=
d_model
//
nb_heads
d_inner
=
config
[
"decoder_ffn_inner_dim"
]
dropout
=
config
[
"decoder_dropout"
]
dropout_attn
=
config
[
"decoder_attention_dropout"
]
dropout_relu
=
config
[
"decoder_relu_dropout"
]
outputs_per_step
=
config
[
"outputs_per_step"
]
d_mem
=
(
config
[
"encoder_projection_units"
]
*
outputs_per_step
+
config
[
"emotion_units"
]
+
config
[
"speaker_units"
]
)
d_mel
=
config
[
"num_mels"
]
self
.
d_mel
=
d_mel
self
.
r
=
outputs_per_step
self
.
nb_layers
=
nb_layers
self
.
mel_dec
=
HybridAttentionDecoder
(
d_mel
,
prenet_units
,
nb_layers
,
d_model
,
d_mem
,
nb_heads
,
d_head
,
d_inner
,
dropout
,
dropout_attn
,
dropout_relu
,
d_mel
*
outputs_per_step
,
)
def
forward
(
self
,
memory
,
x_band_width
,
h_band_width
,
target
=
None
,
masks
=
None
,
return_attns
=
False
,
):
batch_size
=
memory
.
size
(
0
)
go_frame
=
torch
.
zeros
((
batch_size
,
1
,
self
.
d_mel
)).
to
(
memory
.
device
)
if
target
is
not
None
:
self
.
mel_dec
.
reset_state
()
input
=
target
[:,
self
.
r
-
1
::
self
.
r
,
:]
input
=
torch
.
cat
([
go_frame
,
input
],
dim
=
1
)[:,
:
-
1
,
:]
dec_output
,
dec_pnca_attn_x_list
,
dec_pnca_attn_h_list
=
self
.
mel_dec
(
input
,
memory
,
x_band_width
,
h_band_width
,
masks
=
masks
,
return_attns
=
return_attns
,
)
else
:
dec_output
=
[]
dec_pnca_attn_x_list
=
[[]
for
_
in
range
(
self
.
nb_layers
)]
dec_pnca_attn_h_list
=
[[]
for
_
in
range
(
self
.
nb_layers
)]
self
.
mel_dec
.
reset_state
()
input
=
go_frame
for
step
in
range
(
memory
.
size
(
1
)):
(
dec_output_step
,
dec_pnca_attn_x_step
,
dec_pnca_attn_h_step
,
)
=
self
.
mel_dec
.
infer
(
step
,
input
,
memory
,
x_band_width
,
h_band_width
,
masks
=
masks
,
return_attns
=
return_attns
,
)
input
=
dec_output_step
[:,
:,
-
self
.
d_mel
:]
dec_output
.
append
(
dec_output_step
)
for
layer_id
,
(
pnca_x_attn
,
pnca_h_attn
)
in
enumerate
(
zip
(
dec_pnca_attn_x_step
,
dec_pnca_attn_h_step
)
):
left
=
memory
.
size
(
1
)
-
pnca_x_attn
.
size
(
-
1
)
if
left
>
0
:
padding
=
torch
.
zeros
((
pnca_x_attn
.
size
(
0
),
1
,
left
)).
to
(
pnca_x_attn
)
pnca_x_attn
=
torch
.
cat
([
pnca_x_attn
,
padding
],
dim
=-
1
)
dec_pnca_attn_x_list
[
layer_id
].
append
(
pnca_x_attn
)
dec_pnca_attn_h_list
[
layer_id
].
append
(
pnca_h_attn
)
dec_output
=
torch
.
cat
(
dec_output
,
dim
=
1
)
if
return_attns
:
for
layer_id
in
range
(
self
.
nb_layers
):
dec_pnca_attn_x_list
[
layer_id
]
=
torch
.
cat
(
dec_pnca_attn_x_list
[
layer_id
],
dim
=
1
)
dec_pnca_attn_h_list
[
layer_id
]
=
torch
.
cat
(
dec_pnca_attn_h_list
[
layer_id
],
dim
=
1
)
if
return_attns
:
return
dec_output
,
dec_pnca_attn_x_list
,
dec_pnca_attn_h_list
else
:
return
dec_output
class
PostNet
(
nn
.
Module
):
def
__init__
(
self
,
config
):
super
(
PostNet
,
self
).
__init__
()
self
.
filter_size
=
config
[
"postnet_filter_size"
]
self
.
fsmn_num_layers
=
config
[
"postnet_fsmn_num_layers"
]
self
.
num_memory_units
=
config
[
"postnet_num_memory_units"
]
self
.
ffn_inner_dim
=
config
[
"postnet_ffn_inner_dim"
]
self
.
dropout
=
config
[
"postnet_dropout"
]
self
.
shift
=
config
[
"postnet_shift"
]
self
.
lstm_units
=
config
[
"postnet_lstm_units"
]
self
.
num_mels
=
config
[
"num_mels"
]
self
.
fsmn
=
FsmnEncoderV2
(
self
.
filter_size
,
self
.
fsmn_num_layers
,
self
.
num_mels
,
self
.
num_memory_units
,
self
.
ffn_inner_dim
,
self
.
dropout
,
self
.
shift
,
)
self
.
lstm
=
nn
.
LSTM
(
self
.
num_memory_units
,
self
.
lstm_units
,
num_layers
=
1
,
batch_first
=
True
)
self
.
fc
=
nn
.
Linear
(
self
.
lstm_units
,
self
.
num_mels
)
def
forward
(
self
,
x
,
mask
=
None
):
postnet_fsmn_output
=
self
.
fsmn
(
x
,
mask
)
# The input can also be a packed variable length sequence,
# here we just omit it for simpliciy due to the mask and uni-directional lstm.
postnet_lstm_output
,
_
=
self
.
lstm
(
postnet_fsmn_output
)
mel_residual_output
=
self
.
fc
(
postnet_lstm_output
)
return
mel_residual_output
class
FP_Predictor
(
nn
.
Module
):
def
__init__
(
self
,
config
):
super
(
FP_Predictor
,
self
).
__init__
()
self
.
w_1
=
nn
.
Conv1d
(
config
[
"encoder_projection_units"
],
config
[
"embedding_dim"
]
//
2
,
kernel_size
=
3
,
padding
=
1
,
)
self
.
w_2
=
nn
.
Conv1d
(
config
[
"embedding_dim"
]
//
2
,
config
[
"encoder_projection_units"
],
kernel_size
=
1
,
padding
=
0
,
)
self
.
layer_norm1
=
nn
.
LayerNorm
(
config
[
"embedding_dim"
]
//
2
,
eps
=
1e-6
)
self
.
layer_norm2
=
nn
.
LayerNorm
(
config
[
"encoder_projection_units"
],
eps
=
1e-6
)
self
.
dropout_inner
=
nn
.
Dropout
(
0.1
)
self
.
dropout
=
nn
.
Dropout
(
0.1
)
self
.
fc
=
nn
.
Linear
(
config
[
"encoder_projection_units"
],
4
)
def
forward
(
self
,
x
):
x
=
x
.
transpose
(
1
,
2
)
x
=
F
.
relu
(
self
.
w_1
(
x
))
x
=
x
.
transpose
(
1
,
2
)
x
=
self
.
dropout_inner
(
self
.
layer_norm1
(
x
))
x
=
x
.
transpose
(
1
,
2
)
x
=
F
.
relu
(
self
.
w_2
(
x
))
x
=
x
.
transpose
(
1
,
2
)
x
=
self
.
dropout
(
self
.
layer_norm2
(
x
))
output
=
F
.
softmax
(
self
.
fc
(
x
),
dim
=
2
)
return
output
\ No newline at end of file
kantts/models/sambert/positions.py
0 → 100644
View file @
ee10550a
import
torch
import
torch.nn
as
nn
import
torch.nn.functional
as
F
import
numpy
as
np
class
SinusoidalPositionEncoder
(
nn
.
Module
):
def
__init__
(
self
,
max_len
,
depth
):
super
(
SinusoidalPositionEncoder
,
self
).
__init__
()
self
.
max_len
=
max_len
self
.
depth
=
depth
self
.
position_enc
=
nn
.
Parameter
(
self
.
get_sinusoid_encoding_table
(
max_len
,
depth
).
unsqueeze
(
0
),
requires_grad
=
False
,
)
def
forward
(
self
,
input
):
bz_in
,
len_in
,
_
=
input
.
size
()
if
len_in
>
self
.
max_len
:
self
.
max_len
=
len_in
self
.
position_enc
.
data
=
(
self
.
get_sinusoid_encoding_table
(
self
.
max_len
,
self
.
depth
)
.
unsqueeze
(
0
)
.
to
(
input
.
device
)
)
output
=
input
+
self
.
position_enc
[:,
:
len_in
,
:].
expand
(
bz_in
,
-
1
,
-
1
)
return
output
@
staticmethod
def
get_sinusoid_encoding_table
(
n_position
,
d_hid
,
padding_idx
=
None
):
""" Sinusoid position encoding table """
def
cal_angle
(
position
,
hid_idx
):
return
position
/
np
.
power
(
10000
,
hid_idx
/
float
(
d_hid
/
2
-
1
))
def
get_posi_angle_vec
(
position
):
return
[
cal_angle
(
position
,
hid_j
)
for
hid_j
in
range
(
d_hid
//
2
)]
scaled_time_table
=
np
.
array
(
[
get_posi_angle_vec
(
pos_i
+
1
)
for
pos_i
in
range
(
n_position
)]
)
sinusoid_table
=
np
.
zeros
((
n_position
,
d_hid
))
sinusoid_table
[:,
:
d_hid
//
2
]
=
np
.
sin
(
scaled_time_table
)
sinusoid_table
[:,
d_hid
//
2
:]
=
np
.
cos
(
scaled_time_table
)
if
padding_idx
is
not
None
:
# zero vector for padding dimension
sinusoid_table
[
padding_idx
]
=
0.0
return
torch
.
FloatTensor
(
sinusoid_table
)
class
DurSinusoidalPositionEncoder
(
nn
.
Module
):
def
__init__
(
self
,
depth
,
outputs_per_step
):
super
(
DurSinusoidalPositionEncoder
,
self
).
__init__
()
self
.
depth
=
depth
self
.
outputs_per_step
=
outputs_per_step
inv_timescales
=
[
np
.
power
(
10000
,
2
*
(
hid_idx
//
2
)
/
depth
)
for
hid_idx
in
range
(
depth
)
]
self
.
inv_timescales
=
nn
.
Parameter
(
torch
.
FloatTensor
(
inv_timescales
),
requires_grad
=
False
)
def
forward
(
self
,
durations
,
masks
=
None
):
reps
=
(
durations
+
0.5
).
long
()
output_lens
=
reps
.
sum
(
dim
=
1
)
max_len
=
output_lens
.
max
()
reps_cumsum
=
torch
.
cumsum
(
F
.
pad
(
reps
.
float
(),
(
1
,
0
,
0
,
0
),
value
=
0.0
),
dim
=
1
)[
:,
None
,
:
]
range_
=
torch
.
arange
(
max_len
).
to
(
durations
.
device
)[
None
,
:,
None
]
mult
=
(
reps_cumsum
[:,
:,
:
-
1
]
<=
range_
)
&
(
reps_cumsum
[:,
:,
1
:]
>
range_
)
mult
=
mult
.
float
()
offsets
=
torch
.
matmul
(
mult
,
reps_cumsum
[:,
0
,
:
-
1
].
unsqueeze
(
-
1
)).
squeeze
(
-
1
)
dur_pos
=
range_
[:,
:,
0
]
-
offsets
+
1
if
masks
is
not
None
:
assert
masks
.
size
(
1
)
==
dur_pos
.
size
(
1
)
dur_pos
=
dur_pos
.
masked_fill
(
masks
,
0.0
)
seq_len
=
dur_pos
.
size
(
1
)
padding
=
self
.
outputs_per_step
-
int
(
seq_len
)
%
self
.
outputs_per_step
if
padding
<
self
.
outputs_per_step
:
dur_pos
=
F
.
pad
(
dur_pos
,
(
0
,
padding
,
0
,
0
),
value
=
0.0
)
position_embedding
=
dur_pos
[:,
:,
None
]
/
self
.
inv_timescales
[
None
,
None
,
:]
position_embedding
[:,
:,
0
::
2
]
=
torch
.
sin
(
position_embedding
[:,
:,
0
::
2
])
position_embedding
[:,
:,
1
::
2
]
=
torch
.
cos
(
position_embedding
[:,
:,
1
::
2
])
return
position_embedding
kantts/models/utils.py
0 → 100644
View file @
ee10550a
import
torch
from
distutils.version
import
LooseVersion
is_pytorch_17plus
=
LooseVersion
(
torch
.
__version__
)
>=
LooseVersion
(
"1.7"
)
def
init_weights
(
m
,
mean
=
0.0
,
std
=
0.01
):
classname
=
m
.
__class__
.
__name__
if
classname
.
find
(
"Conv"
)
!=
-
1
:
m
.
weight
.
data
.
normal_
(
mean
,
std
)
def
get_mask_from_lengths
(
lengths
,
max_len
=
None
):
batch_size
=
lengths
.
shape
[
0
]
if
max_len
is
None
:
max_len
=
torch
.
max
(
lengths
).
item
()
ids
=
(
torch
.
arange
(
0
,
max_len
).
unsqueeze
(
0
).
expand
(
batch_size
,
-
1
).
to
(
lengths
.
device
)
)
mask
=
ids
>=
lengths
.
unsqueeze
(
1
).
expand
(
-
1
,
max_len
)
return
mask
kantts/preprocess/__init__.py
0 → 100644
View file @
ee10550a
kantts/preprocess/__pycache__/__init__.cpython-38.pyc
0 → 100644
View file @
ee10550a
File added
kantts/preprocess/__pycache__/fp_processor.cpython-38.pyc
0 → 100644
View file @
ee10550a
File added
kantts/preprocess/audio_processor/__init__.py
0 → 100644
View file @
ee10550a
kantts/preprocess/audio_processor/__pycache__/__init__.cpython-38.pyc
0 → 100644
View file @
ee10550a
File added
kantts/preprocess/audio_processor/__pycache__/audio_processor.cpython-38.pyc
0 → 100644
View file @
ee10550a
File added
kantts/preprocess/audio_processor/audio_processor.py
0 → 100644
View file @
ee10550a
import
os
import
numpy
as
np
from
glob
import
glob
from
tqdm
import
tqdm
from
concurrent.futures
import
ProcessPoolExecutor
import
argparse
import
yaml
import
logging
from
.core.utils
import
(
volume_normalize
,
get_pitch
,
get_energy
,
align_length
,
compute_mean
,
compute_std
,
f0_norm_mean_std
,
norm_mean_std
,
parse_interval_file
,
average_by_duration
,
encode_16bits
,
)
from
.core.dsp
import
(
melspectrogram
,
load_wav
,
trim_silence
,
trim_silence_with_interval
,
save_wav
,
)
logging
.
basicConfig
(
format
=
"%(asctime)s %(levelname)-4s [%(filename)s:%(lineno)d] %(message)s"
,
datefmt
=
"%Y-%m-%d:%H:%M:%S"
,
level
=
logging
.
DEBUG
,
)
default_audio_config
=
{
# Preprocess
"wav_normalize"
:
True
,
"trim_silence"
:
True
,
"trim_silence_threshold_db"
:
60
,
"preemphasize"
:
False
,
# Feature extraction
"sampling_rate"
:
24000
,
"hop_length"
:
240
,
"win_length"
:
1024
,
"n_mels"
:
80
,
"n_fft"
:
1024
,
"fmin"
:
50.0
,
"fmax"
:
7600.0
,
"min_level_db"
:
-
100
,
"ref_level_db"
:
20
,
"phone_level_feature"
:
True
,
"num_workers"
:
16
,
# Normalization
"norm_type"
:
"mean_std"
,
# 'mean_std', 'global norm'
"max_norm"
:
1.0
,
"symmetric"
:
False
,
}
class
AudioProcessor
:
def
__init__
(
self
,
config
=
None
):
# TODO: Add more audio processing methods.
if
not
isinstance
(
config
,
dict
):
logging
.
warning
(
"[AudioProcessor] config is not a dict, fall into default config."
)
self
.
config
=
default_audio_config
else
:
self
.
config
=
config
for
key
in
self
.
config
:
setattr
(
self
,
key
,
self
.
config
[
key
])
self
.
min_wav_length
=
int
(
self
.
config
[
"sampling_rate"
]
*
0.5
)
self
.
badcase_list
=
[]
self
.
pcm_dict
=
{}
self
.
mel_dict
=
{}
self
.
f0_dict
=
{}
self
.
uv_dict
=
{}
self
.
nccf_dict
=
{}
self
.
f0uv_dict
=
{}
self
.
energy_dict
=
{}
self
.
dur_dict
=
{}
logging
.
info
(
"[AudioProcessor] Initialize AudioProcessor."
)
logging
.
info
(
"[AudioProcessor] config params:"
)
for
key
in
self
.
config
:
logging
.
info
(
"[AudioProcessor] %s: %s"
,
key
,
self
.
config
[
key
])
def
calibrate_SyllableDuration
(
self
,
raw_dur_dir
,
raw_metafile
,
out_cali_duration_dir
):
with
open
(
raw_metafile
,
"r"
)
as
f
:
lines
=
f
.
readlines
()
output_dur_dir
=
out_cali_duration_dir
os
.
makedirs
(
output_dur_dir
,
exist_ok
=
True
)
for
line
in
lines
:
line
=
line
.
strip
()
index
,
symbols
=
line
.
split
(
"
\t
"
)
symbols
=
[
symbol
.
strip
(
"{"
).
strip
(
"}"
).
split
(
"$"
)[
0
]
for
symbol
in
symbols
.
strip
().
split
(
" "
)
]
dur_file
=
os
.
path
.
join
(
raw_dur_dir
,
index
+
".npy"
)
phone_file
=
os
.
path
.
join
(
raw_dur_dir
,
index
+
".phone"
)
if
not
os
.
path
.
exists
(
dur_file
)
or
not
os
.
path
.
exists
(
phone_file
):
logging
.
warning
(
"[AudioProcessor] dur file or phone file not exists: %s"
,
index
)
continue
with
open
(
phone_file
,
"r"
)
as
f
:
phones
=
f
.
readlines
()
dur
=
np
.
load
(
dur_file
)
cali_duration
=
[]
dur_idx
=
0
syll_idx
=
0
while
dur_idx
<
len
(
dur
)
and
syll_idx
<
len
(
symbols
):
if
phones
[
dur_idx
].
strip
()
==
"sil"
:
dur_idx
+=
1
continue
if
phones
[
dur_idx
].
strip
()
==
"sp"
and
symbols
[
syll_idx
][
0
]
!=
"#"
:
dur_idx
+=
1
continue
if
symbols
[
syll_idx
]
in
[
"ga"
,
"go"
,
"ge"
]:
cali_duration
.
append
(
0
)
syll_idx
+=
1
# print("NONE", symbols[syll_idx], 0)
continue
if
symbols
[
syll_idx
][
0
]
==
"#"
:
if
phones
[
dur_idx
].
strip
()
!=
"sp"
:
cali_duration
.
append
(
0
)
# print("NONE", symbols[syll_idx], 0)
syll_idx
+=
1
continue
else
:
cali_duration
.
append
(
dur
[
dur_idx
])
# print(phones[dur_idx].strip(), symbols[syll_idx], dur[dur_idx])
dur_idx
+=
1
syll_idx
+=
1
continue
# A corresponding phone is found
cali_duration
.
append
(
dur
[
dur_idx
])
# print(phones[dur_idx].strip(), symbols[syll_idx], dur[dur_idx])
dur_idx
+=
1
syll_idx
+=
1
# Add #4 phone duration
cali_duration
.
append
(
0
)
if
len
(
cali_duration
)
!=
len
(
symbols
):
logging
.
error
(
"[Duration Calibrating] Syllable duration {}
\
is not equal to the number of symbols {}, index: {}"
.
format
(
len
(
cali_duration
),
len
(
symbols
),
index
)
)
continue
# Align with mel frames
durs
=
np
.
array
(
cali_duration
)
if
len
(
self
.
mel_dict
)
>
0
:
pair_mel
=
self
.
mel_dict
.
get
(
index
,
None
)
if
pair_mel
is
None
:
logging
.
warning
(
"[AudioProcessor] Interval file %s has no corresponding mel"
,
index
,
)
continue
mel_frames
=
pair_mel
.
shape
[
0
]
dur_frames
=
np
.
sum
(
durs
)
if
np
.
sum
(
durs
)
>
mel_frames
:
durs
[
-
2
]
-=
dur_frames
-
mel_frames
elif
np
.
sum
(
durs
)
<
mel_frames
:
durs
[
-
2
]
+=
mel_frames
-
np
.
sum
(
durs
)
if
durs
[
-
2
]
<
0
:
logging
.
error
(
"[AudioProcessor] Duration calibrating failed for %s, mismatch frames %s"
,
index
,
durs
[
-
2
],
)
self
.
badcase_list
.
append
(
index
)
continue
self
.
dur_dict
[
index
]
=
durs
np
.
save
(
os
.
path
.
join
(
output_dur_dir
,
index
+
".npy"
),
self
.
dur_dict
[
index
])
def
amp_normalize
(
self
,
src_wav_dir
,
out_wav_dir
):
if
self
.
wav_normalize
:
logging
.
info
(
"[AudioProcessor] Amplitude normalization started"
)
os
.
makedirs
(
out_wav_dir
,
exist_ok
=
True
)
res
=
volume_normalize
(
src_wav_dir
,
out_wav_dir
)
logging
.
info
(
"[AudioProcessor] Amplitude normalization finished"
)
return
res
else
:
logging
.
info
(
"[AudioProcessor] No amplitude normalization"
)
os
.
symlink
(
src_wav_dir
,
out_wav_dir
,
target_is_directory
=
True
)
return
True
def
get_pcm_dict
(
self
,
src_wav_dir
):
wav_list
=
glob
(
os
.
path
.
join
(
src_wav_dir
,
"*.wav"
))
if
len
(
self
.
pcm_dict
)
>
0
:
return
self
.
pcm_dict
logging
.
info
(
"[AudioProcessor] Start to load pcm from %s"
,
src_wav_dir
)
with
ProcessPoolExecutor
(
max_workers
=
self
.
num_workers
)
as
executor
,
tqdm
(
total
=
len
(
wav_list
)
)
as
progress
:
futures
=
[]
for
wav_path
in
wav_list
:
future
=
executor
.
submit
(
load_wav
,
wav_path
,
self
.
sampling_rate
)
future
.
add_done_callback
(
lambda
p
:
progress
.
update
())
wav_name
=
os
.
path
.
splitext
(
os
.
path
.
basename
(
wav_path
))[
0
]
futures
.
append
((
future
,
wav_name
))
for
future
,
wav_name
in
futures
:
pcm
=
future
.
result
()
if
len
(
pcm
)
<
self
.
min_wav_length
:
logging
.
warning
(
"[AudioProcessor] %s is too short, skip"
,
wav_name
)
self
.
badcase_list
.
append
(
wav_name
)
continue
self
.
pcm_dict
[
wav_name
]
=
pcm
return
self
.
pcm_dict
def
trim_silence_wav
(
self
,
src_wav_dir
,
out_wav_dir
=
None
):
wav_list
=
glob
(
os
.
path
.
join
(
src_wav_dir
,
"*.wav"
))
logging
.
info
(
"[AudioProcessor] Trim silence started"
)
if
out_wav_dir
is
None
:
out_wav_dir
=
src_wav_dir
else
:
os
.
makedirs
(
out_wav_dir
,
exist_ok
=
True
)
pcm_dict
=
self
.
get_pcm_dict
(
src_wav_dir
)
with
ProcessPoolExecutor
(
max_workers
=
self
.
num_workers
)
as
executor
,
tqdm
(
total
=
len
(
wav_list
)
)
as
progress
:
futures
=
[]
for
wav_basename
,
pcm_data
in
pcm_dict
.
items
():
future
=
executor
.
submit
(
trim_silence
,
pcm_data
,
self
.
trim_silence_threshold_db
,
self
.
hop_length
,
self
.
win_length
,
)
future
.
add_done_callback
(
lambda
p
:
progress
.
update
())
futures
.
append
((
future
,
wav_basename
))
# TODO: multi-processing
for
future
,
wav_basename
in
tqdm
(
futures
):
pcm
=
future
.
result
()
if
len
(
pcm
)
<
self
.
min_wav_length
:
logging
.
warning
(
"[AudioProcessor] %s is too short, skip"
,
wav_basename
)
self
.
badcase_list
.
append
(
wav_basename
)
self
.
pcm_dict
.
pop
(
wav_basename
)
continue
self
.
pcm_dict
[
wav_basename
]
=
pcm
save_wav
(
self
.
pcm_dict
[
wav_basename
],
os
.
path
.
join
(
out_wav_dir
,
wav_basename
+
".wav"
),
self
.
sampling_rate
,
)
logging
.
info
(
"[AudioProcessor] Trim silence finished"
)
return
True
def
trim_silence_wav_with_interval
(
self
,
src_wav_dir
,
dur_dir
,
out_wav_dir
=
None
):
wav_list
=
glob
(
os
.
path
.
join
(
src_wav_dir
,
"*.wav"
))
logging
.
info
(
"[AudioProcessor] Trim silence with interval started"
)
if
out_wav_dir
is
None
:
out_wav_dir
=
src_wav_dir
else
:
os
.
makedirs
(
out_wav_dir
,
exist_ok
=
True
)
pcm_dict
=
self
.
get_pcm_dict
(
src_wav_dir
)
with
ProcessPoolExecutor
(
max_workers
=
self
.
num_workers
)
as
executor
,
tqdm
(
total
=
len
(
wav_list
)
)
as
progress
:
futures
=
[]
for
wav_basename
,
pcm_data
in
pcm_dict
.
items
():
future
=
executor
.
submit
(
trim_silence_with_interval
,
pcm_data
,
self
.
dur_dict
.
get
(
wav_basename
,
None
),
self
.
hop_length
,
)
future
.
add_done_callback
(
lambda
p
:
progress
.
update
())
futures
.
append
((
future
,
wav_basename
))
# TODO: multi-processing
for
future
,
wav_basename
in
tqdm
(
futures
):
trimed_pcm
=
future
.
result
()
if
trimed_pcm
is
None
:
continue
if
len
(
trimed_pcm
)
<
self
.
min_wav_length
:
logging
.
warning
(
"[AudioProcessor] %s is too short, skip"
,
wav_basename
)
self
.
badcase_list
.
append
(
wav_basename
)
self
.
pcm_dict
.
pop
(
wav_basename
)
continue
self
.
pcm_dict
[
wav_basename
]
=
trimed_pcm
save_wav
(
self
.
pcm_dict
[
wav_basename
],
os
.
path
.
join
(
out_wav_dir
,
wav_basename
+
".wav"
),
self
.
sampling_rate
,
)
logging
.
info
(
"[AudioProcessor] Trim silence finished"
)
return
True
def
mel_extract
(
self
,
src_wav_dir
,
out_feature_dir
):
os
.
makedirs
(
out_feature_dir
,
exist_ok
=
True
)
wav_list
=
glob
(
os
.
path
.
join
(
src_wav_dir
,
"*.wav"
))
pcm_dict
=
self
.
get_pcm_dict
(
src_wav_dir
)
logging
.
info
(
"[AudioProcessor] Melspec extraction started"
)
# Get global normed mel spec
with
ProcessPoolExecutor
(
max_workers
=
self
.
num_workers
)
as
executor
,
tqdm
(
total
=
len
(
wav_list
)
)
as
progress
:
futures
=
[]
for
wav_basename
,
pcm_data
in
pcm_dict
.
items
():
future
=
executor
.
submit
(
melspectrogram
,
pcm_data
,
self
.
sampling_rate
,
self
.
n_fft
,
self
.
hop_length
,
self
.
win_length
,
self
.
n_mels
,
self
.
max_norm
,
self
.
min_level_db
,
self
.
ref_level_db
,
self
.
fmin
,
self
.
fmax
,
self
.
symmetric
,
self
.
preemphasize
,
)
future
.
add_done_callback
(
lambda
p
:
progress
.
update
())
futures
.
append
((
future
,
wav_basename
))
for
future
,
wav_basename
in
futures
:
result
=
future
.
result
()
if
result
is
None
:
logging
.
warning
(
"[AudioProcessor] Melspec extraction failed for %s"
,
wav_basename
,
)
self
.
badcase_list
.
append
(
wav_basename
)
else
:
melspec
=
result
self
.
mel_dict
[
wav_basename
]
=
melspec
logging
.
info
(
"[AudioProcessor] Melspec extraction finished"
)
# FIXME: is this step necessary?
# Do mean std norm on global-normed melspec
logging
.
info
(
"Melspec statistic proceeding..."
)
mel_mean
=
compute_mean
(
list
(
self
.
mel_dict
.
values
()),
dims
=
self
.
n_mels
)
mel_std
=
compute_std
(
list
(
self
.
mel_dict
.
values
()),
mel_mean
,
dims
=
self
.
n_mels
)
logging
.
info
(
"Melspec statistic done"
)
np
.
savetxt
(
os
.
path
.
join
(
out_feature_dir
,
"mel_mean.txt"
),
mel_mean
,
fmt
=
"%.6f"
)
np
.
savetxt
(
os
.
path
.
join
(
out_feature_dir
,
"mel_std.txt"
),
mel_std
,
fmt
=
"%.6f"
)
logging
.
info
(
"[AudioProcessor] melspec mean and std saved to:
\n
{},
\n
{}"
.
format
(
os
.
path
.
join
(
out_feature_dir
,
"mel_mean.txt"
),
os
.
path
.
join
(
out_feature_dir
,
"mel_std.txt"
),
)
)
logging
.
info
(
"[AudioProcessor] Melspec mean std norm is proceeding..."
)
for
wav_basename
in
self
.
mel_dict
:
melspec
=
self
.
mel_dict
[
wav_basename
]
norm_melspec
=
norm_mean_std
(
melspec
,
mel_mean
,
mel_std
)
np
.
save
(
os
.
path
.
join
(
out_feature_dir
,
wav_basename
+
".npy"
),
norm_melspec
)
logging
.
info
(
"[AudioProcessor] Melspec normalization finished"
)
logging
.
info
(
"[AudioProcessor] Normed Melspec saved to %s"
,
out_feature_dir
)
return
True
# TODO: some dataset may have no interval label
def
duration_generate
(
self
,
src_interval_dir
,
out_feature_dir
):
os
.
makedirs
(
out_feature_dir
,
exist_ok
=
True
)
interval_list
=
glob
(
os
.
path
.
join
(
src_interval_dir
,
"*.interval"
))
logging
.
info
(
"[AudioProcessor] Duration generation started"
)
with
ProcessPoolExecutor
(
max_workers
=
self
.
num_workers
)
as
executor
,
tqdm
(
total
=
len
(
interval_list
)
)
as
progress
:
futures
=
[]
for
interval_file_path
in
interval_list
:
future
=
executor
.
submit
(
parse_interval_file
,
interval_file_path
,
self
.
sampling_rate
,
self
.
hop_length
,
)
future
.
add_done_callback
(
lambda
p
:
progress
.
update
())
futures
.
append
(
(
future
,
os
.
path
.
splitext
(
os
.
path
.
basename
(
interval_file_path
))[
0
])
)
logging
.
info
(
"[AudioProcessor] Duration align with mel is proceeding..."
)
for
future
,
wav_basename
in
futures
:
result
=
future
.
result
()
if
result
is
None
:
logging
.
warning
(
"[AudioProcessor] Duration generate failed for %s"
,
wav_basename
)
self
.
badcase_list
.
append
(
wav_basename
)
else
:
durs
,
phone_list
=
result
# Algin length with melspec
if
len
(
self
.
mel_dict
)
>
0
:
pair_mel
=
self
.
mel_dict
.
get
(
wav_basename
,
None
)
if
pair_mel
is
None
:
logging
.
warning
(
"[AudioProcessor] Interval file %s has no corresponding mel"
,
wav_basename
,
)
continue
mel_frames
=
pair_mel
.
shape
[
0
]
dur_frames
=
np
.
sum
(
durs
)
if
np
.
sum
(
durs
)
>
mel_frames
:
durs
[
-
1
]
-=
dur_frames
-
mel_frames
elif
np
.
sum
(
durs
)
<
mel_frames
:
durs
[
-
1
]
+=
mel_frames
-
np
.
sum
(
durs
)
if
durs
[
-
1
]
<
0
:
logging
.
error
(
"[AudioProcessor] Duration align failed for %s, mismatch frames %s"
,
wav_basename
,
durs
[
-
1
],
)
self
.
badcase_list
.
append
(
wav_basename
)
continue
self
.
dur_dict
[
wav_basename
]
=
durs
np
.
save
(
os
.
path
.
join
(
out_feature_dir
,
wav_basename
+
".npy"
),
durs
)
with
open
(
os
.
path
.
join
(
out_feature_dir
,
wav_basename
+
".phone"
),
"w"
)
as
f
:
f
.
write
(
"
\n
"
.
join
(
phone_list
))
logging
.
info
(
"[AudioProcessor] Duration generate finished"
)
return
True
def
pitch_extract
(
self
,
src_wav_dir
,
out_f0_dir
,
out_frame_f0_dir
,
out_frame_uv_dir
):
os
.
makedirs
(
out_f0_dir
,
exist_ok
=
True
)
os
.
makedirs
(
out_frame_f0_dir
,
exist_ok
=
True
)
os
.
makedirs
(
out_frame_uv_dir
,
exist_ok
=
True
)
wav_list
=
glob
(
os
.
path
.
join
(
src_wav_dir
,
"*.wav"
))
pcm_dict
=
self
.
get_pcm_dict
(
src_wav_dir
)
mel_dict
=
self
.
mel_dict
logging
.
info
(
"[AudioProcessor] Pitch extraction started"
)
# Get raw pitch
with
ProcessPoolExecutor
(
max_workers
=
self
.
num_workers
)
as
executor
,
tqdm
(
total
=
len
(
wav_list
)
)
as
progress
:
futures
=
[]
for
wav_basename
,
pcm_data
in
pcm_dict
.
items
():
future
=
executor
.
submit
(
get_pitch
,
encode_16bits
(
pcm_data
),
self
.
sampling_rate
,
self
.
hop_length
,
)
future
.
add_done_callback
(
lambda
p
:
progress
.
update
())
futures
.
append
((
future
,
wav_basename
))
logging
.
info
(
"[AudioProcessor] Pitch align with mel is proceeding..."
)
for
future
,
wav_basename
in
futures
:
result
=
future
.
result
()
if
result
is
None
:
logging
.
warning
(
"[AudioProcessor] Pitch extraction failed for %s"
,
wav_basename
)
self
.
badcase_list
.
append
(
wav_basename
)
else
:
f0
,
uv
,
f0uv
=
result
if
len
(
mel_dict
)
>
0
:
f0
=
align_length
(
f0
,
mel_dict
.
get
(
wav_basename
,
None
))
uv
=
align_length
(
uv
,
mel_dict
.
get
(
wav_basename
,
None
))
f0uv
=
align_length
(
f0uv
,
mel_dict
.
get
(
wav_basename
,
None
))
if
f0
is
None
or
uv
is
None
or
f0uv
is
None
:
logging
.
warning
(
"[AudioProcessor] Pitch length mismatch with mel in %s"
,
wav_basename
,
)
self
.
badcase_list
.
append
(
wav_basename
)
continue
self
.
f0_dict
[
wav_basename
]
=
f0
self
.
uv_dict
[
wav_basename
]
=
uv
self
.
f0uv_dict
[
wav_basename
]
=
f0uv
# Normalize f0
logging
.
info
(
"[AudioProcessor] Pitch normalization is proceeding..."
)
f0_mean
=
compute_mean
(
list
(
self
.
f0uv_dict
.
values
()),
dims
=
1
)
f0_std
=
compute_std
(
list
(
self
.
f0uv_dict
.
values
()),
f0_mean
,
dims
=
1
)
np
.
savetxt
(
os
.
path
.
join
(
out_f0_dir
,
"f0_mean.txt"
),
f0_mean
,
fmt
=
"%.6f"
)
np
.
savetxt
(
os
.
path
.
join
(
out_f0_dir
,
"f0_std.txt"
),
f0_std
,
fmt
=
"%.6f"
)
logging
.
info
(
"[AudioProcessor] f0 mean and std saved to:
\n
{},
\n
{}"
.
format
(
os
.
path
.
join
(
out_f0_dir
,
"f0_mean.txt"
),
os
.
path
.
join
(
out_f0_dir
,
"f0_std.txt"
),
)
)
logging
.
info
(
"[AudioProcessor] Pitch mean std norm is proceeding..."
)
for
wav_basename
in
self
.
f0uv_dict
:
f0
=
self
.
f0uv_dict
[
wav_basename
]
norm_f0
=
f0_norm_mean_std
(
f0
,
f0_mean
,
f0_std
)
self
.
f0uv_dict
[
wav_basename
]
=
norm_f0
for
wav_basename
in
self
.
f0_dict
:
f0
=
self
.
f0_dict
[
wav_basename
]
norm_f0
=
f0_norm_mean_std
(
f0
,
f0_mean
,
f0_std
)
self
.
f0_dict
[
wav_basename
]
=
norm_f0
# save frame f0 to a specific dir
for
wav_basename
in
self
.
f0_dict
:
np
.
save
(
os
.
path
.
join
(
out_frame_f0_dir
,
wav_basename
+
".npy"
),
self
.
f0_dict
[
wav_basename
].
reshape
(
-
1
),
)
for
wav_basename
in
self
.
uv_dict
:
np
.
save
(
os
.
path
.
join
(
out_frame_uv_dir
,
wav_basename
+
".npy"
),
self
.
uv_dict
[
wav_basename
].
reshape
(
-
1
),
)
# phone level average
# if there is no duration then save the frame-level f0
if
self
.
phone_level_feature
and
len
(
self
.
dur_dict
)
>
0
:
logging
.
info
(
"[AudioProcessor] Pitch turn to phone-level is proceeding..."
)
with
ProcessPoolExecutor
(
max_workers
=
self
.
num_workers
)
as
executor
,
tqdm
(
total
=
len
(
self
.
f0uv_dict
)
)
as
progress
:
futures
=
[]
for
wav_basename
in
self
.
f0uv_dict
:
future
=
executor
.
submit
(
average_by_duration
,
self
.
f0uv_dict
.
get
(
wav_basename
,
None
),
self
.
dur_dict
.
get
(
wav_basename
,
None
),
)
future
.
add_done_callback
(
lambda
p
:
progress
.
update
())
futures
.
append
((
future
,
wav_basename
))
for
future
,
wav_basename
in
futures
:
result
=
future
.
result
()
if
result
is
None
:
logging
.
warning
(
"[AudioProcessor] Pitch extraction failed in phone level avg for: %s"
,
wav_basename
,
)
self
.
badcase_list
.
append
(
wav_basename
)
else
:
avg_f0
=
result
self
.
f0uv_dict
[
wav_basename
]
=
avg_f0
for
wav_basename
in
self
.
f0uv_dict
:
np
.
save
(
os
.
path
.
join
(
out_f0_dir
,
wav_basename
+
".npy"
),
self
.
f0uv_dict
[
wav_basename
].
reshape
(
-
1
),
)
logging
.
info
(
"[AudioProcessor] Pitch normalization finished"
)
logging
.
info
(
"[AudioProcessor] Normed f0 saved to %s"
,
out_f0_dir
)
logging
.
info
(
"[AudioProcessor] Pitch extraction finished"
)
return
True
def
energy_extract
(
self
,
src_wav_dir
,
out_energy_dir
,
out_frame_energy_dir
):
os
.
makedirs
(
out_energy_dir
,
exist_ok
=
True
)
os
.
makedirs
(
out_frame_energy_dir
,
exist_ok
=
True
)
wav_list
=
glob
(
os
.
path
.
join
(
src_wav_dir
,
"*.wav"
))
pcm_dict
=
self
.
get_pcm_dict
(
src_wav_dir
)
mel_dict
=
self
.
mel_dict
logging
.
info
(
"[AudioProcessor] Energy extraction started"
)
# Get raw energy
with
ProcessPoolExecutor
(
max_workers
=
self
.
num_workers
)
as
executor
,
tqdm
(
total
=
len
(
wav_list
)
)
as
progress
:
futures
=
[]
for
wav_basename
,
pcm_data
in
pcm_dict
.
items
():
future
=
executor
.
submit
(
get_energy
,
pcm_data
,
self
.
hop_length
,
self
.
win_length
,
self
.
n_fft
)
future
.
add_done_callback
(
lambda
p
:
progress
.
update
())
futures
.
append
((
future
,
wav_basename
))
for
future
,
wav_basename
in
futures
:
result
=
future
.
result
()
if
result
is
None
:
logging
.
warning
(
"[AudioProcessor] Energy extraction failed for %s"
,
wav_basename
)
self
.
badcase_list
.
append
(
wav_basename
)
else
:
energy
=
result
if
len
(
mel_dict
)
>
0
:
energy
=
align_length
(
energy
,
mel_dict
.
get
(
wav_basename
,
None
))
if
energy
is
None
:
logging
.
warning
(
"[AudioProcessor] Energy length mismatch with mel in %s"
,
wav_basename
,
)
self
.
badcase_list
.
append
(
wav_basename
)
continue
self
.
energy_dict
[
wav_basename
]
=
energy
# Normalize energy
energy_mean
=
compute_mean
(
list
(
self
.
energy_dict
.
values
()),
dims
=
1
)
energy_std
=
compute_std
(
list
(
self
.
energy_dict
.
values
()),
energy_mean
,
dims
=
1
)
np
.
savetxt
(
os
.
path
.
join
(
out_energy_dir
,
"energy_mean.txt"
),
energy_mean
,
fmt
=
"%.6f"
)
np
.
savetxt
(
os
.
path
.
join
(
out_energy_dir
,
"energy_std.txt"
),
energy_std
,
fmt
=
"%.6f"
)
logging
.
info
(
"[AudioProcessor] energy mean and std saved to:
\n
{},
\n
{}"
.
format
(
os
.
path
.
join
(
out_energy_dir
,
"energy_mean.txt"
),
os
.
path
.
join
(
out_energy_dir
,
"energy_std.txt"
),
)
)
logging
.
info
(
"[AudioProcessor] Energy mean std norm is proceeding..."
)
for
wav_basename
in
self
.
energy_dict
:
energy
=
self
.
energy_dict
[
wav_basename
]
norm_energy
=
f0_norm_mean_std
(
energy
,
energy_mean
,
energy_std
)
self
.
energy_dict
[
wav_basename
]
=
norm_energy
# save frame energy to a specific dir
for
wav_basename
in
self
.
energy_dict
:
np
.
save
(
os
.
path
.
join
(
out_frame_energy_dir
,
wav_basename
+
".npy"
),
self
.
energy_dict
[
wav_basename
].
reshape
(
-
1
),
)
# phone level average
# if there is no duration then save the frame-level energy
if
self
.
phone_level_feature
and
len
(
self
.
dur_dict
)
>
0
:
with
ProcessPoolExecutor
(
max_workers
=
self
.
num_workers
)
as
executor
,
tqdm
(
total
=
len
(
self
.
energy_dict
)
)
as
progress
:
futures
=
[]
for
wav_basename
in
self
.
energy_dict
:
future
=
executor
.
submit
(
average_by_duration
,
self
.
energy_dict
.
get
(
wav_basename
,
None
),
self
.
dur_dict
.
get
(
wav_basename
,
None
),
)
future
.
add_done_callback
(
lambda
p
:
progress
.
update
())
futures
.
append
((
future
,
wav_basename
))
# TODO: failed clear dict element
for
future
,
wav_basename
in
futures
:
result
=
future
.
result
()
if
result
is
None
:
logging
.
warning
(
"[AudioProcessor] Energy extraction failed in phone level avg for: %s"
,
wav_basename
,
)
self
.
badcase_list
.
append
(
wav_basename
)
else
:
avg_energy
=
result
self
.
energy_dict
[
wav_basename
]
=
avg_energy
for
wav_basename
in
self
.
energy_dict
:
np
.
save
(
os
.
path
.
join
(
out_energy_dir
,
wav_basename
+
".npy"
),
self
.
energy_dict
[
wav_basename
].
reshape
(
-
1
),
)
logging
.
info
(
"[AudioProcessor] Energy normalization finished"
)
logging
.
info
(
"[AudioProcessor] Normed Energy saved to %s"
,
out_energy_dir
)
logging
.
info
(
"[AudioProcessor] Energy extraction finished"
)
return
True
def
process
(
self
,
src_voice_dir
,
out_data_dir
,
aux_metafile
=
None
):
succeed
=
True
raw_wav_dir
=
os
.
path
.
join
(
src_voice_dir
,
"wav"
)
src_interval_dir
=
os
.
path
.
join
(
src_voice_dir
,
"interval"
)
out_mel_dir
=
os
.
path
.
join
(
out_data_dir
,
"mel"
)
out_f0_dir
=
os
.
path
.
join
(
out_data_dir
,
"f0"
)
out_frame_f0_dir
=
os
.
path
.
join
(
out_data_dir
,
"frame_f0"
)
out_frame_uv_dir
=
os
.
path
.
join
(
out_data_dir
,
"frame_uv"
)
out_energy_dir
=
os
.
path
.
join
(
out_data_dir
,
"energy"
)
out_frame_energy_dir
=
os
.
path
.
join
(
out_data_dir
,
"frame_energy"
)
out_duration_dir
=
os
.
path
.
join
(
out_data_dir
,
"raw_duration"
)
out_cali_duration_dir
=
os
.
path
.
join
(
out_data_dir
,
"duration"
)
os
.
makedirs
(
out_data_dir
,
exist_ok
=
True
)
with_duration
=
os
.
path
.
exists
(
src_interval_dir
)
# TODO: to resume from previous process, a log file is needed
train_wav_dir
=
os
.
path
.
join
(
out_data_dir
,
"wav"
)
succeed
=
self
.
amp_normalize
(
raw_wav_dir
,
train_wav_dir
)
if
not
succeed
:
logging
.
error
(
"[AudioProcessor] amp_normalize failed, exit"
)
return
False
if
with_duration
:
# Raw duration, non-trimmed
succeed
=
self
.
duration_generate
(
src_interval_dir
,
out_duration_dir
)
if
not
succeed
:
logging
.
error
(
"[AudioProcessor] duration_generate failed, exit"
)
return
False
if
self
.
trim_silence
:
if
with_duration
:
succeed
=
self
.
trim_silence_wav_with_interval
(
train_wav_dir
,
out_duration_dir
)
if
not
succeed
:
logging
.
error
(
"[AudioProcessor] trim_silence_wav_with_interval failed, exit"
)
return
False
else
:
succeed
=
self
.
trim_silence_wav
(
train_wav_dir
)
if
not
succeed
:
logging
.
error
(
"[AudioProcessor] trim_silence_wav failed, exit"
)
return
False
succeed
=
self
.
mel_extract
(
train_wav_dir
,
out_mel_dir
)
if
not
succeed
:
logging
.
error
(
"[AudioProcessor] mel_extract failed, exit"
)
return
False
if
aux_metafile
is
not
None
and
with_duration
:
self
.
calibrate_SyllableDuration
(
out_duration_dir
,
aux_metafile
,
out_cali_duration_dir
)
succeed
=
self
.
pitch_extract
(
train_wav_dir
,
out_f0_dir
,
out_frame_f0_dir
,
out_frame_uv_dir
)
if
not
succeed
:
logging
.
error
(
"[AudioProcessor] pitch_extract failed, exit"
)
return
False
succeed
=
self
.
energy_extract
(
train_wav_dir
,
out_energy_dir
,
out_frame_energy_dir
)
if
not
succeed
:
logging
.
error
(
"[AudioProcessor] energy_extract failed, exit"
)
return
False
# recording badcase list
with
open
(
os
.
path
.
join
(
out_data_dir
,
"badlist.txt"
),
"w"
)
as
f
:
f
.
write
(
"
\n
"
.
join
(
self
.
badcase_list
))
logging
.
info
(
"[AudioProcessor] All features extracted successfully!"
)
return
succeed
if
__name__
==
"__main__"
:
parser
=
argparse
.
ArgumentParser
(
description
=
"Audio Processor"
)
parser
.
add_argument
(
"--src_voice_dir"
,
type
=
str
,
required
=
True
)
parser
.
add_argument
(
"--out_data_dir"
,
type
=
str
,
required
=
True
)
parser
.
add_argument
(
"--config"
,
type
=
str
,
default
=
None
)
args
=
parser
.
parse_args
()
if
args
.
config
is
not
None
:
with
open
(
args
.
config
,
"r"
)
as
f
:
config
=
yaml
.
load
(
f
,
Loader
=
yaml
.
Loader
)
ap
=
AudioProcessor
(
config
[
"audio_config"
])
ap
.
process
(
args
.
src_voice_dir
,
args
.
out_data_dir
)
kantts/preprocess/audio_processor/core/__init__.py
0 → 100644
View file @
ee10550a
kantts/preprocess/audio_processor/core/__pycache__/__init__.cpython-38.pyc
0 → 100644
View file @
ee10550a
File added
kantts/preprocess/audio_processor/core/__pycache__/dsp.cpython-38.pyc
0 → 100644
View file @
ee10550a
File added
kantts/preprocess/audio_processor/core/__pycache__/utils.cpython-38.pyc
0 → 100644
View file @
ee10550a
File added
kantts/preprocess/audio_processor/core/dsp.py
0 → 100644
View file @
ee10550a
import
numpy
as
np
import
librosa
import
librosa.filters
from
scipy.io
import
wavfile
from
scipy
import
signal
def
_stft
(
y
,
hop_length
,
win_length
,
n_fft
):
return
librosa
.
stft
(
y
=
y
,
n_fft
=
n_fft
,
hop_length
=
hop_length
,
win_length
=
win_length
)
def
_istft
(
y
,
hop_length
,
win_length
):
return
librosa
.
istft
(
y
,
hop_length
=
hop_length
,
win_length
=
win_length
)
def
_db_to_amp
(
x
):
return
np
.
power
(
10.0
,
x
*
0.05
)
def
_amp_to_db
(
x
):
return
20
*
np
.
log10
(
np
.
maximum
(
1e-5
,
x
))
def
load_wav
(
path
,
sr
):
return
librosa
.
load
(
path
,
sr
=
sr
)[
0
]
def
save_wav
(
wav
,
path
,
sr
):
if
wav
.
dtype
==
np
.
float32
or
wav
.
dtype
==
np
.
float64
:
quant_wav
=
32767
*
wav
else
:
quant_wav
=
wav
# maxmize the volume to avoid clipping
# wav *= 32767 / max(0.01, np.max(np.abs(wav)))
wavfile
.
write
(
path
,
sr
,
quant_wav
.
astype
(
np
.
int16
))
def
trim_silence
(
wav
,
top_db
,
hop_length
,
win_length
):
trimed_wav
,
_
=
librosa
.
effects
.
trim
(
wav
,
top_db
=
top_db
,
frame_length
=
win_length
,
hop_length
=
hop_length
)
return
trimed_wav
def
trim_silence_with_interval
(
wav
,
interval
,
hop_length
):
if
interval
is
None
:
return
None
leading_sil
=
interval
[
0
]
tailing_sil
=
interval
[
-
1
]
trim_wav
=
wav
[
leading_sil
*
hop_length
:
-
tailing_sil
*
hop_length
]
return
trim_wav
def
preemphasis
(
wav
,
k
=
0.98
,
preemphasize
=
False
):
if
preemphasize
:
return
signal
.
lfilter
([
1
,
-
k
],
[
1
],
wav
)
return
wav
def
inv_preemphasis
(
wav
,
k
=
0.98
,
inv_preemphasize
=
False
):
if
inv_preemphasize
:
return
signal
.
lfilter
([
1
],
[
1
,
-
k
],
wav
)
return
wav
def
_normalize
(
S
,
max_norm
=
1.0
,
min_level_db
=-
100
,
symmetric
=
False
):
if
symmetric
:
return
np
.
clip
(
(
2
*
max_norm
)
*
((
S
-
min_level_db
)
/
(
-
min_level_db
))
-
max_norm
,
-
max_norm
,
max_norm
,
)
else
:
return
np
.
clip
(
max_norm
*
((
S
-
min_level_db
)
/
(
-
min_level_db
)),
0
,
max_norm
)
def
_denormalize
(
D
,
max_norm
=
1.0
,
min_level_db
=-
100
,
symmetric
=
False
):
if
symmetric
:
return
(
(
np
.
clip
(
D
,
-
max_norm
,
max_norm
)
+
max_norm
)
*
-
min_level_db
/
(
2
*
max_norm
)
)
+
min_level_db
else
:
return
(
np
.
clip
(
D
,
0
,
max_norm
)
*
-
min_level_db
/
max_norm
)
+
min_level_db
def
_griffin_lim
(
S
,
n_fft
,
hop_length
,
win_length
,
griffin_lim_iters
=
60
):
angles
=
np
.
exp
(
2j
*
np
.
pi
*
np
.
random
.
rand
(
*
S
.
shape
))
S_complex
=
np
.
abs
(
S
).
astype
(
np
.
complex
)
y
=
_istft
(
S_complex
*
angles
,
hop_length
=
hop_length
,
win_length
=
win_length
)
for
i
in
range
(
griffin_lim_iters
):
angles
=
np
.
exp
(
1j
*
np
.
angle
(
_stft
(
y
,
n_fft
=
n_fft
,
hop_length
=
hop_length
,
win_length
=
win_length
)
)
)
y
=
_istft
(
S_complex
*
angles
,
hop_length
=
hop_length
,
win_length
=
win_length
)
return
y
def
spectrogram
(
y
,
n_fft
=
1024
,
hop_length
=
256
,
win_length
=
1024
,
max_norm
=
1.0
,
min_level_db
=-
100
,
ref_level_db
=
20
,
symmetric
=
False
,
):
D
=
_stft
(
preemphasis
(
y
),
hop_length
,
win_length
,
n_fft
)
S
=
_amp_to_db
(
np
.
abs
(
D
))
-
ref_level_db
return
_normalize
(
S
,
max_norm
,
min_level_db
,
symmetric
)
def
inv_spectrogram
(
spectrogram
,
n_fft
=
1024
,
hop_length
=
256
,
win_length
=
1024
,
max_norm
=
1.0
,
min_level_db
=-
100
,
ref_level_db
=
20
,
symmetric
=
False
,
power
=
1.5
,
):
S
=
_db_to_amp
(
_denormalize
(
spectrogram
,
max_norm
,
min_level_db
,
symmetric
)
+
ref_level_db
)
return
_griffin_lim
(
S
**
power
,
n_fft
,
hop_length
,
win_length
)
def
_build_mel_basis
(
sample_rate
,
n_fft
=
1024
,
fmin
=
50
,
fmax
=
8000
,
n_mels
=
80
):
assert
fmax
<=
sample_rate
//
2
return
librosa
.
filters
.
mel
(
sr
=
sample_rate
,
n_fft
=
n_fft
,
n_mels
=
n_mels
,
fmin
=
fmin
,
fmax
=
fmax
)
# mel linear Conversions
_mel_basis
=
None
_inv_mel_basis
=
None
def
_linear_to_mel
(
spectogram
,
sample_rate
,
n_fft
=
1024
,
fmin
=
50
,
fmax
=
8000
,
n_mels
=
80
):
global
_mel_basis
if
_mel_basis
is
None
:
_mel_basis
=
_build_mel_basis
(
sample_rate
,
n_fft
,
fmin
,
fmax
,
n_mels
)
return
np
.
dot
(
_mel_basis
,
spectogram
)
def
_mel_to_linear
(
mel_spectrogram
,
sample_rate
,
n_fft
=
1024
,
fmin
=
50
,
fmax
=
8000
,
n_mels
=
80
):
global
_inv_mel_basis
if
_inv_mel_basis
is
None
:
_inv_mel_basis
=
np
.
linalg
.
pinv
(
_build_mel_basis
(
sample_rate
,
n_fft
,
fmin
,
fmax
,
n_mels
)
)
return
np
.
maximum
(
1e-10
,
np
.
dot
(
_inv_mel_basis
,
mel_spectrogram
))
def
melspectrogram
(
y
,
sample_rate
,
n_fft
=
1024
,
hop_length
=
256
,
win_length
=
1024
,
n_mels
=
80
,
max_norm
=
1.0
,
min_level_db
=-
100
,
ref_level_db
=
20
,
fmin
=
50
,
fmax
=
8000
,
symmetric
=
False
,
preemphasize
=
False
,
):
D
=
_stft
(
preemphasis
(
y
,
preemphasize
=
preemphasize
),
hop_length
=
hop_length
,
win_length
=
win_length
,
n_fft
=
n_fft
,
)
S
=
(
_amp_to_db
(
_linear_to_mel
(
np
.
abs
(
D
),
sample_rate
=
sample_rate
,
n_fft
=
n_fft
,
fmin
=
fmin
,
fmax
=
fmax
,
n_mels
=
n_mels
,
)
)
-
ref_level_db
)
return
_normalize
(
S
,
max_norm
=
max_norm
,
min_level_db
=
min_level_db
,
symmetric
=
symmetric
).
T
def
inv_mel_spectrogram
(
mel_spectrogram
,
sample_rate
,
n_fft
=
1024
,
hop_length
=
256
,
win_length
=
1024
,
n_mels
=
80
,
max_norm
=
1.0
,
min_level_db
=-
100
,
ref_level_db
=
20
,
fmin
=
50
,
fmax
=
8000
,
power
=
1.5
,
symmetric
=
False
,
preemphasize
=
False
,
):
D
=
_denormalize
(
mel_spectrogram
,
max_norm
=
max_norm
,
min_level_db
=
min_level_db
,
symmetric
=
symmetric
,
)
S
=
_mel_to_linear
(
_db_to_amp
(
D
+
ref_level_db
),
sample_rate
=
sample_rate
,
n_fft
=
n_fft
,
fmin
=
fmin
,
fmax
=
fmax
,
n_mels
=
n_mels
,
)
return
inv_preemphasis
(
_griffin_lim
(
S
**
power
,
n_fft
,
hop_length
,
win_length
),
preemphasize
=
preemphasize
,
)
kantts/preprocess/audio_processor/core/utils.py
0 → 100644
View file @
ee10550a
import
os
from
glob
import
glob
import
numpy
as
np
import
sox
import
librosa
import
pysptk
from
scipy.io
import
wavfile
from
concurrent.futures
import
ProcessPoolExecutor
from
tqdm
import
tqdm
import
logging
from
.dsp
import
_stft
anchor_hist
=
np
.
array
(
[
0.0
,
0.00215827
,
0.00354383
,
0.00442313
,
0.00490274
,
0.00532907
,
0.00602185
,
0.00690115
,
0.00810019
,
0.00948574
,
0.0120437
,
0.01489475
,
0.01873168
,
0.02302158
,
0.02872369
,
0.03669065
,
0.04636291
,
0.05843325
,
0.07700506
,
0.11052491
,
0.16802558
,
0.25997868
,
0.37942979
,
0.50730083
,
0.62006395
,
0.71092459
,
0.76877165
,
0.80762057
,
0.83458566
,
0.85672795
,
0.87660538
,
0.89251266
,
0.90578204
,
0.91569411
,
0.92541966
,
0.93383959
,
0.94162004
,
0.94940048
,
0.95539568
,
0.96136424
,
0.9670397
,
0.97290168
,
0.97705835
,
0.98116174
,
0.98465228
,
0.98814282
,
0.99152678
,
0.99421796
,
0.9965894
,
0.99840128
,
1.0
,
]
)
anchor_bins
=
np
.
array
(
[
0.033976
,
0.03529014
,
0.03660428
,
0.03791842
,
0.03923256
,
0.0405467
,
0.04186084
,
0.04317498
,
0.04448912
,
0.04580326
,
0.0471174
,
0.04843154
,
0.04974568
,
0.05105982
,
0.05237396
,
0.0536881
,
0.05500224
,
0.05631638
,
0.05763052
,
0.05894466
,
0.0602588
,
0.06157294
,
0.06288708
,
0.06420122
,
0.06551536
,
0.0668295
,
0.06814364
,
0.06945778
,
0.07077192
,
0.07208606
,
0.0734002
,
0.07471434
,
0.07602848
,
0.07734262
,
0.07865676
,
0.0799709
,
0.08128504
,
0.08259918
,
0.08391332
,
0.08522746
,
0.0865416
,
0.08785574
,
0.08916988
,
0.09048402
,
0.09179816
,
0.0931123
,
0.09442644
,
0.09574058
,
0.09705472
,
0.09836886
,
0.099683
,
]
)
hist_bins
=
50
def
amp_info
(
wav_file_path
):
"""
Returns the amplitude info of the wav file.
"""
stats
=
sox
.
file_info
.
stat
(
wav_file_path
)
amp_rms
=
stats
[
"RMS amplitude"
]
amp_max
=
stats
[
"Maximum amplitude"
]
amp_mean
=
stats
[
"Mean amplitude"
]
length
=
stats
[
"Length (seconds)"
]
return
{
"amp_rms"
:
amp_rms
,
"amp_max"
:
amp_max
,
"amp_mean"
:
amp_mean
,
"length"
:
length
,
"basename"
:
os
.
path
.
basename
(
wav_file_path
),
}
# TODO: multi-processing
def
statistic_amplitude
(
src_wav_dir
):
"""
Returns the amplitude info of the wav file.
"""
wav_lst
=
glob
(
os
.
path
.
join
(
src_wav_dir
,
"*.wav"
))
with
ProcessPoolExecutor
(
max_workers
=
8
)
as
executor
,
tqdm
(
total
=
len
(
wav_lst
)
)
as
progress
:
futures
=
[]
for
wav_file_path
in
wav_lst
:
future
=
executor
.
submit
(
amp_info
,
wav_file_path
)
future
.
add_done_callback
(
lambda
p
:
progress
.
update
())
futures
.
append
(
future
)
amp_info_lst
=
[
future
.
result
()
for
future
in
futures
]
amp_info_lst
=
sorted
(
amp_info_lst
,
key
=
lambda
x
:
x
[
"amp_rms"
])
logging
.
info
(
"Average amplitude RMS : {}"
.
format
(
np
.
mean
([
x
[
"amp_rms"
]
for
x
in
amp_info_lst
])
)
)
# cnt = len(amp_info_lst)
#
# pinhead_cnt = math.floor(cnt * 0.01)
#
# return amp_info_lst[pinhead_cnt : cnt - pinhead_cnt]
return
amp_info_lst
# TODO: multi process
def
volume_normalize
(
src_wav_dir
,
out_wav_dir
):
logging
.
info
(
"Volume statistic proceeding..."
)
amp_info_lst
=
statistic_amplitude
(
src_wav_dir
)
logging
.
info
(
"Volume statistic done."
)
rms_amp_lst
=
[
x
[
"amp_rms"
]
for
x
in
amp_info_lst
]
src_hist
,
src_bins
=
np
.
histogram
(
rms_amp_lst
,
bins
=
hist_bins
,
density
=
True
)
src_hist
=
src_hist
/
np
.
sum
(
src_hist
)
src_hist
=
np
.
cumsum
(
src_hist
)
src_hist
=
np
.
insert
(
src_hist
,
0
,
0.0
)
logging
.
info
(
"Volume normalization proceeding..."
)
for
amp_info
in
tqdm
(
amp_info_lst
):
rms_amp
=
amp_info
[
"amp_rms"
]
rms_amp
=
np
.
clip
(
rms_amp
,
src_bins
[
0
],
src_bins
[
-
1
])
src_idx
=
np
.
where
(
rms_amp
>=
src_bins
)[
0
][
-
1
]
src_pos
=
src_hist
[
src_idx
]
anchor_idx
=
np
.
where
(
src_pos
>=
anchor_hist
)[
0
][
-
1
]
if
src_idx
==
hist_bins
or
anchor_idx
==
hist_bins
:
rms_amp
=
anchor_bins
[
-
1
]
else
:
rms_amp
=
(
rms_amp
-
src_bins
[
src_idx
])
/
(
src_bins
[
src_idx
+
1
]
-
src_bins
[
src_idx
]
)
*
(
anchor_bins
[
anchor_idx
+
1
]
-
anchor_bins
[
anchor_idx
])
+
anchor_bins
[
anchor_idx
]
scale
=
rms_amp
/
amp_info
[
"amp_rms"
]
# FIXME: This is a hack to avoid the sound cliping.
sr
,
data
=
wavfile
.
read
(
os
.
path
.
join
(
src_wav_dir
,
amp_info
[
"basename"
]))
wavfile
.
write
(
os
.
path
.
join
(
out_wav_dir
,
amp_info
[
"basename"
]),
sr
,
(
data
*
scale
).
astype
(
np
.
int16
),
)
logging
.
info
(
"Volume normalization done."
)
return
True
def
interp_f0
(
f0_data
):
"""
linear interpolation
"""
f0_data
[
f0_data
<
1
]
=
0
xp
=
np
.
nonzero
(
f0_data
)
yp
=
f0_data
[
xp
]
x
=
np
.
arange
(
f0_data
.
size
)
contour_f0
=
np
.
interp
(
x
,
xp
[
0
],
yp
).
astype
(
np
.
float32
)
return
contour_f0
def
frame_nccf
(
x
,
y
):
norm_coef
=
(
np
.
sum
(
x
**
2.0
)
*
np
.
sum
(
y
**
2.0
)
+
1e-30
)
**
0.5
return
(
np
.
sum
(
x
*
y
)
/
norm_coef
+
1.0
)
/
2.0
def
get_nccf
(
pcm_data
,
f0
,
min_f0
=
40
,
max_f0
=
800
,
fs
=
160
,
sr
=
16000
):
if
pcm_data
.
dtype
==
np
.
int16
:
pcm_data
=
pcm_data
.
astype
(
np
.
float32
)
/
32768
frame_len
=
int
(
sr
/
200
)
frame_num
=
int
(
len
(
pcm_data
)
//
fs
)
frame_num
=
min
(
frame_num
,
len
(
f0
))
pad_len
=
int
(
sr
/
min_f0
)
+
frame_len
pad_zeros
=
np
.
zeros
([
pad_len
],
dtype
=
np
.
float32
)
data
=
np
.
hstack
((
pad_zeros
,
pcm_data
.
astype
(
np
.
float32
),
pad_zeros
))
nccf
=
np
.
zeros
((
frame_num
),
dtype
=
np
.
float32
)
for
i
in
range
(
frame_num
):
curr_f0
=
np
.
clip
(
f0
[
i
],
min_f0
,
max_f0
)
lag
=
int
(
sr
/
curr_f0
+
0.5
)
j
=
i
*
fs
+
pad_len
-
frame_len
//
2
l_data
=
data
[
j
:
j
+
frame_len
]
l_data
-=
l_data
.
mean
()
r_data
=
data
[
j
+
lag
:
j
+
lag
+
frame_len
]
r_data
-=
r_data
.
mean
()
nccf
[
i
]
=
frame_nccf
(
l_data
,
r_data
)
return
nccf
def
smooth
(
data
,
win_len
):
if
win_len
%
2
==
0
:
win_len
+=
1
hwin
=
win_len
//
2
win
=
np
.
hanning
(
win_len
)
win
/=
win
.
sum
()
data
=
data
.
reshape
([
-
1
])
pad_data
=
np
.
pad
(
data
,
hwin
,
mode
=
"edge"
)
for
i
in
range
(
data
.
shape
[
0
]):
data
[
i
]
=
np
.
dot
(
win
,
pad_data
[
i
:
i
+
win_len
])
return
data
.
reshape
([
-
1
,
1
])
# TODO: pysptk only supports two methods to estimate the F0 now.
# support: rapt, swipe
# unsupport: reaper, world(DIO)
def
RAPT_FUNC
(
v1
,
v2
,
v3
,
v4
,
v5
):
return
pysptk
.
sptk
.
rapt
(
v1
.
astype
(
np
.
float32
),
fs
=
v2
,
hopsize
=
v3
,
min
=
v4
,
max
=
v5
)
def
SWIPE_FUNC
(
v1
,
v2
,
v3
,
v4
,
v5
):
return
pysptk
.
sptk
.
swipe
(
v1
.
astype
(
np
.
float64
),
fs
=
v2
,
hopsize
=
v3
,
min
=
v4
,
max
=
v5
)
def
PYIN_FUNC
(
v1
,
v2
,
v3
,
v4
,
v5
):
f0_mel
=
librosa
.
pyin
(
v1
.
astype
(
np
.
float32
),
sr
=
v2
,
frame_length
=
v3
*
4
,
fmin
=
v4
,
fmax
=
v5
)[
0
]
f0_mel
=
np
.
where
(
np
.
isnan
(
f0_mel
),
0.0
,
f0_mel
)
return
f0_mel
def
get_pitch
(
pcm_data
,
sampling_rate
=
16000
,
hop_length
=
160
):
log_f0_list
=
[]
uv_list
=
[]
low
,
high
=
40
,
800
cali_f0
=
pysptk
.
sptk
.
rapt
(
pcm_data
.
astype
(
np
.
float32
),
fs
=
sampling_rate
,
hopsize
=
hop_length
,
min
=
low
,
max
=
high
,
)
f0_range
=
np
.
sort
(
np
.
unique
(
cali_f0
))
if
len
(
f0_range
)
>
20
:
low
=
max
(
f0_range
[
10
]
-
50
,
low
)
high
=
min
(
f0_range
[
-
10
]
+
50
,
high
)
func_dict
=
{
"rapt"
:
RAPT_FUNC
,
"swipe"
:
SWIPE_FUNC
}
for
func_name
in
func_dict
:
f0
=
func_dict
[
func_name
](
pcm_data
,
sampling_rate
,
hop_length
,
low
,
high
)
uv
=
f0
>
0
if
len
(
f0
)
<
10
or
f0
.
max
()
<
low
:
logging
.
error
(
"{} method: calc F0 is too low."
.
format
(
func_name
))
continue
else
:
f0
=
np
.
clip
(
f0
,
1e-30
,
high
)
log_f0
=
np
.
log
(
f0
)
contour_log_f0
=
interp_f0
(
log_f0
)
log_f0_list
.
append
(
contour_log_f0
)
uv_list
.
append
(
uv
)
if
len
(
log_f0_list
)
==
0
:
logging
.
error
(
"F0 estimation failed."
)
return
None
min_len
=
float
(
"inf"
)
for
log_f0
in
log_f0_list
:
min_len
=
min
(
min_len
,
log_f0
.
shape
[
0
])
multi_log_f0
=
np
.
zeros
([
len
(
log_f0_list
),
min_len
],
dtype
=
np
.
float32
)
multi_uv
=
np
.
zeros
([
len
(
log_f0_list
),
min_len
],
dtype
=
np
.
float32
)
for
i
in
range
(
len
(
log_f0_list
)):
multi_log_f0
[
i
,
:]
=
log_f0_list
[
i
][:
min_len
]
multi_uv
[
i
,
:]
=
uv_list
[
i
][:
min_len
]
log_f0
=
smooth
(
np
.
median
(
multi_log_f0
,
axis
=
0
),
5
)
uv
=
(
smooth
(
np
.
median
(
multi_uv
,
axis
=
0
),
5
)
>
0.5
).
astype
(
np
.
float32
)
f0
=
np
.
exp
(
log_f0
)
# nccf = get_nccf(
# pcm_data, f0, min_f0=low, max_f0=high, fs=hop_length, sr=sampling_rate
# )
min_len
=
min
(
f0
.
shape
[
0
],
uv
.
shape
[
0
])
return
f0
[:
min_len
],
uv
[:
min_len
],
f0
[:
min_len
]
*
uv
[:
min_len
]
# TODO: some DSP functions are not implemented.
def
get_energy
(
pcm_data
,
hop_length
,
win_length
,
n_fft
):
D
=
_stft
(
pcm_data
,
hop_length
,
win_length
,
n_fft
)
S
,
_
=
librosa
.
magphase
(
D
)
energy
=
np
.
sqrt
(
np
.
sum
(
S
**
2
,
axis
=
0
))
return
energy
.
reshape
((
-
1
,
1
))
def
align_length
(
in_data
,
tgt_data
,
basename
=
None
):
if
in_data
is
None
or
tgt_data
is
None
:
logging
.
error
(
"{}: Input data is None."
.
format
(
basename
))
return
None
in_len
=
in_data
.
shape
[
0
]
tgt_len
=
tgt_data
.
shape
[
0
]
if
abs
(
in_len
-
tgt_len
)
>
20
:
logging
.
error
(
"{}: Input data length mismatches with target data length too much."
.
format
(
basename
)
)
return
None
if
in_len
<
tgt_len
:
out_data
=
np
.
pad
(
in_data
,
((
0
,
tgt_len
-
in_len
),
(
0
,
0
)),
"constant"
,
constant_values
=
0.0
)
else
:
out_data
=
in_data
[:
tgt_len
]
return
out_data
def
compute_mean
(
data_list
,
dims
=
80
):
mean_vector
=
np
.
zeros
((
1
,
dims
))
all_frame_number
=
0
for
data
in
tqdm
(
data_list
):
if
data
is
None
:
continue
features
=
data
.
reshape
((
-
1
,
dims
))
current_frame_number
=
np
.
shape
(
features
)[
0
]
mean_vector
+=
np
.
sum
(
features
[:,
:],
axis
=
0
)
all_frame_number
+=
current_frame_number
mean_vector
/=
float
(
all_frame_number
)
return
mean_vector
def
compute_std
(
data_list
,
mean_vector
,
dims
=
80
):
std_vector
=
np
.
zeros
((
1
,
dims
))
all_frame_number
=
0
for
data
in
tqdm
(
data_list
):
if
data
is
None
:
continue
features
=
data
.
reshape
((
-
1
,
dims
))
current_frame_number
=
np
.
shape
(
features
)[
0
]
mean_matrix
=
np
.
tile
(
mean_vector
,
(
current_frame_number
,
1
))
std_vector
+=
np
.
sum
((
features
[:,
:]
-
mean_matrix
)
**
2
,
axis
=
0
)
all_frame_number
+=
current_frame_number
std_vector
/=
float
(
all_frame_number
)
std_vector
=
std_vector
**
0.5
return
std_vector
F0_MIN
=
0.0
F0_MAX
=
800.0
ENERGY_MIN
=
0.0
ENERGY_MAX
=
200.0
CLIP_FLOOR
=
1e-3
def
f0_norm_min_max
(
f0
):
zero_idxs
=
np
.
where
(
f0
<=
CLIP_FLOOR
)[
0
]
res
=
(
2
*
f0
-
F0_MIN
-
F0_MAX
)
/
(
F0_MAX
-
F0_MIN
)
res
[
zero_idxs
]
=
0.0
return
res
def
f0_denorm_min_max
(
f0
):
zero_idxs
=
np
.
where
(
f0
==
0.0
)[
0
]
res
=
(
f0
*
(
F0_MAX
-
F0_MIN
)
+
F0_MIN
+
F0_MAX
)
/
2
res
[
zero_idxs
]
=
0.0
return
res
def
energy_norm_min_max
(
energy
):
zero_idxs
=
np
.
where
(
energy
==
0.0
)[
0
]
res
=
(
2
*
energy
-
ENERGY_MIN
-
ENERGY_MAX
)
/
(
ENERGY_MAX
-
ENERGY_MIN
)
res
[
zero_idxs
]
=
0.0
return
res
def
energy_denorm_min_max
(
energy
):
zero_idxs
=
np
.
where
(
energy
==
0.0
)[
0
]
res
=
(
energy
*
(
ENERGY_MAX
-
ENERGY_MIN
)
+
ENERGY_MIN
+
ENERGY_MAX
)
/
2
res
[
zero_idxs
]
=
0.0
return
res
def
norm_log
(
x
):
zero_idxs
=
np
.
where
(
x
<=
CLIP_FLOOR
)[
0
]
x
[
zero_idxs
]
=
1.0
res
=
np
.
log
(
x
)
return
res
def
denorm_log
(
x
):
zero_idxs
=
np
.
where
(
x
==
0.0
)[
0
]
res
=
np
.
exp
(
x
)
res
[
zero_idxs
]
=
0.0
return
res
def
f0_norm_mean_std
(
x
,
mean
,
std
):
zero_idxs
=
np
.
where
(
x
==
0.0
)[
0
]
x
=
(
x
-
mean
)
/
std
x
[
zero_idxs
]
=
0.0
return
x
def
norm_mean_std
(
x
,
mean
,
std
):
x
=
(
x
-
mean
)
/
std
return
x
# TODO: This is a hardcode implementation for mit-style interval label
# TODO: Try to implement a more general version
def
parse_interval_file
(
file_path
,
sampling_rate
,
hop_length
):
with
open
(
file_path
,
"r"
)
as
f
:
lines
=
f
.
readlines
()
# second
frame_intervals
=
1.0
*
hop_length
/
sampling_rate
skip_lines
=
12
dur_list
=
[]
phone_list
=
[]
line_index
=
skip_lines
while
line_index
<
len
(
lines
):
phone_begin
=
float
(
lines
[
line_index
])
phone_end
=
float
(
lines
[
line_index
+
1
])
phone
=
lines
[
line_index
+
2
].
strip
()[
1
:
-
1
]
dur_list
.
append
(
int
(
round
((
phone_end
-
phone_begin
)
/
frame_intervals
)))
phone_list
.
append
(
phone
)
line_index
+=
3
if
len
(
dur_list
)
==
0
or
len
(
phone_list
)
==
0
:
return
None
return
np
.
array
(
dur_list
),
phone_list
def
average_by_duration
(
x
,
durs
):
if
x
is
None
or
durs
is
None
:
return
None
durs_cum
=
np
.
cumsum
(
np
.
pad
(
durs
,
(
1
,
0
),
"constant"
))
# average over each symbol's duraion
x_symbol
=
np
.
zeros
((
durs
.
shape
[
0
],),
dtype
=
np
.
float32
)
for
idx
,
start
,
end
in
zip
(
range
(
durs
.
shape
[
0
]),
durs_cum
[:
-
1
],
durs_cum
[
1
:]):
values
=
x
[
start
:
end
][
np
.
where
(
x
[
start
:
end
]
!=
0.0
)[
0
]]
x_symbol
[
idx
]
=
np
.
mean
(
values
)
if
len
(
values
)
>
0
else
0.0
return
x_symbol
.
astype
(
np
.
float32
)
def
encode_16bits
(
x
):
if
x
.
min
()
>
-
1.0
and
x
.
max
()
<
1.0
:
return
np
.
clip
(
x
*
2
**
15
,
-
(
2
**
15
),
2
**
15
-
1
).
astype
(
np
.
int16
)
else
:
return
x
if
__name__
==
"__main__"
:
import
sys
infile
=
sys
.
argv
[
1
]
sr
,
pcm_data
=
wavfile
.
read
(
infile
)
res
=
get_pitch
(
pcm_data
,
24000
,
240
)
print
(
res
)
kantts/preprocess/data_process.py
0 → 100644
View file @
ee10550a
import
logging
import
os
import
sys
import
argparse
import
yaml
import
time
import
codecs
ROOT_PATH
=
os
.
path
.
dirname
(
os
.
path
.
dirname
(
os
.
path
.
abspath
(
__file__
)))
# NOQA: E402
sys
.
path
.
insert
(
0
,
os
.
path
.
dirname
(
ROOT_PATH
))
# NOQA: E402
try
:
from
kantts.preprocess.audio_processor.audio_processor
import
AudioProcessor
from
kantts.preprocess.se_processor.se_processor
import
SpeakerEmbeddingProcessor
from
kantts.preprocess.script_convertor.TextScriptConvertor
import
(
TextScriptConvertor
,
)
from
kantts.preprocess.fp_processor
import
FpProcessor
,
is_fp_line
from
kantts.preprocess.languages
import
languages
from
kantts.datasets.dataset
import
AM_Dataset
,
Voc_Dataset
from
kantts.utils.log
import
logging_to_file
,
get_git_revision_hash
except
ImportError
:
raise
ImportError
(
"Please install kantts."
)
logging
.
basicConfig
(
format
=
"%(asctime)s,%(msecs)d %(levelname)-8s [%(filename)s:%(lineno)d] %(message)s"
,
datefmt
=
"%Y-%m-%d:%H:%M:%S"
,
level
=
logging
.
INFO
,
)
LANGUAGES_DIR
=
os
.
path
.
join
(
os
.
path
.
dirname
(
os
.
path
.
abspath
(
__file__
)),
"languages"
)
def
gen_metafile
(
voice_output_dir
,
fp_enable
=
False
,
badlist
=
None
,
split_ratio
=
0.98
,
):
voc_train_meta
=
os
.
path
.
join
(
voice_output_dir
,
"train.lst"
)
voc_valid_meta
=
os
.
path
.
join
(
voice_output_dir
,
"valid.lst"
)
if
not
os
.
path
.
exists
(
voc_train_meta
)
or
not
os
.
path
.
exists
(
voc_valid_meta
):
Voc_Dataset
.
gen_metafile
(
os
.
path
.
join
(
voice_output_dir
,
"wav"
),
voice_output_dir
,
split_ratio
,
)
logging
.
info
(
"Voc metafile generated."
)
raw_metafile
=
os
.
path
.
join
(
voice_output_dir
,
"raw_metafile.txt"
)
am_train_meta
=
os
.
path
.
join
(
voice_output_dir
,
"am_train.lst"
)
am_valid_meta
=
os
.
path
.
join
(
voice_output_dir
,
"am_valid.lst"
)
if
not
os
.
path
.
exists
(
am_train_meta
)
or
not
os
.
path
.
exists
(
am_valid_meta
):
AM_Dataset
.
gen_metafile
(
raw_metafile
,
voice_output_dir
,
am_train_meta
,
am_valid_meta
,
badlist
,
split_ratio
,
)
logging
.
info
(
"AM metafile generated."
)
if
fp_enable
:
fpadd_metafile
=
os
.
path
.
join
(
voice_output_dir
,
"fpadd_metafile.txt"
)
am_train_meta
=
os
.
path
.
join
(
voice_output_dir
,
"am_fpadd_train.lst"
)
am_valid_meta
=
os
.
path
.
join
(
voice_output_dir
,
"am_fpadd_valid.lst"
)
if
not
os
.
path
.
exists
(
am_train_meta
)
or
not
os
.
path
.
exists
(
am_valid_meta
):
AM_Dataset
.
gen_metafile
(
fpadd_metafile
,
voice_output_dir
,
am_train_meta
,
am_valid_meta
,
badlist
,
split_ratio
,
)
logging
.
info
(
"AM fpaddmetafile generated."
)
fprm_metafile
=
os
.
path
.
join
(
voice_output_dir
,
"fprm_metafile.txt"
)
am_train_meta
=
os
.
path
.
join
(
voice_output_dir
,
"am_fprm_train.lst"
)
am_valid_meta
=
os
.
path
.
join
(
voice_output_dir
,
"am_fprm_valid.lst"
)
if
not
os
.
path
.
exists
(
am_train_meta
)
or
not
os
.
path
.
exists
(
am_valid_meta
):
AM_Dataset
.
gen_metafile
(
fprm_metafile
,
voice_output_dir
,
am_train_meta
,
am_valid_meta
,
badlist
,
split_ratio
,
)
logging
.
info
(
"AM fprmmetafile generated."
)
# TODO: Zh-CN as default
def
process_data
(
voice_input_dir
,
voice_output_dir
,
audio_config
,
speaker_name
=
None
,
targetLang
=
"PinYin"
,
skip_script
=
False
,
se_model
=
None
,
):
foreignLang
=
"EnUS"
# check if the vocie is supported
if
not
os
.
path
.
exists
(
os
.
path
.
join
(
voice_input_dir
,
"emotion_tag.txt"
)):
emo_tag_path
=
None
else
:
emo_tag_path
=
os
.
path
.
join
(
voice_input_dir
,
"emotion_tag.txt"
)
phoneset_path
=
os
.
path
.
join
(
LANGUAGES_DIR
,
targetLang
,
languages
[
targetLang
][
"phoneset_path"
]
)
posset_path
=
os
.
path
.
join
(
LANGUAGES_DIR
,
targetLang
,
languages
[
targetLang
][
"posset_path"
]
)
f2t_map_path
=
os
.
path
.
join
(
LANGUAGES_DIR
,
targetLang
,
languages
[
targetLang
][
"f2t_map_path"
]
)
s2p_map_path
=
os
.
path
.
join
(
LANGUAGES_DIR
,
targetLang
,
languages
[
targetLang
][
"s2p_map_path"
]
)
# dir of plain text/sentences for training byte based model
plain_text_dir
=
os
.
path
.
join
(
voice_input_dir
,
"text"
)
if
speaker_name
is
None
:
speaker_name
=
os
.
path
.
basename
(
voice_input_dir
)
if
audio_config
is
not
None
:
with
open
(
audio_config
,
"r"
)
as
f
:
config
=
yaml
.
load
(
f
,
Loader
=
yaml
.
Loader
)
config
[
"create_time"
]
=
time
.
strftime
(
"%Y-%m-%d %H:%M:%S"
,
time
.
localtime
())
config
[
"git_revision_hash"
]
=
get_git_revision_hash
()
se_enable
=
config
[
"audio_config"
].
get
(
"se_feature"
,
False
)
with
open
(
os
.
path
.
join
(
voice_output_dir
,
"audio_config.yaml"
),
"w"
)
as
f
:
yaml
.
dump
(
config
,
f
,
Dumper
=
yaml
.
Dumper
,
default_flow_style
=
None
)
if
skip_script
:
logging
.
info
(
"Skip script conversion"
)
raw_metafile
=
None
# Script processor
if
not
skip_script
:
if
os
.
path
.
exists
(
plain_text_dir
):
TextScriptConvertor
.
turn_text_into_bytes
(
os
.
path
.
join
(
plain_text_dir
,
"text.txt"
),
os
.
path
.
join
(
voice_output_dir
,
"raw_metafile.txt"
),
speaker_name
,
)
fp_enable
=
False
else
:
tsc
=
TextScriptConvertor
(
phoneset_path
,
posset_path
,
targetLang
,
foreignLang
,
f2t_map_path
,
s2p_map_path
,
emo_tag_path
,
speaker_name
,
)
tsc
.
process
(
os
.
path
.
join
(
voice_input_dir
,
"prosody"
,
"prosody.txt"
),
os
.
path
.
join
(
voice_output_dir
,
"Script.xml"
),
os
.
path
.
join
(
voice_output_dir
,
"raw_metafile.txt"
),
)
prosody
=
os
.
path
.
join
(
voice_input_dir
,
"prosody"
,
"prosody.txt"
)
# FP processor
with
codecs
.
open
(
prosody
,
"r"
,
"utf-8"
)
as
f
:
lines
=
f
.
readlines
()
fp_enable
=
is_fp_line
(
lines
[
1
])
raw_metafile
=
os
.
path
.
join
(
voice_output_dir
,
"raw_metafile.txt"
)
if
fp_enable
:
FP
=
FpProcessor
()
FP
.
process
(
voice_output_dir
,
prosody
,
raw_metafile
,
)
logging
.
info
(
"Processing fp done."
)
# Audio processor
ap
=
AudioProcessor
(
config
[
"audio_config"
])
ap
.
process
(
voice_input_dir
,
voice_output_dir
,
raw_metafile
,
)
logging
.
info
(
"Processing audio done."
)
# SpeakerEmbedding processor
if
se_enable
:
sep
=
SpeakerEmbeddingProcessor
()
sep
.
process
(
voice_output_dir
,
se_model
,
)
logging
.
info
(
"Processing speaker embedding done."
)
logging
.
info
(
"Processing done."
)
# Generate Voc&AM metafile
# TODO: train/valid ratio setting
gen_metafile
(
voice_output_dir
,
fp_enable
,
ap
.
badcase_list
)
if
__name__
==
"__main__"
:
parser
=
argparse
.
ArgumentParser
(
description
=
"Dataset preprocessor"
)
parser
.
add_argument
(
"--voice_input_dir"
,
type
=
str
,
required
=
True
)
parser
.
add_argument
(
"--voice_output_dir"
,
type
=
str
,
required
=
True
)
parser
.
add_argument
(
"--audio_config"
,
type
=
str
,
required
=
True
)
parser
.
add_argument
(
"--speaker"
,
type
=
str
,
default
=
None
,
help
=
"speaker"
)
parser
.
add_argument
(
"--lang"
,
type
=
str
,
default
=
"PinYin"
,
help
=
"target language"
)
parser
.
add_argument
(
"--se_model"
,
type
=
str
,
default
=
"../pre_data/speaker_embeddding/se.*"
,
help
=
"speaker embedding extractor model"
,
)
parser
.
add_argument
(
"--skip_script"
,
action
=
"store_true"
,
help
=
"skip script converting"
)
args
=
parser
.
parse_args
()
os
.
makedirs
(
args
.
voice_output_dir
,
exist_ok
=
True
)
logging_to_file
(
os
.
path
.
join
(
args
.
voice_output_dir
,
"data_process_stdout.log"
))
try
:
process_data
(
args
.
voice_input_dir
,
args
.
voice_output_dir
,
args
.
audio_config
,
args
.
speaker
,
args
.
lang
,
args
.
skip_script
,
args
.
se_model
,
)
except
(
Exception
,
KeyboardInterrupt
)
as
e
:
logging
.
error
(
e
,
exc_info
=
True
)
kantts/preprocess/fp_processor.py
0 → 100644
View file @
ee10550a
import
os
import
logging
import
random
def
is_fp_line
(
line
):
fp_category_list
=
[
"FP"
,
"I"
,
"N"
,
"Q"
]
elements
=
line
.
strip
().
split
(
" "
)
res
=
True
for
ele
in
elements
:
if
ele
not
in
fp_category_list
:
res
=
False
break
return
res
class
FpProcessor
:
def
__init__
(
self
):
# TODO: Add more audio processing methods.
self
.
res
=
[]
def
is_fp_line
(
line
):
fp_category_list
=
[
"FP"
,
"I"
,
"N"
,
"Q"
]
elements
=
line
.
strip
().
split
(
" "
)
res
=
True
for
ele
in
elements
:
if
ele
not
in
fp_category_list
:
res
=
False
break
return
res
# TODO: adjust idx judgment rule
def
addfp
(
self
,
voice_output_dir
,
prosody
,
raw_metafile_lines
):
fp_category_list
=
[
"FP"
,
"I"
,
"N"
]
f
=
open
(
prosody
)
prosody_lines
=
f
.
readlines
()
f
.
close
()
idx
=
""
fp
=
""
fp_label_dict
=
{}
i
=
0
while
i
<
len
(
prosody_lines
):
if
len
(
prosody_lines
[
i
].
strip
().
split
(
"
\t
"
))
==
2
:
idx
=
prosody_lines
[
i
].
strip
().
split
(
"
\t
"
)[
0
]
i
+=
1
else
:
fp_enable
=
is_fp_line
(
prosody_lines
[
i
])
if
fp_enable
:
fp
=
prosody_lines
[
i
].
strip
().
split
(
"
\t
"
)[
0
].
split
(
" "
)
for
label
in
fp
:
if
label
not
in
fp_category_list
:
logging
.
warning
(
"fp label not in fp_category_list"
)
break
i
+=
4
else
:
fp
=
[
"N"
for
_
in
range
(
len
(
prosody_lines
[
i
]
.
strip
()
.
split
(
"
\t
"
)[
0
]
.
replace
(
"/ "
,
""
)
.
replace
(
". "
,
""
)
.
split
(
" "
)
)
)
]
i
+=
1
fp_label_dict
[
idx
]
=
fp
fpadd_metafile
=
os
.
path
.
join
(
voice_output_dir
,
"fpadd_metafile.txt"
)
f_out
=
open
(
fpadd_metafile
,
"w"
)
for
line
in
raw_metafile_lines
:
tokens
=
line
.
strip
().
split
(
"
\t
"
)
if
len
(
tokens
)
==
2
:
uttname
=
tokens
[
0
]
symbol_sequences
=
tokens
[
1
].
split
(
" "
)
error_flag
=
False
idx
=
0
out_str
=
uttname
+
"
\t
"
for
this_symbol_sequence
in
symbol_sequences
:
emotion
=
this_symbol_sequence
.
split
(
"$"
)[
4
]
this_symbol_sequence
=
this_symbol_sequence
.
replace
(
emotion
,
"emotion_neutral"
)
if
idx
<
len
(
fp_label_dict
[
uttname
]):
if
fp_label_dict
[
uttname
][
idx
]
==
"FP"
:
if
"none"
not
in
this_symbol_sequence
:
this_symbol_sequence
=
this_symbol_sequence
.
replace
(
"emotion_neutral"
,
"emotion_disgust"
)
syllable_label
=
this_symbol_sequence
.
split
(
"$"
)[
2
]
if
syllable_label
==
"s_both"
or
syllable_label
==
"s_end"
:
idx
+=
1
elif
idx
>
len
(
fp_label_dict
[
uttname
]):
logging
.
warning
(
uttname
+
" not match"
)
error_flag
=
True
out_str
=
out_str
+
this_symbol_sequence
+
" "
# if idx != len(fp_label_dict[uttname]):
# logging.warning(
# "{} length mismatch, length: {} ".format(
# idx, len(fp_label_dict[uttname])
# )
# )
if
not
error_flag
:
f_out
.
write
(
out_str
.
strip
()
+
"
\n
"
)
f_out
.
close
()
return
fpadd_metafile
def
removefp
(
self
,
voice_output_dir
,
fpadd_metafile
,
raw_metafile_lines
):
f
=
open
(
fpadd_metafile
)
fpadd_metafile_lines
=
f
.
readlines
()
f
.
close
()
fprm_metafile
=
os
.
path
.
join
(
voice_output_dir
,
"fprm_metafile.txt"
)
f_out
=
open
(
fprm_metafile
,
"w"
)
for
i
in
range
(
len
(
raw_metafile_lines
)):
tokens
=
raw_metafile_lines
[
i
].
strip
().
split
(
"
\t
"
)
symbol_sequences
=
tokens
[
1
].
split
(
" "
)
fpadd_tokens
=
fpadd_metafile_lines
[
i
].
strip
().
split
(
"
\t
"
)
fpadd_symbol_sequences
=
fpadd_tokens
[
1
].
split
(
" "
)
error_flag
=
False
out_str
=
tokens
[
0
]
+
"
\t
"
idx
=
0
length
=
len
(
symbol_sequences
)
while
idx
<
length
:
if
"$emotion_disgust"
in
fpadd_symbol_sequences
[
idx
]:
if
idx
+
1
<
length
and
"none"
in
fpadd_symbol_sequences
[
idx
+
1
]:
idx
=
idx
+
2
else
:
idx
=
idx
+
1
continue
out_str
=
out_str
+
symbol_sequences
[
idx
]
+
" "
idx
=
idx
+
1
if
not
error_flag
:
f_out
.
write
(
out_str
.
strip
()
+
"
\n
"
)
f_out
.
close
()
def
process
(
self
,
voice_output_dir
,
prosody
,
raw_metafile
):
with
open
(
raw_metafile
,
"r"
)
as
f
:
lines
=
f
.
readlines
()
random
.
shuffle
(
lines
)
fpadd_metafile
=
self
.
addfp
(
voice_output_dir
,
prosody
,
lines
)
self
.
removefp
(
voice_output_dir
,
fpadd_metafile
,
lines
)
kantts/preprocess/languages/PinYin/En2ChPhoneMap.txt
0 → 100644
View file @
ee10550a
wu w
yi y
kantts/preprocess/languages/PinYin/PhoneSet.xml
0 → 100644
View file @
ee10550a
<?xml version="1.0" encoding="utf-8"?>
<phoneSet
xmlns=
"http://schemas.alibaba-inc.com/tts"
>
<phone>
<id>
0
</id>
<name>
a_c
</name>
<cv>
vowel
</cv>
<if>
final
</if>
<uv>
voiced
</uv>
<ap>
low
</ap>
<am>
open
</am>
</phone>
<phone>
<id>
1
</id>
<name>
ai_c
</name>
<cv>
vowel
</cv>
<if>
final
</if>
<uv>
voiced
</uv>
<ap>
low
</ap>
<am>
open
</am>
</phone>
<phone>
<id>
2
</id>
<name>
an_c
</name>
<cv>
vowel
</cv>
<if>
final
</if>
<uv>
voiced
</uv>
<ap>
low
</ap>
<am>
open
</am>
</phone>
<phone>
<id>
3
</id>
<name>
ang_c
</name>
<cv>
vowel
</cv>
<if>
final
</if>
<uv>
voiced
</uv>
<ap>
low
</ap>
<am>
open
</am>
</phone>
<phone>
<id>
4
</id>
<name>
ao_c
</name>
<cv>
vowel
</cv>
<if>
final
</if>
<uv>
voiced
</uv>
<ap>
low
</ap>
<am>
open
</am>
</phone>
<phone>
<id>
5
</id>
<name>
b_c
</name>
<cv>
vowel
</cv>
<if>
initial
</if>
<uv>
unvoiced
</uv>
<ap>
low
</ap>
<am>
open
</am>
</phone>
<phone>
<id>
6
</id>
<name>
c_c
</name>
<cv>
vowel
</cv>
<if>
initial
</if>
<uv>
unvoiced
</uv>
<ap>
low
</ap>
<am>
open
</am>
</phone>
<phone>
<id>
7
</id>
<name>
ch_c
</name>
<cv>
vowel
</cv>
<if>
initial
</if>
<uv>
unvoiced
</uv>
<ap>
low
</ap>
<am>
open
</am>
</phone>
<phone>
<id>
8
</id>
<name>
d_c
</name>
<cv>
vowel
</cv>
<if>
initial
</if>
<uv>
unvoiced
</uv>
<ap>
low
</ap>
<am>
open
</am>
</phone>
<phone>
<id>
9
</id>
<name>
e_c
</name>
<cv>
vowel
</cv>
<if>
final
</if>
<uv>
voiced
</uv>
<ap>
low
</ap>
<am>
open
</am>
</phone>
<phone>
<id>
10
</id>
<name>
ei_c
</name>
<cv>
vowel
</cv>
<if>
final
</if>
<uv>
voiced
</uv>
<ap>
low
</ap>
<am>
open
</am>
</phone>
<phone>
<id>
11
</id>
<name>
en_c
</name>
<cv>
vowel
</cv>
<if>
final
</if>
<uv>
voiced
</uv>
<ap>
low
</ap>
<am>
open
</am>
</phone>
<phone>
<id>
12
</id>
<name>
eng_c
</name>
<cv>
vowel
</cv>
<if>
final
</if>
<uv>
voiced
</uv>
<ap>
doublelips
</ap>
<am>
stop
</am>
</phone>
<phone>
<id>
13
</id>
<name>
er_c
</name>
<cv>
vowel
</cv>
<if>
final
</if>
<uv>
voiced
</uv>
<ap>
doublelips
</ap>
<am>
stop
</am>
</phone>
<phone>
<id>
14
</id>
<name>
f_c
</name>
<cv>
vowel
</cv>
<if>
initial
</if>
<uv>
unvoiced
</uv>
<ap>
doublelips
</ap>
<am>
stop
</am>
</phone>
<phone>
<id>
15
</id>
<name>
g_c
</name>
<cv>
vowel
</cv>
<if>
initial
</if>
<uv>
unvoiced
</uv>
<ap>
fronttongue
</ap>
<am>
affricative
</am>
</phone>
<phone>
<id>
16
</id>
<name>
h_c
</name>
<cv>
vowel
</cv>
<if>
initial
</if>
<uv>
unvoiced
</uv>
<ap>
backtongue
</ap>
<am>
affricative
</am>
</phone>
<phone>
<id>
17
</id>
<name>
i_c
</name>
<cv>
vowel
</cv>
<if>
final
</if>
<uv>
voiced
</uv>
<ap>
backtongue
</ap>
<am>
affricative
</am>
</phone>
<phone>
<id>
18
</id>
<name>
ia_c
</name>
<cv>
vowel
</cv>
<if>
final
</if>
<uv>
voiced
</uv>
<ap>
fronttongue
</ap>
<am>
affricative
</am>
</phone>
<phone>
<id>
19
</id>
<name>
ian_c
</name>
<cv>
vowel
</cv>
<if>
final
</if>
<uv>
voiced
</uv>
<ap>
centraltongue
</ap>
<am>
stop
</am>
</phone>
<phone>
<id>
20
</id>
<name>
iang_c
</name>
<cv>
vowel
</cv>
<if>
final
</if>
<uv>
voiced
</uv>
<ap>
centraltongue
</ap>
<am>
stop
</am>
</phone>
<phone>
<id>
21
</id>
<name>
iao_c
</name>
<cv>
vowel
</cv>
<if>
final
</if>
<uv>
voiced
</uv>
<ap>
centraltongue
</ap>
<am>
stop
</am>
</phone>
<phone>
<id>
22
</id>
<name>
ie_c
</name>
<cv>
vowel
</cv>
<if>
final
</if>
<uv>
voiced
</uv>
<ap>
middle
</ap>
<am>
open
</am>
</phone>
<phone>
<id>
23
</id>
<name>
ih_c
</name>
<cv>
vowel
</cv>
<if>
final
</if>
<uv>
voiced
</uv>
<ap>
middle
</ap>
<am>
open
</am>
</phone>
<phone>
<id>
24
</id>
<name>
ii_c
</name>
<cv>
vowel
</cv>
<if>
final
</if>
<uv>
voiced
</uv>
<ap>
middle
</ap>
<am>
open
</am>
</phone>
<phone>
<id>
25
</id>
<name>
in_c
</name>
<cv>
vowel
</cv>
<if>
final
</if>
<uv>
voiced
</uv>
<ap>
middle
</ap>
<am>
open
</am>
</phone>
<phone>
<id>
26
</id>
<name>
ing_c
</name>
<cv>
vowel
</cv>
<if>
final
</if>
<uv>
voiced
</uv>
<ap>
middle
</ap>
<am>
open
</am>
</phone>
<phone>
<id>
27
</id>
<name>
io_c
</name>
<cv>
vowel
</cv>
<if>
final
</if>
<uv>
voiced
</uv>
<ap>
middle
</ap>
<am>
open
</am>
</phone>
<phone>
<id>
28
</id>
<name>
iong_c
</name>
<cv>
vowel
</cv>
<if>
final
</if>
<uv>
voiced
</uv>
<ap>
middle
</ap>
<am>
open
</am>
</phone>
<phone>
<id>
29
</id>
<name>
iou_c
</name>
<cv>
vowel
</cv>
<if>
final
</if>
<uv>
voiced
</uv>
<ap>
middle
</ap>
<am>
open
</am>
</phone>
<phone>
<id>
30
</id>
<name>
j_c
</name>
<cv>
vowel
</cv>
<if>
initial
</if>
<uv>
unvoiced
</uv>
<ap>
middle
</ap>
<am>
open
</am>
</phone>
<phone>
<id>
31
</id>
<name>
k_c
</name>
<cv>
vowel
</cv>
<if>
initial
</if>
<uv>
unvoiced
</uv>
<ap>
middle
</ap>
<am>
open
</am>
</phone>
<phone>
<id>
32
</id>
<name>
l_c
</name>
<cv>
vowel
</cv>
<if>
initial
</if>
<uv>
voiced
</uv>
<ap>
middle
</ap>
<am>
open
</am>
</phone>
<phone>
<id>
33
</id>
<name>
m_c
</name>
<cv>
vowel
</cv>
<if>
initial
</if>
<uv>
voiced
</uv>
<ap>
middle
</ap>
<am>
open
</am>
</phone>
<phone>
<id>
34
</id>
<name>
n_c
</name>
<cv>
vowel
</cv>
<if>
initial
</if>
<uv>
voiced
</uv>
<ap>
middle
</ap>
<am>
open
</am>
</phone>
<phone>
<id>
35
</id>
<name>
o_c
</name>
<cv>
vowel
</cv>
<if>
final
</if>
<uv>
voiced
</uv>
<ap>
middle
</ap>
<am>
open
</am>
</phone>
<phone>
<id>
36
</id>
<name>
ong_c
</name>
<cv>
vowel
</cv>
<if>
final
</if>
<uv>
voiced
</uv>
<ap>
middle
</ap>
<am>
open
</am>
</phone>
<phone>
<id>
37
</id>
<name>
ou_c
</name>
<cv>
vowel
</cv>
<if>
final
</if>
<uv>
voiced
</uv>
<ap>
middle
</ap>
<am>
open
</am>
</phone>
<phone>
<id>
38
</id>
<name>
p_c
</name>
<cv>
vowel
</cv>
<if>
initial
</if>
<uv>
unvoiced
</uv>
<ap>
liptooth
</ap>
<am>
fricative
</am>
</phone>
<phone>
<id>
39
</id>
<name>
q_c
</name>
<cv>
vowel
</cv>
<if>
initial
</if>
<uv>
unvoiced
</uv>
<ap>
liptooth
</ap>
<am>
fricative
</am>
</phone>
<phone>
<id>
40
</id>
<name>
r_c
</name>
<cv>
vowel
</cv>
<if>
initial
</if>
<uv>
voiced
</uv>
<ap>
velar
</ap>
<am>
stop
</am>
</phone>
<phone>
<id>
41
</id>
<name>
s_c
</name>
<cv>
vowel
</cv>
<if>
initial
</if>
<uv>
unvoiced
</uv>
<ap>
low
</ap>
<am>
open
</am>
</phone>
<phone>
<id>
42
</id>
<name>
sh_c
</name>
<cv>
vowel
</cv>
<if>
initial
</if>
<uv>
unvoiced
</uv>
<ap>
middle
</ap>
<am>
open
</am>
</phone>
<phone>
<id>
43
</id>
<name>
t_c
</name>
<cv>
vowel
</cv>
<if>
initial
</if>
<uv>
unvoiced
</uv>
<ap>
middle
</ap>
<am>
open
</am>
</phone>
<phone>
<id>
44
</id>
<name>
u_c
</name>
<cv>
vowel
</cv>
<if>
final
</if>
<uv>
voiced
</uv>
<ap>
velar
</ap>
<am>
stop
</am>
</phone>
<phone>
<id>
45
</id>
<name>
ua_c
</name>
<cv>
vowel
</cv>
<if>
final
</if>
<uv>
voiced
</uv>
<ap>
velar
</ap>
<am>
fricative
</am>
</phone>
<phone>
<id>
46
</id>
<name>
uai_c
</name>
<cv>
vowel
</cv>
<if>
final
</if>
<uv>
voiced
</uv>
<ap>
velar
</ap>
<am>
fricative
</am>
</phone>
<phone>
<id>
47
</id>
<name>
uan_c
</name>
<cv>
vowel
</cv>
<if>
final
</if>
<uv>
voiced
</uv>
<ap>
high
</ap>
<am>
close
</am>
</phone>
<phone>
<id>
48
</id>
<name>
uang_c
</name>
<cv>
vowel
</cv>
<if>
final
</if>
<uv>
voiced
</uv>
<ap>
high
</ap>
<am>
close
</am>
</phone>
<phone>
<id>
49
</id>
<name>
uei_c
</name>
<cv>
vowel
</cv>
<if>
final
</if>
<uv>
voiced
</uv>
<ap>
high
</ap>
<am>
close
</am>
</phone>
<phone>
<id>
50
</id>
<name>
uen_c
</name>
<cv>
vowel
</cv>
<if>
final
</if>
<uv>
voiced
</uv>
<ap>
high
</ap>
<am>
open
</am>
</phone>
<phone>
<id>
51
</id>
<name>
ueng_c
</name>
<cv>
vowel
</cv>
<if>
final
</if>
<uv>
voiced
</uv>
<ap>
high
</ap>
<am>
open
</am>
</phone>
<phone>
<id>
52
</id>
<name>
uo_c
</name>
<cv>
vowel
</cv>
<if>
final
</if>
<uv>
voiced
</uv>
<ap>
high
</ap>
<am>
open
</am>
</phone>
<phone>
<id>
53
</id>
<name>
v_c
</name>
<cv>
vowel
</cv>
<if>
final
</if>
<uv>
voiced
</uv>
<ap>
high
</ap>
<am>
open
</am>
</phone>
<phone>
<id>
54
</id>
<name>
van_c
</name>
<cv>
vowel
</cv>
<if>
final
</if>
<uv>
voiced
</uv>
<ap>
high
</ap>
<am>
open
</am>
</phone>
<phone>
<id>
55
</id>
<name>
ve_c
</name>
<cv>
vowel
</cv>
<if>
final
</if>
<uv>
voiced
</uv>
<ap>
high
</ap>
<am>
open
</am>
</phone>
<phone>
<id>
56
</id>
<name>
vn_c
</name>
<cv>
vowel
</cv>
<if>
final
</if>
<uv>
voiced
</uv>
<ap>
high
</ap>
<am>
open
</am>
</phone>
<phone>
<id>
57
</id>
<name>
xx_c
</name>
<cv>
vowel
</cv>
<if>
initial
</if>
<uv>
unvoiced
</uv>
<ap>
high
</ap>
<am>
close
</am>
</phone>
<phone>
<id>
58
</id>
<name>
z_c
</name>
<cv>
vowel
</cv>
<if>
initial
</if>
<uv>
unvoiced
</uv>
<ap>
high
</ap>
<am>
close
</am>
</phone>
<phone>
<id>
59
</id>
<name>
zh_c
</name>
<cv>
vowel
</cv>
<if>
initial
</if>
<uv>
unvoiced
</uv>
<ap>
high
</ap>
<am>
close
</am>
</phone>
<phone>
<id>
60
</id>
<name>
w_c
</name>
<cv>
consonant
</cv>
<if>
initial
</if>
<uv>
unvoiced
</uv>
<ap>
high
</ap>
<am>
close
</am>
</phone>
<phone>
<id>
61
</id>
<name>
y_c
</name>
<cv>
consonant
</cv>
<if>
initial
</if>
<uv>
unvoiced
</uv>
<ap>
high
</ap>
<am>
close
</am>
</phone>
<phone>
<id>
62
</id>
<name>
ga
</name>
<cv>
consonant
</cv>
<if>
initial
</if>
<uv>
voiced
</uv>
<ap>
centraltongue
</ap>
<am>
lateral
</am>
</phone>
<phone>
<id>
63
</id>
<name>
ge
</name>
<cv>
consonant
</cv>
<if>
initial
</if>
<uv>
voiced
</uv>
<ap>
centraltongue
</ap>
<am>
lateral
</am>
</phone>
<phone>
<id>
64
</id>
<name>
go
</name>
<cv>
consonant
</cv>
<if>
initial
</if>
<uv>
voiced
</uv>
<ap>
centraltongue
</ap>
<am>
lateral
</am>
</phone>
<phone>
<id>
65
</id>
<name>
aa
</name>
<cv>
vowel
</cv>
<if>
final
</if>
<uv>
voiced
</uv>
<ap>
fronttongue
</ap>
<am>
affricative
</am>
</phone>
<phone>
<id>
66
</id>
<name>
ae
</name>
<cv>
vowel
</cv>
<if>
final
</if>
<uv>
voiced
</uv>
<ap>
fronttongue
</ap>
<am>
affricative
</am>
</phone>
<phone>
<id>
67
</id>
<name>
ah
</name>
<cv>
vowel
</cv>
<if>
final
</if>
<uv>
voiced
</uv>
<ap>
fronttongue
</ap>
<am>
affricative
</am>
</phone>
<phone>
<id>
68
</id>
<name>
ao
</name>
<cv>
vowel
</cv>
<if>
final
</if>
<uv>
voiced
</uv>
<ap>
fronttongue
</ap>
<am>
affricative
</am>
</phone>
<phone>
<id>
69
</id>
<name>
aw
</name>
<cv>
vowel
</cv>
<if>
final
</if>
<uv>
voiced
</uv>
<ap>
fronttongue
</ap>
<am>
affricative
</am>
</phone>
<phone>
<id>
70
</id>
<name>
ay
</name>
<cv>
vowel
</cv>
<if>
final
</if>
<uv>
voiced
</uv>
<ap>
fronttongue
</ap>
<am>
affricative
</am>
</phone>
<phone>
<id>
71
</id>
<name>
b
</name>
<cv>
consonant
</cv>
<if>
initial
</if>
<uv>
unvoiced
</uv>
<ap>
doublelips
</ap>
<am>
stop
</am>
</phone>
<phone>
<id>
72
</id>
<name>
ch
</name>
<cv>
consonant
</cv>
<if>
initial
</if>
<uv>
unvoiced
</uv>
<ap>
backtongue
</ap>
<am>
affricative
</am>
</phone>
<phone>
<id>
73
</id>
<name>
d
</name>
<cv>
consonant
</cv>
<if>
initial
</if>
<uv>
unvoiced
</uv>
<ap>
centraltongue
</ap>
<am>
stop
</am>
</phone>
<phone>
<id>
74
</id>
<name>
dh
</name>
<cv>
consonant
</cv>
<if>
initial
</if>
<uv>
unvoiced
</uv>
<ap>
fronttongue
</ap>
<am>
affricative
</am>
</phone>
<phone>
<id>
75
</id>
<name>
eh
</name>
<cv>
vowel
</cv>
<if>
final
</if>
<uv>
voiced
</uv>
<ap>
fronttongue
</ap>
<am>
affricative
</am>
</phone>
<phone>
<id>
76
</id>
<name>
er
</name>
<cv>
vowel
</cv>
<if>
final
</if>
<uv>
voiced
</uv>
<ap>
fronttongue
</ap>
<am>
affricative
</am>
</phone>
<phone>
<id>
77
</id>
<name>
ey
</name>
<cv>
vowel
</cv>
<if>
final
</if>
<uv>
voiced
</uv>
<ap>
fronttongue
</ap>
<am>
affricative
</am>
</phone>
<phone>
<id>
78
</id>
<name>
f
</name>
<cv>
consonant
</cv>
<if>
initial
</if>
<uv>
unvoiced
</uv>
<ap>
liptooth
</ap>
<am>
fricative
</am>
</phone>
<phone>
<id>
79
</id>
<name>
g
</name>
<cv>
consonant
</cv>
<if>
initial
</if>
<uv>
unvoiced
</uv>
<ap>
velar
</ap>
<am>
stop
</am>
</phone>
<phone>
<id>
80
</id>
<name>
hh
</name>
<cv>
consonant
</cv>
<if>
initial
</if>
<uv>
unvoiced
</uv>
<ap>
fronttongue
</ap>
<am>
affricative
</am>
</phone>
<phone>
<id>
81
</id>
<name>
ih
</name>
<cv>
vowel
</cv>
<if>
final
</if>
<uv>
voiced
</uv>
<ap>
fronttongue
</ap>
<am>
affricative
</am>
</phone>
<phone>
<id>
82
</id>
<name>
iy
</name>
<cv>
vowel
</cv>
<if>
final
</if>
<uv>
voiced
</uv>
<ap>
fronttongue
</ap>
<am>
affricative
</am>
</phone>
<phone>
<id>
83
</id>
<name>
jh
</name>
<cv>
consonant
</cv>
<if>
initial
</if>
<uv>
unvoiced
</uv>
<ap>
fronttongue
</ap>
<am>
affricative
</am>
</phone>
<phone>
<id>
84
</id>
<name>
k
</name>
<cv>
consonant
</cv>
<if>
initial
</if>
<uv>
unvoiced
</uv>
<ap>
velar
</ap>
<am>
stop
</am>
</phone>
<phone>
<id>
85
</id>
<name>
l
</name>
<cv>
consonant
</cv>
<if>
initial
</if>
<uv>
voiced
</uv>
<ap>
centraltongue
</ap>
<am>
lateral
</am>
</phone>
<phone>
<id>
86
</id>
<name>
m
</name>
<cv>
consonant
</cv>
<if>
initial
</if>
<uv>
voiced
</uv>
<ap>
doublelips
</ap>
<am>
nasal
</am>
</phone>
<phone>
<id>
87
</id>
<name>
n
</name>
<cv>
consonant
</cv>
<if>
initial
</if>
<uv>
voiced
</uv>
<ap>
centraltongue
</ap>
<am>
nasal
</am>
</phone>
<phone>
<id>
88
</id>
<name>
ng
</name>
<cv>
consonant
</cv>
<if>
final
</if>
<uv>
voiced
</uv>
<ap>
fronttongue
</ap>
<am>
affricative
</am>
</phone>
<phone>
<id>
89
</id>
<name>
ow
</name>
<cv>
vowel
</cv>
<if>
final
</if>
<uv>
voiced
</uv>
<ap>
fronttongue
</ap>
<am>
affricative
</am>
</phone>
<phone>
<id>
90
</id>
<name>
oy
</name>
<cv>
vowel
</cv>
<if>
final
</if>
<uv>
voiced
</uv>
<ap>
fronttongue
</ap>
<am>
affricative
</am>
</phone>
<phone>
<id>
91
</id>
<name>
p
</name>
<cv>
consonant
</cv>
<if>
initial
</if>
<uv>
unvoiced
</uv>
<ap>
doublelips
</ap>
<am>
stop
</am>
</phone>
<phone>
<id>
92
</id>
<name>
r
</name>
<cv>
consonant
</cv>
<if>
initial
</if>
<uv>
voiced
</uv>
<ap>
backtongue
</ap>
<am>
fricative
</am>
</phone>
<phone>
<id>
93
</id>
<name>
s
</name>
<cv>
consonant
</cv>
<if>
initial
</if>
<uv>
unvoiced
</uv>
<ap>
fronttongue
</ap>
<am>
fricative
</am>
</phone>
<phone>
<id>
94
</id>
<name>
sh
</name>
<cv>
consonant
</cv>
<if>
initial
</if>
<uv>
unvoiced
</uv>
<ap>
backtongue
</ap>
<am>
fricative
</am>
</phone>
<phone>
<id>
95
</id>
<name>
t
</name>
<cv>
consonant
</cv>
<if>
initial
</if>
<uv>
unvoiced
</uv>
<ap>
centraltongue
</ap>
<am>
stop
</am>
</phone>
<phone>
<id>
96
</id>
<name>
th
</name>
<cv>
consonant
</cv>
<if>
initial
</if>
<uv>
unvoiced
</uv>
<ap>
fronttongue
</ap>
<am>
affricative
</am>
</phone>
<phone>
<id>
97
</id>
<name>
uh
</name>
<cv>
vowel
</cv>
<if>
final
</if>
<uv>
voiced
</uv>
<ap>
fronttongue
</ap>
<am>
affricative
</am>
</phone>
<phone>
<id>
98
</id>
<name>
uw
</name>
<cv>
vowel
</cv>
<if>
final
</if>
<uv>
voiced
</uv>
<ap>
fronttongue
</ap>
<am>
affricative
</am>
</phone>
<phone>
<id>
99
</id>
<name>
v
</name>
<cv>
consonant
</cv>
<if>
initial
</if>
<uv>
unvoiced
</uv>
<ap>
fronttongue
</ap>
<am>
affricative
</am>
</phone>
<phone>
<id>
100
</id>
<name>
w
</name>
<cv>
consonant
</cv>
<if>
initial
</if>
<uv>
unvoiced
</uv>
<ap>
fronttongue
</ap>
<am>
affricative
</am>
</phone>
<phone>
<id>
101
</id>
<name>
y
</name>
<cv>
consonant
</cv>
<if>
final
</if>
<uv>
voiced
</uv>
<ap>
fronttongue
</ap>
<am>
affricative
</am>
</phone>
<phone>
<id>
102
</id>
<name>
z
</name>
<cv>
consonant
</cv>
<if>
initial
</if>
<uv>
unvoiced
</uv>
<ap>
fronttongue
</ap>
<am>
affricative
</am>
</phone>
<phone>
<id>
103
</id>
<name>
zh
</name>
<cv>
consonant
</cv>
<if>
initial
</if>
<uv>
unvoiced
</uv>
<ap>
backtongue
</ap>
<am>
affricative
</am>
</phone>
<phone>
<id>
104
</id>
<name>
air_c
</name>
<cv>
vowel
</cv>
<if>
final
</if>
<uv>
voiced
</uv>
<ap>
low
</ap>
<am>
open
</am>
</phone>
<phone>
<id>
105
</id>
<name>
angr_c
</name>
<cv>
vowel
</cv>
<if>
final
</if>
<uv>
voiced
</uv>
<ap>
low
</ap>
<am>
open
</am>
</phone>
<phone>
<id>
106
</id>
<name>
anr_c
</name>
<cv>
vowel
</cv>
<if>
final
</if>
<uv>
voiced
</uv>
<ap>
low
</ap>
<am>
open
</am>
</phone>
<phone>
<id>
107
</id>
<name>
aor_c
</name>
<cv>
vowel
</cv>
<if>
final
</if>
<uv>
voiced
</uv>
<ap>
low
</ap>
<am>
open
</am>
</phone>
<phone>
<id>
108
</id>
<name>
ar_c
</name>
<cv>
vowel
</cv>
<if>
final
</if>
<uv>
voiced
</uv>
<ap>
low
</ap>
<am>
open
</am>
</phone>
<phone>
<id>
109
</id>
<name>
eir_c
</name>
<cv>
vowel
</cv>
<if>
final
</if>
<uv>
voiced
</uv>
<ap>
low
</ap>
<am>
open
</am>
</phone>
<phone>
<id>
110
</id>
<name>
engr_c
</name>
<cv>
vowel
</cv>
<if>
final
</if>
<uv>
voiced
</uv>
<ap>
doublelips
</ap>
<am>
stop
</am>
</phone>
<phone>
<id>
111
</id>
<name>
enr_c
</name>
<cv>
vowel
</cv>
<if>
final
</if>
<uv>
voiced
</uv>
<ap>
low
</ap>
<am>
open
</am>
</phone>
<phone>
<id>
112
</id>
<name>
iangr_c
</name>
<cv>
vowel
</cv>
<if>
final
</if>
<uv>
voiced
</uv>
<ap>
centraltongue
</ap>
<am>
stop
</am>
</phone>
<phone>
<id>
113
</id>
<name>
ianr_c
</name>
<cv>
vowel
</cv>
<if>
final
</if>
<uv>
voiced
</uv>
<ap>
centraltongue
</ap>
<am>
stop
</am>
</phone>
<phone>
<id>
114
</id>
<name>
iaor_c
</name>
<cv>
vowel
</cv>
<if>
final
</if>
<uv>
voiced
</uv>
<ap>
centraltongue
</ap>
<am>
stop
</am>
</phone>
<phone>
<id>
115
</id>
<name>
iar_c
</name>
<cv>
vowel
</cv>
<if>
final
</if>
<uv>
voiced
</uv>
<ap>
fronttongue
</ap>
<am>
affricative
</am>
</phone>
<phone>
<id>
116
</id>
<name>
ier_c
</name>
<cv>
vowel
</cv>
<if>
final
</if>
<uv>
voiced
</uv>
<ap>
middle
</ap>
<am>
open
</am>
</phone>
<phone>
<id>
117
</id>
<name>
ihr_c
</name>
<cv>
vowel
</cv>
<if>
final
</if>
<uv>
voiced
</uv>
<ap>
middle
</ap>
<am>
open
</am>
</phone>
<phone>
<id>
118
</id>
<name>
iir_c
</name>
<cv>
vowel
</cv>
<if>
final
</if>
<uv>
voiced
</uv>
<ap>
middle
</ap>
<am>
open
</am>
</phone>
<phone>
<id>
119
</id>
<name>
ingr_c
</name>
<cv>
vowel
</cv>
<if>
final
</if>
<uv>
voiced
</uv>
<ap>
middle
</ap>
<am>
open
</am>
</phone>
<phone>
<id>
120
</id>
<name>
inr_c
</name>
<cv>
vowel
</cv>
<if>
final
</if>
<uv>
voiced
</uv>
<ap>
middle
</ap>
<am>
open
</am>
</phone>
<phone>
<id>
121
</id>
<name>
iongr_c
</name>
<cv>
vowel
</cv>
<if>
final
</if>
<uv>
voiced
</uv>
<ap>
middle
</ap>
<am>
open
</am>
</phone>
<phone>
<id>
122
</id>
<name>
iour_c
</name>
<cv>
vowel
</cv>
<if>
final
</if>
<uv>
voiced
</uv>
<ap>
middle
</ap>
<am>
open
</am>
</phone>
<phone>
<id>
123
</id>
<name>
ir_c
</name>
<cv>
vowel
</cv>
<if>
final
</if>
<uv>
voiced
</uv>
<ap>
backtongue
</ap>
<am>
affricative
</am>
</phone>
<phone>
<id>
124
</id>
<name>
ongr_c
</name>
<cv>
vowel
</cv>
<if>
final
</if>
<uv>
voiced
</uv>
<ap>
middle
</ap>
<am>
open
</am>
</phone>
<phone>
<id>
125
</id>
<name>
or_c
</name>
<cv>
vowel
</cv>
<if>
final
</if>
<uv>
voiced
</uv>
<ap>
middle
</ap>
<am>
open
</am>
</phone>
<phone>
<id>
126
</id>
<name>
our_c
</name>
<cv>
vowel
</cv>
<if>
final
</if>
<uv>
voiced
</uv>
<ap>
middle
</ap>
<am>
open
</am>
</phone>
<phone>
<id>
127
</id>
<name>
uair_c
</name>
<cv>
vowel
</cv>
<if>
final
</if>
<uv>
voiced
</uv>
<ap>
velar
</ap>
<am>
fricative
</am>
</phone>
<phone>
<id>
128
</id>
<name>
uangr_c
</name>
<cv>
vowel
</cv>
<if>
final
</if>
<uv>
voiced
</uv>
<ap>
high
</ap>
<am>
close
</am>
</phone>
<phone>
<id>
129
</id>
<name>
uanr_c
</name>
<cv>
vowel
</cv>
<if>
final
</if>
<uv>
voiced
</uv>
<ap>
high
</ap>
<am>
close
</am>
</phone>
<phone>
<id>
130
</id>
<name>
uar_c
</name>
<cv>
vowel
</cv>
<if>
final
</if>
<uv>
voiced
</uv>
<ap>
velar
</ap>
<am>
fricative
</am>
</phone>
<phone>
<id>
131
</id>
<name>
ueir_c
</name>
<cv>
vowel
</cv>
<if>
final
</if>
<uv>
voiced
</uv>
<ap>
high
</ap>
<am>
close
</am>
</phone>
<phone>
<id>
132
</id>
<name>
uenr_c
</name>
<cv>
vowel
</cv>
<if>
final
</if>
<uv>
voiced
</uv>
<ap>
high
</ap>
<am>
open
</am>
</phone>
<phone>
<id>
133
</id>
<name>
uor_c
</name>
<cv>
vowel
</cv>
<if>
final
</if>
<uv>
voiced
</uv>
<ap>
high
</ap>
<am>
open
</am>
</phone>
<phone>
<id>
134
</id>
<name>
ur_c
</name>
<cv>
vowel
</cv>
<if>
final
</if>
<uv>
voiced
</uv>
<ap>
velar
</ap>
<am>
stop
</am>
</phone>
<phone>
<id>
135
</id>
<name>
vanr_c
</name>
<cv>
vowel
</cv>
<if>
final
</if>
<uv>
voiced
</uv>
<ap>
high
</ap>
<am>
open
</am>
</phone>
<phone>
<id>
136
</id>
<name>
ver_c
</name>
<cv>
vowel
</cv>
<if>
final
</if>
<uv>
voiced
</uv>
<ap>
high
</ap>
<am>
open
</am>
</phone>
<phone>
<id>
137
</id>
<name>
vnr_c
</name>
<cv>
vowel
</cv>
<if>
final
</if>
<uv>
voiced
</uv>
<ap>
high
</ap>
<am>
open
</am>
</phone>
<phone>
<id>
138
</id>
<name>
vr_c
</name>
<cv>
vowel
</cv>
<if>
final
</if>
<uv>
voiced
</uv>
<ap>
high
</ap>
<am>
open
</am>
</phone>
<phone>
<id>
146
</id>
<name>
pau
</name>
<cv>
consonant
</cv>
<if>
initial
</if>
<uv>
unvoiced
</uv>
<ap>
high
</ap>
<am>
close
</am>
</phone>
</phoneSet>
Prev
1
2
3
4
5
6
7
8
9
10
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment