Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
hehl2
Torchaudio
Commits
9dcc7a15
Commit
9dcc7a15
authored
Apr 25, 2022
by
flyingdown
Browse files
init v0.10.0
parent
db2b0b79
Changes
416
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
2308 additions
and
0 deletions
+2308
-0
test/torchaudio_unittest/common_utils/rnnt_utils.py
test/torchaudio_unittest/common_utils/rnnt_utils.py
+603
-0
test/torchaudio_unittest/common_utils/sox_utils.py
test/torchaudio_unittest/common_utils/sox_utils.py
+106
-0
test/torchaudio_unittest/common_utils/wav_utils.py
test/torchaudio_unittest/common_utils/wav_utils.py
+92
-0
test/torchaudio_unittest/compliance_kaldi_test.py
test/torchaudio_unittest/compliance_kaldi_test.py
+76
-0
test/torchaudio_unittest/datasets/__init__.py
test/torchaudio_unittest/datasets/__init__.py
+0
-0
test/torchaudio_unittest/datasets/cmuarctic_test.py
test/torchaudio_unittest/datasets/cmuarctic_test.py
+84
-0
test/torchaudio_unittest/datasets/cmudict_test.py
test/torchaudio_unittest/datasets/cmudict_test.py
+218
-0
test/torchaudio_unittest/datasets/commonvoice_test.py
test/torchaudio_unittest/datasets/commonvoice_test.py
+148
-0
test/torchaudio_unittest/datasets/datasets_test.py
test/torchaudio_unittest/datasets/datasets_test.py
+15
-0
test/torchaudio_unittest/datasets/gtzan_test.py
test/torchaudio_unittest/datasets/gtzan_test.py
+127
-0
test/torchaudio_unittest/datasets/librispeech_test.py
test/torchaudio_unittest/datasets/librispeech_test.py
+128
-0
test/torchaudio_unittest/datasets/libritts_test.py
test/torchaudio_unittest/datasets/libritts_test.py
+89
-0
test/torchaudio_unittest/datasets/ljspeech_test.py
test/torchaudio_unittest/datasets/ljspeech_test.py
+92
-0
test/torchaudio_unittest/datasets/speechcommands_test.py
test/torchaudio_unittest/datasets/speechcommands_test.py
+161
-0
test/torchaudio_unittest/datasets/tedlium_test.py
test/torchaudio_unittest/datasets/tedlium_test.py
+150
-0
test/torchaudio_unittest/datasets/utils_test.py
test/torchaudio_unittest/datasets/utils_test.py
+37
-0
test/torchaudio_unittest/datasets/vctk_test.py
test/torchaudio_unittest/datasets/vctk_test.py
+107
-0
test/torchaudio_unittest/datasets/yesno_test.py
test/torchaudio_unittest/datasets/yesno_test.py
+67
-0
test/torchaudio_unittest/example/__init__.py
test/torchaudio_unittest/example/__init__.py
+8
-0
test/torchaudio_unittest/example/souce_sepration/__init__.py
test/torchaudio_unittest/example/souce_sepration/__init__.py
+0
-0
No files found.
Too many changes to show.
To preserve performance only
416 of 416+
files are displayed.
Plain diff
Email patch
test/torchaudio_unittest/common_utils/rnnt_utils.py
0 → 100644
View file @
9dcc7a15
import
unittest
import
random
import
torch
import
numpy
as
np
from
torchaudio.functional
import
rnnt_loss
CPU_DEVICE
=
torch
.
device
(
"cpu"
)
class
_NumpyTransducer
(
torch
.
autograd
.
Function
):
@
staticmethod
def
forward
(
ctx
,
log_probs
,
logit_lengths
,
target_lengths
,
targets
,
blank
=-
1
,
):
device
=
log_probs
.
device
log_probs
=
log_probs
.
cpu
().
data
.
numpy
()
logit_lengths
=
logit_lengths
.
cpu
().
data
.
numpy
()
target_lengths
=
target_lengths
.
cpu
().
data
.
numpy
()
targets
=
targets
.
cpu
().
data
.
numpy
()
gradients
,
costs
,
_
,
_
=
__class__
.
compute
(
log_probs
=
log_probs
,
logit_lengths
=
logit_lengths
,
target_lengths
=
target_lengths
,
targets
=
targets
,
blank
=
blank
,
)
costs
=
torch
.
FloatTensor
(
costs
).
to
(
device
=
device
)
gradients
=
torch
.
FloatTensor
(
gradients
).
to
(
device
=
device
)
ctx
.
grads
=
torch
.
autograd
.
Variable
(
gradients
)
return
costs
@
staticmethod
def
backward
(
ctx
,
grad_output
):
grad_output
=
grad_output
.
view
(
-
1
,
1
,
1
,
1
).
to
(
ctx
.
grads
)
return
ctx
.
grads
.
mul
(
grad_output
),
None
,
None
,
None
,
None
,
None
,
None
,
None
,
None
@
staticmethod
def
compute_alpha_one_sequence
(
log_probs
,
targets
,
blank
=-
1
):
max_T
,
max_U
,
D
=
log_probs
.
shape
alpha
=
np
.
zeros
((
max_T
,
max_U
),
dtype
=
np
.
float32
)
for
t
in
range
(
1
,
max_T
):
alpha
[
t
,
0
]
=
alpha
[
t
-
1
,
0
]
+
log_probs
[
t
-
1
,
0
,
blank
]
for
u
in
range
(
1
,
max_U
):
alpha
[
0
,
u
]
=
alpha
[
0
,
u
-
1
]
+
log_probs
[
0
,
u
-
1
,
targets
[
u
-
1
]]
for
t
in
range
(
1
,
max_T
):
for
u
in
range
(
1
,
max_U
):
skip
=
alpha
[
t
-
1
,
u
]
+
log_probs
[
t
-
1
,
u
,
blank
]
emit
=
alpha
[
t
,
u
-
1
]
+
log_probs
[
t
,
u
-
1
,
targets
[
u
-
1
]]
alpha
[
t
,
u
]
=
np
.
logaddexp
(
skip
,
emit
)
cost
=
-
(
alpha
[
-
1
,
-
1
]
+
log_probs
[
-
1
,
-
1
,
blank
])
return
alpha
,
cost
@
staticmethod
def
compute_beta_one_sequence
(
log_probs
,
targets
,
blank
=-
1
):
max_T
,
max_U
,
D
=
log_probs
.
shape
beta
=
np
.
zeros
((
max_T
,
max_U
),
dtype
=
np
.
float32
)
beta
[
-
1
,
-
1
]
=
log_probs
[
-
1
,
-
1
,
blank
]
for
t
in
reversed
(
range
(
max_T
-
1
)):
beta
[
t
,
-
1
]
=
beta
[
t
+
1
,
-
1
]
+
log_probs
[
t
,
-
1
,
blank
]
for
u
in
reversed
(
range
(
max_U
-
1
)):
beta
[
-
1
,
u
]
=
beta
[
-
1
,
u
+
1
]
+
log_probs
[
-
1
,
u
,
targets
[
u
]]
for
t
in
reversed
(
range
(
max_T
-
1
)):
for
u
in
reversed
(
range
(
max_U
-
1
)):
skip
=
beta
[
t
+
1
,
u
]
+
log_probs
[
t
,
u
,
blank
]
emit
=
beta
[
t
,
u
+
1
]
+
log_probs
[
t
,
u
,
targets
[
u
]]
beta
[
t
,
u
]
=
np
.
logaddexp
(
skip
,
emit
)
cost
=
-
beta
[
0
,
0
]
return
beta
,
cost
@
staticmethod
def
compute_gradients_one_sequence
(
log_probs
,
alpha
,
beta
,
targets
,
blank
=-
1
):
max_T
,
max_U
,
D
=
log_probs
.
shape
gradients
=
np
.
full
(
log_probs
.
shape
,
float
(
"-inf"
))
cost
=
-
beta
[
0
,
0
]
gradients
[
-
1
,
-
1
,
blank
]
=
alpha
[
-
1
,
-
1
]
gradients
[:
-
1
,
:,
blank
]
=
alpha
[:
-
1
,
:]
+
beta
[
1
:,
:]
for
u
,
l
in
enumerate
(
targets
):
gradients
[:,
u
,
l
]
=
alpha
[:,
u
]
+
beta
[:,
u
+
1
]
gradients
=
-
(
np
.
exp
(
gradients
+
log_probs
+
cost
))
return
gradients
@
staticmethod
def
compute
(
log_probs
,
logit_lengths
,
target_lengths
,
targets
,
blank
=-
1
,
):
gradients
=
np
.
zeros_like
(
log_probs
)
B_tgt
,
max_T
,
max_U
,
D
=
log_probs
.
shape
B_src
=
logit_lengths
.
shape
[
0
]
H
=
int
(
B_tgt
/
B_src
)
alphas
=
np
.
zeros
((
B_tgt
,
max_T
,
max_U
))
betas
=
np
.
zeros
((
B_tgt
,
max_T
,
max_U
))
betas
.
fill
(
float
(
"-inf"
))
alphas
.
fill
(
float
(
"-inf"
))
costs
=
np
.
zeros
(
B_tgt
)
for
b_tgt
in
range
(
B_tgt
):
b_src
=
int
(
b_tgt
/
H
)
T
=
int
(
logit_lengths
[
b_src
])
# NOTE: see https://arxiv.org/pdf/1211.3711.pdf Section 2.1
U
=
int
(
target_lengths
[
b_tgt
])
+
1
seq_log_probs
=
log_probs
[
b_tgt
,
:
T
,
:
U
,
:]
seq_targets
=
targets
[
b_tgt
,
:
int
(
target_lengths
[
b_tgt
])]
alpha
,
alpha_cost
=
__class__
.
compute_alpha_one_sequence
(
log_probs
=
seq_log_probs
,
targets
=
seq_targets
,
blank
=
blank
)
beta
,
beta_cost
=
__class__
.
compute_beta_one_sequence
(
log_probs
=
seq_log_probs
,
targets
=
seq_targets
,
blank
=
blank
)
seq_gradients
=
__class__
.
compute_gradients_one_sequence
(
log_probs
=
seq_log_probs
,
alpha
=
alpha
,
beta
=
beta
,
targets
=
seq_targets
,
blank
=
blank
,
)
np
.
testing
.
assert_almost_equal
(
alpha_cost
,
beta_cost
,
decimal
=
2
)
gradients
[
b_tgt
,
:
T
,
:
U
,
:]
=
seq_gradients
costs
[
b_tgt
]
=
beta_cost
alphas
[
b_tgt
,
:
T
,
:
U
]
=
alpha
betas
[
b_tgt
,
:
T
,
:
U
]
=
beta
return
gradients
,
costs
,
alphas
,
betas
class
NumpyTransducerLoss
(
torch
.
nn
.
Module
):
def
__init__
(
self
,
blank
=-
1
):
super
().
__init__
()
self
.
blank
=
blank
def
forward
(
self
,
logits
,
logit_lengths
,
target_lengths
,
targets
,
):
log_probs
=
torch
.
nn
.
functional
.
log_softmax
(
logits
,
dim
=-
1
)
return
_NumpyTransducer
.
apply
(
log_probs
,
logit_lengths
,
target_lengths
,
targets
,
self
.
blank
,
)
def
compute_with_numpy_transducer
(
data
):
costs
=
NumpyTransducerLoss
(
blank
=
data
[
"blank"
],
)(
logits
=
data
[
"logits"
],
logit_lengths
=
data
[
"logit_lengths"
],
target_lengths
=
data
[
"target_lengths"
],
targets
=
data
[
"targets"
],
)
loss
=
torch
.
sum
(
costs
)
loss
.
backward
()
costs
=
costs
.
cpu
()
gradients
=
data
[
"logits"
].
saved_grad
.
cpu
()
return
costs
,
gradients
def
compute_with_pytorch_transducer
(
data
):
costs
=
rnnt_loss
(
logits
=
data
[
"logits"
],
logit_lengths
=
data
[
"logit_lengths"
],
target_lengths
=
data
[
"target_lengths"
],
targets
=
data
[
"targets"
],
blank
=
data
[
"blank"
],
reduction
=
"none"
,
)
loss
=
torch
.
sum
(
costs
)
loss
.
backward
()
costs
=
costs
.
cpu
()
gradients
=
data
[
"logits"
].
saved_grad
.
cpu
()
return
costs
,
gradients
def
get_basic_data
(
device
):
# Example provided
# in 6f73a2513dc784c59eec153a45f40bc528355b18
# of https://github.com/HawkAaron/warp-transducer
logits
=
torch
.
tensor
(
[
[
[
[
0.1
,
0.6
,
0.1
,
0.1
,
0.1
],
[
0.1
,
0.1
,
0.6
,
0.1
,
0.1
],
[
0.1
,
0.1
,
0.2
,
0.8
,
0.1
],
],
[
[
0.1
,
0.6
,
0.1
,
0.1
,
0.1
],
[
0.1
,
0.1
,
0.2
,
0.1
,
0.1
],
[
0.7
,
0.1
,
0.2
,
0.1
,
0.1
],
],
]
],
dtype
=
torch
.
float32
,
device
=
device
,
)
targets
=
torch
.
tensor
([[
1
,
2
]],
dtype
=
torch
.
int
,
device
=
device
)
logit_lengths
=
torch
.
tensor
([
2
],
dtype
=
torch
.
int
,
device
=
device
)
target_lengths
=
torch
.
tensor
([
2
],
dtype
=
torch
.
int
,
device
=
device
)
logits
.
requires_grad_
(
True
)
return
logits
,
targets
,
logit_lengths
,
target_lengths
def
get_B1_T10_U3_D4_data
(
random
=
False
,
dtype
=
torch
.
float32
,
device
=
CPU_DEVICE
,
):
B
,
T
,
U
,
D
=
2
,
10
,
3
,
4
logits
=
torch
.
rand
(
B
,
T
,
U
,
D
,
dtype
=
dtype
,
device
=
device
)
if
not
random
:
logits
.
fill_
(
0.1
)
logits
.
requires_grad_
(
True
)
def
grad_hook
(
grad
):
logits
.
saved_grad
=
grad
.
clone
()
logits
.
register_hook
(
grad_hook
)
data
=
{}
data
[
"logits"
]
=
logits
data
[
"logit_lengths"
]
=
torch
.
tensor
([
10
,
10
],
dtype
=
torch
.
int32
,
device
=
device
)
data
[
"target_lengths"
]
=
torch
.
tensor
([
2
,
2
],
dtype
=
torch
.
int32
,
device
=
device
)
data
[
"targets"
]
=
torch
.
tensor
([[
1
,
2
],
[
1
,
2
]],
dtype
=
torch
.
int32
,
device
=
device
)
data
[
"blank"
]
=
0
return
data
def
get_B1_T2_U3_D5_data
(
dtype
=
torch
.
float32
,
device
=
CPU_DEVICE
):
logits
=
torch
.
tensor
(
[
0.1
,
0.6
,
0.1
,
0.1
,
0.1
,
0.1
,
0.1
,
0.6
,
0.1
,
0.1
,
0.1
,
0.1
,
0.2
,
0.8
,
0.1
,
0.1
,
0.6
,
0.1
,
0.1
,
0.1
,
0.1
,
0.1
,
0.2
,
0.1
,
0.1
,
0.7
,
0.1
,
0.2
,
0.1
,
0.1
,
],
dtype
=
dtype
,
device
=
device
,
).
reshape
(
1
,
2
,
3
,
5
)
logits
.
requires_grad_
(
True
)
def
grad_hook
(
grad
):
logits
.
saved_grad
=
grad
.
clone
()
logits
.
register_hook
(
grad_hook
)
targets
=
torch
.
tensor
([[
1
,
2
]],
dtype
=
torch
.
int32
,
device
=
device
)
logit_lengths
=
torch
.
tensor
([
2
],
dtype
=
torch
.
int32
,
device
=
device
)
target_lengths
=
torch
.
tensor
([
2
],
dtype
=
torch
.
int32
,
device
=
device
)
blank
=
-
1
ref_costs
=
torch
.
tensor
([
5.09566688538
],
dtype
=
dtype
)
ref_gradients
=
torch
.
tensor
(
[
0.17703132
,
-
0.39992708
,
0.17703132
,
0.17703132
,
-
0.13116692
,
0.12247062
,
0.12247062
,
-
0.181684
,
0.12247062
,
-
0.1857276
,
0.06269141
,
0.06269141
,
0.06928471
,
0.12624498
,
-
0.32091248
,
0.05456069
,
-
0.2182428
,
0.05456069
,
0.05456069
,
0.05456069
,
0.12073967
,
0.12073967
,
-
0.48295838
,
0.12073967
,
0.12073967
,
0.30741188
,
0.16871123
,
0.18645471
,
0.16871123
,
-
0.83128875
,
],
dtype
=
dtype
,
).
reshape
(
1
,
2
,
3
,
5
)
data
=
{
"logits"
:
logits
,
"targets"
:
targets
,
"logit_lengths"
:
logit_lengths
,
"target_lengths"
:
target_lengths
,
"blank"
:
blank
,
}
return
data
,
ref_costs
,
ref_gradients
def
get_B2_T4_U3_D3_data
(
dtype
=
torch
.
float32
,
device
=
CPU_DEVICE
):
# Test from D21322854
logits
=
torch
.
tensor
(
[
0.065357
,
0.787530
,
0.081592
,
0.529716
,
0.750675
,
0.754135
,
0.609764
,
0.868140
,
0.622532
,
0.668522
,
0.858039
,
0.164539
,
0.989780
,
0.944298
,
0.603168
,
0.946783
,
0.666203
,
0.286882
,
0.094184
,
0.366674
,
0.736168
,
0.166680
,
0.714154
,
0.399400
,
0.535982
,
0.291821
,
0.612642
,
0.324241
,
0.800764
,
0.524106
,
0.779195
,
0.183314
,
0.113745
,
0.240222
,
0.339470
,
0.134160
,
0.505562
,
0.051597
,
0.640290
,
0.430733
,
0.829473
,
0.177467
,
0.320700
,
0.042883
,
0.302803
,
0.675178
,
0.569537
,
0.558474
,
0.083132
,
0.060165
,
0.107958
,
0.748615
,
0.943918
,
0.486356
,
0.418199
,
0.652408
,
0.024243
,
0.134582
,
0.366342
,
0.295830
,
0.923670
,
0.689929
,
0.741898
,
0.250005
,
0.603430
,
0.987289
,
0.592606
,
0.884672
,
0.543450
,
0.660770
,
0.377128
,
0.358021
,
],
dtype
=
dtype
,
device
=
device
,
).
reshape
(
2
,
4
,
3
,
3
)
logits
.
requires_grad_
(
True
)
def
grad_hook
(
grad
):
logits
.
saved_grad
=
grad
.
clone
()
logits
.
register_hook
(
grad_hook
)
targets
=
torch
.
tensor
([[
1
,
2
],
[
1
,
1
]],
dtype
=
torch
.
int32
,
device
=
device
)
logit_lengths
=
torch
.
tensor
([
4
,
4
],
dtype
=
torch
.
int32
,
device
=
device
)
target_lengths
=
torch
.
tensor
([
2
,
2
],
dtype
=
torch
.
int32
,
device
=
device
)
blank
=
0
ref_costs
=
torch
.
tensor
([
4.2806528590890736
,
3.9384369822503591
],
dtype
=
dtype
)
ref_gradients
=
torch
.
tensor
(
[
-
0.186844
,
-
0.062555
,
0.249399
,
-
0.203377
,
0.202399
,
0.000977
,
-
0.141016
,
0.079123
,
0.061893
,
-
0.011552
,
-
0.081280
,
0.092832
,
-
0.154257
,
0.229433
,
-
0.075176
,
-
0.246593
,
0.146405
,
0.100188
,
-
0.012918
,
-
0.061593
,
0.074512
,
-
0.055986
,
0.219831
,
-
0.163845
,
-
0.497627
,
0.209240
,
0.288387
,
0.013605
,
-
0.030220
,
0.016615
,
0.113925
,
0.062781
,
-
0.176706
,
-
0.667078
,
0.367659
,
0.299419
,
-
0.356344
,
-
0.055347
,
0.411691
,
-
0.096922
,
0.029459
,
0.067463
,
-
0.063518
,
0.027654
,
0.035863
,
-
0.154499
,
-
0.073942
,
0.228441
,
-
0.166790
,
-
0.000088
,
0.166878
,
-
0.172370
,
0.105565
,
0.066804
,
0.023875
,
-
0.118256
,
0.094381
,
-
0.104707
,
-
0.108934
,
0.213642
,
-
0.369844
,
0.180118
,
0.189726
,
0.025714
,
-
0.079462
,
0.053748
,
0.122328
,
-
0.238789
,
0.116460
,
-
0.598687
,
0.302203
,
0.296484
,
],
dtype
=
dtype
,
).
reshape
(
2
,
4
,
3
,
3
)
data
=
{
"logits"
:
logits
,
"targets"
:
targets
,
"logit_lengths"
:
logit_lengths
,
"target_lengths"
:
target_lengths
,
"blank"
:
blank
,
}
return
data
,
ref_costs
,
ref_gradients
def
get_random_data
(
max_B
=
8
,
max_T
=
128
,
max_U
=
32
,
max_D
=
40
,
blank
=-
1
,
dtype
=
torch
.
float32
,
device
=
CPU_DEVICE
,
seed
=
None
,
):
if
seed
is
not
None
:
torch
.
manual_seed
(
seed
=
seed
)
if
blank
!=
-
1
:
raise
ValueError
(
"blank != -1 is not supported yet."
)
random
.
seed
(
0
)
B
=
random
.
randint
(
1
,
max_B
-
1
)
T
=
random
.
randint
(
5
,
max_T
-
1
)
U
=
random
.
randint
(
5
,
max_U
-
1
)
D
=
random
.
randint
(
2
,
max_D
-
1
)
logit_lengths
=
torch
.
randint
(
low
=
5
,
high
=
T
+
1
,
size
=
(
B
,),
dtype
=
torch
.
int32
,
device
=
device
)
target_lengths
=
torch
.
randint
(
low
=
5
,
high
=
U
+
1
,
size
=
(
B
,),
dtype
=
torch
.
int32
,
device
=
device
)
max_src_length
=
torch
.
max
(
logit_lengths
)
max_tgt_length
=
torch
.
max
(
target_lengths
)
targets
=
torch
.
randint
(
low
=
0
,
high
=
D
-
1
,
size
=
(
B
,
max_tgt_length
),
dtype
=
torch
.
int32
,
device
=
device
)
logits
=
torch
.
rand
(
size
=
(
B
,
max_src_length
,
max_tgt_length
+
1
,
D
),
dtype
=
dtype
,
device
=
device
,
).
requires_grad_
(
True
)
def
grad_hook
(
grad
):
logits
.
saved_grad
=
grad
.
clone
()
logits
.
register_hook
(
grad_hook
)
return
{
"logits"
:
logits
,
"targets"
:
targets
,
"logit_lengths"
:
logit_lengths
,
"target_lengths"
:
target_lengths
,
"blank"
:
blank
,
}
def
skipIfNoRNNT
(
test_item
):
try
:
torch
.
ops
.
torchaudio
.
rnnt_loss
return
test_item
except
RuntimeError
:
return
unittest
.
skip
(
"torchaudio C++ extension is not compiled with RNN transducer loss"
)
test/torchaudio_unittest/common_utils/sox_utils.py
0 → 100644
View file @
9dcc7a15
import
sys
import
subprocess
import
warnings
def
get_encoding
(
dtype
):
encodings
=
{
'float32'
:
'floating-point'
,
'int32'
:
'signed-integer'
,
'int16'
:
'signed-integer'
,
'uint8'
:
'unsigned-integer'
,
}
return
encodings
[
dtype
]
def
get_bit_depth
(
dtype
):
bit_depths
=
{
'float32'
:
32
,
'int32'
:
32
,
'int16'
:
16
,
'uint8'
:
8
,
}
return
bit_depths
[
dtype
]
def
gen_audio_file
(
path
,
sample_rate
,
num_channels
,
*
,
encoding
=
None
,
bit_depth
=
None
,
compression
=
None
,
attenuation
=
None
,
duration
=
1
,
comment_file
=
None
,
):
"""Generate synthetic audio file with `sox` command."""
if
path
.
endswith
(
'.wav'
):
warnings
.
warn
(
'Use get_wav_data and save_wav to generate wav file for accurate result.'
)
command
=
[
'sox'
,
'-V3'
,
# verbose
'--no-dither'
,
# disable automatic dithering
'-R'
,
# -R is supposed to be repeatable, though the implementation looks suspicious
# and not setting the seed to a fixed value.
# https://fossies.org/dox/sox-14.4.2/sox_8c_source.html
# search "sox_globals.repeatable"
]
if
bit_depth
is
not
None
:
command
+=
[
'--bits'
,
str
(
bit_depth
)]
command
+=
[
'--rate'
,
str
(
sample_rate
),
'--null'
,
# no input
'--channels'
,
str
(
num_channels
),
]
if
compression
is
not
None
:
command
+=
[
'--compression'
,
str
(
compression
)]
if
bit_depth
is
not
None
:
command
+=
[
'--bits'
,
str
(
bit_depth
)]
if
encoding
is
not
None
:
command
+=
[
'--encoding'
,
str
(
encoding
)]
if
comment_file
is
not
None
:
command
+=
[
'--comment-file'
,
str
(
comment_file
)]
command
+=
[
str
(
path
),
'synth'
,
str
(
duration
),
# synthesizes for the given duration [sec]
'sawtooth'
,
'1'
,
# saw tooth covers the both ends of value range, which is a good property for test.
# similar to linspace(-1., 1.)
# this introduces bigger boundary effect than sine when converted to mp3
]
if
attenuation
is
not
None
:
command
+=
[
'vol'
,
f
'-
{
attenuation
}
dB'
]
print
(
' '
.
join
(
command
),
file
=
sys
.
stderr
)
subprocess
.
run
(
command
,
check
=
True
)
def
convert_audio_file
(
src_path
,
dst_path
,
*
,
encoding
=
None
,
bit_depth
=
None
,
compression
=
None
):
"""Convert audio file with `sox` command."""
command
=
[
'sox'
,
'-V3'
,
'--no-dither'
,
'-R'
,
str
(
src_path
)]
if
encoding
is
not
None
:
command
+=
[
'--encoding'
,
str
(
encoding
)]
if
bit_depth
is
not
None
:
command
+=
[
'--bits'
,
str
(
bit_depth
)]
if
compression
is
not
None
:
command
+=
[
'--compression'
,
str
(
compression
)]
command
+=
[
dst_path
]
print
(
' '
.
join
(
command
),
file
=
sys
.
stderr
)
subprocess
.
run
(
command
,
check
=
True
)
def
_flattern
(
effects
):
if
not
effects
:
return
effects
if
isinstance
(
effects
[
0
],
str
):
return
effects
return
[
item
for
sublist
in
effects
for
item
in
sublist
]
def
run_sox_effect
(
input_file
,
output_file
,
effect
,
*
,
output_sample_rate
=
None
,
output_bitdepth
=
None
):
"""Run sox effects"""
effect
=
_flattern
(
effect
)
command
=
[
'sox'
,
'-V'
,
'--no-dither'
,
input_file
]
if
output_bitdepth
:
command
+=
[
'--bits'
,
str
(
output_bitdepth
)]
command
+=
[
output_file
]
+
effect
if
output_sample_rate
:
command
+=
[
'rate'
,
str
(
output_sample_rate
)]
print
(
' '
.
join
(
command
))
subprocess
.
run
(
command
,
check
=
True
)
test/torchaudio_unittest/common_utils/wav_utils.py
0 → 100644
View file @
9dcc7a15
from
typing
import
Optional
import
torch
import
scipy.io.wavfile
def
normalize_wav
(
tensor
:
torch
.
Tensor
)
->
torch
.
Tensor
:
if
tensor
.
dtype
==
torch
.
float32
:
pass
elif
tensor
.
dtype
==
torch
.
int32
:
tensor
=
tensor
.
to
(
torch
.
float32
)
tensor
[
tensor
>
0
]
/=
2147483647.
tensor
[
tensor
<
0
]
/=
2147483648.
elif
tensor
.
dtype
==
torch
.
int16
:
tensor
=
tensor
.
to
(
torch
.
float32
)
tensor
[
tensor
>
0
]
/=
32767.
tensor
[
tensor
<
0
]
/=
32768.
elif
tensor
.
dtype
==
torch
.
uint8
:
tensor
=
tensor
.
to
(
torch
.
float32
)
-
128
tensor
[
tensor
>
0
]
/=
127.
tensor
[
tensor
<
0
]
/=
128.
return
tensor
def
get_wav_data
(
dtype
:
str
,
num_channels
:
int
,
*
,
num_frames
:
Optional
[
int
]
=
None
,
normalize
:
bool
=
True
,
channels_first
:
bool
=
True
,
):
"""Generate linear signal of the given dtype and num_channels
Data range is
[-1.0, 1.0] for float32,
[-2147483648, 2147483647] for int32
[-32768, 32767] for int16
[0, 255] for uint8
num_frames allow to change the linear interpolation parameter.
Default values are 256 for uint8, else 1 << 16.
1 << 16 as default is so that int16 value range is completely covered.
"""
dtype_
=
getattr
(
torch
,
dtype
)
if
num_frames
is
None
:
if
dtype
==
'uint8'
:
num_frames
=
256
else
:
num_frames
=
1
<<
16
if
dtype
==
'uint8'
:
base
=
torch
.
linspace
(
0
,
255
,
num_frames
,
dtype
=
dtype_
)
elif
dtype
==
'int8'
:
base
=
torch
.
linspace
(
-
128
,
127
,
num_frames
,
dtype
=
dtype_
)
elif
dtype
==
'float32'
:
base
=
torch
.
linspace
(
-
1.
,
1.
,
num_frames
,
dtype
=
dtype_
)
elif
dtype
==
'float64'
:
base
=
torch
.
linspace
(
-
1.
,
1.
,
num_frames
,
dtype
=
dtype_
)
elif
dtype
==
'int32'
:
base
=
torch
.
linspace
(
-
2147483648
,
2147483647
,
num_frames
,
dtype
=
dtype_
)
elif
dtype
==
'int16'
:
base
=
torch
.
linspace
(
-
32768
,
32767
,
num_frames
,
dtype
=
dtype_
)
else
:
raise
NotImplementedError
(
f
'Unsupported dtype
{
dtype
}
'
)
data
=
base
.
repeat
([
num_channels
,
1
])
if
not
channels_first
:
data
=
data
.
transpose
(
1
,
0
)
if
normalize
:
data
=
normalize_wav
(
data
)
return
data
def
load_wav
(
path
:
str
,
normalize
=
True
,
channels_first
=
True
)
->
torch
.
Tensor
:
"""Load wav file without torchaudio"""
sample_rate
,
data
=
scipy
.
io
.
wavfile
.
read
(
path
)
data
=
torch
.
from_numpy
(
data
.
copy
())
if
data
.
ndim
==
1
:
data
=
data
.
unsqueeze
(
1
)
if
normalize
:
data
=
normalize_wav
(
data
)
if
channels_first
:
data
=
data
.
transpose
(
1
,
0
)
return
data
,
sample_rate
def
save_wav
(
path
,
data
,
sample_rate
,
channels_first
=
True
):
"""Save wav file without torchaudio"""
if
channels_first
:
data
=
data
.
transpose
(
1
,
0
)
scipy
.
io
.
wavfile
.
write
(
path
,
sample_rate
,
data
.
numpy
())
test/torchaudio_unittest/compliance_kaldi_test.py
0 → 100644
View file @
9dcc7a15
import
torch
import
torchaudio.compliance.kaldi
as
kaldi
from
torchaudio_unittest
import
common_utils
def
extract_window
(
window
,
wave
,
f
,
frame_length
,
frame_shift
,
snip_edges
):
# just a copy of ExtractWindow from feature-window.cc in python
def
first_sample_of_frame
(
frame
,
window_size
,
window_shift
,
snip_edges
):
if
snip_edges
:
return
frame
*
window_shift
else
:
midpoint_of_frame
=
frame
*
window_shift
+
window_shift
//
2
beginning_of_frame
=
midpoint_of_frame
-
window_size
//
2
return
beginning_of_frame
sample_offset
=
0
num_samples
=
sample_offset
+
wave
.
size
(
0
)
start_sample
=
first_sample_of_frame
(
f
,
frame_length
,
frame_shift
,
snip_edges
)
end_sample
=
start_sample
+
frame_length
if
snip_edges
:
assert
(
start_sample
>=
sample_offset
and
end_sample
<=
num_samples
)
else
:
assert
(
sample_offset
==
0
or
start_sample
>=
sample_offset
)
wave_start
=
start_sample
-
sample_offset
wave_end
=
wave_start
+
frame_length
if
wave_start
>=
0
and
wave_end
<=
wave
.
size
(
0
):
window
[
f
,
:]
=
wave
[
wave_start
:(
wave_start
+
frame_length
)]
else
:
wave_dim
=
wave
.
size
(
0
)
for
s
in
range
(
frame_length
):
s_in_wave
=
s
+
wave_start
while
s_in_wave
<
0
or
s_in_wave
>=
wave_dim
:
if
s_in_wave
<
0
:
s_in_wave
=
-
s_in_wave
-
1
else
:
s_in_wave
=
2
*
wave_dim
-
1
-
s_in_wave
window
[
f
,
s
]
=
wave
[
s_in_wave
]
class
Test_Kaldi
(
common_utils
.
TempDirMixin
,
common_utils
.
TorchaudioTestCase
):
def
_test_get_strided_helper
(
self
,
num_samples
,
window_size
,
window_shift
,
snip_edges
):
waveform
=
torch
.
arange
(
num_samples
).
float
()
output
=
kaldi
.
_get_strided
(
waveform
,
window_size
,
window_shift
,
snip_edges
)
# from NumFrames in feature-window.cc
n
=
window_size
if
snip_edges
:
m
=
0
if
num_samples
<
window_size
else
1
+
(
num_samples
-
window_size
)
//
window_shift
else
:
m
=
(
num_samples
+
(
window_shift
//
2
))
//
window_shift
self
.
assertTrue
(
output
.
dim
()
==
2
)
self
.
assertTrue
(
output
.
shape
[
0
]
==
m
and
output
.
shape
[
1
]
==
n
)
window
=
torch
.
empty
((
m
,
window_size
))
for
r
in
range
(
m
):
extract_window
(
window
,
waveform
,
r
,
window_size
,
window_shift
,
snip_edges
)
self
.
assertEqual
(
window
,
output
)
def
test_get_strided
(
self
):
# generate any combination where 0 < window_size <= num_samples and
# 0 < window_shift.
for
num_samples
in
range
(
1
,
20
):
for
window_size
in
range
(
1
,
num_samples
+
1
):
for
window_shift
in
range
(
1
,
2
*
num_samples
+
1
):
for
snip_edges
in
range
(
0
,
2
):
self
.
_test_get_strided_helper
(
num_samples
,
window_size
,
window_shift
,
snip_edges
)
def
test_mfcc_empty
(
self
):
# Passing in an empty tensor should result in an error
self
.
assertRaises
(
AssertionError
,
kaldi
.
mfcc
,
torch
.
empty
(
0
))
test/torchaudio_unittest/datasets/__init__.py
0 → 100644
View file @
9dcc7a15
test/torchaudio_unittest/datasets/cmuarctic_test.py
0 → 100644
View file @
9dcc7a15
import
os
from
pathlib
import
Path
from
torchaudio.datasets
import
cmuarctic
from
torchaudio_unittest.common_utils
import
(
TempDirMixin
,
TorchaudioTestCase
,
get_whitenoise
,
save_wav
,
normalize_wav
,
)
def
get_mock_dataset
(
root_dir
):
"""
root_dir: directory to the mocked dataset
"""
mocked_data
=
[]
sample_rate
=
16000
transcript
=
"This is a test transcript."
base_dir
=
os
.
path
.
join
(
root_dir
,
"ARCTIC"
,
"cmu_us_aew_arctic"
)
txt_dir
=
os
.
path
.
join
(
base_dir
,
"etc"
)
os
.
makedirs
(
txt_dir
,
exist_ok
=
True
)
txt_file
=
os
.
path
.
join
(
txt_dir
,
"txt.done.data"
)
audio_dir
=
os
.
path
.
join
(
base_dir
,
"wav"
)
os
.
makedirs
(
audio_dir
,
exist_ok
=
True
)
seed
=
42
with
open
(
txt_file
,
"w"
)
as
txt
:
for
c
in
[
"a"
,
"b"
]:
for
i
in
range
(
5
):
utterance_id
=
f
"arctic_
{
c
}{
i
:
04
d
}
"
path
=
os
.
path
.
join
(
audio_dir
,
f
"
{
utterance_id
}
.wav"
)
data
=
get_whitenoise
(
sample_rate
=
sample_rate
,
duration
=
3
,
n_channels
=
1
,
dtype
=
"int16"
,
seed
=
seed
,
)
save_wav
(
path
,
data
,
sample_rate
)
sample
=
(
normalize_wav
(
data
),
sample_rate
,
transcript
,
utterance_id
.
split
(
"_"
)[
1
],
)
mocked_data
.
append
(
sample
)
txt
.
write
(
f
'(
{
utterance_id
}
"
{
transcript
}
" )
\n
'
)
seed
+=
1
return
mocked_data
class
TestCMUARCTIC
(
TempDirMixin
,
TorchaudioTestCase
):
backend
=
"default"
root_dir
=
None
samples
=
[]
@
classmethod
def
setUpClass
(
cls
):
cls
.
root_dir
=
cls
.
get_base_temp_dir
()
cls
.
samples
=
get_mock_dataset
(
cls
.
root_dir
)
def
_test_cmuarctic
(
self
,
dataset
):
n_ite
=
0
for
i
,
(
waveform
,
sample_rate
,
transcript
,
utterance_id
)
in
enumerate
(
dataset
):
expected_sample
=
self
.
samples
[
i
]
assert
sample_rate
==
expected_sample
[
1
]
assert
transcript
==
expected_sample
[
2
]
assert
utterance_id
==
expected_sample
[
3
]
self
.
assertEqual
(
expected_sample
[
0
],
waveform
,
atol
=
5e-5
,
rtol
=
1e-8
)
n_ite
+=
1
assert
n_ite
==
len
(
self
.
samples
)
def
test_cmuarctic_str
(
self
):
dataset
=
cmuarctic
.
CMUARCTIC
(
self
.
root_dir
)
self
.
_test_cmuarctic
(
dataset
)
def
test_cmuarctic_path
(
self
):
dataset
=
cmuarctic
.
CMUARCTIC
(
Path
(
self
.
root_dir
))
self
.
_test_cmuarctic
(
dataset
)
test/torchaudio_unittest/datasets/cmudict_test.py
0 → 100644
View file @
9dcc7a15
import
os
from
pathlib
import
Path
from
torchaudio.datasets
import
CMUDict
from
torchaudio_unittest.common_utils
import
(
TempDirMixin
,
TorchaudioTestCase
,
)
def
get_mock_dataset
(
root_dir
,
return_punc
=
False
):
"""
root_dir: directory to the mocked dataset
"""
header
=
[
";;; # CMUdict -- Major Version: 0.07"
,
";;; "
,
";;; # $HeadURL$"
,
]
puncs
=
[
"!EXCLAMATION-POINT EH2 K S K L AH0 M EY1 SH AH0 N P OY2 N T"
,
"
\"
CLOSE-QUOTE K L OW1 Z K W OW1 T"
,
"#HASH-MARK HH AE1 M AA2 R K"
,
"%PERCENT P ER0 S EH1 N T"
,
"&ERSAND AE1 M P ER0 S AE2 N D"
,
"'END-INNER-QUOTE EH1 N D IH1 N ER0 K W OW1 T"
,
"(BEGIN-PARENS B IH0 G IH1 N P ER0 EH1 N Z"
,
")CLOSE-PAREN K L OW1 Z P ER0 EH1 N"
,
"+PLUS P L UH1 S"
,
",COMMA K AA1 M AH0"
,
"--DASH D AE1 SH"
,
"!EXCLAMATION-POINT EH2 K S K L AH0 M EY1 SH AH0 N P OY2 N T"
,
"/SLASH S L AE1 SH"
,
":COLON K OW1 L AH0 N"
,
";SEMI-COLON S EH1 M IY0 K OW1 L AH0 N"
,
"?QUESTION-MARK K W EH1 S CH AH0 N M AA1 R K"
,
"{BRACE B R EY1 S"
,
"}CLOSE-BRACE K L OW1 Z B R EY1 S"
,
"...ELLIPSIS IH2 L IH1 P S IH0 S"
,
]
punc_outputs
=
[
"!"
,
"
\"
"
,
"#"
,
"%"
,
"&"
,
"'"
,
"("
,
")"
,
"+"
,
","
,
"--"
,
"!"
,
"/"
,
":"
,
";"
,
"?"
,
"{"
,
"}"
,
"..."
,
]
words
=
[
"3-D TH R IY1 D IY2"
,
"'BOUT B AW1 T"
,
"'CAUSE K AH0 Z"
,
"'TWAS T W AH1 Z"
,
"A AH0"
,
"B B IY1"
,
"C S IY1"
,
"D D IY1"
,
"E IY1"
,
"F EH1 F"
,
"G JH IY1"
,
"H EY1 CH"
,
"I AY1"
,
"J JH EY1"
,
"K K EY1"
,
"L EH1 L"
,
"M EH1 M"
,
"N EH1 N"
,
"O OW1"
,
"P P IY1"
,
"Q K Y UW1"
,
"R AA1 R"
,
"S EH1 S"
,
"T T IY1"
,
"U Y UW1"
,
"V V IY1"
,
"X EH1 K S"
,
"Y W AY1"
,
"Z Z IY1"
,
]
mocked_symbols
=
[
"AA1"
,
"AA2"
,
"AE1"
,
"AE2"
,
"AH0"
,
"AH1"
,
"AY1"
,
"B"
,
"CH"
,
"D"
,
"EH1"
,
"EH2"
,
"ER0"
,
"EY1"
,
"F"
,
"G"
,
"HH"
,
"IH0"
,
"IH1"
,
"IY0"
,
"IY1"
,
"IY2"
,
"JH"
,
"K"
,
"L"
,
"M"
,
"N"
,
"OW1"
,
"OY2"
,
"P"
,
"R"
,
"S"
,
"SH"
,
"T"
,
"TH"
,
"UH1"
,
"UW0"
,
"UW1"
,
"V"
,
"W"
,
"Y"
,
"Z"
,
]
dict_file
=
os
.
path
.
join
(
root_dir
,
"cmudict-0.7b"
)
symbol_file
=
os
.
path
.
join
(
root_dir
,
"cmudict-0.7b.symbols"
)
with
open
(
dict_file
,
"w"
)
as
fileobj
:
for
section
in
[
header
,
puncs
,
words
]:
for
line
in
section
:
fileobj
.
write
(
line
)
fileobj
.
write
(
"
\n
"
)
with
open
(
symbol_file
,
"w"
)
as
txt
:
txt
.
write
(
"
\n
"
.
join
(
mocked_symbols
))
mocked_data
=
[]
if
return_punc
:
for
i
,
ent
in
enumerate
(
puncs
):
_
,
phones
=
ent
.
split
(
" "
)
mocked_data
.
append
((
punc_outputs
[
i
],
phones
.
split
(
" "
)))
for
ent
in
words
:
word
,
phones
=
ent
.
split
(
" "
)
mocked_data
.
append
((
word
,
phones
.
split
(
" "
)))
return
mocked_data
class
TestCMUDict
(
TempDirMixin
,
TorchaudioTestCase
):
root_dir
=
None
root_punc_dir
=
None
samples
=
[]
punc_samples
=
[]
@
classmethod
def
setUpClass
(
cls
):
cls
.
root_dir
=
os
.
path
.
join
(
cls
.
get_base_temp_dir
(),
"normal"
)
os
.
mkdir
(
cls
.
root_dir
)
cls
.
samples
=
get_mock_dataset
(
cls
.
root_dir
)
cls
.
root_punc_dir
=
os
.
path
.
join
(
cls
.
get_base_temp_dir
(),
"punc"
)
os
.
mkdir
(
cls
.
root_punc_dir
)
cls
.
punc_samples
=
get_mock_dataset
(
cls
.
root_punc_dir
,
return_punc
=
True
)
def
_test_cmudict
(
self
,
dataset
):
"""Test if the dataset is reading the mocked data correctly."""
n_item
=
0
for
i
,
(
word
,
phones
)
in
enumerate
(
dataset
):
expected_word
,
expected_phones
=
self
.
samples
[
i
]
assert
word
==
expected_word
assert
phones
==
expected_phones
n_item
+=
1
assert
n_item
==
len
(
self
.
samples
)
def
_test_punc_cmudict
(
self
,
dataset
):
"""Test if the dataset is reading the mocked data with punctuations correctly."""
n_item
=
0
for
i
,
(
word
,
phones
)
in
enumerate
(
dataset
):
expected_word
,
expected_phones
=
self
.
punc_samples
[
i
]
assert
word
==
expected_word
assert
phones
==
expected_phones
n_item
+=
1
assert
n_item
==
len
(
self
.
punc_samples
)
def
test_cmuarctic_path_with_punctuation
(
self
):
dataset
=
CMUDict
(
Path
(
self
.
root_punc_dir
),
exclude_punctuations
=
False
)
self
.
_test_punc_cmudict
(
dataset
)
def
test_cmuarctic_str_with_punctuation
(
self
):
dataset
=
CMUDict
(
self
.
root_punc_dir
,
exclude_punctuations
=
False
)
self
.
_test_punc_cmudict
(
dataset
)
def
test_cmuarctic_path
(
self
):
dataset
=
CMUDict
(
Path
(
self
.
root_punc_dir
),
exclude_punctuations
=
True
)
self
.
_test_cmudict
(
dataset
)
def
test_cmuarctic_str
(
self
):
dataset
=
CMUDict
(
self
.
root_punc_dir
,
exclude_punctuations
=
True
)
self
.
_test_cmudict
(
dataset
)
test/torchaudio_unittest/datasets/commonvoice_test.py
0 → 100644
View file @
9dcc7a15
import
csv
import
os
from
pathlib
import
Path
from
typing
import
Tuple
,
Dict
from
torch
import
Tensor
from
torchaudio_unittest.common_utils
import
(
TempDirMixin
,
TorchaudioTestCase
,
get_whitenoise
,
save_wav
,
normalize_wav
,
)
from
torchaudio.datasets
import
COMMONVOICE
_ORIGINAL_EXT_AUDIO
=
COMMONVOICE
.
_ext_audio
_SAMPLE_RATE
=
48000
_HEADERS
=
[
u
"client_ids"
,
u
"path"
,
u
"sentence"
,
u
"up_votes"
,
u
"down_votes"
,
u
"age"
,
u
"gender"
,
u
"accent"
]
_EN_TRAIN_CSV_CONTENTS
=
[
[
"9d16c5d980247861130e0480e2719f448be73d86a496c36d01a477cbdecd8cfd1399403d7a77bf458d211a70711b2da0845c"
,
"common_voice_en_18885784.wav"
,
"He was accorded a State funeral, and was buried in Drayton and Toowoomba Cemetery."
,
"2"
,
"0"
,
""
,
""
,
""
],
[
"c82eb9291328620f06025a1f8112b909099e447e485e99236cb87df008650250e79fea5ca772061fb6a370830847b9c44d20"
,
"common_voice_en_556542.wav"
,
"Once more into the breach"
,
"2"
,
"0"
,
"thirties"
,
"male"
,
"us"
],
[
"f74d880c5ad4c5917f314a604d3fc4805159d255796fb9f8defca35333ecc002bdf53dc463503c12674ea840b21b4a507b7c"
,
"common_voice_en_18607573.wav"
,
"Caddy, show Miss Clare and Miss Summerson their rooms."
,
"2"
,
"0"
,
"twenties"
,
"male"
,
"canada"
],
]
_FR_TRAIN_CSV_CONTENTS
=
[
[
"a2e8e1e1cc74d08c92a53d7b9ff84e077eb90410edd85b8882f16fd037cecfcb6a19413c6c63ce6458cfea9579878fa91cef"
"18343441c601cae0597a4b0d3144"
,
"89e67e7682b36786a0b4b4022c4d42090c86edd96c78c12d30088e62522b8fe466ea4912e6a1055dfb91b296a0743e0a2bbe"
"16cebac98ee5349e3e8262cb9329"
,
"Or sur ce point nous n’avons aucune réponse de votre part."
,
"2"
,
"0"
,
"twenties"
,
"male"
,
"france"
],
[
"a2e8e1e1cc74d08c92a53d7b9ff84e077eb90410edd85b8882f16fd037cecfcb6a19413c6c63ce6458cfea9579878fa91cef18"
"343441c601cae0597a4b0d3144"
,
"87d71819a26179e93acfee149d0b21b7bf5e926e367d80b2b3792d45f46e04853a514945783ff764c1fc237b4eb0ee2b0a7a7"
"cbd395acbdfcfa9d76a6e199bbd"
,
"Monsieur de La Verpillière, laissez parler le ministre"
,
"2"
,
"0"
,
"twenties"
,
"male"
,
"france"
],
]
def
get_mock_dataset
(
root_dir
,
train_csv_contents
,
ext_audio
)
->
Tuple
[
Tensor
,
int
,
Dict
[
str
,
str
]]:
"""
prepares mocked dataset
"""
mocked_data
=
[]
# Note: extension is changed to wav for the sake of test
# Note: the first content is missing values for `age`, `gender` and `accent` as in the original data.
# Tsv file name difference does not mean different subset, testing as a whole dataset here
tsv_filename
=
os
.
path
.
join
(
root_dir
,
"train.tsv"
)
audio_base_path
=
os
.
path
.
join
(
root_dir
,
"clips"
)
os
.
makedirs
(
audio_base_path
,
exist_ok
=
True
)
with
open
(
tsv_filename
,
"w"
,
newline
=
''
)
as
tsv
:
writer
=
csv
.
writer
(
tsv
,
delimiter
=
'
\t
'
)
writer
.
writerow
(
_HEADERS
)
for
i
,
content
in
enumerate
(
train_csv_contents
):
content
[
2
]
=
str
(
content
[
2
].
encode
(
"utf-8"
))
writer
.
writerow
(
content
)
if
not
content
[
1
].
endswith
(
ext_audio
):
audio_path
=
os
.
path
.
join
(
audio_base_path
,
content
[
1
]
+
ext_audio
)
else
:
audio_path
=
os
.
path
.
join
(
audio_base_path
,
content
[
1
])
data
=
get_whitenoise
(
sample_rate
=
_SAMPLE_RATE
,
duration
=
1
,
n_channels
=
1
,
seed
=
i
,
dtype
=
'float32'
)
save_wav
(
audio_path
,
data
,
_SAMPLE_RATE
)
# Append data entry
mocked_data
.
append
((
normalize_wav
(
data
),
_SAMPLE_RATE
,
dict
(
zip
(
_HEADERS
,
content
))))
return
mocked_data
def
get_mock_dataset_en
(
root_dir
,
ext_audio
)
->
Tuple
[
Tensor
,
int
,
Dict
[
str
,
str
]]:
"""
prepares english mocked dataset
"""
return
get_mock_dataset
(
root_dir
,
_EN_TRAIN_CSV_CONTENTS
,
ext_audio
)
def
get_mock_dataset_fr
(
root_dir
,
ext_audio
)
->
Tuple
[
Tensor
,
int
,
Dict
[
str
,
str
]]:
"""
prepares french mocked dataset
"""
return
get_mock_dataset
(
root_dir
,
_FR_TRAIN_CSV_CONTENTS
,
ext_audio
)
class
BaseTestCommonVoice
(
TempDirMixin
):
root_dir
=
None
data
=
None
@
classmethod
def
setUpClass
(
cls
):
super
().
setUpClass
()
cls
.
root_dir
=
cls
.
get_base_temp_dir
()
COMMONVOICE
.
_ext_audio
=
".wav"
@
classmethod
def
tearDownClass
(
cls
):
super
().
tearDownClass
()
COMMONVOICE
.
_ext_audio
=
_ORIGINAL_EXT_AUDIO
def
_test_commonvoice
(
self
,
dataset
):
n_ite
=
0
for
i
,
(
waveform
,
sample_rate
,
dictionary
)
in
enumerate
(
dataset
):
expected_dictionary
=
self
.
data
[
i
][
2
]
expected_data
=
self
.
data
[
i
][
0
]
self
.
assertEqual
(
expected_data
,
waveform
,
atol
=
5e-5
,
rtol
=
1e-8
)
assert
sample_rate
==
_SAMPLE_RATE
assert
dictionary
==
expected_dictionary
n_ite
+=
1
assert
n_ite
==
len
(
self
.
data
)
class
TestCommonVoiceEN
(
BaseTestCommonVoice
,
TorchaudioTestCase
):
backend
=
'default'
root_dir
=
None
@
classmethod
def
setUpClass
(
cls
):
super
().
setUpClass
()
cls
.
data
=
get_mock_dataset_en
(
cls
.
root_dir
,
COMMONVOICE
.
_ext_audio
)
def
test_commonvoice_str
(
self
):
dataset
=
COMMONVOICE
(
self
.
root_dir
)
self
.
_test_commonvoice
(
dataset
)
def
test_commonvoice_path
(
self
):
dataset
=
COMMONVOICE
(
Path
(
self
.
root_dir
))
self
.
_test_commonvoice
(
dataset
)
class
TestCommonVoiceFR
(
BaseTestCommonVoice
,
TorchaudioTestCase
):
backend
=
'default'
root_dir
=
None
@
classmethod
def
setUpClass
(
cls
):
super
().
setUpClass
()
cls
.
data
=
get_mock_dataset_fr
(
cls
.
root_dir
,
COMMONVOICE
.
_ext_audio
)
def
test_commonvoice_str
(
self
):
dataset
=
COMMONVOICE
(
self
.
root_dir
)
self
.
_test_commonvoice
(
dataset
)
test/torchaudio_unittest/datasets/datasets_test.py
0 → 100644
View file @
9dcc7a15
from
torchaudio.datasets.vctk
import
VCTK
from
torchaudio_unittest.common_utils
import
(
TorchaudioTestCase
,
get_asset_path
,
)
class
TestDatasets
(
TorchaudioTestCase
):
backend
=
'default'
path
=
get_asset_path
()
def
test_vctk
(
self
):
data
=
VCTK
(
self
.
path
)
data
[
0
]
test/torchaudio_unittest/datasets/gtzan_test.py
0 → 100644
View file @
9dcc7a15
import
os
from
pathlib
import
Path
from
torchaudio.datasets
import
gtzan
from
torchaudio_unittest.common_utils
import
(
TempDirMixin
,
TorchaudioTestCase
,
get_whitenoise
,
save_wav
,
normalize_wav
,
)
def
get_mock_dataset
(
root_dir
):
"""
root_dir: directory to the mocked dataset
"""
mocked_samples
=
[]
mocked_training
=
[]
mocked_validation
=
[]
mocked_testing
=
[]
sample_rate
=
22050
seed
=
0
for
genre
in
gtzan
.
gtzan_genres
:
base_dir
=
os
.
path
.
join
(
root_dir
,
'genres'
,
genre
)
os
.
makedirs
(
base_dir
,
exist_ok
=
True
)
for
i
in
range
(
100
):
filename
=
f
'
{
genre
}
.
{
i
:
05
d
}
'
path
=
os
.
path
.
join
(
base_dir
,
f
'
{
filename
}
.wav'
)
data
=
get_whitenoise
(
sample_rate
=
sample_rate
,
duration
=
0.01
,
n_channels
=
1
,
dtype
=
'int16'
,
seed
=
seed
)
save_wav
(
path
,
data
,
sample_rate
)
sample
=
(
normalize_wav
(
data
),
sample_rate
,
genre
)
mocked_samples
.
append
(
sample
)
if
filename
in
gtzan
.
filtered_test
:
mocked_testing
.
append
(
sample
)
if
filename
in
gtzan
.
filtered_train
:
mocked_training
.
append
(
sample
)
if
filename
in
gtzan
.
filtered_valid
:
mocked_validation
.
append
(
sample
)
seed
+=
1
return
(
mocked_samples
,
mocked_training
,
mocked_validation
,
mocked_testing
)
class
TestGTZAN
(
TempDirMixin
,
TorchaudioTestCase
):
backend
=
'default'
root_dir
=
None
samples
=
[]
training
=
[]
validation
=
[]
testing
=
[]
@
classmethod
def
setUpClass
(
cls
):
cls
.
root_dir
=
cls
.
get_base_temp_dir
()
mocked_data
=
get_mock_dataset
(
cls
.
root_dir
)
cls
.
samples
=
mocked_data
[
0
]
cls
.
training
=
mocked_data
[
1
]
cls
.
validation
=
mocked_data
[
2
]
cls
.
testing
=
mocked_data
[
3
]
def
test_no_subset
(
self
):
dataset
=
gtzan
.
GTZAN
(
self
.
root_dir
)
n_ite
=
0
for
i
,
(
waveform
,
sample_rate
,
label
)
in
enumerate
(
dataset
):
self
.
assertEqual
(
waveform
,
self
.
samples
[
i
][
0
],
atol
=
5e-5
,
rtol
=
1e-8
)
assert
sample_rate
==
self
.
samples
[
i
][
1
]
assert
label
==
self
.
samples
[
i
][
2
]
n_ite
+=
1
assert
n_ite
==
len
(
self
.
samples
)
def
_test_training
(
self
,
dataset
):
n_ite
=
0
for
i
,
(
waveform
,
sample_rate
,
label
)
in
enumerate
(
dataset
):
self
.
assertEqual
(
waveform
,
self
.
training
[
i
][
0
],
atol
=
5e-5
,
rtol
=
1e-8
)
assert
sample_rate
==
self
.
training
[
i
][
1
]
assert
label
==
self
.
training
[
i
][
2
]
n_ite
+=
1
assert
n_ite
==
len
(
self
.
training
)
def
_test_validation
(
self
,
dataset
):
n_ite
=
0
for
i
,
(
waveform
,
sample_rate
,
label
)
in
enumerate
(
dataset
):
self
.
assertEqual
(
waveform
,
self
.
validation
[
i
][
0
],
atol
=
5e-5
,
rtol
=
1e-8
)
assert
sample_rate
==
self
.
validation
[
i
][
1
]
assert
label
==
self
.
validation
[
i
][
2
]
n_ite
+=
1
assert
n_ite
==
len
(
self
.
validation
)
def
_test_testing
(
self
,
dataset
):
n_ite
=
0
for
i
,
(
waveform
,
sample_rate
,
label
)
in
enumerate
(
dataset
):
self
.
assertEqual
(
waveform
,
self
.
testing
[
i
][
0
],
atol
=
5e-5
,
rtol
=
1e-8
)
assert
sample_rate
==
self
.
testing
[
i
][
1
]
assert
label
==
self
.
testing
[
i
][
2
]
n_ite
+=
1
assert
n_ite
==
len
(
self
.
testing
)
def
test_training_str
(
self
):
train_dataset
=
gtzan
.
GTZAN
(
self
.
root_dir
,
subset
=
'training'
)
self
.
_test_training
(
train_dataset
)
def
test_validation_str
(
self
):
val_dataset
=
gtzan
.
GTZAN
(
self
.
root_dir
,
subset
=
'validation'
)
self
.
_test_validation
(
val_dataset
)
def
test_testing_str
(
self
):
test_dataset
=
gtzan
.
GTZAN
(
self
.
root_dir
,
subset
=
'testing'
)
self
.
_test_testing
(
test_dataset
)
def
test_training_path
(
self
):
root_dir
=
Path
(
self
.
root_dir
)
train_dataset
=
gtzan
.
GTZAN
(
root_dir
,
subset
=
'training'
)
self
.
_test_training
(
train_dataset
)
def
test_validation_path
(
self
):
root_dir
=
Path
(
self
.
root_dir
)
val_dataset
=
gtzan
.
GTZAN
(
root_dir
,
subset
=
'validation'
)
self
.
_test_validation
(
val_dataset
)
def
test_testing_path
(
self
):
root_dir
=
Path
(
self
.
root_dir
)
test_dataset
=
gtzan
.
GTZAN
(
root_dir
,
subset
=
'testing'
)
self
.
_test_testing
(
test_dataset
)
test/torchaudio_unittest/datasets/librispeech_test.py
0 → 100644
View file @
9dcc7a15
import
os
from
pathlib
import
Path
from
torchaudio_unittest.common_utils
import
(
TempDirMixin
,
TorchaudioTestCase
,
get_whitenoise
,
save_wav
,
normalize_wav
,
)
from
torchaudio.datasets
import
librispeech
# Used to generate a unique transcript for each dummy audio file
_NUMBERS
=
[
'ZERO'
,
'ONE'
,
'TWO'
,
'THREE'
,
'FOUR'
,
'FIVE'
,
'SIX'
,
'SEVEN'
,
'EIGHT'
,
'NINE'
]
def
get_mock_dataset
(
root_dir
):
"""
root_dir: directory to the mocked dataset
"""
mocked_data
=
[]
dataset_dir
=
os
.
path
.
join
(
root_dir
,
librispeech
.
FOLDER_IN_ARCHIVE
,
librispeech
.
URL
)
os
.
makedirs
(
dataset_dir
,
exist_ok
=
True
)
sample_rate
=
16000
# 16kHz
seed
=
0
for
speaker_id
in
range
(
5
):
speaker_path
=
os
.
path
.
join
(
dataset_dir
,
str
(
speaker_id
))
os
.
makedirs
(
speaker_path
,
exist_ok
=
True
)
for
chapter_id
in
range
(
3
):
chapter_path
=
os
.
path
.
join
(
speaker_path
,
str
(
chapter_id
))
os
.
makedirs
(
chapter_path
,
exist_ok
=
True
)
trans_content
=
[]
for
utterance_id
in
range
(
10
):
filename
=
f
'
{
speaker_id
}
-
{
chapter_id
}
-
{
utterance_id
:
04
d
}
.wav'
path
=
os
.
path
.
join
(
chapter_path
,
filename
)
transcript
=
' '
.
join
(
[
_NUMBERS
[
x
]
for
x
in
[
speaker_id
,
chapter_id
,
utterance_id
]]
)
trans_content
.
append
(
f
'
{
speaker_id
}
-
{
chapter_id
}
-
{
utterance_id
:
04
d
}
{
transcript
}
'
)
data
=
get_whitenoise
(
sample_rate
=
sample_rate
,
duration
=
0.01
,
n_channels
=
1
,
dtype
=
'float32'
,
seed
=
seed
)
save_wav
(
path
,
data
,
sample_rate
)
sample
=
(
normalize_wav
(
data
),
sample_rate
,
transcript
,
speaker_id
,
chapter_id
,
utterance_id
)
mocked_data
.
append
(
sample
)
seed
+=
1
trans_filename
=
f
'
{
speaker_id
}
-
{
chapter_id
}
.trans.txt'
trans_path
=
os
.
path
.
join
(
chapter_path
,
trans_filename
)
with
open
(
trans_path
,
'w'
)
as
f
:
f
.
write
(
'
\n
'
.
join
(
trans_content
))
return
mocked_data
class
TestLibriSpeech
(
TempDirMixin
,
TorchaudioTestCase
):
backend
=
'default'
root_dir
=
None
samples
=
[]
@
classmethod
def
setUpClass
(
cls
):
cls
.
root_dir
=
cls
.
get_base_temp_dir
()
cls
.
samples
=
get_mock_dataset
(
cls
.
root_dir
)
@
classmethod
def
tearDownClass
(
cls
):
# In case of test failure
librispeech
.
LIBRISPEECH
.
_ext_audio
=
'.flac'
def
_test_librispeech
(
self
,
dataset
):
num_samples
=
0
for
i
,
(
data
,
sample_rate
,
transcript
,
speaker_id
,
chapter_id
,
utterance_id
)
in
enumerate
(
dataset
):
self
.
assertEqual
(
data
,
self
.
samples
[
i
][
0
],
atol
=
5e-5
,
rtol
=
1e-8
)
assert
sample_rate
==
self
.
samples
[
i
][
1
]
assert
transcript
==
self
.
samples
[
i
][
2
]
assert
speaker_id
==
self
.
samples
[
i
][
3
]
assert
chapter_id
==
self
.
samples
[
i
][
4
]
assert
utterance_id
==
self
.
samples
[
i
][
5
]
num_samples
+=
1
assert
num_samples
==
len
(
self
.
samples
)
librispeech
.
LIBRISPEECH
.
_ext_audio
=
'.flac'
def
test_librispeech_str
(
self
):
librispeech
.
LIBRISPEECH
.
_ext_audio
=
'.wav'
dataset
=
librispeech
.
LIBRISPEECH
(
self
.
root_dir
)
self
.
_test_librispeech
(
dataset
)
def
test_librispeech_path
(
self
):
librispeech
.
LIBRISPEECH
.
_ext_audio
=
'.wav'
dataset
=
librispeech
.
LIBRISPEECH
(
Path
(
self
.
root_dir
))
self
.
_test_librispeech
(
dataset
)
test/torchaudio_unittest/datasets/libritts_test.py
0 → 100644
View file @
9dcc7a15
import
os
from
pathlib
import
Path
from
torchaudio_unittest.common_utils
import
(
TempDirMixin
,
TorchaudioTestCase
,
get_whitenoise
,
save_wav
,
normalize_wav
,
)
from
torchaudio.datasets.libritts
import
LIBRITTS
_UTTERANCE_IDS
=
[
[
19
,
198
,
'000000'
,
'000000'
],
[
26
,
495
,
'000004'
,
'000000'
],
]
_ORIGINAL_TEXT
=
'this is the original text.'
_NORMALIZED_TEXT
=
'this is the normalized text.'
def
get_mock_dataset
(
root_dir
):
"""
root_dir: directory to the mocked dataset
"""
mocked_data
=
[]
base_dir
=
os
.
path
.
join
(
root_dir
,
'LibriTTS'
,
'train-clean-100'
)
for
i
,
utterance_id
in
enumerate
(
_UTTERANCE_IDS
):
filename
=
f
'
{
"_"
.
join
(
str
(
u
)
for
u
in
utterance_id
)
}
.wav'
file_dir
=
os
.
path
.
join
(
base_dir
,
str
(
utterance_id
[
0
]),
str
(
utterance_id
[
1
]))
os
.
makedirs
(
file_dir
,
exist_ok
=
True
)
path
=
os
.
path
.
join
(
file_dir
,
filename
)
data
=
get_whitenoise
(
sample_rate
=
24000
,
duration
=
2
,
n_channels
=
1
,
dtype
=
'int16'
,
seed
=
i
)
save_wav
(
path
,
data
,
24000
)
mocked_data
.
append
(
normalize_wav
(
data
))
original_text_filename
=
f
'
{
"_"
.
join
(
str
(
u
)
for
u
in
utterance_id
)
}
.original.txt'
path_original
=
os
.
path
.
join
(
file_dir
,
original_text_filename
)
with
open
(
path_original
,
'w'
)
as
file_
:
file_
.
write
(
_ORIGINAL_TEXT
)
normalized_text_filename
=
f
'
{
"_"
.
join
(
str
(
u
)
for
u
in
utterance_id
)
}
.normalized.txt'
path_normalized
=
os
.
path
.
join
(
file_dir
,
normalized_text_filename
)
with
open
(
path_normalized
,
'w'
)
as
file_
:
file_
.
write
(
_NORMALIZED_TEXT
)
return
mocked_data
,
_UTTERANCE_IDS
,
_ORIGINAL_TEXT
,
_NORMALIZED_TEXT
class
TestLibriTTS
(
TempDirMixin
,
TorchaudioTestCase
):
backend
=
'default'
root_dir
=
None
data
=
[]
_utterance_ids
,
_original_text
,
_normalized_text
=
[],
[],
[]
@
classmethod
def
setUpClass
(
cls
):
cls
.
root_dir
=
cls
.
get_base_temp_dir
()
cls
.
data
,
cls
.
_utterance_ids
,
cls
.
_original_text
,
cls
.
_normalized_text
=
get_mock_dataset
(
cls
.
root_dir
)
def
_test_libritts
(
self
,
dataset
):
n_ites
=
0
for
i
,
(
waveform
,
sample_rate
,
original_text
,
normalized_text
,
speaker_id
,
chapter_id
,
utterance_id
)
in
enumerate
(
dataset
):
expected_ids
=
self
.
_utterance_ids
[
i
]
expected_data
=
self
.
data
[
i
]
self
.
assertEqual
(
expected_data
,
waveform
,
atol
=
5e-5
,
rtol
=
1e-8
)
assert
sample_rate
==
24000
assert
speaker_id
==
expected_ids
[
0
]
assert
chapter_id
==
expected_ids
[
1
]
assert
original_text
==
self
.
_original_text
assert
normalized_text
==
self
.
_normalized_text
assert
utterance_id
==
f
'
{
"_"
.
join
(
str
(
u
)
for
u
in
expected_ids
[
-
4
:])
}
'
n_ites
+=
1
assert
n_ites
==
len
(
self
.
_utterance_ids
)
def
test_libritts_str
(
self
):
dataset
=
LIBRITTS
(
self
.
root_dir
)
self
.
_test_libritts
(
dataset
)
def
test_libritts_path
(
self
):
dataset
=
LIBRITTS
(
Path
(
self
.
root_dir
))
self
.
_test_libritts
(
dataset
)
test/torchaudio_unittest/datasets/ljspeech_test.py
0 → 100644
View file @
9dcc7a15
import
csv
import
os
from
pathlib
import
Path
from
torchaudio_unittest.common_utils
import
(
TempDirMixin
,
TorchaudioTestCase
,
get_whitenoise
,
normalize_wav
,
save_wav
,
)
from
torchaudio.datasets
import
ljspeech
_TRANSCRIPTS
=
[
"Test transcript 1"
,
"Test transcript 2"
,
"Test transcript 3"
,
"In 1465 Sweynheim and Pannartz began printing in the monastery of Subiaco near Rome,"
]
_NORMALIZED_TRANSCRIPT
=
[
"Test transcript one"
,
"Test transcript two"
,
"Test transcript three"
,
"In fourteen sixty-five Sweynheim and Pannartz began printing in the monastery of Subiaco near Rome,"
]
def
get_mock_dataset
(
root_dir
):
"""
root_dir: path to the mocked dataset
"""
mocked_data
=
[]
base_dir
=
os
.
path
.
join
(
root_dir
,
"LJSpeech-1.1"
)
archive_dir
=
os
.
path
.
join
(
base_dir
,
"wavs"
)
os
.
makedirs
(
archive_dir
,
exist_ok
=
True
)
metadata_path
=
os
.
path
.
join
(
base_dir
,
"metadata.csv"
)
sample_rate
=
22050
with
open
(
metadata_path
,
mode
=
"w"
,
newline
=
''
)
as
metadata_file
:
metadata_writer
=
csv
.
writer
(
metadata_file
,
delimiter
=
"|"
,
quoting
=
csv
.
QUOTE_NONE
)
for
i
,
(
transcript
,
normalized_transcript
)
in
enumerate
(
zip
(
_TRANSCRIPTS
,
_NORMALIZED_TRANSCRIPT
)
):
fileid
=
f
'LJ001-
{
i
:
04
d
}
'
metadata_writer
.
writerow
([
fileid
,
transcript
,
normalized_transcript
])
filename
=
fileid
+
".wav"
path
=
os
.
path
.
join
(
archive_dir
,
filename
)
data
=
get_whitenoise
(
sample_rate
=
sample_rate
,
duration
=
1
,
n_channels
=
1
,
dtype
=
"int16"
,
seed
=
i
)
save_wav
(
path
,
data
,
sample_rate
)
mocked_data
.
append
(
normalize_wav
(
data
))
return
mocked_data
,
_TRANSCRIPTS
,
_NORMALIZED_TRANSCRIPT
class
TestLJSpeech
(
TempDirMixin
,
TorchaudioTestCase
):
backend
=
"default"
root_dir
=
None
data
,
_transcripts
,
_normalized_transcript
=
[],
[],
[]
@
classmethod
def
setUpClass
(
cls
):
cls
.
root_dir
=
cls
.
get_base_temp_dir
()
cls
.
data
,
cls
.
_transcripts
,
cls
.
_normalized_transcript
=
get_mock_dataset
(
cls
.
root_dir
)
def
_test_ljspeech
(
self
,
dataset
):
n_ite
=
0
for
i
,
(
waveform
,
sample_rate
,
transcript
,
normalized_transcript
)
in
enumerate
(
dataset
):
expected_transcript
=
self
.
_transcripts
[
i
]
expected_normalized_transcript
=
self
.
_normalized_transcript
[
i
]
expected_data
=
self
.
data
[
i
]
self
.
assertEqual
(
expected_data
,
waveform
,
atol
=
5e-5
,
rtol
=
1e-8
)
assert
sample_rate
==
sample_rate
assert
transcript
==
expected_transcript
assert
normalized_transcript
==
expected_normalized_transcript
n_ite
+=
1
assert
n_ite
==
len
(
self
.
data
)
def
test_ljspeech_str
(
self
):
dataset
=
ljspeech
.
LJSPEECH
(
self
.
root_dir
)
self
.
_test_ljspeech
(
dataset
)
def
test_ljspeech_path
(
self
):
dataset
=
ljspeech
.
LJSPEECH
(
Path
(
self
.
root_dir
))
self
.
_test_ljspeech
(
dataset
)
test/torchaudio_unittest/datasets/speechcommands_test.py
0 → 100644
View file @
9dcc7a15
import
os
from
pathlib
import
Path
from
torchaudio_unittest.common_utils
import
(
TempDirMixin
,
TorchaudioTestCase
,
get_whitenoise
,
normalize_wav
,
save_wav
,
)
from
torchaudio.datasets
import
speechcommands
_LABELS
=
[
"bed"
,
"bird"
,
"cat"
,
"dog"
,
"down"
,
"eight"
,
"five"
,
"follow"
,
"forward"
,
"four"
,
"go"
,
"happy"
,
"house"
,
"learn"
,
"left"
,
"marvin"
,
"nine"
,
"no"
,
"off"
,
"on"
,
"one"
,
"right"
,
"seven"
,
"sheila"
,
"six"
,
"stop"
,
"three"
,
"tree"
,
"two"
,
"up"
,
"visual"
,
"wow"
,
"yes"
,
"zero"
,
]
def
get_mock_dataset
(
dataset_dir
):
"""
dataset_dir: directory to the mocked dataset
"""
mocked_samples
=
[]
mocked_train_samples
=
[]
mocked_valid_samples
=
[]
mocked_test_samples
=
[]
os
.
makedirs
(
dataset_dir
,
exist_ok
=
True
)
sample_rate
=
16000
# 16kHz sample rate
seed
=
0
valid_file
=
os
.
path
.
join
(
dataset_dir
,
"validation_list.txt"
)
test_file
=
os
.
path
.
join
(
dataset_dir
,
"testing_list.txt"
)
with
open
(
valid_file
,
"w"
)
as
valid
,
open
(
test_file
,
"w"
)
as
test
:
for
label
in
_LABELS
:
path
=
os
.
path
.
join
(
dataset_dir
,
label
)
os
.
makedirs
(
path
,
exist_ok
=
True
)
for
j
in
range
(
6
):
# generate hash ID for speaker
speaker
=
"{:08x}"
.
format
(
j
)
for
utterance
in
range
(
3
):
filename
=
f
"
{
speaker
}{
speechcommands
.
HASH_DIVIDER
}{
utterance
}
.wav"
file_path
=
os
.
path
.
join
(
path
,
filename
)
seed
+=
1
data
=
get_whitenoise
(
sample_rate
=
sample_rate
,
duration
=
0.01
,
n_channels
=
1
,
dtype
=
"int16"
,
seed
=
seed
,
)
save_wav
(
file_path
,
data
,
sample_rate
)
sample
=
(
normalize_wav
(
data
),
sample_rate
,
label
,
speaker
,
utterance
,
)
mocked_samples
.
append
(
sample
)
if
j
<
2
:
mocked_train_samples
.
append
(
sample
)
elif
j
<
4
:
valid
.
write
(
f
'
{
label
}
/
{
filename
}
\n
'
)
mocked_valid_samples
.
append
(
sample
)
elif
j
<
6
:
test
.
write
(
f
'
{
label
}
/
{
filename
}
\n
'
)
mocked_test_samples
.
append
(
sample
)
return
mocked_samples
,
mocked_train_samples
,
mocked_valid_samples
,
mocked_test_samples
class
TestSpeechCommands
(
TempDirMixin
,
TorchaudioTestCase
):
backend
=
"default"
root_dir
=
None
samples
=
[]
train_samples
=
[]
valid_samples
=
[]
test_samples
=
[]
@
classmethod
def
setUpClass
(
cls
):
cls
.
root_dir
=
cls
.
get_base_temp_dir
()
dataset_dir
=
os
.
path
.
join
(
cls
.
root_dir
,
speechcommands
.
FOLDER_IN_ARCHIVE
,
speechcommands
.
URL
)
cls
.
samples
,
cls
.
train_samples
,
cls
.
valid_samples
,
cls
.
test_samples
=
get_mock_dataset
(
dataset_dir
)
def
_testSpeechCommands
(
self
,
dataset
,
data_samples
):
num_samples
=
0
for
i
,
(
data
,
sample_rate
,
label
,
speaker_id
,
utterance_number
)
in
enumerate
(
dataset
):
self
.
assertEqual
(
data
,
data_samples
[
i
][
0
],
atol
=
5e-5
,
rtol
=
1e-8
)
assert
sample_rate
==
data_samples
[
i
][
1
]
assert
label
==
data_samples
[
i
][
2
]
assert
speaker_id
==
data_samples
[
i
][
3
]
assert
utterance_number
==
data_samples
[
i
][
4
]
num_samples
+=
1
assert
num_samples
==
len
(
data_samples
)
def
testSpeechCommands_str
(
self
):
dataset
=
speechcommands
.
SPEECHCOMMANDS
(
self
.
root_dir
)
self
.
_testSpeechCommands
(
dataset
,
self
.
samples
)
def
testSpeechCommands_path
(
self
):
dataset
=
speechcommands
.
SPEECHCOMMANDS
(
Path
(
self
.
root_dir
))
self
.
_testSpeechCommands
(
dataset
,
self
.
samples
)
def
testSpeechCommandsSubsetTrain
(
self
):
dataset
=
speechcommands
.
SPEECHCOMMANDS
(
self
.
root_dir
,
subset
=
"training"
)
self
.
_testSpeechCommands
(
dataset
,
self
.
train_samples
)
def
testSpeechCommandsSubsetValid
(
self
):
dataset
=
speechcommands
.
SPEECHCOMMANDS
(
self
.
root_dir
,
subset
=
"validation"
)
self
.
_testSpeechCommands
(
dataset
,
self
.
valid_samples
)
def
testSpeechCommandsSubsetTest
(
self
):
dataset
=
speechcommands
.
SPEECHCOMMANDS
(
self
.
root_dir
,
subset
=
"testing"
)
self
.
_testSpeechCommands
(
dataset
,
self
.
test_samples
)
def
testSpeechCommandsSum
(
self
):
dataset_all
=
speechcommands
.
SPEECHCOMMANDS
(
self
.
root_dir
)
dataset_train
=
speechcommands
.
SPEECHCOMMANDS
(
self
.
root_dir
,
subset
=
"training"
)
dataset_valid
=
speechcommands
.
SPEECHCOMMANDS
(
self
.
root_dir
,
subset
=
"validation"
)
dataset_test
=
speechcommands
.
SPEECHCOMMANDS
(
self
.
root_dir
,
subset
=
"testing"
)
assert
len
(
dataset_train
)
+
len
(
dataset_valid
)
+
len
(
dataset_test
)
==
len
(
dataset_all
)
test/torchaudio_unittest/datasets/tedlium_test.py
0 → 100644
View file @
9dcc7a15
import
os
import
platform
from
pathlib
import
Path
from
torchaudio_unittest.common_utils
import
(
TempDirMixin
,
TorchaudioTestCase
,
get_whitenoise
,
save_wav
,
skipIfNoSox
)
from
torchaudio.datasets
import
tedlium
# Used to generate a unique utterance for each dummy audio file
_UTTERANCES
=
[
"AaronHuey_2010X 1 AaronHuey_2010X 0.0 2.0 <o,f0,female> script1
\n
"
,
"AaronHuey_2010X 1 AaronHuey_2010X 2.0 4.0 <o,f0,female> script2
\n
"
,
"AaronHuey_2010X 1 AaronHuey_2010X 4.0 6.0 <o,f0,female> script3
\n
"
,
"AaronHuey_2010X 1 AaronHuey_2010X 6.0 8.0 <o,f0,female> script4
\n
"
,
"AaronHuey_2010X 1 AaronHuey_2010X 8.0 10.0 <o,f0,female> script5
\n
"
,
]
_PHONEME
=
[
"a AH"
,
"a(2) EY"
,
"aachen AA K AH N"
,
"aad AE D"
,
"aaden EY D AH N"
,
"aadmi AE D M IY"
,
"aae EY EY"
,
]
def
get_mock_dataset
(
dataset_dir
):
"""
dataset_dir: directory of the mocked dataset
"""
mocked_samples
=
{}
os
.
makedirs
(
dataset_dir
,
exist_ok
=
True
)
sample_rate
=
16000
# 16kHz
seed
=
0
for
release
in
[
"release1"
,
"release2"
,
"release3"
]:
data
=
get_whitenoise
(
sample_rate
=
sample_rate
,
duration
=
10.00
,
n_channels
=
1
,
dtype
=
"float32"
,
seed
=
seed
)
if
release
in
[
"release1"
,
"release2"
]:
release_dir
=
os
.
path
.
join
(
dataset_dir
,
tedlium
.
_RELEASE_CONFIGS
[
release
][
"folder_in_archive"
],
tedlium
.
_RELEASE_CONFIGS
[
release
][
"subset"
],
)
else
:
release_dir
=
os
.
path
.
join
(
dataset_dir
,
tedlium
.
_RELEASE_CONFIGS
[
release
][
"folder_in_archive"
],
tedlium
.
_RELEASE_CONFIGS
[
release
][
"data_path"
],
)
os
.
makedirs
(
release_dir
,
exist_ok
=
True
)
os
.
makedirs
(
os
.
path
.
join
(
release_dir
,
"stm"
),
exist_ok
=
True
)
# Subfolder for transcripts
os
.
makedirs
(
os
.
path
.
join
(
release_dir
,
"sph"
),
exist_ok
=
True
)
# Subfolder for audio files
filename
=
f
"
{
release
}
.sph"
path
=
os
.
path
.
join
(
os
.
path
.
join
(
release_dir
,
"sph"
),
filename
)
save_wav
(
path
,
data
,
sample_rate
)
trans_filename
=
f
"
{
release
}
.stm"
trans_path
=
os
.
path
.
join
(
os
.
path
.
join
(
release_dir
,
"stm"
),
trans_filename
)
with
open
(
trans_path
,
"w"
)
as
f
:
f
.
write
(
""
.
join
(
_UTTERANCES
))
dict_filename
=
f
"
{
release
}
.dic"
dict_path
=
os
.
path
.
join
(
release_dir
,
dict_filename
)
with
open
(
dict_path
,
"w"
)
as
f
:
f
.
write
(
"
\n
"
.
join
(
_PHONEME
))
# Create a samples list to compare with
mocked_samples
[
release
]
=
[]
for
utterance
in
_UTTERANCES
:
talk_id
,
_
,
speaker_id
,
start_time
,
end_time
,
identifier
,
transcript
=
utterance
.
split
(
" "
,
6
)
start_time
=
int
(
float
(
start_time
))
*
sample_rate
end_time
=
int
(
float
(
end_time
))
*
sample_rate
sample
=
(
data
[:,
start_time
:
end_time
],
sample_rate
,
transcript
,
talk_id
,
speaker_id
,
identifier
,
)
mocked_samples
[
release
].
append
(
sample
)
seed
+=
1
return
mocked_samples
class
Tedlium
(
TempDirMixin
):
root_dir
=
None
samples
=
{}
@
classmethod
def
setUpClass
(
cls
):
cls
.
root_dir
=
cls
.
get_base_temp_dir
()
cls
.
root_dir
=
dataset_dir
=
os
.
path
.
join
(
cls
.
root_dir
,
"tedlium"
)
cls
.
samples
=
get_mock_dataset
(
dataset_dir
)
def
_test_tedlium
(
self
,
dataset
,
release
):
num_samples
=
0
for
i
,
(
data
,
sample_rate
,
transcript
,
talk_id
,
speaker_id
,
identifier
)
in
enumerate
(
dataset
):
self
.
assertEqual
(
data
,
self
.
samples
[
release
][
i
][
0
],
atol
=
5e-5
,
rtol
=
1e-8
)
assert
sample_rate
==
self
.
samples
[
release
][
i
][
1
]
assert
transcript
==
self
.
samples
[
release
][
i
][
2
]
assert
talk_id
==
self
.
samples
[
release
][
i
][
3
]
assert
speaker_id
==
self
.
samples
[
release
][
i
][
4
]
assert
identifier
==
self
.
samples
[
release
][
i
][
5
]
num_samples
+=
1
assert
num_samples
==
len
(
self
.
samples
[
release
])
dataset
.
_dict_path
=
os
.
path
.
join
(
dataset
.
_path
,
f
"
{
release
}
.dic"
)
phoneme_dict
=
dataset
.
phoneme_dict
phoenemes
=
[
f
"
{
key
}
{
' '
.
join
(
value
)
}
"
for
key
,
value
in
phoneme_dict
.
items
()]
assert
phoenemes
==
_PHONEME
def
test_tedlium_release1_str
(
self
):
release
=
"release1"
dataset
=
tedlium
.
TEDLIUM
(
self
.
root_dir
,
release
=
release
)
self
.
_test_tedlium
(
dataset
,
release
)
def
test_tedlium_release1_path
(
self
):
release
=
"release1"
dataset
=
tedlium
.
TEDLIUM
(
Path
(
self
.
root_dir
),
release
=
release
)
self
.
_test_tedlium
(
dataset
,
release
)
def
test_tedlium_release2
(
self
):
release
=
"release2"
dataset
=
tedlium
.
TEDLIUM
(
self
.
root_dir
,
release
=
release
)
self
.
_test_tedlium
(
dataset
,
release
)
def
test_tedlium_release3
(
self
):
release
=
"release3"
dataset
=
tedlium
.
TEDLIUM
(
self
.
root_dir
,
release
=
release
)
self
.
_test_tedlium
(
dataset
,
release
)
class
TestTedliumSoundfile
(
Tedlium
,
TorchaudioTestCase
):
backend
=
"soundfile"
if
platform
.
system
()
!=
"Windows"
:
@
skipIfNoSox
class
TestTedliumSoxIO
(
Tedlium
,
TorchaudioTestCase
):
backend
=
"sox_io"
test/torchaudio_unittest/datasets/utils_test.py
0 → 100644
View file @
9dcc7a15
import
torch
from
torchaudio_unittest.common_utils
import
(
TorchaudioTestCase
,
TempDirMixin
)
from
torchaudio.datasets
import
utils
as
dataset_utils
class
Dataset
(
torch
.
utils
.
data
.
Dataset
):
def
__getitem__
(
self
,
n
):
sample_rate
=
8000
waveform
=
n
*
torch
.
ones
(
2
,
256
)
return
waveform
,
sample_rate
def
__len__
(
self
)
->
int
:
return
2
def
__iter__
(
self
):
for
i
in
range
(
len
(
self
)):
yield
self
[
i
]
class
TestIterator
(
TorchaudioTestCase
,
TempDirMixin
):
backend
=
'default'
def
test_disckcache_iterator
(
self
):
data
=
dataset_utils
.
diskcache_iterator
(
Dataset
(),
self
.
get_base_temp_dir
())
# Save
data
[
0
]
# Load
data
[
0
]
def
test_bg_iterator
(
self
):
data
=
dataset_utils
.
bg_iterator
(
Dataset
(),
5
)
for
_
in
data
:
pass
test/torchaudio_unittest/datasets/vctk_test.py
0 → 100644
View file @
9dcc7a15
import
os
from
pathlib
import
Path
from
torchaudio.datasets
import
vctk
from
torchaudio_unittest.common_utils
import
(
TempDirMixin
,
TorchaudioTestCase
,
get_whitenoise
,
save_wav
,
normalize_wav
,
)
# Used to generate a unique transcript for each dummy audio file
_TRANSCRIPT
=
[
'Please call Stella'
,
'Ask her to bring these things'
,
'with her from the store'
,
'Six spoons of fresh snow peas, five thick slabs of blue cheese, and maybe a snack for her brother Bob'
,
'We also need a small plastic snake and a big toy frog for the kids'
,
'She can scoop these things into three red bags, and we will go meet her Wednesday at the train station'
,
'When the sunlight strikes raindrops in the air, they act as a prism and form a rainbow'
,
'The rainbow is a division of white light into many beautiful colors'
,
'These take the shape of a long round arch, with its path high above, and its two ends
\
apparently beyond the horizon'
,
'There is, according to legend, a boiling pot of gold at one end'
]
def
get_mock_dataset
(
root_dir
):
"""
root_dir: root directory of the mocked data
"""
mocked_samples
=
[]
dataset_dir
=
os
.
path
.
join
(
root_dir
,
'VCTK-Corpus-0.92'
)
os
.
makedirs
(
dataset_dir
,
exist_ok
=
True
)
sample_rate
=
48000
seed
=
0
for
speaker
in
range
(
225
,
230
):
speaker_id
=
'p'
+
str
(
speaker
)
audio_dir
=
os
.
path
.
join
(
dataset_dir
,
'wav48_silence_trimmed'
,
speaker_id
)
os
.
makedirs
(
audio_dir
,
exist_ok
=
True
)
file_dir
=
os
.
path
.
join
(
dataset_dir
,
'txt'
,
speaker_id
)
os
.
makedirs
(
file_dir
,
exist_ok
=
True
)
for
utterance_id
in
range
(
1
,
11
):
filename
=
f
'
{
speaker_id
}
_
{
utterance_id
:
03
d
}
_mic2'
audio_file_path
=
os
.
path
.
join
(
audio_dir
,
filename
+
'.wav'
)
data
=
get_whitenoise
(
sample_rate
=
sample_rate
,
duration
=
0.01
,
n_channels
=
1
,
dtype
=
'float32'
,
seed
=
seed
)
save_wav
(
audio_file_path
,
data
,
sample_rate
)
txt_file_path
=
os
.
path
.
join
(
file_dir
,
filename
[:
-
5
]
+
'.txt'
)
transcript
=
_TRANSCRIPT
[
utterance_id
-
1
]
with
open
(
txt_file_path
,
'w'
)
as
f
:
f
.
write
(
transcript
)
sample
=
(
normalize_wav
(
data
),
sample_rate
,
transcript
,
speaker_id
,
utterance_id
)
mocked_samples
.
append
(
sample
)
seed
+=
1
return
mocked_samples
class
TestVCTK
(
TempDirMixin
,
TorchaudioTestCase
):
backend
=
'default'
root_dir
=
None
samples
=
[]
@
classmethod
def
setUpClass
(
cls
):
cls
.
root_dir
=
cls
.
get_base_temp_dir
()
cls
.
samples
=
get_mock_dataset
(
cls
.
root_dir
)
def
_test_vctk
(
self
,
dataset
):
num_samples
=
0
for
i
,
(
data
,
sample_rate
,
transcript
,
speaker_id
,
utterance_id
)
in
enumerate
(
dataset
):
self
.
assertEqual
(
data
,
self
.
samples
[
i
][
0
],
atol
=
5e-5
,
rtol
=
1e-8
)
assert
sample_rate
==
self
.
samples
[
i
][
1
]
assert
transcript
==
self
.
samples
[
i
][
2
]
assert
speaker_id
==
self
.
samples
[
i
][
3
]
assert
int
(
utterance_id
)
==
self
.
samples
[
i
][
4
]
num_samples
+=
1
assert
num_samples
==
len
(
self
.
samples
)
def
test_vctk_str
(
self
):
dataset
=
vctk
.
VCTK_092
(
self
.
root_dir
,
audio_ext
=
".wav"
)
self
.
_test_vctk
(
dataset
)
def
test_vctk_path
(
self
):
dataset
=
vctk
.
VCTK_092
(
Path
(
self
.
root_dir
),
audio_ext
=
".wav"
)
self
.
_test_vctk
(
dataset
)
test/torchaudio_unittest/datasets/yesno_test.py
0 → 100644
View file @
9dcc7a15
import
os
from
pathlib
import
Path
from
torchaudio.datasets
import
yesno
from
torchaudio_unittest.common_utils
import
(
TempDirMixin
,
TorchaudioTestCase
,
get_whitenoise
,
save_wav
,
normalize_wav
,
)
def
get_mock_data
(
root_dir
,
labels
):
"""
root_dir: path
labels: list of labels
"""
mocked_data
=
[]
base_dir
=
os
.
path
.
join
(
root_dir
,
'waves_yesno'
)
os
.
makedirs
(
base_dir
,
exist_ok
=
True
)
for
i
,
label
in
enumerate
(
labels
):
filename
=
f
'
{
"_"
.
join
(
str
(
l
)
for
l
in
label
)
}
.wav'
path
=
os
.
path
.
join
(
base_dir
,
filename
)
data
=
get_whitenoise
(
sample_rate
=
8000
,
duration
=
6
,
n_channels
=
1
,
dtype
=
'int16'
,
seed
=
i
)
save_wav
(
path
,
data
,
8000
)
mocked_data
.
append
(
normalize_wav
(
data
))
return
mocked_data
class
TestYesNo
(
TempDirMixin
,
TorchaudioTestCase
):
backend
=
'default'
root_dir
=
None
data
=
[]
labels
=
[
[
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
],
[
0
,
0
,
0
,
0
,
1
,
1
,
1
,
1
],
[
0
,
1
,
0
,
1
,
0
,
1
,
1
,
0
],
[
1
,
1
,
1
,
1
,
0
,
0
,
0
,
0
],
[
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
],
]
@
classmethod
def
setUpClass
(
cls
):
cls
.
root_dir
=
cls
.
get_base_temp_dir
()
cls
.
data
=
get_mock_data
(
cls
.
root_dir
,
cls
.
labels
)
def
_test_yesno
(
self
,
dataset
):
n_ite
=
0
for
i
,
(
waveform
,
sample_rate
,
label
)
in
enumerate
(
dataset
):
expected_label
=
self
.
labels
[
i
]
expected_data
=
self
.
data
[
i
]
self
.
assertEqual
(
expected_data
,
waveform
,
atol
=
5e-5
,
rtol
=
1e-8
)
assert
sample_rate
==
8000
assert
label
==
expected_label
n_ite
+=
1
assert
n_ite
==
len
(
self
.
data
)
def
test_yesno_str
(
self
):
dataset
=
yesno
.
YESNO
(
self
.
root_dir
)
self
.
_test_yesno
(
dataset
)
def
test_yesno_path
(
self
):
dataset
=
yesno
.
YESNO
(
Path
(
self
.
root_dir
))
self
.
_test_yesno
(
dataset
)
test/torchaudio_unittest/example/__init__.py
0 → 100644
View file @
9dcc7a15
import
os
import
sys
sys
.
path
.
append
(
os
.
path
.
join
(
os
.
path
.
dirname
(
__file__
),
'..'
,
'..'
,
'..'
,
'examples'
))
test/torchaudio_unittest/example/souce_sepration/__init__.py
0 → 100644
View file @
9dcc7a15
Prev
1
…
9
10
11
12
13
14
15
16
17
…
21
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment