Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
Torchaudio
Commits
3bd4db86
Commit
3bd4db86
authored
Jan 03, 2019
by
David Pollack
Committed by
Soumith Chintala
Jan 04, 2019
Browse files
refactoring and clearning up code
parent
0e0d1e59
Changes
14
Hide whitespace changes
Inline
Side-by-side
Showing
14 changed files
with
213 additions
and
93 deletions
+213
-93
.clang-format
.clang-format
+88
-0
.clang-tidy
.clang-tidy
+33
-0
.flake8
.flake8
+4
-0
setup.py
setup.py
+21
-1
test/test.py
test/test.py
+3
-4
test/test_dataloader.py
test/test_dataloader.py
+2
-1
test/test_legacy.py
test/test_legacy.py
+2
-2
test/test_sox_effects.py
test/test_sox_effects.py
+5
-4
test/test_transforms.py
test/test_transforms.py
+4
-2
torchaudio/__init__.py
torchaudio/__init__.py
+3
-3
torchaudio/sox_effects.py
torchaudio/sox_effects.py
+2
-1
torchaudio/torch_sox.cpp
torchaudio/torch_sox.cpp
+4
-11
torchaudio/torch_sox.h
torchaudio/torch_sox.h
+9
-2
torchaudio/transforms.py
torchaudio/transforms.py
+33
-62
No files found.
.clang-format
0 → 100644
View file @
3bd4db86
---
AccessModifierOffset: -1
AlignAfterOpenBracket: AlwaysBreak
AlignConsecutiveAssignments: false
AlignConsecutiveDeclarations: false
AlignEscapedNewlinesLeft: true
AlignOperands: false
AlignTrailingComments: false
AllowAllParametersOfDeclarationOnNextLine: false
AllowShortBlocksOnASingleLine: false
AllowShortCaseLabelsOnASingleLine: false
AllowShortFunctionsOnASingleLine: Empty
AllowShortIfStatementsOnASingleLine: false
AllowShortLoopsOnASingleLine: false
AlwaysBreakAfterReturnType: None
AlwaysBreakBeforeMultilineStrings: true
AlwaysBreakTemplateDeclarations: true
BinPackArguments: false
BinPackParameters: false
BraceWrapping:
AfterClass: false
AfterControlStatement: false
AfterEnum: false
AfterFunction: false
AfterNamespace: false
AfterObjCDeclaration: false
AfterStruct: false
AfterUnion: false
BeforeCatch: false
BeforeElse: false
IndentBraces: false
BreakBeforeBinaryOperators: None
BreakBeforeBraces: Attach
BreakBeforeTernaryOperators: true
BreakConstructorInitializersBeforeComma: false
BreakAfterJavaFieldAnnotations: false
BreakStringLiterals: false
ColumnLimit: 80
CommentPragmas: '^ IWYU pragma:'
CompactNamespaces: false
ConstructorInitializerAllOnOneLineOrOnePerLine: true
ConstructorInitializerIndentWidth: 4
ContinuationIndentWidth: 4
Cpp11BracedListStyle: true
DerivePointerAlignment: false
DisableFormat: false
ForEachMacros: [ FOR_EACH_RANGE, FOR_EACH, ]
IncludeCategories:
- Regex: '^<.*\.h(pp)?>'
Priority: 1
- Regex: '^<.*'
Priority: 2
- Regex: '.*'
Priority: 3
IndentCaseLabels: true
IndentWidth: 2
IndentWrappedFunctionNames: false
KeepEmptyLinesAtTheStartOfBlocks: false
MacroBlockBegin: ''
MacroBlockEnd: ''
MaxEmptyLinesToKeep: 1
NamespaceIndentation: None
ObjCBlockIndentWidth: 2
ObjCSpaceAfterProperty: false
ObjCSpaceBeforeProtocolList: false
PenaltyBreakBeforeFirstCallParameter: 1
PenaltyBreakComment: 300
PenaltyBreakFirstLessLess: 120
PenaltyBreakString: 1000
PenaltyExcessCharacter: 1000000
PenaltyReturnTypeOnItsOwnLine: 2000000
PointerAlignment: Left
ReflowComments: true
SortIncludes: true
SpaceAfterCStyleCast: false
SpaceBeforeAssignmentOperators: true
SpaceBeforeParens: ControlStatements
SpaceInEmptyParentheses: false
SpacesBeforeTrailingComments: 1
SpacesInAngles: false
SpacesInContainerLiterals: true
SpacesInCStyleCastParentheses: false
SpacesInParentheses: false
SpacesInSquareBrackets: false
Standard: Cpp11
TabWidth: 8
UseTab: Never
...
.clang-tidy
0 → 100644
View file @
3bd4db86
---
# NOTE there must be no spaces before the '-', so put the comma first.
Checks: '
-*
,bugprone-*
,-bugprone-forward-declaration-namespace
,-bugprone-macro-parentheses
,cppcoreguidelines-*
,-cppcoreguidelines-interfaces-global-init
,-cppcoreguidelines-owning-memory
,-cppcoreguidelines-pro-bounds-array-to-pointer-decay
,-cppcoreguidelines-pro-bounds-constant-array-index
,-cppcoreguidelines-pro-bounds-pointer-arithmetic
,-cppcoreguidelines-pro-type-cstyle-cast
,-cppcoreguidelines-pro-type-reinterpret-cast
,-cppcoreguidelines-pro-type-static-cast-downcast
,-cppcoreguidelines-pro-type-union-access
,-cppcoreguidelines-pro-type-vararg
,-cppcoreguidelines-special-member-functions
,hicpp-exception-baseclass
,hicpp-avoid-goto
,modernize-*
,-modernize-return-braced-init-list
,-modernize-use-auto
,-modernize-use-default-member-init
,-modernize-use-using
,performance-unnecessary-value-param
'
WarningsAsErrors: '*'
HeaderFilterRegex: 'torchaudio/.*'
AnalyzeTemporaryDtors: false
CheckOptions:
...
.flake8
0 → 100644
View file @
3bd4db86
[flake8]
max-line-length = 120
ignore = E305,E402,E721,E741,F401,F403,F405,F821,F841,F999,W503,W504
exclude = build,docs/source,_ext
setup.py
View file @
3bd4db86
#!/usr/bin/env python
import
os
import
platform
from
setuptools
import
setup
,
find_packages
from
torch.utils.cpp_extension
import
BuildExtension
,
CppExtension
def
check_env_flag
(
name
,
default
=
''
):
return
os
.
getenv
(
name
,
default
).
upper
()
in
set
([
'ON'
,
'1'
,
'YES'
,
'TRUE'
,
'Y'
])
DEBUG
=
check_env_flag
(
'DEBUG'
)
eca
=
[]
ela
=
[]
if
DEBUG
:
if
platform
.
system
()
==
'Windows'
:
ela
+=
[
'/DEBUG:FULL'
]
else
:
eca
+=
[
'-O0'
,
'-g'
]
ela
+=
[
'-O0'
,
'-g'
]
setup
(
name
=
"torchaudio"
,
version
=
"0.2"
,
...
...
@@ -14,6 +30,10 @@ setup(
packages
=
find_packages
(
exclude
=
[
"build"
]),
ext_modules
=
[
CppExtension
(
'_torch_sox'
,
[
'torchaudio/torch_sox.cpp'
],
libraries
=
[
'sox'
]),
'_torch_sox'
,
[
'torchaudio/torch_sox.cpp'
],
libraries
=
[
'sox'
],
extra_compile_args
=
eca
,
extra_link_args
=
ela
),
],
cmdclass
=
{
'build_ext'
:
BuildExtension
})
test/test.py
View file @
3bd4db86
...
...
@@ -27,7 +27,6 @@ class Test_LoadSave(unittest.TestCase):
os
.
unlink
(
new_filepath
)
# test save 1d tensor
#x = x[:, 0] # get mono signal
x
=
x
[
0
,
:]
# get mono signal
x
.
squeeze_
()
# remove channel dim
torchaudio
.
save
(
new_filepath
,
x
,
sr
)
...
...
@@ -91,7 +90,7 @@ class Test_LoadSave(unittest.TestCase):
offset
=
15
x
,
_
=
torchaudio
.
load
(
self
.
test_filepath
)
x_offset
,
_
=
torchaudio
.
load
(
self
.
test_filepath
,
offset
=
offset
)
self
.
assertTrue
(
x
[:,
offset
:].
allclose
(
x_offset
))
self
.
assertTrue
(
x
[:,
offset
:].
allclose
(
x_offset
))
# check number of frames
n
=
201
...
...
@@ -132,7 +131,7 @@ class Test_LoadSave(unittest.TestCase):
input_sine_path
=
os
.
path
.
join
(
self
.
test_dirpath
,
'assets'
,
'sinewave.wav'
)
x_sine_full
,
sr_sine
=
torchaudio
.
load
(
input_sine_path
)
x_sine_part
,
_
=
torchaudio
.
load
(
input_sine_path
,
num_frames
=
num_frames
,
offset
=
offset
)
l1_error
=
x_sine_full
[:,
offset
:(
num_frames
+
offset
)].
sub
(
x_sine_part
).
abs
().
sum
().
item
()
l1_error
=
x_sine_full
[:,
offset
:(
num_frames
+
offset
)].
sub
(
x_sine_part
).
abs
().
sum
().
item
()
# test for the correct number of samples and that the correct portion was loaded
self
.
assertEqual
(
x_sine_part
.
size
(
1
),
num_frames
)
self
.
assertEqual
(
l1_error
,
0.
)
...
...
@@ -148,7 +147,7 @@ class Test_LoadSave(unittest.TestCase):
# test with two channel mp3
x_2ch_full
,
sr_2ch
=
torchaudio
.
load
(
self
.
test_filepath
,
normalization
=
True
)
x_2ch_part
,
_
=
torchaudio
.
load
(
self
.
test_filepath
,
normalization
=
True
,
num_frames
=
num_frames
,
offset
=
offset
)
l1_error
=
x_2ch_full
[:,
offset
:(
offset
+
num_frames
)].
sub
(
x_2ch_part
).
abs
().
sum
().
item
()
l1_error
=
x_2ch_full
[:,
offset
:(
offset
+
num_frames
)].
sub
(
x_2ch_part
).
abs
().
sum
().
item
()
self
.
assertEqual
(
x_2ch_part
.
size
(
1
),
num_frames
)
self
.
assertEqual
(
l1_error
,
0.
)
...
...
test/test_dataloader.py
View file @
3bd4db86
...
...
@@ -30,13 +30,14 @@ class TORCHAUDIODS(Dataset):
def
__len__
(
self
):
return
len
(
self
.
data
)
class
Test_DataLoader
(
unittest
.
TestCase
):
def
test_1
(
self
):
expected_size
=
(
2
,
1
,
16000
)
ds
=
TORCHAUDIODS
()
dl
=
DataLoader
(
ds
,
batch_size
=
2
)
for
x
in
dl
:
#print(x.size())
#
print(x.size())
continue
self
.
assertTrue
(
x
.
size
()
==
expected_size
)
...
...
test/test_legacy.py
View file @
3bd4db86
...
...
@@ -120,7 +120,7 @@ class Test_LoadSave(unittest.TestCase):
input_sine_path
=
os
.
path
.
join
(
self
.
test_dirpath
,
'assets'
,
'sinewave.wav'
)
x_sine_full
,
sr_sine
=
load
(
input_sine_path
)
x_sine_part
,
_
=
load
(
input_sine_path
,
num_frames
=
num_frames
,
offset
=
offset
)
l1_error
=
x_sine_full
[
offset
:(
num_frames
+
offset
)].
sub
(
x_sine_part
).
abs
().
sum
().
item
()
l1_error
=
x_sine_full
[
offset
:(
num_frames
+
offset
)].
sub
(
x_sine_part
).
abs
().
sum
().
item
()
# test for the correct number of samples and that the correct portion was loaded
self
.
assertEqual
(
x_sine_part
.
size
(
0
),
num_frames
)
self
.
assertEqual
(
l1_error
,
0.
)
...
...
@@ -137,7 +137,7 @@ class Test_LoadSave(unittest.TestCase):
# test with two channel mp3
x_2ch_full
,
sr_2ch
=
load
(
self
.
test_filepath
,
normalization
=
True
)
x_2ch_part
,
_
=
load
(
self
.
test_filepath
,
normalization
=
True
,
num_frames
=
num_frames
,
offset
=
offset
)
l1_error
=
x_2ch_full
[
offset
:(
offset
+
num_frames
)].
sub
(
x_2ch_part
).
abs
().
sum
().
item
()
l1_error
=
x_2ch_full
[
offset
:(
offset
+
num_frames
)].
sub
(
x_2ch_part
).
abs
().
sum
().
item
()
self
.
assertEqual
(
x_2ch_part
.
size
(
0
),
num_frames
)
self
.
assertEqual
(
l1_error
,
0.
)
...
...
test/test_sox_effects.py
View file @
3bd4db86
...
...
@@ -17,7 +17,7 @@ class Test_SoxEffectsChain(unittest.TestCase):
E
.
append_effect_to_chain
(
"echos"
,
[
0.8
,
0.7
,
40
,
0.25
,
63
,
0.3
])
x
,
sr
=
E
.
sox_build_flow_effects
()
# check if effects worked
#print(x.size())
#
print(x.size())
def
test_rate_channels
(
self
):
target_rate
=
16000
...
...
@@ -154,7 +154,7 @@ class Test_SoxEffectsChain(unittest.TestCase):
E
.
append_effect_to_chain
(
"trim"
,
[
offset
,
num_frames
])
x
,
sr
=
E
.
sox_build_flow_effects
()
# check if effect worked
self
.
assertTrue
(
x
.
allclose
(
x_orig
[:,
offset_int
:(
offset_int
+
num_frames_int
)],
rtol
=
1e-4
,
atol
=
1e-4
))
self
.
assertTrue
(
x
.
allclose
(
x_orig
[:,
offset_int
:(
offset_int
+
num_frames_int
)],
rtol
=
1e-4
,
atol
=
1e-4
))
def
test_silence_contrast
(
self
):
si
,
_
=
torchaudio
.
info
(
self
.
test_filepath
)
...
...
@@ -183,13 +183,14 @@ class Test_SoxEffectsChain(unittest.TestCase):
E
.
append_effect_to_chain
(
"fade"
,
[
"q"
,
"0.25"
,
"0"
,
"0.33"
])
x
,
_
=
E
.
sox_build_flow_effects
()
# check if effect worked
#print(x.size())
#
print(x.size())
def
test_biquad_delay
(
self
):
si
,
_
=
torchaudio
.
info
(
self
.
test_filepath
)
E
=
torchaudio
.
sox_effects
.
SoxEffectsChain
()
E
.
set_input_file
(
self
.
test_filepath
)
E
.
append_effect_to_chain
(
"biquad"
,
[
"0.25136437"
,
"0.50272873"
,
"0.25136437"
,
"1.0"
,
"-0.17123075"
,
"0.17668821"
])
E
.
append_effect_to_chain
(
"biquad"
,
[
"0.25136437"
,
"0.50272873"
,
"0.25136437"
,
"1.0"
,
"-0.17123075"
,
"0.17668821"
])
E
.
append_effect_to_chain
(
"delay"
,
[
"15000s"
])
x
,
_
=
E
.
sox_build_flow_effects
()
# check if effect worked
...
...
test/test_transforms.py
View file @
3bd4db86
...
...
@@ -38,9 +38,11 @@ class Tester(unittest.TestCase):
length_new
=
int
(
length_orig
*
1.2
)
result
=
transforms
.
PadTrim
(
max_len
=
length_new
,
channels_first
=
False
)(
audio_orig
)
self
.
assertEqual
(
result
.
size
(
0
),
length_new
)
result
=
transforms
.
PadTrim
(
max_len
=
length_new
,
channels_first
=
True
)(
audio_orig
.
transpose
(
0
,
1
))
self
.
assertEqual
(
result
.
size
(
1
),
length_new
)
audio_orig
=
self
.
sig
.
clone
()
length_orig
=
audio_orig
.
size
(
0
)
length_new
=
int
(
length_orig
*
0.8
)
...
...
@@ -147,7 +149,7 @@ class Tester(unittest.TestCase):
audio_orig
=
self
.
sig
.
clone
()
# (16000, 1)
audio_scaled
=
transforms
.
Scale
()(
audio_orig
)
# (16000, 1)
audio_scaled
=
transforms
.
LC2CL
()(
audio_scaled
)
# (1, 16000)
spectrogram_torch
=
transforms
.
MEL2
()(
audio_scaled
)
# (1, 319, 40)
spectrogram_torch
=
transforms
.
MEL2
(
window_fn
=
torch
.
hamming_window
,
pad
=
10
)(
audio_scaled
)
# (1, 319, 40)
self
.
assertTrue
(
spectrogram_torch
.
dim
()
==
3
)
self
.
assertTrue
(
spectrogram_torch
.
max
()
<=
0.
)
...
...
torchaudio/__init__.py
View file @
3bd4db86
...
...
@@ -44,7 +44,8 @@ def load(filepath,
filetype (str, optional): a filetype or extension to be set if sox cannot determine it automatically
Returns: tuple(Tensor, int)
- Tensor: output Tensor of size `[C x L]` or `[L x C]` where L is the number of audio frames, C is the number of channels
- Tensor: output Tensor of size `[C x L]` or `[L x C]` where L is the number of audio frames and
C is the number of channels
- int: the sample rate of the audio (as listed in the metadata of the file)
Example::
...
...
@@ -127,8 +128,7 @@ def save_encinfo(filepath,
>>> torchaudio.save('foo.wav', data, sample_rate)
"""
ch_idx
=
0
if
channels_first
else
1
len_idx
=
1
if
channels_first
else
0
ch_idx
,
len_idx
=
(
0
,
1
)
if
channels_first
else
(
1
,
0
)
# check if save directory exists
abs_dirpath
=
os
.
path
.
dirname
(
os
.
path
.
abspath
(
filepath
))
...
...
torchaudio/sox_effects.py
View file @
3bd4db86
...
...
@@ -44,7 +44,8 @@ class SoxEffectsChain(object):
filetype (str, optional): a filetype or extension to be set if sox cannot determine it automatically
Returns: tuple(Tensor, int)
- Tensor: output Tensor of size `[C x L]` or `[L x C]` where L is the number of audio frames, C is the number of channels
- Tensor: output Tensor of size `[C x L]` or `[L x C]` where L is the number of audio frames and
C is the number of channels
- int: the sample rate of the audio (as listed in the metadata of the file)
Example::
...
...
torchaudio/torch_sox.cpp
View file @
3bd4db86
...
...
@@ -158,7 +158,7 @@ int read_audio_file(
void
write_audio_file
(
const
std
::
string
&
file_name
,
at
::
Tensor
tensor
,
const
at
::
Tensor
&
tensor
,
sox_signalinfo_t
*
si
,
sox_encodinginfo_t
*
ei
,
const
char
*
file_type
)
{
...
...
@@ -332,16 +332,9 @@ int build_flow_effects(const std::string& file_name,
int
sr
;
// Read the in-memory audio buffer or temp file that we just wrote.
#ifdef __APPLE__
/* certain effects will result in a target signal length of 0.
if (target_signal->length > 0) {
if (target_signal->channels != output->signal.channels) {
std::cout << "output: " << output->signal.channels << "|" << output->signal.length << "\n";
std::cout << "interm: " << interm_signal.channels << "|" << interm_signal.length << "\n";
std::cout << "target: " << target_signal->channels << "|" << target_signal->length << "\n";
unlink(tmp_name);
throw std::runtime_error("unexpected number of audio channels");
}
}
/*
Temporary filetype must have a valid header. Wav seems to work here while
raw does not. Certain effects like chorus caused strange behavior on the mac.
*/
// read_audio_file reads the temporary file and returns the sr and otensor
sr
=
read_audio_file
(
tmp_name
,
otensor
,
ch_first
,
0
,
0
,
...
...
torchaudio/torch_sox.h
View file @
3bd4db86
...
...
@@ -26,10 +26,10 @@ int read_audio_file(
/// writing, or an error ocurred during writing of the audio data.
void
write_audio_file
(
const
std
::
string
&
file_name
,
at
::
Tensor
tensor
,
at
::
Tensor
&
tensor
,
sox_signalinfo_t
*
si
,
sox_encodinginfo_t
*
ei
,
const
char
*
extension
)
const
char
*
file_type
)
/// Reads an audio file from the given `path` and returns a tuple of
/// sox_signalinfo_t and sox_encodinginfo_t, which contain information about
...
...
@@ -46,6 +46,13 @@ std::vector<std::string> get_effect_names();
int
initialize_sox
();
int
shutdown_sox
();
// Struct for build_flow_effects function
struct
SoxEffect
{
SoxEffect
()
:
ename
(
""
),
eopts
({
""
})
{
}
std
::
string
ename
;
std
::
vector
<
std
::
string
>
eopts
;
};
/// Build a SoX chain, flow the effects, and capture the results in a tensor.
/// An audio file from the given `path` flows through an effects chain given
/// by a list of effects and effect options to an output buffer which is encoded
...
...
torchaudio/transforms.py
View file @
3bd4db86
from
__future__
import
division
,
print_function
import
torch
from
torch.autograd
import
Variable
import
numpy
as
np
try
:
import
librosa
...
...
@@ -8,18 +7,6 @@ except ImportError:
librosa
=
None
def
_check_is_variable
(
tensor
):
if
isinstance
(
tensor
,
torch
.
Tensor
):
is_variable
=
False
tensor
=
Variable
(
tensor
,
requires_grad
=
False
)
elif
isinstance
(
tensor
,
Variable
):
is_variable
=
True
else
:
raise
TypeError
(
"tensor should be a Variable or Tensor, but is {}"
.
format
(
type
(
tensor
)))
return
tensor
,
is_variable
class
Compose
(
object
):
"""Composes several transforms together.
...
...
@@ -73,8 +60,8 @@ class Scale(object):
Tensor: Scaled by the scale factor. (default between -1.0 and 1.0)
"""
if
isinstance
(
tensor
,
(
torch
.
LongTensor
,
torch
.
IntTensor
)
):
tensor
=
tensor
.
float
(
)
if
not
tensor
.
is_floating_point
(
):
tensor
=
tensor
.
to
(
torch
.
float
32
)
return
tensor
/
self
.
factor
...
...
@@ -101,18 +88,18 @@ class PadTrim(object):
"""
Returns:
Tensor: (c x
L
n or (n x c)
Tensor: (c x n
)
or (n x c)
"""
assert
tensor
.
size
(
self
.
ch_dim
)
<
128
,
\
"Too many channels ({}) detected,
look at
channels_first param."
.
format
(
tensor
.
size
(
self
.
ch_dim
))
"Too many channels ({}) detected,
see
channels_first param."
.
format
(
tensor
.
size
(
self
.
ch_dim
))
if
self
.
max_len
>
tensor
.
size
(
self
.
len_dim
):
padding_size
=
[
self
.
max_len
-
tensor
.
size
(
self
.
len_dim
)
if
i
=
=
self
.
len_dim
else
tensor
.
size
(
self
.
ch_dim
)
for
i
in
range
(
2
)]
pad
=
torch
.
empty
(
padding_size
,
dtype
=
tensor
.
dtype
).
fill_
(
self
.
fill_value
)
tensor
=
torch
.
cat
((
tensor
,
pad
),
dim
=
self
.
len_dim
)
padding
=
[
self
.
max_len
-
tensor
.
size
(
self
.
len_dim
)
if
(
i
%
2
==
1
)
and
(
i
//
2
!
=
self
.
len_dim
)
else
0
for
i
in
range
(
4
)]
with
torch
.
no_grad
():
tensor
=
torch
.
nn
.
functional
.
pad
(
tensor
,
padding
,
"constant"
,
self
.
fill_value
)
elif
self
.
max_len
<
tensor
.
size
(
self
.
len_dim
):
tensor
=
tensor
.
narrow
(
self
.
len_dim
,
0
,
self
.
max_len
)
return
tensor
...
...
@@ -138,8 +125,8 @@ class DownmixMono(object):
self
.
ch_dim
=
int
(
not
channels_first
)
def
__call__
(
self
,
tensor
):
if
isinstance
(
tensor
,
(
torch
.
LongTensor
,
torch
.
IntTensor
)
):
tensor
=
tensor
.
float
(
)
if
not
tensor
.
is_floating_point
(
):
tensor
=
tensor
.
to
(
torch
.
float
32
)
tensor
=
torch
.
mean
(
tensor
,
self
.
ch_dim
,
True
)
return
tensor
...
...
@@ -182,12 +169,8 @@ class SPECTROGRAM(object):
"""
def
__init__
(
self
,
sr
=
16000
,
ws
=
400
,
hop
=
None
,
n_fft
=
None
,
pad
=
0
,
window
=
torch
.
hann_window
,
wkwargs
=
None
):
if
isinstance
(
window
,
Variable
):
self
.
window
=
window
else
:
self
.
window
=
window
(
ws
)
if
wkwargs
is
None
else
window
(
ws
,
**
wkwargs
)
self
.
window
=
Variable
(
self
.
window
,
volatile
=
True
)
pad
=
0
,
window_fn
=
torch
.
hann_window
,
wkwargs
=
None
):
self
.
window
=
window_fn
(
ws
)
if
wkwargs
is
None
else
window_fn
(
ws
,
**
wkwargs
)
self
.
sr
=
sr
self
.
ws
=
ws
self
.
hop
=
hop
if
hop
is
not
None
else
ws
//
2
...
...
@@ -200,33 +183,27 @@ class SPECTROGRAM(object):
def
__call__
(
self
,
sig
):
"""
Args:
sig (Tensor
or Variable
): Tensor of audio of size (c, n)
sig (Tensor): Tensor of audio of size (c, n)
Returns:
spec_f (Tensor
or Variable
): channels x hops x n_fft (c, l, f), where channels
spec_f (Tensor): channels x hops x n_fft (c, l, f), where channels
is unchanged, hops is the number of hops, and n_fft is the
number of fourier bins, which should be the window size divided
by 2 plus 1.
"""
sig
,
is_variable
=
_check_is_variable
(
sig
)
assert
sig
.
dim
()
==
2
if
self
.
pad
>
0
:
c
,
n
=
sig
.
size
()
new_sig
=
sig
.
new_empty
(
c
,
n
+
self
.
pad
*
2
)
new_sig
[:,
:
self
.
pad
].
zero_
()
new_sig
[:,
-
self
.
pad
:].
zero_
()
new_sig
.
narrow
(
1
,
self
.
pad
,
n
).
copy_
(
sig
)
sig
=
new_sig
with
torch
.
no_grad
():
sig
=
torch
.
nn
.
functional
.
pad
(
sig
,
(
self
.
pad
,
self
.
pad
),
"constant"
)
spec_f
=
torch
.
stft
(
sig
,
self
.
n_fft
,
self
.
hop
,
self
.
ws
,
self
.
window
,
center
=
False
,
normalized
=
True
,
onesided
=
True
).
transpose
(
1
,
2
)
spec_f
/=
self
.
window
.
pow
(
2
).
sum
().
sqrt
()
spec_f
=
spec_f
.
pow
(
2
).
sum
(
-
1
)
# get power of "complex" tensor (c, l, n_fft)
return
spec_f
if
is_variable
else
spec_f
.
data
return
spec_f
class
F2M
(
object
):
...
...
@@ -247,7 +224,6 @@ class F2M(object):
def
__call__
(
self
,
spec_f
):
spec_f
,
is_variable
=
_check_is_variable
(
spec_f
)
n_fft
=
spec_f
.
size
(
2
)
m_min
=
0.
if
self
.
f_min
==
0
else
2595
*
np
.
log10
(
1.
+
(
self
.
f_min
/
700
))
...
...
@@ -269,9 +245,8 @@ class F2M(object):
if
f_m
!=
f_m_plus
:
fb
[
f_m
:
f_m_plus
,
m
-
1
]
=
(
f_m_plus
-
torch
.
arange
(
f_m
,
f_m_plus
))
/
(
f_m_plus
-
f_m
)
fb
=
Variable
(
fb
)
spec_m
=
torch
.
matmul
(
spec_f
,
fb
)
# (c, l, n_fft) dot (n_fft, n_mels) -> (c, l, n_mels)
return
spec_m
if
is_variable
else
spec_m
.
data
return
spec_m
class
SPEC2DB
(
object
):
...
...
@@ -290,11 +265,10 @@ class SPEC2DB(object):
def
__call__
(
self
,
spec
):
spec
,
is_variable
=
_check_is_variable
(
spec
)
spec_db
=
self
.
multiplier
*
torch
.
log10
(
spec
/
spec
.
max
())
# power -> dB
if
self
.
top_db
is
not
None
:
spec_db
=
torch
.
max
(
spec_db
,
spec_db
.
new
([
self
.
top_db
]))
return
spec_db
if
is_variable
else
spec_db
.
data
return
spec_db
class
MEL2
(
object
):
...
...
@@ -322,9 +296,8 @@ class MEL2(object):
>>> spec_mel = transforms.MEL2(sr)(sig) # (c, l, m)
"""
def
__init__
(
self
,
sr
=
16000
,
ws
=
400
,
hop
=
None
,
n_fft
=
None
,
pad
=
0
,
n_mels
=
40
,
window
=
torch
.
hann_window
,
wkwargs
=
None
):
self
.
window
=
window
(
ws
)
if
wkwargs
is
None
else
window
(
ws
,
**
wkwargs
)
self
.
window
=
Variable
(
self
.
window
,
requires_grad
=
False
)
pad
=
0
,
n_mels
=
40
,
window_fn
=
torch
.
hann_window
,
wkwargs
=
None
):
self
.
window_fn
=
window_fn
self
.
sr
=
sr
self
.
ws
=
ws
self
.
hop
=
hop
if
hop
is
not
None
else
ws
//
2
...
...
@@ -348,18 +321,16 @@ class MEL2(object):
"""
sig
,
is_variable
=
_check_is_variable
(
sig
)
transforms
=
Compose
([
SPECTROGRAM
(
self
.
sr
,
self
.
ws
,
self
.
hop
,
self
.
n_fft
,
self
.
pad
,
self
.
window
),
self
.
pad
,
self
.
window
_fn
,
self
.
wkwargs
),
F2M
(
self
.
n_mels
,
self
.
sr
,
self
.
f_max
,
self
.
f_min
),
SPEC2DB
(
"power"
,
self
.
top_db
),
])
spec_mel_db
=
transforms
(
sig
)
return
spec_mel_db
if
is_variable
else
spec_mel_db
.
data
return
spec_mel_db
class
MEL
(
object
):
...
...
@@ -454,10 +425,10 @@ class MuLawEncoding(object):
if
isinstance
(
x
,
np
.
ndarray
):
x_mu
=
np
.
sign
(
x
)
*
np
.
log1p
(
mu
*
np
.
abs
(
x
))
/
np
.
log1p
(
mu
)
x_mu
=
((
x_mu
+
1
)
/
2
*
mu
+
0.5
).
astype
(
int
)
elif
isinstance
(
x
,
(
torch
.
Tensor
,
torch
.
LongTensor
)
):
if
isinstance
(
x
,
torch
.
LongTensor
):
x
=
x
.
float
(
)
mu
=
torch
.
FloatT
ensor
(
[
mu
]
)
elif
isinstance
(
x
,
torch
.
Tensor
):
if
not
x
.
is_floating_point
(
):
x
=
x
.
to
(
torch
.
float
)
mu
=
torch
.
t
ensor
(
mu
,
dtype
=
x
.
dtype
)
x_mu
=
torch
.
sign
(
x
)
*
torch
.
log1p
(
mu
*
torch
.
abs
(
x
))
/
torch
.
log1p
(
mu
)
x_mu
=
((
x_mu
+
1
)
/
2
*
mu
+
0.5
).
long
()
...
...
@@ -496,10 +467,10 @@ class MuLawExpanding(object):
if
isinstance
(
x_mu
,
np
.
ndarray
):
x
=
((
x_mu
)
/
mu
)
*
2
-
1.
x
=
np
.
sign
(
x
)
*
(
np
.
exp
(
np
.
abs
(
x
)
*
np
.
log1p
(
mu
))
-
1.
)
/
mu
elif
isinstance
(
x_mu
,
(
torch
.
Tensor
,
torch
.
LongTensor
)
):
if
isinstance
(
x_mu
,
torch
.
LongTensor
):
x_mu
=
x_mu
.
float
(
)
mu
=
torch
.
FloatTensor
([
mu
]
)
elif
isinstance
(
x_mu
,
torch
.
Tensor
):
if
not
x_mu
.
is_floating_point
(
):
x_mu
=
x_mu
.
to
(
torch
.
float
)
mu
=
torch
.
tensor
(
mu
,
dtype
=
x_mu
.
dtype
)
x
=
((
x_mu
)
/
mu
)
*
2
-
1.
x
=
torch
.
sign
(
x
)
*
(
torch
.
exp
(
torch
.
abs
(
x
)
*
torch
.
log1p
(
mu
))
-
1.
)
/
mu
return
x
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment