"src/git@developer.sourcefind.cn:OpenDAS/lmdeploy.git" did not exist on "b8354dae03e3b942ddef98c3447545f78b84f902"
Commit 301e2e98 authored by David Pollack's avatar David Pollack Committed by Soumith Chintala
Browse files

sox effects and documentation

parent db0da559
......@@ -7,8 +7,10 @@ The :mod:`torchaudio` package consists of I/O, popular datasets and common audio
:maxdepth: 2
:caption: Package Reference
sox_effects
datasets
transforms
legacy
.. automodule:: torchaudio
:members:
torchaudio.legacy
======================
Legacy loading and save functions.
.. automodule:: torchaudio.legacy
:members:
torchaudio.sox_effects
======================
Create SoX effects chain for preprocessing audio.
.. currentmodule:: torchaudio.sox_effects
.. autoclass:: SoxEffect
:members:
.. autoclass:: SoxEffectsChain
:members: append_effect_to_chain, sox_build_flow_effects, clear_chain, set_input_file
......@@ -5,7 +5,7 @@ from torch.utils.cpp_extension import BuildExtension, CppExtension
setup(
name="torchaudio",
version="0.1",
version="0.2",
description="An audio package for PyTorch",
url="https://github.com/pytorch/audio",
author="Soumith Chintala, David Pollack, Sean Naren, Peter Goldsborough",
......
......@@ -82,11 +82,29 @@ class Test_LoadSave(unittest.TestCase):
self.assertEqual(sr, 44100)
self.assertEqual(x.size(), (2, 278756))
# check normalizing
x, sr = torchaudio.load(self.test_filepath, normalization=True)
self.assertEqual(x.dtype, torch.float32)
self.assertTrue(x.min() >= -1.0)
self.assertTrue(x.max() <= 1.0)
# check no normalizing
x, _ = torchaudio.load(self.test_filepath, normalization=False)
self.assertTrue(x.min() <= -1.0)
self.assertTrue(x.max() >= 1.0)
# check offset
offset = 15
x, _ = torchaudio.load(self.test_filepath)
x_offset, _ = torchaudio.load(self.test_filepath, offset=offset)
self.assertTrue(x[:,offset:].allclose(x_offset))
# check number of frames
n = 201
x, _ = torchaudio.load(self.test_filepath, num_frames=n)
self.assertTrue(x.size(), (2, n))
# check channels first
x, _ = torchaudio.load(self.test_filepath, channels_first=False)
self.assertEqual(x.size(), (278756, 2))
# check different input tensor type
x, _ = torchaudio.load(self.test_filepath, torch.LongTensor(), normalization=False)
self.assertTrue(isinstance(x, torch.LongTensor))
# check raising errors
with self.assertRaises(OSError):
......@@ -108,8 +126,8 @@ class Test_LoadSave(unittest.TestCase):
os.unlink(output_path)
def test_4_load_partial(self):
num_frames = 100
offset = 200
num_frames = 101
offset = 201
# load entire mono sinewave wav file, load a partial copy and then compare
input_sine_path = os.path.join(self.test_dirpath, 'assets', 'sinewave.wav')
x_sine_full, sr_sine = torchaudio.load(input_sine_path)
......
......@@ -16,10 +16,10 @@ class TORCHAUDIODS(Dataset):
self.data = [os.path.join(self.asset_dirpath, fn) for fn in os.listdir(self.asset_dirpath)]
self.si, self.ei = torchaudio.info(os.path.join(self.asset_dirpath, "sinewave.wav"))
self.si.precision = 16
self.E = torchaudio.sox_effects.SoxEffects()
self.E.sox_append_effect_to_chain("rate", [self.si.rate]) # resample to 16000hz
self.E.sox_append_effect_to_chain("channels", [self.si.channels]) # mono singal
self.E.sox_append_effect_to_chain("trim", [0, 1]) # first sec of audio
self.E = torchaudio.sox_effects.SoxEffectsChain()
self.E.append_effect_to_chain("rate", [self.si.rate]) # resample to 16000hz
self.E.append_effect_to_chain("channels", [self.si.channels]) # mono singal
self.E.append_effect_to_chain("trim", [0, "16000s"]) # first 16000 samples of audio
def __getitem__(self, index):
fn = self.data[index]
......
......@@ -5,31 +5,40 @@ import math
import os
class Test_SoxEffects(unittest.TestCase):
class Test_SoxEffectsChain(unittest.TestCase):
test_dirpath = os.path.dirname(os.path.realpath(__file__))
test_filepath = os.path.join(test_dirpath, "assets",
"steam-train-whistle-daniel_simon.mp3")
def test_single_channel(self):
fn_sine = os.path.join(self.test_dirpath, "assets", "sinewave.wav")
E = torchaudio.sox_effects.SoxEffectsChain()
E.set_input_file(fn_sine)
E.append_effect_to_chain("echos", [0.8, 0.7, 40, 0.25, 63, 0.3])
x, sr = E.sox_build_flow_effects()
# check if effects worked
#print(x.size())
def test_rate_channels(self):
target_rate = 16000
target_channels = 1
E = torchaudio.sox_effects.SoxEffects()
E = torchaudio.sox_effects.SoxEffectsChain()
E.set_input_file(self.test_filepath)
E.sox_append_effect_to_chain("rate", [target_rate])
E.sox_append_effect_to_chain("channels", [target_channels])
E.append_effect_to_chain("rate", [target_rate])
E.append_effect_to_chain("channels", [target_channels])
x, sr = E.sox_build_flow_effects()
# check if effects worked
self.assertEqual(sr, target_rate)
self.assertEqual(x.size(0), target_channels)
def test_other(self):
def test_lowpass_speed(self):
speed = .8
si, _ = torchaudio.info(self.test_filepath)
E = torchaudio.sox_effects.SoxEffects()
E = torchaudio.sox_effects.SoxEffectsChain()
E.set_input_file(self.test_filepath)
E.sox_append_effect_to_chain("lowpass", 100)
E.sox_append_effect_to_chain("speed", speed)
E.sox_append_effect_to_chain("rate", si.rate)
E.append_effect_to_chain("lowpass", 100)
E.append_effect_to_chain("speed", speed)
E.append_effect_to_chain("rate", si.rate)
x, sr = E.sox_build_flow_effects()
# check if effects worked
self.assertEqual(x.size(1), int((si.length / si.channels) / speed))
......@@ -43,17 +52,145 @@ class Test_SoxEffects(unittest.TestCase):
ei_out.encoding = torchaudio.get_sox_encoding_t(9)
ei_out.bits_per_sample = 8
si_in, ei_in = torchaudio.info(self.test_filepath)
E = torchaudio.sox_effects.SoxEffects(out_siginfo=si_out, out_encinfo=ei_out)
E = torchaudio.sox_effects.SoxEffectsChain(out_siginfo=si_out, out_encinfo=ei_out)
E.set_input_file(self.test_filepath)
x, sr = E.sox_build_flow_effects()
# Note: the sample rate is reported as "changed", but no downsampling occured
# also the number of channels has not changed. Run rate and channels effects
# to make those changes
# to make those changes. However, the output was encoded into ulaw because the
# number of unique values in the output is less than 256.
self.assertLess(x.unique().size(0), 2**8)
self.assertEqual(x.size(0), si_in.channels)
self.assertEqual(sr, si_out.rate)
self.assertEqual(x.numel(), si_in.length)
def test_band_chorus(self):
si_in, ei_in = torchaudio.info(self.test_filepath)
E = torchaudio.sox_effects.SoxEffectsChain(out_encinfo=ei_in, out_siginfo=si_in)
E.set_input_file(self.test_filepath)
E.append_effect_to_chain("band", ["-n", "10k", "3.5k"])
E.append_effect_to_chain("chorus", [.5, .7, 55, 0.4, .25, 2, '-s'])
x, sr = E.sox_build_flow_effects()
#print(x.size(), sr)
def test_synth(self):
si_in, ei_in = torchaudio.info(self.test_filepath)
E = torchaudio.sox_effects.SoxEffectsChain(out_encinfo=ei_in, out_siginfo=si_in)
E.set_input_file(self.test_filepath)
E.append_effect_to_chain("synth", ["1", "pinknoise", "mix"])
E.append_effect_to_chain("rate", [44100])
E.append_effect_to_chain("channels", [2])
x, sr = E.sox_build_flow_effects()
#print(x.size(), sr)
def test_gain(self):
E = torchaudio.sox_effects.SoxEffectsChain()
E.set_input_file(self.test_filepath)
E.append_effect_to_chain("gain", ["5"])
x, sr = E.sox_build_flow_effects()
E.clear_chain()
self.assertTrue(x.abs().max().item(), 1.)
E.set_input_file(self.test_filepath)
E.append_effect_to_chain("gain", ["-e", "-5"])
x, sr = E.sox_build_flow_effects()
E.clear_chain()
self.assertLess(x.abs().max().item(), 1.)
E.set_input_file(self.test_filepath)
E.append_effect_to_chain("gain", ["-b", "8"])
x, sr = E.sox_build_flow_effects()
E.clear_chain()
self.assertTrue(x.abs().max().item(), 1.)
E.set_input_file(self.test_filepath)
E.append_effect_to_chain("gain", ["-n", "-10"])
x, sr = E.sox_build_flow_effects()
E.clear_chain()
self.assertLess(x.abs().max().item(), 1.)
def test_tempo(self):
tempo = .8
si, _ = torchaudio.info(self.test_filepath)
E = torchaudio.sox_effects.SoxEffectsChain()
E.set_input_file(self.test_filepath)
E.append_effect_to_chain("tempo", ["-s", tempo])
x, sr = E.sox_build_flow_effects()
# check if effect worked
self.assertEqual(x.size(1), int((si.length / si.channels) / tempo))
def test_trim(self):
x_orig, _ = torchaudio.load(self.test_filepath)
offset = "10000s"
offset_int = int(offset[:-1])
num_frames = "200s"
num_frames_int = int(num_frames[:-1])
E = torchaudio.sox_effects.SoxEffectsChain()
E.set_input_file(self.test_filepath)
E.append_effect_to_chain("trim", [offset, num_frames])
x, sr = E.sox_build_flow_effects()
# check if effect worked
self.assertTrue(x.allclose(x_orig[:,offset_int:(offset_int+num_frames_int)], rtol=1e-4, atol=1e-4))
def test_silence_contrast(self):
si, _ = torchaudio.info(self.test_filepath)
E = torchaudio.sox_effects.SoxEffectsChain()
E.set_input_file(self.test_filepath)
E.append_effect_to_chain("silence", [1, 100, 1])
E.append_effect_to_chain("contrast", [])
x, sr = E.sox_build_flow_effects()
# check if effect worked
self.assertLess(x.numel(), si.length)
def test_reverse(self):
x_orig, _ = torchaudio.load(self.test_filepath)
E = torchaudio.sox_effects.SoxEffectsChain()
E.set_input_file(self.test_filepath)
E.append_effect_to_chain("reverse", "")
x_rev, _ = E.sox_build_flow_effects()
# check if effect worked
rev_idx = torch.LongTensor(range(x_orig.size(1))[::-1])
self.assertTrue(x_orig.allclose(x_rev[:, rev_idx], rtol=1e-5, atol=2e-5))
def test_compand_fade(self):
E = torchaudio.sox_effects.SoxEffectsChain()
E.set_input_file(self.test_filepath)
E.append_effect_to_chain("compand", ["0.3,1", "6:-70,-60,-20", "-5", "-90", "0.2"])
E.append_effect_to_chain("fade", ["q", "0.25", "0", "0.33"])
x, _ = E.sox_build_flow_effects()
# check if effect worked
#print(x.size())
def test_biquad_delay(self):
si, _ = torchaudio.info(self.test_filepath)
E = torchaudio.sox_effects.SoxEffectsChain()
E.set_input_file(self.test_filepath)
E.append_effect_to_chain("biquad", ["0.25136437", "0.50272873", "0.25136437", "1.0", "-0.17123075", "0.17668821"])
E.append_effect_to_chain("delay", ["15000s"])
x, _ = E.sox_build_flow_effects()
# check if effect worked
self.assertTrue(x.size(1) == (si.length / si.channels) + 15000)
def test_invalid_effect_name(self):
E = torchaudio.sox_effects.SoxEffectsChain()
E.set_input_file(self.test_filepath)
# there is no effect named "special"
with self.assertRaises(LookupError):
E.append_effect_to_chain("special", [""])
def test_unimplemented_effect(self):
E = torchaudio.sox_effects.SoxEffectsChain()
E.set_input_file(self.test_filepath)
# the sox spectrogram function is not implemented in torchaudio
with self.assertRaises(NotImplementedError):
E.append_effect_to_chain("spectrogram", [""])
def test_invalid_effect_options(self):
E = torchaudio.sox_effects.SoxEffectsChain()
E.set_input_file(self.test_filepath)
# first two options should be combined to "0.3,1"
E.append_effect_to_chain("compand", ["0.3", "1", "6:-70,-60,-20", "-5", "-90", "0.2"])
with self.assertRaises(RuntimeError):
E.sox_build_flow_effects()
if __name__ == '__main__':
torchaudio.initialize_sox()
unittest.main()
......
from __future__ import division, print_function
import os.path
import torch
import _torch_sox
from torchaudio import transforms, datasets, sox_effects
from torchaudio import transforms, datasets, sox_effects, legacy
def check_input(src):
......@@ -17,7 +18,7 @@ def load(filepath,
out=None,
normalization=True,
channels_first=True,
num_frames=-1,
num_frames=0,
offset=0,
signalinfo=None,
encodinginfo=None,
......@@ -27,13 +28,13 @@ def load(filepath,
Args:
filepath (string): path to audio file
out (Tensor, optional): an output Tensor to use instead of creating one
normalization (bool, number, or function, optional): If boolean `True`, then output is divided by `1 << 31`
(assumes 16-bit depth audio, and normalizes to `[0, 1]`.
normalization (bool, number, or callable, optional): If boolean `True`, then output is divided by `1 << 31`
(assumes signed 32-bit audio), and normalizes to `[0, 1]`.
If `number`, then output is divided by that number
If `function`, then the output is passed as a parameter
If `callable`, then the output is passed as a parameter
to the given function, then the output is divided by
the result.
num_frames (int, optional): number of frames to load. -1 to load everything after the offset.
num_frames (int, optional): number of frames to load. 0 to load everything after the offset.
offset (int, optional): number of frames from the start of the file to begin data loading.
signalinfo (sox_signalinfo_t, optional): a sox_signalinfo_t type, which could be helpful if the
audio type cannot be automatically determine
......@@ -42,18 +43,18 @@ def load(filepath,
filetype (str, optional): a filetype or extension to be set if sox cannot determine it automatically
Returns: tuple(Tensor, int)
- Tensor: output Tensor of size `[L x C]` where L is the number of audio frames, C is the number of channels
- Tensor: output Tensor of size `[C x L]` or `[L x C]` where L is the number of audio frames, C is the number of channels
- int: the sample-rate of the audio (as listed in the metadata of the file)
Example::
>>> data, sample_rate = torchaudio.load('foo.mp3')
>>> print(data.size())
torch.Size([278756, 2])
torch.Size([2, 278756])
>>> print(sample_rate)
44100
>>> data_volume_normalized, _ = torchaudio.load('foo.mp3', normalization=lambda x: torch.abs(x).max())
>>> print(data_volume_normalized.abs().max())
>>> data_vol_normalized, _ = torchaudio.load('foo.mp3', normalization=lambda x: torch.abs(x).max())
>>> print(data_vol_normalized.abs().max())
1.
"""
......@@ -88,6 +89,9 @@ def load(filepath,
def save(filepath, src, sample_rate, precision=16, channels_first=True):
"""Convenience function for `save_encinfo`.
"""
si = sox_signalinfo_t()
ch_idx = 0 if channels_first else 1
si.rate = sample_rate
......@@ -97,12 +101,17 @@ def save(filepath, src, sample_rate, precision=16, channels_first=True):
return save_encinfo(filepath, src, channels_first, si)
def save_encinfo(filepath, src, channels_first=True, signalinfo=None, encodinginfo=None, filetype=None):
"""Saves a Tensor with audio signal to disk as a standard format like mp3, wav, etc.
def save_encinfo(filepath,
src,
channels_first=True,
signalinfo=None,
encodinginfo=None,
filetype=None):
"""Saves a Tensor of an audio signal to disk as a standard format like mp3, wav, etc.
Args:
filepath (string): path to audio file
src (Tensor): an input 2D Tensor of shape `[L x C]` where L is
src (Tensor): an input 2D Tensor of shape `[C x L]` or `[L x C]` where L is
the number of audio frames, C is the number of channels
signalinfo (sox_signalinfo_t): a sox_signalinfo_t type, which could be helpful if the
audio type cannot be automatically determine
......@@ -129,10 +138,10 @@ def save_encinfo(filepath, src, channels_first=True, signalinfo=None, encodingin
if src.dim() == 1:
# 1d tensors as assumed to be mono signals
src.unsqueeze_(ch_idx)
elif src.dim() > 2 or src.size(ch_idx) > src.size(len_idx):
# assumes num_samples > num_channels
elif src.dim() > 2 or src.size(ch_idx) > 16:
# assumes num_channels < 16
raise ValueError(
"Expected format (L x C), C < L, but found {}".format(src.size()))
"Expected format where C < 16, but found {}".format(src.size()))
# sox stores the sample rate as a float, though practically sample rates are almost always integers
# convert integers to floats
if not isinstance(signalinfo.rate, float):
......@@ -178,31 +187,10 @@ def info(filepath):
return _torch_sox.get_info(filepath)
def effect_names():
"""Gets list of valid sox effect names
Returns: list[str]
Example::
>>> EFFECT_NAMES = torchaudio.effect_names()
"""
return _torch_sox.get_effect_names()
def SoxEffect():
"""Create a object to hold sox effect and options to pass between python and c++
Returns: SoxEffects(object)
- ename (str), name of effect
- eopts (list[str]), list of effect options
"""
return _torch_sox.SoxEffect()
def sox_signalinfo_t():
"""Create a sox_signalinfo_t object. This object can be used to set the sample
rate, number of channels, length, bit precision and headroom multiplier
primarily for effects
r"""Create a sox_signalinfo_t object. This object can be used to set the sample
rate, number of channels, length, bit precision and headroom multiplier
primarily for effects
Returns: sox_signalinfo_t(object)
- rate (float), sample rate as a float, practically will likely be an integer float
......@@ -210,18 +198,25 @@ def sox_signalinfo_t():
- precision (int), bit precision
- length (int), length of audio, 0 for unspecified and -1 for unknown
- mult (float, optional), headroom multiplier for effects and None for no multiplier
Example::
>>> si = torchaudio.sox_signalinfo_t()
>>> si.channels = 1
>>> si.rate = 16000.
>>> si.precision = 16
>>> si.length = 0
"""
return _torch_sox.sox_signalinfo_t()
def sox_encodinginfo_t():
"""Create a sox_encodinginfo_t object. This object can be used to set the encoding
type, bit precision, compression factor, reverse bytes, reverse nibbles,
reverse bits and endianness. This can be used in an effects chain to encode the
final output or to save a file with a specific encoding. For example, one could
use the sox ulaw encoding to do 8-bit ulaw encoding. Note in a tensor output
the result will be a 32-bit number, but number of unique values will be determined by
the bit precision.
type, bit precision, compression factor, reverse bytes, reverse nibbles,
reverse bits and endianness. This can be used in an effects chain to encode the
final output or to save a file with a specific encoding. For example, one could
use the sox ulaw encoding to do 8-bit ulaw encoding. Note in a tensor output
the result will be a 32-bit number, but number of unique values will be determined by
the bit precision.
Returns: sox_encodinginfo_t(object)
- encoding (sox_encoding_t), output encoding
......@@ -231,6 +226,17 @@ def sox_encodinginfo_t():
- reverse_nibbles (sox_option_t), reverse nibbles, use sox_option_default
- reverse_bits (sox_option_t), reverse bytes, use sox_option_default
- opposite_endian (sox_bool), change endianness, use sox_false
Example::
>>> ei = torchaudio.sox_encodinginfo_t()
>>> ei.encoding = torchaudio.get_sox_encoding_t(1)
>>> ei.bits_per_sample = 16
>>> ei.compression = 0
>>> ei.reverse_bytes = torchaudio.get_sox_option_t(2)
>>> ei.reverse_nibbles = torchaudio.get_sox_option_t(2)
>>> ei.reverse_bits = torchaudio.get_sox_option_t(2)
>>> ei.opposite_endian = torchaudio.get_sox_bool(0)
"""
ei = _torch_sox.sox_encodinginfo_t()
sdo = get_sox_option_t(2) # sox_default_option
......@@ -245,7 +251,7 @@ def get_sox_encoding_t(i=None):
Args:
i (int, optional): choose type or get a dict with all possible options
use .__members__ to see all options when not specified
use `__members__` to see all options when not specified
Returns:
sox_encoding_t: a sox_encoding_t type for output encoding
"""
......@@ -261,7 +267,7 @@ def get_sox_option_t(i=2):
Args:
i (int, optional): choose type or get a dict with all possible options
use .__members__ to see all options when not specified.
use `__members__` to see all options when not specified.
Defaults to sox_option_default.
Returns:
sox_option_t: a sox_option_t type
......@@ -277,7 +283,7 @@ def get_sox_bool(i=0):
Args:
i (int, optional): choose type or get a dict with all possible options
use .__members__ to see all options when not specified.
use `__members__` to see all options when not specified.
Defaults to sox_false.
Returns:
sox_bool: a sox_bool type
......@@ -289,22 +295,25 @@ def get_sox_bool(i=0):
def initialize_sox():
"""Initialize sox for effects chain. Not required for simple loading. Importantly,
only initialize this once and do not shutdown until you have done effect chain
calls even when loading multiple files.
"""Initialize sox for use with effects chains. This is not required for simple
loading. Importantly, only run `initialize_sox` once and do not shutdown
after each effect chain, but rather once you are finished with all effects chains.
"""
return _torch_sox.initialize_sox()
def shutdown_sox():
"""Showdown sox for effects chain. Not required for simple loading. Importantly,
only call once. Attempting to re-initialize sox will result seg faults.
only call once. Attempting to re-initialize sox will result in seg faults.
"""
return _torch_sox.shutdown_sox()
def _audio_normalization(signal, normalization):
# assumes signed 32-bit depth, which is what sox uses internally
"""Audio normalization of a tensor in-place. The normalization can be a bool,
a number, or a callable that takes the audio tensor as an input. SoX uses
32-bit signed integers internally, thus bool normalizes based on that assumption.
"""
if not normalization:
return
......
......@@ -35,13 +35,18 @@ def make_manifest(dir):
def read_audio(fp, downsample=True):
sig, sr = torchaudio.load(fp)
if downsample:
# 48khz -> 16 khz
if sig.size(0) % 3 == 0:
sig = sig[::3].contiguous()
else:
sig = sig[:-(sig.size(0) % 3):3].contiguous()
E = torchaudio.sox_effects.SoxEffects()
E.set_input_file(fp)
E.sox_append_effect_to_chain("gain", ["-h"])
E.sox_append_effect_to_chain("channels", [1])
E.sox_append_effect_to_chain("rate", [16000])
E.sox_append_effect_to_chain("gain", ["-rh"])
E.sox_append_effect_to_chain("dither", ["-s"])
sig, sr = E.sox_build_flow_effects()
else:
sig, sr = torchaudio.load(fp)
sig = sig.contiguous()
return sig, sr
......@@ -168,8 +173,8 @@ class VCTK(data.Dataset):
# download files
try:
os.makedirs(os.path.join(self.root, self.raw_folder))
os.makedirs(os.path.join(self.root, self.processed_folder))
os.makedirs(os.path.join(self.root, self.raw_folder))
except OSError as e:
if e.errno == errno.EEXIST:
pass
......@@ -191,6 +196,7 @@ class VCTK(data.Dataset):
os.unlink(file_path)
# process and save as torch files
torchaudio.initialize_sox()
print('Processing...')
shutil.copyfile(
os.path.join(dset_abs_path, "COPYING"),
......@@ -213,10 +219,10 @@ class VCTK(data.Dataset):
f_rel_no_ext = os.path.basename(f).rsplit(".", 1)[0]
sig = read_audio(f, downsample=self.downsample)[0]
tensors.append(sig)
lengths.append(sig.size(0))
lengths.append(sig.size(1))
labels.append(utterences[f_rel_no_ext])
self.max_len = sig.size(0) if sig.size(
0) > self.max_len else self.max_len
self.max_len = sig.size(1) if sig.size(
1) > self.max_len else self.max_len
# sort sigs/labels: longest -> shortest
tensors, labels = zip(*[(b, c) for (a, b, c) in sorted(
zip(lengths, tensors, labels), key=lambda x: x[0], reverse=True)])
......@@ -232,5 +238,5 @@ class VCTK(data.Dataset):
self._write_info((n * self.chunk_size) + i + 1)
if not self.dev_mode:
shutil.rmtree(raw_abs_dir, ignore_errors=True)
torchaudio.shutdown_sox()
print('Done!')
......@@ -128,12 +128,12 @@ class YESNO(data.Dataset):
full_path = os.path.join(dset_abs_path, f)
sig, sr = torchaudio.load(full_path)
tensors.append(sig)
lengths.append(sig.size(0))
lengths.append(sig.size(1))
labels.append(os.path.basename(f).split(".", 1)[0].split("_"))
# sort sigs/labels: longest -> shortest
tensors, labels = zip(*[(b, c) for (a, b, c) in sorted(
zip(lengths, tensors, labels), key=lambda x: x[0], reverse=True)])
self.max_len = tensors[0].size(0)
self.max_len = tensors[0].size(1)
torch.save(
(tensors, labels),
os.path.join(
......
from __future__ import division, print_function
import os.path
import torch
import _torch_sox
from torchaudio import save as save_new, load as load_new
import torchaudio
def load(filepath, out=None, normalization=None, num_frames=-1, offset=0):
def load(filepath, out=None, normalization=None, num_frames=0, offset=0):
"""Loads an audio file from disk into a Tensor. The default options have
changed as of torchaudio 0.2 and this function maintains option defaults
from version 0.1.
changed as of torchaudio 0.2 and this function maintains option defaults
from version 0.1.
Args:
filepath (string): path to audio file
......@@ -26,20 +27,20 @@ def load(filepath, out=None, normalization=None, num_frames=-1, offset=0):
Example::
>>> data, sample_rate = torchaudio.load('foo.mp3')
>>> data, sample_rate = torchaudio.legacy.load('foo.mp3')
>>> print(data.size())
torch.Size([278756, 2])
>>> print(sample_rate)
44100
"""
return load_new(filepath, out, normalization, False, num_frames, offset)
return torchaudio.load(filepath, out, normalization, False, num_frames, offset)
def save(filepath, src, sample_rate, precision=32):
"""Saves a Tensor with audio signal to disk as a standard format like mp3, wav, etc.
The default options have changed as of torchaudio 0.2 and this function maintains
option defaults from version 0.1.
The default options have changed as of torchaudio 0.2 and this function maintains
option defaults from version 0.1.
Args:
filepath (string): path to audio file
......@@ -50,8 +51,8 @@ def save(filepath, src, sample_rate, precision=32):
Example::
>>> data, sample_rate = torchaudio.load('foo.mp3')
>>> torchaudio.save('foo.wav', data, sample_rate)
>>> data, sample_rate = torchaudio.legacy.load('foo.mp3')
>>> torchaudio.legacy.save('foo.wav', data, sample_rate)
"""
save_new(filepath, src, sample_rate, precision, False)
torchaudio.save(filepath, src, sample_rate, precision, False)
from __future__ import division, print_function
import torch
import _torch_sox
import torchaudio
EFFECT_NAMES = set(_torch_sox.get_effect_names())
"""
Notes:
sox_signalinfo_t {
sox_rate_t rate; /**< samples per second, 0 if unknown */
unsigned channels; /**< number of sound channels, 0 if unknown */
unsigned precision; /**< bits per sample, 0 if unknown */
sox_uint64_t length; /**< samples * chans in file, 0 if unspecified, -1 if unknown */
double * mult; /**< Effects headroom multiplier; may be null */
}
typedef struct sox_encodinginfo_t {
sox_encoding_t encoding; /**< format of sample numbers */
unsigned bits_per_sample; /**< 0 if unknown or variable; uncompressed value if lossless; compressed value if lossy */
double compression; /**< compression factor (where applicable) */
sox_option_t reverse_bytes; /** use sox_option_default */
sox_option_t reverse_nibbles; /** use sox_option_default */
sox_option_t reverse_bits; /** use sox_option_default */
sox_bool opposite_endian; /** use sox_false */
}
sox_encodings_t = {
"SOX_ENCODING_UNKNOWN",
"SOX_ENCODING_SIGN2",
"SOX_ENCODING_UNSIGNED",
"SOX_ENCODING_FLOAT",
"SOX_ENCODING_FLOAT_TEXT",
"SOX_ENCODING_FLAC",
"SOX_ENCODING_HCOM",
"SOX_ENCODING_WAVPACK",
"SOX_ENCODING_WAVPACKF",
"SOX_ENCODING_ULAW",
"SOX_ENCODING_ALAW",
"SOX_ENCODING_G721",
"SOX_ENCODING_G723",
"SOX_ENCODING_CL_ADPCM",
"SOX_ENCODING_CL_ADPCM16",
"SOX_ENCODING_MS_ADPCM",
"SOX_ENCODING_IMA_ADPCM",
"SOX_ENCODING_OKI_ADPCM",
"SOX_ENCODING_DPCM",
"SOX_ENCODING_DWVW",
"SOX_ENCODING_DWVWN",
"SOX_ENCODING_GSM",
"SOX_ENCODING_MP3",
"SOX_ENCODING_VORBIS",
"SOX_ENCODING_AMR_WB",
"SOX_ENCODING_AMR_NB",
"SOX_ENCODING_CVSD",
"SOX_ENCODING_LPC10",
"SOX_ENCODING_OPUS",
"SOX_ENCODINGS"
}
"""
class SoxEffects(object):
def effect_names():
"""Gets list of valid sox effect names
Returns: list[str]
Example::
>>> EFFECT_NAMES = torchaudio.sox_effects.effect_names()
"""
return _torch_sox.get_effect_names()
def SoxEffect():
"""Create an object for passing sox effect information between python and c++
Returns: SoxEffect(object)
- ename (str), name of effect
- eopts (list[str]), list of effect options
"""
return _torch_sox.SoxEffect()
class SoxEffectsChain(object):
"""SoX effects chain class.
"""
EFFECTS_AVAILABLE = set(effect_names())
EFFECTS_UNIMPLEMENTED = set(["spectrogram", "splice", "noiseprof", "fir"])
def __init__(self, normalization=True, channels_first=True, out_siginfo=None, out_encinfo=None, filetype="raw"):
self.input_file = None
......@@ -73,15 +43,12 @@ class SoxEffects(object):
self.normalization = normalization
self.channels_first = channels_first
def sox_check_effect(self, e):
if e.lower() not in EFFECT_NAMES:
raise LookupError("Effect name, {}, not valid".format(e.lower()))
return e.lower()
def sox_append_effect_to_chain(self, ename, eargs=None):
e = torchaudio.SoxEffect()
def append_effect_to_chain(self, ename, eargs=None):
"""Append effect to a sox effects chain.
"""
e = SoxEffect()
# check if we have a valid effect
ename = self.sox_check_effect(ename)
ename = self._check_effect(ename)
if eargs is None or eargs == []:
eargs = [""]
elif not isinstance(eargs, list):
......@@ -96,13 +63,15 @@ class SoxEffects(object):
self.chain.append(e)
def sox_build_flow_effects(self, out=None):
"""Build effects chain and flow effects from input file to output tensor
"""
# initialize output tensor
if out is not None:
torchaudio.check_input(out)
else:
out = torch.FloatTensor()
if not len(self.chain):
e = torchaudio.SoxEffect()
e = SoxEffect()
e.ename = "no_effects"
e.eopts = [""]
self.chain.append(e)
......@@ -122,11 +91,22 @@ class SoxEffects(object):
return out, sr
def clear_chain(self):
"""Clear effects chain in python
"""
self.chain = []
def set_input_file(self, input_file):
"""Set input file for input of chain
"""
self.input_file = input_file
def _check_effect(self, e):
if e.lower() in self.EFFECTS_UNIMPLEMENTED:
raise NotImplementedError("This effect ({}) is not implement in torchaudio".format(e))
elif e.lower() not in self.EFFECTS_AVAILABLE:
raise LookupError("Effect name, {}, not valid".format(e.lower()))
return e.lower()
# https://stackoverflow.com/questions/12472338/flattening-a-list-recursively
# convenience function to flatten list recursively
def _flatten(self, x):
......
......@@ -109,18 +109,21 @@ int read_audio_file(
sox_encodinginfo_t* ei,
const char* ft) {
SoxDescriptor fd(sox_open_read(
file_name.c_str(),
/*signal=*/si,
/*encoding=*/ei,
/*filetype=*/ft));
SoxDescriptor fd(sox_open_read(file_name.c_str(), si, ei, ft));
if (fd.get() == nullptr) {
throw std::runtime_error("Error opening audio file");
}
// signal info
const int number_of_channels = fd->signal.channels;
const int sample_rate = fd->signal.rate;
const int64_t total_length = fd->signal.length;
// multiply offset and number of frames by number of channels
offset *= number_of_channels;
nframes *= number_of_channels;
if (total_length == 0) {
throw std::runtime_error("Error reading audio file: unknown length");
}
......@@ -133,14 +136,10 @@ int read_audio_file(
if (offset > 0) {
buffer_length -= offset;
}
if (nframes != -1 && buffer_length > nframes) {
if (nframes > 0 && buffer_length > nframes) {
buffer_length = nframes;
}
// buffer length and offset need to be multipled by the number of channels
buffer_length *= number_of_channels;
offset *= number_of_channels;
// seek to offset point before reading data
if (sox_seek(fd.get(), offset, 0) == SOX_EOF) {
throw std::runtime_error("sox_seek reached EOF, try reducing offset or num_samples");
......@@ -149,6 +148,7 @@ int read_audio_file(
// read data and fill output tensor
read_audio(fd, output, buffer_length);
// L x C -> C x L, if desired
if (ch_first) {
output.transpose_(1, 0);
}
......@@ -167,7 +167,6 @@ void write_audio_file(
"Error writing audio file: input tensor must be contiguous");
}
// remove ?
#if SOX_LIB_VERSION_CODE >= 918272 // >= 14.3.0
si->mult = nullptr;
#endif
......@@ -248,20 +247,12 @@ int build_flow_effects(const std::string& file_name,
target_encoding->opposite_endian = sox_false; // Reverse endianness
}
// set target precision / bits_per_sample if it's still 0
//if (target_signal->precision == 0)
// target_signal->precision = input->signal.precision;
//if (target_encoding->bits_per_sample == 0)
// target_encoding->bits_per_sample = input->signal.precision;
// check for rate or channels effect and change the output signalinfo accordingly
for (SoxEffect se : pyeffs) {
if (se.ename == "rate") {
target_signal->rate = std::stod(se.eopts[0]);
//se.eopts[0] = "";
} else if (se.ename == "channels") {
target_signal->channels = std::stoi(se.eopts[0]);
//se.eopts[0] = "";
}
}
......@@ -271,7 +262,6 @@ int build_flow_effects(const std::string& file_name,
// create buffer and buffer_size for output in memwrite
char* buffer;
size_t buffer_size;
//const char* otype = (file_type.empty()) ? (const char*) "raw" : file_type.c_str();
#ifdef __APPLE__
// According to Mozilla Deepspeech sox_open_memstream_write doesn't work
// with OSX
......@@ -287,7 +277,9 @@ int build_flow_effects(const std::string& file_name,
target_encoding,
file_type, nullptr);
#endif
assert(output);
if (output == nullptr) {
throw std::runtime_error("Error opening output memstream/temporary file");
}
// Setup the effects chain to decode/resample
sox_effects_chain_t* chain =
sox_create_effects_chain(&input->encoding, &output->encoding);
......@@ -307,11 +299,12 @@ int build_flow_effects(const std::string& file_name,
} else {
int num_opts = tae.eopts.size();
char* sox_args[max_num_eopts];
//for(std::string s : tae.eopts) {
for(std::vector<std::string>::size_type i = 0; i != tae.eopts.size(); i++) {
sox_args[i] = (char*) tae.eopts[i].c_str();
}
sox_effect_options(e, num_opts, sox_args);
if(sox_effect_options(e, num_opts, sox_args) != SOX_SUCCESS) {
throw std::runtime_error("invalid effect options, see SoX docs for details");
}
}
sox_add_effect(chain, e, &interm_signal, &input->signal);
free(e);
......@@ -331,9 +324,21 @@ int build_flow_effects(const std::string& file_name,
sox_close(output);
sox_close(input);
// Resize output tensor to desired dimensions
int nc = interm_signal.channels;
int ns = interm_signal.length;
// Resize output tensor to desired dimensions, different effects result in output->signal.length,
// interm_signal.length and buffer size being inconsistent with the result of the file output.
// We prioritize in the order: output->signal.length > interm_signal.length > buffer_size
int nc, ns;
if (output->signal.length == 0) {
if (interm_signal.length > (buffer_size * 10)) {
ns = buffer_size / 2;
} else {
ns = interm_signal.length;
}
nc = interm_signal.channels;
} else {
nc = output->signal.channels;
ns = output->signal.length;
}
otensor.resize_({ns/nc, nc});
otensor = otensor.contiguous();
......
......@@ -27,7 +27,6 @@ int read_audio_file(
void write_audio_file(
const std::string& file_name,
at::Tensor tensor,
bool ch_first,
sox_signalinfo_t* si,
sox_encodinginfo_t* ei,
const char* extension)
......@@ -55,6 +54,7 @@ int shutdown_sox();
/// and the sample rate of the output tensor.
int build_flow_effects(const std::string& file_name,
at::Tensor otensor,
bool ch_first,
sox_signalinfo_t* target_signal,
sox_encodinginfo_t* target_encoding,
const char* file_type,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment