Commit 4554d242 authored by moto's avatar moto Committed by Facebook GitHub Bot
Browse files

Fix load behavior for 24-bit input (#2084)

Summary:
## bug description

When a 24 bits-par-sample audio is loaded via file-like object,
the loaded Tensor is wrong. It was fine if the audio is loaded
from local file.

## The cause of the bug

The core of the sox's decoding mechanism is `sox_read` function,
one of which parameter is the maximum number of samples to decode
from the given buffer.

https://fossies.org/dox/sox-14.4.2/formats_8c.html#a2a4f0194a0f919d4f38c57b81aa2c06f)]

The `sox_read` function is called in what is called `drain` effect,
callback and this callback receives output buffer and its size in
byte. The previous implementation passed this size value as
the argument of `sox_read` for the maximum number of samples to
read. Since buffer size is larger than the number of samples fit in
the buffer, `sox_read` function always consumed the entire
buffer. (This behavior is not wrong except when the input is
24 bits-per-sample and file-like object.)

When the input is read from file-like object, inside of drain
callback, new data are fetched via Python's `read` method and
loaded on fixed-size memory region. The size of this memory region
can be adjusted via `torchaudio.utils.sox_utils.set_buffer_size`,
but the default value is 8096.

If the input format is 24 bits-per-sample, the end of memory region
does not necessarily correspond to the end of a valid sample.
When `sox_read` consumes all the data in the buffer region, the data
at the end introduces some unexpected values.
This causes the aforementioned bug

## Fix

Pass proper (better estimated) maximum number of samples decodable to
`sox_read`.

Pull Request resolved: https://github.com/pytorch/audio/pull/2084

Reviewed By: carolineechen

Differential Revision: D33236947

Pulled By: mthrok

fbshipit-source-id: 171d9b7945f81db54f98362a68b20f2f95bb11a4
parent 2476dd2d
......@@ -317,25 +317,26 @@ class TestFileObject(TempDirMixin, PytorchTestCase):
because `load` function is rigrously tested for file path inputs to match libsox's result,
"""
@parameterized.expand([
('wav', None),
('mp3', 128),
('mp3', 320),
('flac', 0),
('flac', 5),
('flac', 8),
('vorbis', -1),
('vorbis', 10),
('amb', None),
('wav', {'bit_depth': 16}),
('wav', {'bit_depth': 24}),
('wav', {'bit_depth': 32}),
('mp3', {'compression': 128}),
('mp3', {'compression': 320}),
('flac', {'compression': 0}),
('flac', {'compression': 5}),
('flac', {'compression': 8}),
('vorbis', {'compression': -1}),
('vorbis', {'compression': 10}),
('amb', {}),
])
def test_fileobj(self, ext, compression):
def test_fileobj(self, ext, kwargs):
"""Loading audio via file object returns the same result as via file path."""
sample_rate = 16000
format_ = ext if ext in ['mp3'] else None
path = self.get_temp_path(f'test.{ext}')
sox_utils.gen_audio_file(
path, sample_rate, num_channels=2,
compression=compression)
path, sample_rate, num_channels=2, **kwargs)
expected, _ = sox_io_backend.load(path)
with open(path, 'rb') as fileobj:
......@@ -345,25 +346,26 @@ class TestFileObject(TempDirMixin, PytorchTestCase):
self.assertEqual(expected, found)
@parameterized.expand([
('wav', None),
('mp3', 128),
('mp3', 320),
('flac', 0),
('flac', 5),
('flac', 8),
('vorbis', -1),
('vorbis', 10),
('amb', None),
('wav', {'bit_depth': 16}),
('wav', {'bit_depth': 24}),
('wav', {'bit_depth': 32}),
('mp3', {'compression': 128}),
('mp3', {'compression': 320}),
('flac', {'compression': 0}),
('flac', {'compression': 5}),
('flac', {'compression': 8}),
('vorbis', {'compression': -1}),
('vorbis', {'compression': 10}),
('amb', {}),
])
def test_bytesio(self, ext, compression):
def test_bytesio(self, ext, kwargs):
"""Loading audio via BytesIO object returns the same result as via file path."""
sample_rate = 16000
format_ = ext if ext in ['mp3'] else None
path = self.get_temp_path(f'test.{ext}')
sox_utils.gen_audio_file(
path, sample_rate, num_channels=2,
compression=compression)
path, sample_rate, num_channels=2, **kwargs)
expected, _ = sox_io_backend.load(path)
with open(path, 'rb') as file_:
......@@ -374,17 +376,19 @@ class TestFileObject(TempDirMixin, PytorchTestCase):
self.assertEqual(expected, found)
@parameterized.expand([
('wav', None),
('mp3', 128),
('mp3', 320),
('flac', 0),
('flac', 5),
('flac', 8),
('vorbis', -1),
('vorbis', 10),
('amb', None),
('wav', {'bit_depth': 16}),
('wav', {'bit_depth': 24}),
('wav', {'bit_depth': 32}),
('mp3', {'compression': 128}),
('mp3', {'compression': 320}),
('flac', {'compression': 0}),
('flac', {'compression': 5}),
('flac', {'compression': 8}),
('vorbis', {'compression': -1}),
('vorbis', {'compression': 10}),
('amb', {}),
])
def test_bytesio_clogged(self, ext, compression):
def test_bytesio_clogged(self, ext, kwargs):
"""Loading audio via clogged file object returns the same result as via file path.
This test case validates the case where fileobject returns shorter bytes than requeted.
......@@ -394,8 +398,7 @@ class TestFileObject(TempDirMixin, PytorchTestCase):
path = self.get_temp_path(f'test.{ext}')
sox_utils.gen_audio_file(
path, sample_rate, num_channels=2,
compression=compression)
path, sample_rate, num_channels=2, **kwargs)
expected, _ = sox_io_backend.load(path)
with open(path, 'rb') as file_:
......@@ -406,17 +409,19 @@ class TestFileObject(TempDirMixin, PytorchTestCase):
self.assertEqual(expected, found)
@parameterized.expand([
('wav', None),
('mp3', 128),
('mp3', 320),
('flac', 0),
('flac', 5),
('flac', 8),
('vorbis', -1),
('vorbis', 10),
('amb', None),
('wav', {'bit_depth': 16}),
('wav', {'bit_depth': 24}),
('wav', {'bit_depth': 32}),
('mp3', {'compression': 128}),
('mp3', {'compression': 320}),
('flac', {'compression': 0}),
('flac', {'compression': 5}),
('flac', {'compression': 8}),
('vorbis', {'compression': -1}),
('vorbis', {'compression': 10}),
('amb', {}),
])
def test_bytesio_tiny(self, ext, compression):
def test_bytesio_tiny(self, ext, kwargs):
"""Loading very small audio via file object returns the same result as via file path.
"""
sample_rate = 16000
......@@ -424,8 +429,7 @@ class TestFileObject(TempDirMixin, PytorchTestCase):
path = self.get_temp_path(f'test.{ext}')
sox_utils.gen_audio_file(
path, sample_rate, num_channels=2,
compression=compression, duration=1 / 1600)
path, sample_rate, num_channels=2, duration=1 / 1600, **kwargs)
expected, _ = sox_io_backend.load(path)
with open(path, 'rb') as file_:
......@@ -436,17 +440,19 @@ class TestFileObject(TempDirMixin, PytorchTestCase):
self.assertEqual(expected, found)
@parameterized.expand([
('wav', None),
('mp3', 128),
('mp3', 320),
('flac', 0),
('flac', 5),
('flac', 8),
('vorbis', -1),
('vorbis', 10),
('amb', None),
('wav', {'bit_depth': 16}),
('wav', {'bit_depth': 24}),
('wav', {'bit_depth': 32}),
('mp3', {'compression': 128}),
('mp3', {'compression': 320}),
('flac', {'compression': 0}),
('flac', {'compression': 5}),
('flac', {'compression': 8}),
('vorbis', {'compression': -1}),
('vorbis', {'compression': 10}),
('amb', {}),
])
def test_tarfile(self, ext, compression):
def test_tarfile(self, ext, kwargs):
"""Loading compressed audio via file-like object returns the same result as via file path."""
sample_rate = 16000
format_ = ext if ext in ['mp3'] else None
......@@ -455,8 +461,7 @@ class TestFileObject(TempDirMixin, PytorchTestCase):
archive_path = self.get_temp_path('archive.tar.gz')
sox_utils.gen_audio_file(
audio_path, sample_rate, num_channels=2,
compression=compression)
audio_path, sample_rate, num_channels=2, **kwargs)
expected, _ = sox_io_backend.load(audio_path)
with tarfile.TarFile(archive_path, 'w') as tarobj:
......@@ -474,24 +479,26 @@ class TestFileObject(TempDirMixin, PytorchTestCase):
@skipIfNoModule("requests")
class TestFileObjectHttp(HttpServerMixin, PytorchTestCase):
@parameterized.expand([
('wav', None),
('mp3', 128),
('mp3', 320),
('flac', 0),
('flac', 5),
('flac', 8),
('vorbis', -1),
('vorbis', 10),
('amb', None),
('wav', {'bit_depth': 16}),
('wav', {'bit_depth': 24}),
('wav', {'bit_depth': 32}),
('mp3', {'compression': 128}),
('mp3', {'compression': 320}),
('flac', {'compression': 0}),
('flac', {'compression': 5}),
('flac', {'compression': 8}),
('vorbis', {'compression': -1}),
('vorbis', {'compression': 10}),
('amb', {}),
])
def test_requests(self, ext, compression):
def test_requests(self, ext, kwargs):
sample_rate = 16000
format_ = ext if ext in ['mp3'] else None
audio_file = f'test.{ext}'
audio_path = self.get_temp_path(audio_file)
sox_utils.gen_audio_file(
audio_path, sample_rate, num_channels=2, compression=compression)
audio_path, sample_rate, num_channels=2, **kwargs)
expected, _ = sox_io_backend.load(audio_path)
url = self.get_url(audio_file)
......
......@@ -103,6 +103,14 @@ auto fileobj_input_drain(sox_effect_t* effp, sox_sample_t* obuf, size_t* osamp)
// The following part is practically same as "input" effect
// https://github.com/dmkrepo/libsox/blob/b9dd1a86e71bbd62221904e3e59dfaa9e5e72046/src/input.c#L30-L48
// At this point, osamp represents the buffer size in bytes,
// but sox_read expects the maximum number of samples ready to read.
// Normally, this is fine, but in case when the samples are not 4-byte
// aligned, (e.g. sample is 24bits), the resulting signal is not correct.
// https://github.com/pytorch/audio/issues/2083
if (sf->encoding.bits_per_sample > 0)
*osamp /= (sf->encoding.bits_per_sample / 8);
// Ensure that it's a multiple of the number of channels
*osamp -= *osamp % effp->out_signal.channels;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment