Commit 4554d242 authored by moto's avatar moto Committed by Facebook GitHub Bot
Browse files

Fix load behavior for 24-bit input (#2084)

Summary:
## bug description

When a 24 bits-par-sample audio is loaded via file-like object,
the loaded Tensor is wrong. It was fine if the audio is loaded
from local file.

## The cause of the bug

The core of the sox's decoding mechanism is `sox_read` function,
one of which parameter is the maximum number of samples to decode
from the given buffer.

https://fossies.org/dox/sox-14.4.2/formats_8c.html#a2a4f0194a0f919d4f38c57b81aa2c06f)]

The `sox_read` function is called in what is called `drain` effect,
callback and this callback receives output buffer and its size in
byte. The previous implementation passed this size value as
the argument of `sox_read` for the maximum number of samples to
read. Since buffer size is larger than the number of samples fit in
the buffer, `sox_read` function always consumed the entire
buffer. (This behavior is not wrong except when the input is
24 bits-per-sample and file-like object.)

When the input is read from file-like object, inside of drain
callback, new data are fetched via Python's `read` method and
loaded on fixed-size memory region. The size of this memory region
can be adjusted via `torchaudio.utils.sox_utils.set_buffer_size`,
but the default value is 8096.

If the input format is 24 bits-per-sample, the end of memory region
does not necessarily correspond to the end of a valid sample.
When `sox_read` consumes all the data in the buffer region, the data
at the end introduces some unexpected values.
This causes the aforementioned bug

## Fix

Pass proper (better estimated) maximum number of samples decodable to
`sox_read`.

Pull Request resolved: https://github.com/pytorch/audio/pull/2084

Reviewed By: carolineechen

Differential Revision: D33236947

Pulled By: mthrok

fbshipit-source-id: 171d9b7945f81db54f98362a68b20f2f95bb11a4
parent 2476dd2d
...@@ -317,25 +317,26 @@ class TestFileObject(TempDirMixin, PytorchTestCase): ...@@ -317,25 +317,26 @@ class TestFileObject(TempDirMixin, PytorchTestCase):
because `load` function is rigrously tested for file path inputs to match libsox's result, because `load` function is rigrously tested for file path inputs to match libsox's result,
""" """
@parameterized.expand([ @parameterized.expand([
('wav', None), ('wav', {'bit_depth': 16}),
('mp3', 128), ('wav', {'bit_depth': 24}),
('mp3', 320), ('wav', {'bit_depth': 32}),
('flac', 0), ('mp3', {'compression': 128}),
('flac', 5), ('mp3', {'compression': 320}),
('flac', 8), ('flac', {'compression': 0}),
('vorbis', -1), ('flac', {'compression': 5}),
('vorbis', 10), ('flac', {'compression': 8}),
('amb', None), ('vorbis', {'compression': -1}),
('vorbis', {'compression': 10}),
('amb', {}),
]) ])
def test_fileobj(self, ext, compression): def test_fileobj(self, ext, kwargs):
"""Loading audio via file object returns the same result as via file path.""" """Loading audio via file object returns the same result as via file path."""
sample_rate = 16000 sample_rate = 16000
format_ = ext if ext in ['mp3'] else None format_ = ext if ext in ['mp3'] else None
path = self.get_temp_path(f'test.{ext}') path = self.get_temp_path(f'test.{ext}')
sox_utils.gen_audio_file( sox_utils.gen_audio_file(
path, sample_rate, num_channels=2, path, sample_rate, num_channels=2, **kwargs)
compression=compression)
expected, _ = sox_io_backend.load(path) expected, _ = sox_io_backend.load(path)
with open(path, 'rb') as fileobj: with open(path, 'rb') as fileobj:
...@@ -345,25 +346,26 @@ class TestFileObject(TempDirMixin, PytorchTestCase): ...@@ -345,25 +346,26 @@ class TestFileObject(TempDirMixin, PytorchTestCase):
self.assertEqual(expected, found) self.assertEqual(expected, found)
@parameterized.expand([ @parameterized.expand([
('wav', None), ('wav', {'bit_depth': 16}),
('mp3', 128), ('wav', {'bit_depth': 24}),
('mp3', 320), ('wav', {'bit_depth': 32}),
('flac', 0), ('mp3', {'compression': 128}),
('flac', 5), ('mp3', {'compression': 320}),
('flac', 8), ('flac', {'compression': 0}),
('vorbis', -1), ('flac', {'compression': 5}),
('vorbis', 10), ('flac', {'compression': 8}),
('amb', None), ('vorbis', {'compression': -1}),
('vorbis', {'compression': 10}),
('amb', {}),
]) ])
def test_bytesio(self, ext, compression): def test_bytesio(self, ext, kwargs):
"""Loading audio via BytesIO object returns the same result as via file path.""" """Loading audio via BytesIO object returns the same result as via file path."""
sample_rate = 16000 sample_rate = 16000
format_ = ext if ext in ['mp3'] else None format_ = ext if ext in ['mp3'] else None
path = self.get_temp_path(f'test.{ext}') path = self.get_temp_path(f'test.{ext}')
sox_utils.gen_audio_file( sox_utils.gen_audio_file(
path, sample_rate, num_channels=2, path, sample_rate, num_channels=2, **kwargs)
compression=compression)
expected, _ = sox_io_backend.load(path) expected, _ = sox_io_backend.load(path)
with open(path, 'rb') as file_: with open(path, 'rb') as file_:
...@@ -374,17 +376,19 @@ class TestFileObject(TempDirMixin, PytorchTestCase): ...@@ -374,17 +376,19 @@ class TestFileObject(TempDirMixin, PytorchTestCase):
self.assertEqual(expected, found) self.assertEqual(expected, found)
@parameterized.expand([ @parameterized.expand([
('wav', None), ('wav', {'bit_depth': 16}),
('mp3', 128), ('wav', {'bit_depth': 24}),
('mp3', 320), ('wav', {'bit_depth': 32}),
('flac', 0), ('mp3', {'compression': 128}),
('flac', 5), ('mp3', {'compression': 320}),
('flac', 8), ('flac', {'compression': 0}),
('vorbis', -1), ('flac', {'compression': 5}),
('vorbis', 10), ('flac', {'compression': 8}),
('amb', None), ('vorbis', {'compression': -1}),
('vorbis', {'compression': 10}),
('amb', {}),
]) ])
def test_bytesio_clogged(self, ext, compression): def test_bytesio_clogged(self, ext, kwargs):
"""Loading audio via clogged file object returns the same result as via file path. """Loading audio via clogged file object returns the same result as via file path.
This test case validates the case where fileobject returns shorter bytes than requeted. This test case validates the case where fileobject returns shorter bytes than requeted.
...@@ -394,8 +398,7 @@ class TestFileObject(TempDirMixin, PytorchTestCase): ...@@ -394,8 +398,7 @@ class TestFileObject(TempDirMixin, PytorchTestCase):
path = self.get_temp_path(f'test.{ext}') path = self.get_temp_path(f'test.{ext}')
sox_utils.gen_audio_file( sox_utils.gen_audio_file(
path, sample_rate, num_channels=2, path, sample_rate, num_channels=2, **kwargs)
compression=compression)
expected, _ = sox_io_backend.load(path) expected, _ = sox_io_backend.load(path)
with open(path, 'rb') as file_: with open(path, 'rb') as file_:
...@@ -406,17 +409,19 @@ class TestFileObject(TempDirMixin, PytorchTestCase): ...@@ -406,17 +409,19 @@ class TestFileObject(TempDirMixin, PytorchTestCase):
self.assertEqual(expected, found) self.assertEqual(expected, found)
@parameterized.expand([ @parameterized.expand([
('wav', None), ('wav', {'bit_depth': 16}),
('mp3', 128), ('wav', {'bit_depth': 24}),
('mp3', 320), ('wav', {'bit_depth': 32}),
('flac', 0), ('mp3', {'compression': 128}),
('flac', 5), ('mp3', {'compression': 320}),
('flac', 8), ('flac', {'compression': 0}),
('vorbis', -1), ('flac', {'compression': 5}),
('vorbis', 10), ('flac', {'compression': 8}),
('amb', None), ('vorbis', {'compression': -1}),
('vorbis', {'compression': 10}),
('amb', {}),
]) ])
def test_bytesio_tiny(self, ext, compression): def test_bytesio_tiny(self, ext, kwargs):
"""Loading very small audio via file object returns the same result as via file path. """Loading very small audio via file object returns the same result as via file path.
""" """
sample_rate = 16000 sample_rate = 16000
...@@ -424,8 +429,7 @@ class TestFileObject(TempDirMixin, PytorchTestCase): ...@@ -424,8 +429,7 @@ class TestFileObject(TempDirMixin, PytorchTestCase):
path = self.get_temp_path(f'test.{ext}') path = self.get_temp_path(f'test.{ext}')
sox_utils.gen_audio_file( sox_utils.gen_audio_file(
path, sample_rate, num_channels=2, path, sample_rate, num_channels=2, duration=1 / 1600, **kwargs)
compression=compression, duration=1 / 1600)
expected, _ = sox_io_backend.load(path) expected, _ = sox_io_backend.load(path)
with open(path, 'rb') as file_: with open(path, 'rb') as file_:
...@@ -436,17 +440,19 @@ class TestFileObject(TempDirMixin, PytorchTestCase): ...@@ -436,17 +440,19 @@ class TestFileObject(TempDirMixin, PytorchTestCase):
self.assertEqual(expected, found) self.assertEqual(expected, found)
@parameterized.expand([ @parameterized.expand([
('wav', None), ('wav', {'bit_depth': 16}),
('mp3', 128), ('wav', {'bit_depth': 24}),
('mp3', 320), ('wav', {'bit_depth': 32}),
('flac', 0), ('mp3', {'compression': 128}),
('flac', 5), ('mp3', {'compression': 320}),
('flac', 8), ('flac', {'compression': 0}),
('vorbis', -1), ('flac', {'compression': 5}),
('vorbis', 10), ('flac', {'compression': 8}),
('amb', None), ('vorbis', {'compression': -1}),
('vorbis', {'compression': 10}),
('amb', {}),
]) ])
def test_tarfile(self, ext, compression): def test_tarfile(self, ext, kwargs):
"""Loading compressed audio via file-like object returns the same result as via file path.""" """Loading compressed audio via file-like object returns the same result as via file path."""
sample_rate = 16000 sample_rate = 16000
format_ = ext if ext in ['mp3'] else None format_ = ext if ext in ['mp3'] else None
...@@ -455,8 +461,7 @@ class TestFileObject(TempDirMixin, PytorchTestCase): ...@@ -455,8 +461,7 @@ class TestFileObject(TempDirMixin, PytorchTestCase):
archive_path = self.get_temp_path('archive.tar.gz') archive_path = self.get_temp_path('archive.tar.gz')
sox_utils.gen_audio_file( sox_utils.gen_audio_file(
audio_path, sample_rate, num_channels=2, audio_path, sample_rate, num_channels=2, **kwargs)
compression=compression)
expected, _ = sox_io_backend.load(audio_path) expected, _ = sox_io_backend.load(audio_path)
with tarfile.TarFile(archive_path, 'w') as tarobj: with tarfile.TarFile(archive_path, 'w') as tarobj:
...@@ -474,24 +479,26 @@ class TestFileObject(TempDirMixin, PytorchTestCase): ...@@ -474,24 +479,26 @@ class TestFileObject(TempDirMixin, PytorchTestCase):
@skipIfNoModule("requests") @skipIfNoModule("requests")
class TestFileObjectHttp(HttpServerMixin, PytorchTestCase): class TestFileObjectHttp(HttpServerMixin, PytorchTestCase):
@parameterized.expand([ @parameterized.expand([
('wav', None), ('wav', {'bit_depth': 16}),
('mp3', 128), ('wav', {'bit_depth': 24}),
('mp3', 320), ('wav', {'bit_depth': 32}),
('flac', 0), ('mp3', {'compression': 128}),
('flac', 5), ('mp3', {'compression': 320}),
('flac', 8), ('flac', {'compression': 0}),
('vorbis', -1), ('flac', {'compression': 5}),
('vorbis', 10), ('flac', {'compression': 8}),
('amb', None), ('vorbis', {'compression': -1}),
('vorbis', {'compression': 10}),
('amb', {}),
]) ])
def test_requests(self, ext, compression): def test_requests(self, ext, kwargs):
sample_rate = 16000 sample_rate = 16000
format_ = ext if ext in ['mp3'] else None format_ = ext if ext in ['mp3'] else None
audio_file = f'test.{ext}' audio_file = f'test.{ext}'
audio_path = self.get_temp_path(audio_file) audio_path = self.get_temp_path(audio_file)
sox_utils.gen_audio_file( sox_utils.gen_audio_file(
audio_path, sample_rate, num_channels=2, compression=compression) audio_path, sample_rate, num_channels=2, **kwargs)
expected, _ = sox_io_backend.load(audio_path) expected, _ = sox_io_backend.load(audio_path)
url = self.get_url(audio_file) url = self.get_url(audio_file)
......
...@@ -103,6 +103,14 @@ auto fileobj_input_drain(sox_effect_t* effp, sox_sample_t* obuf, size_t* osamp) ...@@ -103,6 +103,14 @@ auto fileobj_input_drain(sox_effect_t* effp, sox_sample_t* obuf, size_t* osamp)
// The following part is practically same as "input" effect // The following part is practically same as "input" effect
// https://github.com/dmkrepo/libsox/blob/b9dd1a86e71bbd62221904e3e59dfaa9e5e72046/src/input.c#L30-L48 // https://github.com/dmkrepo/libsox/blob/b9dd1a86e71bbd62221904e3e59dfaa9e5e72046/src/input.c#L30-L48
// At this point, osamp represents the buffer size in bytes,
// but sox_read expects the maximum number of samples ready to read.
// Normally, this is fine, but in case when the samples are not 4-byte
// aligned, (e.g. sample is 24bits), the resulting signal is not correct.
// https://github.com/pytorch/audio/issues/2083
if (sf->encoding.bits_per_sample > 0)
*osamp /= (sf->encoding.bits_per_sample / 8);
// Ensure that it's a multiple of the number of channels // Ensure that it's a multiple of the number of channels
*osamp -= *osamp % effp->out_signal.channels; *osamp -= *osamp % effp->out_signal.channels;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment