Fix load from file object for small files and shorter bytes (#1181)

47d97e30 · moto · GitHub · 41c76a17 · 47d97e30 · 47d97e30
Unverified Commit 47d97e30 authored Jan 27, 2021 by moto Committed by GitHub Jan 27, 2021
4 changed files
--- a/test/torchaudio_unittest/sox_io_backend/load_test.py
+++ b/test/torchaudio_unittest/sox_io_backend/load_test.py
@@ -380,6 +380,19 @@ class TestLoadWithoutExtension(PytorchTestCase):
        assert sr == 16000
+class CloggedFileObj:
+    def __init__(self, fileobj):
+        self.fileobj = fileobj
+        self.buffer = b''
+    def read(self, n):
+        if not self.buffer:
+            self.buffer += self.fileobj.read(n)
+        ret = self.buffer[:2]
+        self.buffer = self.buffer[2:]
+        return ret
 @skipIfNoExtension
 @skipIfNoExec('sox')
 class TestFileObject(TempDirMixin, PytorchTestCase):
@@ -444,6 +457,68 @@ class TestFileObject(TempDirMixin, PytorchTestCase):
        assert sr == sample_rate
        self.assertEqual(expected, found)
+    @parameterized.expand([
+        ('wav', None),
+        ('mp3', 128),
+        ('mp3', 320),
+        ('flac', 0),
+        ('flac', 5),
+        ('flac', 8),
+        ('vorbis', -1),
+        ('vorbis', 10),
+        ('amb', None),
+    ])
+    def test_bytesio_clogged(self, ext, compression):
+        """Loading audio via clogged file object returns the same result as via file path.
+        This test case validates the case where fileobject returns shorter bytes than requeted.
+        """
+        sample_rate = 16000
+        format_ = ext if ext in ['mp3'] else None
+        path = self.get_temp_path(f'test.{ext}')
+        sox_utils.gen_audio_file(
+            path, sample_rate, num_channels=2,
+            compression=compression)
+        expected, _ = sox_io_backend.load(path)
+        with open(path, 'rb') as file_:
+            fileobj = CloggedFileObj(io.BytesIO(file_.read()))
+        found, sr = sox_io_backend.load(fileobj, format=format_)
+        assert sr == sample_rate
+        self.assertEqual(expected, found)
+    @parameterized.expand([
+        ('wav', None),
+        ('mp3', 128),
+        ('mp3', 320),
+        ('flac', 0),
+        ('flac', 5),
+        ('flac', 8),
+        ('vorbis', -1),
+        ('vorbis', 10),
+        ('amb', None),
+    ])
+    def test_bytesio_tiny(self, ext, compression):
+        """Loading very small audio via file object returns the same result as via file path.
+        """
+        sample_rate = 16000
+        format_ = ext if ext in ['mp3'] else None
+        path = self.get_temp_path(f'test.{ext}')
+        sox_utils.gen_audio_file(
+            path, sample_rate, num_channels=2,
+            compression=compression, duration=1 / 1600)
+        expected, _ = sox_io_backend.load(path)
+        with open(path, 'rb') as file_:
+            fileobj = io.BytesIO(file_.read())
+        found, sr = sox_io_backend.load(fileobj, format=format_)
+        assert sr == sample_rate
+        self.assertEqual(expected, found)
    @parameterized.expand([
        ('wav', None),
        ('mp3', 128),

--- a/torchaudio/csrc/sox/effects.cpp
+++ b/torchaudio/csrc/sox/effects.cpp
@@ -137,51 +137,65 @@ c10::intrusive_ptr<TensorSignal> apply_effects_file(
 #ifdef TORCH_API_INCLUDE_EXTENSION_H
+// Streaming decoding over file-like object is tricky because libsox operates on
+// FILE pointer. The folloing is what `sox` and `play` commands do
+//  - file input -> FILE pointer
+//  - URL input -> call wget in suprocess and pipe the data -> FILE pointer
+//  - stdin -> FILE pointer
+//
+// We want to, instead, fetch byte strings chunk by chunk, consume them, and
+// discard.
+//
+// Here is the approach
+// 1. Initialize sox_format_t using sox_open_mem_read, providing the initial
+// chunk of byte string
+//    This will perform header-based format detection, if necessary, then fill
+//    the metadata of sox_format_t. Internally, sox_open_mem_read uses fmemopen,
+//    which returns FILE* which points the buffer of the provided byte string.
+// 2. Each time sox reads a chunk from the FILE*, we update the underlying
+// buffer in a way that it
+//    starts with unseen data, and append the new data read from the given
+//    fileobj. This will trick libsox as if it keeps reading from the FILE*
+//    continuously.
+// For Step 2. see `fileobj_input_drain` function in effects_chain.cpp
 std::tuple<torch::Tensor, int64_t> apply_effects_fileobj(
    py::object fileobj,
    std::vector<std::vector<std::string>> effects,
    c10::optional<bool>& normalize,
    c10::optional<bool>& channels_first,
    c10::optional<std::string>& format) {
-  // Streaming decoding over file-like object is tricky because libsox operates
+  // Prepare the buffer used throughout the lifecycle of SoxEffectChain.
-  // on FILE pointer. The folloing is what `sox` and `play` commands do
-  //  - file input -> FILE pointer
-  //  - URL input -> call wget in suprocess and pipe the data -> FILE pointer
-  //  - stdin -> FILE pointer
  //
-  // We want to, instead, fetch byte strings chunk by chunk, consume them, and
+  // For certain format (such as FLAC), libsox keeps reading the content at
-  // discard.
+  // the initialization unless it reaches EOF even when the header is properly
+  // parsed. (Making buffer size 8192, which is way bigger than the header,
+  // resulted in libsox consuming all the buffer content at the time it opens
+  // the file.) Therefore buffer has to always contain valid data, except after
+  // EOF. We default to `sox_get_globals()->bufsiz`* for buffer size and we
+  // first check if there is enough data to fill the buffer. `read_fileobj`
+  // repeatedly calls `read`  method until it receives the requested lenght of
+  // bytes or it reaches EOF. If we get bytes shorter than requested, that means
+  // the whole audio data are fetched.
  //
-  // Here is the approach
+  // * This can be changed with `torchaudio.utils.sox_utils.set_buffer_size`.
-  // 1. Initialize sox_format_t using sox_open_mem_read, providing the initial
+  auto capacity =
-  // chunk of byte string
+      (sox_get_globals()->bufsiz > 256) ? sox_get_globals()->bufsiz : 256;
-  //    This will perform header-based format detection, if necessary, then fill
+  std::string buffer(capacity, '\0');
-  //    the metadata of sox_format_t. Internally, sox_open_mem_read uses
+  auto* in_buf = const_cast<char*>(buffer.data());
-  //    fmemopen, which returns FILE* which points the buffer of the provided
+  auto num_read = read_fileobj(&fileobj, capacity, in_buf);
-  //    byte string.
+  // If the file is shorter than 256, then libsox cannot read the header.
-  // 2. Each time sox reads a chunk from the FILE*, we update the underlying
+  auto in_buffer_size = (num_read > 256) ? num_read : 256;
-  // buffer in a way that it
-  //    starts with unseen data, and append the new data read from the given
-  //    fileobj. This will trick libsox as if it keeps reading from the FILE*
-  //    continuously.
-  // Prepare the buffer used throughout the lifecycle of SoxEffectChain.
-  // Using std::string and let it manage memory.
-  // 4096 is minimum size requried by auto_detect_format
-  // https://github.com/dmkrepo/libsox/blob/b9dd1a86e71bbd62221904e3e59dfaa9e5e72046/src/formats.c#L40-L48
-  const size_t in_buffer_size = 4096;
-  std::string in_buffer(in_buffer_size, 'x');
-  auto* in_buf = const_cast<char*>(in_buffer.data());
-  // Fetch the header, and copy it to the buffer.
-  auto header = static_cast<std::string>(
-      static_cast<py::bytes>(fileobj.attr("read")(4096)));
-  memcpy(
-      static_cast<void*>(in_buf),
-      static_cast<void*>(const_cast<char*>(header.data())),
-      header.length());
  // Open file (this starts reading the header)
+  // When opening a file there are two functions that can touches FILE*.
+  // * `auto_detect_format`
+  //   https://github.com/dmkrepo/libsox/blob/b9dd1a86e71bbd62221904e3e59dfaa9e5e72046/src/formats.c#L43
+  // * `startread` handler of detected format.
+  //   https://github.com/dmkrepo/libsox/blob/b9dd1a86e71bbd62221904e3e59dfaa9e5e72046/src/formats.c#L574
+  // To see the handler of a particular format, go to
+  //   https://github.com/dmkrepo/libsox/blob/b9dd1a86e71bbd62221904e3e59dfaa9e5e72046/src/<FORMAT>.c
+  // For example, voribs can be found
+  //   https://github.com/dmkrepo/libsox/blob/b9dd1a86e71bbd62221904e3e59dfaa9e5e72046/src/vorbis.c#L97-L158
  SoxFormat sf(sox_open_mem_read(
      in_buf,
      in_buffer_size,

--- a/torchaudio/csrc/sox/effects_chain.cpp
+++ b/torchaudio/csrc/sox/effects_chain.cpp
@@ -296,6 +296,7 @@ namespace {
 struct FileObjInputPriv {
  sox_format_t* sf;
  py::object* fileobj;
+  bool eof_reached;
  char* buffer;
  uint64_t buffer_size;
 };
@@ -312,9 +313,7 @@ struct FileObjOutputPriv {
 int fileobj_input_drain(sox_effect_t* effp, sox_sample_t* obuf, size_t* osamp) {
  auto priv = static_cast<FileObjInputPriv*>(effp->priv);
  auto sf = priv->sf;
-  auto fileobj = priv->fileobj;
  auto buffer = priv->buffer;
-  auto buffer_size = priv->buffer_size;
  // 1. Refresh the buffer
  //
@@ -326,32 +325,30 @@ int fileobj_input_drain(sox_effect_t* effp, sox_sample_t* obuf, size_t* osamp) {
  //
  // Before:
  //
-  //     |<--------consumed------->|<-remaining->|
+  //     |<-------consumed------>|<---remaining--->|
-  //     |*************************|-------------|
+  //     |***********************|-----------------|
-  //                               ^ ftell
+  //                             ^ ftell
  //
  // After:
  //
-  //     |<-offset->|<-remaining->|<--new data-->|
+  //     |<-offset->|<---remaining--->|<-new data->|
-  //     |**********|-------------|++++++++++++++|
+  //     |**********|-----------------|++++++++++++|
  //                ^ ftell
  const auto num_consumed = sf->tell_off;
-  const auto num_remain = buffer_size - num_consumed;
+  const auto num_remain = priv->buffer_size - num_consumed;
-  // 1.1. First, we fetch the data to see if there is data to fill the buffer
+  // 1.1. Fetch the data to see if there is data to fill the buffer
-  py::bytes chunk_ = fileobj->attr("read")(num_consumed);
+  uint64_t num_refill = 0;
-  const auto num_refill = py::len(chunk_);
+  std::string chunk(num_consumed, '\0');
-  const auto offset = buffer_size - (num_remain + num_refill);
+  if (num_consumed && !priv->eof_reached) {
+    num_refill = read_fileobj(
-  if (num_refill > num_consumed) {
+        priv->fileobj, num_consumed, const_cast<char*>(chunk.data()));
-    std::ostringstream message;
+    if (num_refill < num_consumed) {
-    message
+      priv->eof_reached = true;
-        << "Tried to read up to " << num_consumed << " bytes but, "
+    }
-        << "recieved " << num_refill << " bytes. "
-        << "The given object does not confirm to read protocol of file object.";
-    throw std::runtime_error(message.str());
  }
+  const auto offset = num_consumed - num_refill;
  // 1.2. Move the unconsumed data towards the beginning of buffer.
  if (num_remain) {
@@ -362,7 +359,6 @@ int fileobj_input_drain(sox_effect_t* effp, sox_sample_t* obuf, size_t* osamp) {
  // 1.3. Refill the remaining buffer.
  if (num_refill) {
-    auto chunk = static_cast<std::string>(chunk_);
    auto src = static_cast<void*>(const_cast<char*>(chunk.c_str()));
    auto dst = buffer + offset + num_remain;
    memcpy(dst, src, num_refill);
@@ -383,7 +379,9 @@ int fileobj_input_drain(sox_effect_t* effp, sox_sample_t* obuf, size_t* osamp) {
  // store the actual number read back to *osamp
  *osamp = sox_read(sf, obuf, *osamp);
-  return *osamp ? SOX_SUCCESS : SOX_EOF;
+  // Decoding is finished when fileobject is exhausted and sox can no longer
+  // decode a sample.
+  return (priv->eof_reached && !*osamp) ? SOX_EOF : SOX_SUCCESS;
 }
 int fileobj_output_flow(
@@ -469,6 +467,7 @@ void SoxEffectsChain::addInputFileObj(
  auto priv = static_cast<FileObjInputPriv*>(e->priv);
  priv->sf = sf;
  priv->fileobj = fileobj;
+  priv->eof_reached = false;
  priv->buffer = buffer;
  priv->buffer_size = buffer_size;
  if (sox_add_effect(sec_, e, &interm_sig_, &in_sig_) != SOX_SUCCESS) {

--- a/torchaudio/csrc/sox/utils.cpp
+++ b/torchaudio/csrc/sox/utils.cpp
@@ -337,9 +337,6 @@ uint64_t read_fileobj(py::object* fileobj, const uint64_t size, char* buffer) {
          << "The given object does not confirm to read protocol of file object.";
      throw std::runtime_error(message.str());
    }
-    std::cerr << "req: " << request << ", fetched: " << chunk_len << std::endl;
-    std::cerr << "buffer: " << (void*)buffer << std::endl;
    memcpy(buffer, chunk.data(), chunk_len);
    buffer += chunk_len;
    num_read += chunk_len;