Remove ffmpeg fallback from sox_io backend (#3516)

Summary: In https://github.com/pytorch/audio/issues/2419, we added ffmpeg as fallback for sox_io backend. The was a warkaround for solving the issue with libmad removal. Now that we introduced `backend` argument to I/O functions, and libsox integration is moved to dynamic binding where users can use libsox with libmad integration, we do not need the workaround. This commit is based on reverting https://github.com/pytorch/audio/issues/2416 (fd7ace17). Pull Request resolved: https://github.com/pytorch/audio/pull/3516 Reviewed By: huangruizhe Differential Revision: D47855272 Pulled By: mthrok fbshipit-source-id: 5af73af7865f6e545ccb052d478e86588ff2a014

Remove ffmpeg fallback from sox_io backend (#3516)
Summary: In https://github.com/pytorch/audio/issues/2419, we added ffmpeg as fallback for sox_io backend. The was a warkaround for solving the issue with libmad removal. Now that we introduced `backend` argument to I/O functions, and libsox integration is moved to dynamic binding where users can use libsox with libmad integration, we do not need the workaround. This commit is based on reverting https://github.com/pytorch/audio/issues/2416 (fd7ace17). Pull Request resolved: https://github.com/pytorch/audio/pull/3516 Reviewed By: huangruizhe Differential Revision: D47855272 Pulled By: mthrok fbshipit-source-id: 5af73af7865f6e545ccb052d478e86588ff2a014
2c8665de · moto · Facebook GitHub Bot · a051985f · 2c8665de · 2c8665de
Commit 2c8665de authored Jul 28, 2023 by moto Committed by Facebook GitHub Bot Jul 28, 2023
18 changed files
--- a/test/torchaudio_unittest/backend/dispatcher/sox/load_test.py
+++ b/test/torchaudio_unittest/backend/dispatcher/sox/load_test.py
@@ -318,25 +318,6 @@ class TestLoadParams(TempDirMixin, PytorchTestCase):

        self._test(torch.ops.torchaudio.sox_io_load_audio_file, frame_offset, num_frames, channels_first, normalize)

-    @nested_params(
-        [0, 1, 10, 100, 1000],
-        [-1, 1, 10, 100, 1000],
-        [True, False],
-        [True, False],
-    )
-    def test_ffmpeg(self, frame_offset, num_frames, channels_first, normalize):
-        """The combination of properly changes the output tensor"""
-        from torchaudio.io._compat import load_audio, load_audio_fileobj
-
-        self._test(load_audio, frame_offset, num_frames, channels_first, normalize)
-
-        # test file-like obj
-        def func(path, *args):
-            with open(path, "rb") as fileobj:
-                return load_audio_fileobj(fileobj, *args)
-
-        self._test(func, frame_offset, num_frames, channels_first, normalize)
-

 @skipIfNoSox
 @skipIfNoExec("sox")

--- a/test/torchaudio_unittest/backend/sox_io/info_test.py
+++ b/test/torchaudio_unittest/backend/sox_io/info_test.py
@@ -277,6 +277,7 @@ class TestInfo(TempDirMixin, PytorchTestCase):


 @skipIfNoSox
+@skipIfNoSoxDecoder("opus")
 class TestInfoOpus(PytorchTestCase):
    @parameterized.expand(
        list(
@@ -304,17 +305,15 @@ class TestLoadWithoutExtension(PytorchTestCase):
    def test_mp3(self):
        """MP3 file without extension can be loaded

-        Originally, we added `format` argument for this case, but now we use FFmpeg
-        for MP3 decoding, which works even without `format` argument.
        https://github.com/pytorch/audio/issues/1040

        The file was generated with the following command
            ffmpeg -f lavfi -i "sine=frequency=1000:duration=5" -ar 16000 -f mp3 test_noext
        """
        path = get_asset_path("mp3_without_ext")
-        sinfo = sox_io_backend.info(path)
+        sinfo = sox_io_backend.info(path, format="mp3")
        assert sinfo.sample_rate == 16000
-        assert sinfo.num_frames == 80000
+        assert sinfo.num_frames == 81216
        assert sinfo.num_channels == 1
        assert sinfo.bits_per_sample == 0  # bit_per_sample is irrelevant for compressed formats
        assert sinfo.encoding == "MP3"

--- a/test/torchaudio_unittest/backend/sox_io/load_test.py
+++ b/test/torchaudio_unittest/backend/sox_io/load_test.py
@@ -315,40 +315,19 @@ class TestLoadParams(TempDirMixin, PytorchTestCase):

        self._test(torch.ops.torchaudio.sox_io_load_audio_file, frame_offset, num_frames, channels_first, normalize)

-    @nested_params(
-        [0, 1, 10, 100, 1000],
-        [-1, 1, 10, 100, 1000],
-        [True, False],
-        [True, False],
-    )
-    def test_ffmpeg(self, frame_offset, num_frames, channels_first, normalize):
-        """The combination of properly changes the output tensor"""
-        from torchaudio.io._compat import load_audio, load_audio_fileobj
-
-        self._test(load_audio, frame_offset, num_frames, channels_first, normalize)
-
-        # test file-like obj
-        def func(path, *args):
-            with open(path, "rb") as fileobj:
-                return load_audio_fileobj(fileobj, *args)
-
-        self._test(func, frame_offset, num_frames, channels_first, normalize)
-

 @skipIfNoSox
 class TestLoadWithoutExtension(PytorchTestCase):
    def test_mp3(self):
        """MP3 file without extension can be loaded

-        Originally, we added `format` argument for this case, but now we use FFmpeg
-        for MP3 decoding, which works even without `format` argument.
        https://github.com/pytorch/audio/issues/1040

        The file was generated with the following command
            ffmpeg -f lavfi -i "sine=frequency=1000:duration=5" -ar 16000 -f mp3 test_noext
        """
        path = get_asset_path("mp3_without_ext")
-        _, sr = sox_io_backend.load(path)
+        _, sr = sox_io_backend.load(path, format="mp3")
        assert sr == 16000



--- a/torchaudio/backend/sox_io_backend.py
+++ b/torchaudio/backend/sox_io_backend.py
@@ -7,33 +7,6 @@ import torchaudio
 from .common import AudioMetaData


-# Note: need to comply TorchScript syntax -- need annotation and no f-string
-def _fail_info(filepath: str, format: Optional[str]) -> AudioMetaData:
-    raise RuntimeError("Failed to fetch metadata from {}".format(filepath))
-
-
-# Note: need to comply TorchScript syntax -- need annotation and no f-string
-def _fail_load(
-    filepath: str,
-    frame_offset: int = 0,
-    num_frames: int = -1,
-    normalize: bool = True,
-    channels_first: bool = True,
-    format: Optional[str] = None,
-) -> Tuple[torch.Tensor, int]:
-    raise RuntimeError("Failed to load audio from {}".format(filepath))
-
-
-if torchaudio._extension._FFMPEG_EXT is not None:
-    import torchaudio.io._compat as _compat
-
-    _fallback_info = _compat.info_audio
-    _fallback_load = _compat.load_audio
-else:
-    _fallback_info = _fail_info
-    _fallback_load = _fail_load
-
-
 @torchaudio._extension.fail_if_no_sox
 def info(
    filepath: str,
@@ -58,9 +31,7 @@ def info(
            raise RuntimeError("sox_io backend does not support file-like object.")
        filepath = os.fspath(filepath)
    sinfo = torch.ops.torchaudio.sox_io_get_info(filepath, format)
-    if sinfo is not None:
    return AudioMetaData(*sinfo)
-    return _fallback_info(filepath, format)


 @torchaudio._extension.fail_if_no_sox
@@ -153,12 +124,9 @@ def load(
        if hasattr(filepath, "read"):
            raise RuntimeError("sox_io backend does not support file-like object.")
        filepath = os.fspath(filepath)
-    ret = torch.ops.torchaudio.sox_io_load_audio_file(
+    return torch.ops.torchaudio.sox_io_load_audio_file(
        filepath, frame_offset, num_frames, normalize, channels_first, format
    )
-    if ret is not None:
-        return ret
-    return _fallback_load(filepath, frame_offset, num_frames, normalize, channels_first, format)


 @torchaudio._extension.fail_if_no_sox

--- a/torchaudio/csrc/sox/effects.cpp
+++ b/torchaudio/csrc/sox/effects.cpp
@@ -89,7 +89,7 @@ auto apply_effects_file(
    c10::optional<bool> normalize,
    c10::optional<bool> channels_first,
    const c10::optional<std::string>& format)
-    -> c10::optional<std::tuple<torch::Tensor, int64_t>> {
+    -> std::tuple<torch::Tensor, int64_t> {
  // Open input file
  SoxFormat sf(sox_open_read(
      path.c_str(),
@@ -97,10 +97,7 @@ auto apply_effects_file(
      /*encoding=*/nullptr,
      /*filetype=*/format.has_value() ? format.value().c_str() : nullptr));

-  if (static_cast<sox_format_t*>(sf) == nullptr ||
-      sf->encoding.encoding == SOX_ENCODING_UNKNOWN) {
-    return {};
-  }
+  validate_input_file(sf, path);

  const auto dtype = get_dtype(sf->encoding.encoding, sf->signal.precision);


--- a/torchaudio/csrc/sox/effects.h
+++ b/torchaudio/csrc/sox/effects.h
@@ -22,7 +22,7 @@ auto apply_effects_file(
    c10::optional<bool> normalize,
    c10::optional<bool> channels_first,
    const c10::optional<std::string>& format)
-    -> c10::optional<std::tuple<torch::Tensor, int64_t>>;
+    -> std::tuple<torch::Tensor, int64_t>;

 } // namespace torchaudio::sox


--- a/torchaudio/csrc/sox/io.cpp
+++ b/torchaudio/csrc/sox/io.cpp
@@ -8,7 +8,7 @@ using namespace torch::indexing;

 namespace torchaudio::sox {

-c10::optional<MetaDataTuple> get_info_file(
+std::tuple<int64_t, int64_t, int64_t, int64_t, std::string> get_info_file(
    const std::string& path,
    const c10::optional<std::string>& format) {
  SoxFormat sf(sox_open_read(
@@ -17,12 +17,9 @@ c10::optional<MetaDataTuple> get_info_file(
      /*encoding=*/nullptr,
      /*filetype=*/format.has_value() ? format.value().c_str() : nullptr));

-  if (static_cast<sox_format_t*>(sf) == nullptr ||
-      sf->encoding.encoding == SOX_ENCODING_UNKNOWN) {
-    return {};
-  }
+  validate_input_file(sf, path);

-  return std::forward_as_tuple(
+  return std::make_tuple(
      static_cast<int64_t>(sf->signal.rate),
      static_cast<int64_t>(sf->signal.length / sf->signal.channels),
      static_cast<int64_t>(sf->signal.channels),
@@ -58,7 +55,7 @@ std::vector<std::vector<std::string>> get_effects(
  return effects;
 }

-c10::optional<std::tuple<torch::Tensor, int64_t>> load_audio_file(
+std::tuple<torch::Tensor, int64_t> load_audio_file(
    const std::string& path,
    const c10::optional<int64_t>& frame_offset,
    const c10::optional<int64_t>& num_frames,

--- a/torchaudio/csrc/sox/io.h
+++ b/torchaudio/csrc/sox/io.h
@@ -11,14 +11,11 @@ auto get_effects(
    const c10::optional<int64_t>& num_frames)
    -> std::vector<std::vector<std::string>>;

-using MetaDataTuple =
-    std::tuple<int64_t, int64_t, int64_t, int64_t, std::string>;
-
-c10::optional<MetaDataTuple> get_info_file(
+std::tuple<int64_t, int64_t, int64_t, int64_t, std::string> get_info_file(
    const std::string& path,
    const c10::optional<std::string>& format);

-c10::optional<std::tuple<torch::Tensor, int64_t>> load_audio_file(
+std::tuple<torch::Tensor, int64_t> load_audio_file(
    const std::string& path,
    const c10::optional<int64_t>& frame_offset,
    const c10::optional<int64_t>& num_frames,

--- a/torchaudio/csrc/sox/pybind/effects.cpp
+++ b/torchaudio/csrc/sox/pybind/effects.cpp
-#include <torchaudio/csrc/sox/pybind/effects.h>
-#include <torchaudio/csrc/sox/pybind/effects_chain.h>
-#include <torchaudio/csrc/sox/pybind/utils.h>
-
-namespace torchaudio::sox {
-
-// Streaming decoding over file-like object is tricky because libsox operates on
-// FILE pointer. The folloing is what `sox` and `play` commands do
-//  - file input -> FILE pointer
-//  - URL input -> call wget in suprocess and pipe the data -> FILE pointer
-//  - stdin -> FILE pointer
-//
-// We want to, instead, fetch byte strings chunk by chunk, consume them, and
-// discard.
-//
-// Here is the approach
-// 1. Initialize sox_format_t using sox_open_mem_read, providing the initial
-// chunk of byte string
-//    This will perform header-based format detection, if necessary, then fill
-//    the metadata of sox_format_t. Internally, sox_open_mem_read uses fmemopen,
-//    which returns FILE* which points the buffer of the provided byte string.
-// 2. Each time sox reads a chunk from the FILE*, we update the underlying
-// buffer in a way that it
-//    starts with unseen data, and append the new data read from the given
-//    fileobj. This will trick libsox as if it keeps reading from the FILE*
-//    continuously.
-// For Step 2. see `fileobj_input_drain` function in effects_chain.cpp
-auto apply_effects_fileobj(
-    py::object fileobj,
-    const std::vector<std::vector<std::string>>& effects,
-    c10::optional<bool> normalize,
-    c10::optional<bool> channels_first,
-    c10::optional<std::string> format)
-    -> c10::optional<std::tuple<torch::Tensor, int64_t>> {
-  // Prepare the buffer used throughout the lifecycle of SoxEffectChain.
-  //
-  // For certain format (such as FLAC), libsox keeps reading the content at
-  // the initialization unless it reaches EOF even when the header is properly
-  // parsed. (Making buffer size 8192, which is way bigger than the header,
-  // resulted in libsox consuming all the buffer content at the time it opens
-  // the file.) Therefore buffer has to always contain valid data, except after
-  // EOF. We default to `sox_get_globals()->bufsiz`* for buffer size and we
-  // first check if there is enough data to fill the buffer. `read_fileobj`
-  // repeatedly calls `read`  method until it receives the requested length of
-  // bytes or it reaches EOF. If we get bytes shorter than requested, that means
-  // the whole audio data are fetched.
-  //
-  // * This can be changed with `torchaudio.utils.sox_utils.set_buffer_size`.
-  const auto capacity = [&]() {
-    // NOTE:
-    // Use the abstraction provided by `libtorchaudio` to access the global
-    // config defined by libsox. Directly using `sox_get_globals` function will
-    // end up retrieving the static variable defined in `_torchaudio`, which is
-    // not correct.
-    const auto bufsiz = get_buffer_size();
-    const int64_t kDefaultCapacityInBytes = 256;
-    return (bufsiz > kDefaultCapacityInBytes) ? bufsiz
-                                              : kDefaultCapacityInBytes;
-  }();
-  std::string buffer(capacity, '\0');
-  auto* in_buf = const_cast<char*>(buffer.data());
-  auto num_read = read_fileobj(&fileobj, capacity, in_buf);
-  // If the file is shorter than 256, then libsox cannot read the header.
-  auto in_buffer_size = (num_read > 256) ? num_read : 256;
-
-  // Open file (this starts reading the header)
-  // When opening a file there are two functions that can touches FILE*.
-  // * `auto_detect_format`
-  //   https://github.com/dmkrepo/libsox/blob/b9dd1a86e71bbd62221904e3e59dfaa9e5e72046/src/formats.c#L43
-  // * `startread` handler of detected format.
-  //   https://github.com/dmkrepo/libsox/blob/b9dd1a86e71bbd62221904e3e59dfaa9e5e72046/src/formats.c#L574
-  // To see the handler of a particular format, go to
-  //   https://github.com/dmkrepo/libsox/blob/b9dd1a86e71bbd62221904e3e59dfaa9e5e72046/src/<FORMAT>.c
-  // For example, voribs can be found
-  //   https://github.com/dmkrepo/libsox/blob/b9dd1a86e71bbd62221904e3e59dfaa9e5e72046/src/vorbis.c#L97-L158
-  SoxFormat sf(sox_open_mem_read(
-      in_buf,
-      in_buffer_size,
-      /*signal=*/nullptr,
-      /*encoding=*/nullptr,
-      /*filetype=*/format.has_value() ? format.value().c_str() : nullptr));
-
-  // In case of streamed data, length can be 0
-  if (static_cast<sox_format_t*>(sf) == nullptr ||
-      sf->encoding.encoding == SOX_ENCODING_UNKNOWN) {
-    return {};
-  }
-
-  // Prepare output buffer
-  std::vector<sox_sample_t> out_buffer;
-  out_buffer.reserve(sf->signal.length);
-
-  // Create and run SoxEffectsChain
-  const auto dtype = get_dtype(sf->encoding.encoding, sf->signal.precision);
-  SoxEffectsChainPyBind chain(
-      /*input_encoding=*/sf->encoding,
-      /*output_encoding=*/get_tensor_encodinginfo(dtype));
-  chain.addInputFileObj(sf, in_buf, in_buffer_size, &fileobj);
-  for (const auto& effect : effects) {
-    chain.addEffect(effect);
-  }
-  chain.addOutputBuffer(&out_buffer);
-  chain.run();
-
-  // Create tensor from buffer
-  bool channels_first_ = channels_first.value_or(true);
-  auto tensor = convert_to_tensor(
-      /*buffer=*/out_buffer.data(),
-      /*num_samples=*/out_buffer.size(),
-      /*num_channels=*/chain.getOutputNumChannels(),
-      dtype,
-      normalize.value_or(true),
-      channels_first_);
-
-  return std::forward_as_tuple(
-      tensor, static_cast<int64_t>(chain.getOutputSampleRate()));
-}
-
-} // namespace torchaudio::sox
--- a/torchaudio/csrc/sox/pybind/effects.h
+++ b/torchaudio/csrc/sox/pybind/effects.h
-#ifndef TORCHAUDIO_PYBIND_SOX_EFFECTS_H
-#define TORCHAUDIO_PYBIND_SOX_EFFECTS_H
-
-#include <torch/extension.h>
-
-namespace torchaudio::sox {
-
-auto apply_effects_fileobj(
-    py::object fileobj,
-    const std::vector<std::vector<std::string>>& effects,
-    c10::optional<bool> normalize,
-    c10::optional<bool> channels_first,
-    c10::optional<std::string> format)
-    -> c10::optional<std::tuple<torch::Tensor, int64_t>>;
-
-} // namespace torchaudio::sox
-
-#endif
--- a/torchaudio/csrc/sox/pybind/effects_chain.cpp
+++ b/torchaudio/csrc/sox/pybind/effects_chain.cpp
-#include <sox.h>
-#include <torchaudio/csrc/sox/pybind/effects_chain.h>
-#include <torchaudio/csrc/sox/pybind/utils.h>
-
-namespace torchaudio::sox {
-namespace {
-
-/// helper classes for passing file-like object to SoxEffectChain
-struct FileObjInputPriv {
-  sox_format_t* sf;
-  py::object* fileobj;
-  bool eof_reached;
-  char* buffer;
-  uint64_t buffer_size;
-};
-
-struct FileObjOutputPriv {
-  sox_format_t* sf;
-  py::object* fileobj;
-  char** buffer;
-  size_t* buffer_size;
-};
-
-/// Callback function to feed byte string
-/// https://github.com/dmkrepo/libsox/blob/b9dd1a86e71bbd62221904e3e59dfaa9e5e72046/src/sox.h#L1268-L1278
-auto fileobj_input_drain(sox_effect_t* effp, sox_sample_t* obuf, size_t* osamp)
-    -> int {
-  auto priv = static_cast<FileObjInputPriv*>(effp->priv);
-  auto sf = priv->sf;
-  auto buffer = priv->buffer;
-
-  // 1. Refresh the buffer
-  //
-  // NOTE:
-  //   Since the underlying FILE* was opened with `fmemopen`, the only way
-  //   libsox detect EOF is reaching the end of the buffer. (null byte won't
-  //   help) Therefore we need to align the content at the end of buffer,
-  //   otherwise, libsox will keep reading the content beyond intended length.
-  //
-  // Before:
-  //
-  //     |<-------consumed------>|<---remaining--->|
-  //     |***********************|-----------------|
-  //                             ^ ftell
-  //
-  // After:
-  //
-  //     |<-offset->|<---remaining--->|<-new data->|
-  //     |**********|-----------------|++++++++++++|
-  //                ^ ftell
-
-  // NOTE:
-  //   Do not use `sf->tell_off` here. Presumably, `tell_off` and `fseek` are
-  //   supposed to be in sync, but there are cases (Vorbis) they are not
-  //   in sync and `tell_off` has seemingly uninitialized value, which
-  //   leads num_remain to be negative and cause segmentation fault
-  //   in `memmove`.
-  const auto tell = ftell((FILE*)sf->fp);
-  if (tell < 0) {
-    throw std::runtime_error("Internal Error: ftell failed.");
-  }
-  const auto num_consumed = static_cast<size_t>(tell);
-  if (num_consumed > priv->buffer_size) {
-    throw std::runtime_error("Internal Error: buffer overrun.");
-  }
-
-  const auto num_remain = priv->buffer_size - num_consumed;
-
-  // 1.1. Fetch the data to see if there is data to fill the buffer
-  size_t num_refill = 0;
-  std::string chunk(num_consumed, '\0');
-  if (num_consumed && !priv->eof_reached) {
-    num_refill = read_fileobj(
-        priv->fileobj, num_consumed, const_cast<char*>(chunk.data()));
-    if (num_refill < num_consumed) {
-      priv->eof_reached = true;
-    }
-  }
-  const auto offset = num_consumed - num_refill;
-
-  // 1.2. Move the unconsumed data towards the beginning of buffer.
-  if (num_remain) {
-    auto src = static_cast<void*>(buffer + num_consumed);
-    auto dst = static_cast<void*>(buffer + offset);
-    memmove(dst, src, num_remain);
-  }
-
-  // 1.3. Refill the remaining buffer.
-  if (num_refill) {
-    auto src = static_cast<void*>(const_cast<char*>(chunk.c_str()));
-    auto dst = buffer + offset + num_remain;
-    memcpy(dst, src, num_refill);
-  }
-
-  // 1.4. Set the file pointer to the new offset
-  sf->tell_off = offset;
-  fseek((FILE*)sf->fp, offset, SEEK_SET);
-
-  // 2. Perform decoding operation
-  // The following part is practically same as "input" effect
-  // https://github.com/dmkrepo/libsox/blob/b9dd1a86e71bbd62221904e3e59dfaa9e5e72046/src/input.c#L30-L48
-
-  // At this point, osamp represents the buffer size in bytes,
-  // but sox_read expects the maximum number of samples ready to read.
-  // Normally, this is fine, but in case when the samples are not 4-byte
-  // aligned, (e.g. sample is 24bits), the resulting signal is not correct.
-  // https://github.com/pytorch/audio/issues/2083
-  if (sf->encoding.bits_per_sample > 0)
-    *osamp /= (sf->encoding.bits_per_sample / 8);
-
-  // Ensure that it's a multiple of the number of channels
-  *osamp -= *osamp % effp->out_signal.channels;
-
-  // Read up to *osamp samples into obuf;
-  // store the actual number read back to *osamp
-  *osamp = sox_read(sf, obuf, *osamp);
-
-  // Decoding is finished when fileobject is exhausted and sox can no longer
-  // decode a sample.
-  return (priv->eof_reached && !*osamp) ? SOX_EOF : SOX_SUCCESS;
-}
-
-auto fileobj_output_flow(
-    sox_effect_t* effp,
-    sox_sample_t const* ibuf,
-    sox_sample_t* obuf LSX_UNUSED,
-    size_t* isamp,
-    size_t* osamp) -> int {
-  *osamp = 0;
-  if (*isamp) {
-    auto priv = static_cast<FileObjOutputPriv*>(effp->priv);
-    auto sf = priv->sf;
-    auto fp = static_cast<FILE*>(sf->fp);
-    auto fileobj = priv->fileobj;
-    auto buffer = priv->buffer;
-
-    // Encode chunk
-    auto num_samples_written = sox_write(sf, ibuf, *isamp);
-    fflush(fp);
-
-    // Copy the encoded chunk to python object.
-    fileobj->attr("write")(py::bytes(*buffer, ftell(fp)));
-
-    // Reset FILE*
-    sf->tell_off = 0;
-    fseek(fp, 0, SEEK_SET);
-
-    if (num_samples_written != *isamp) {
-      if (sf->sox_errno) {
-        std::ostringstream stream;
-        stream << sf->sox_errstr << " " << sox_strerror(sf->sox_errno) << " "
-               << sf->filename;
-        throw std::runtime_error(stream.str());
-      }
-      return SOX_EOF;
-    }
-  }
-  return SOX_SUCCESS;
-}
-
-auto get_fileobj_input_handler() -> sox_effect_handler_t* {
-  static sox_effect_handler_t handler{
-      /*name=*/"input_fileobj_object",
-      /*usage=*/nullptr,
-      /*flags=*/SOX_EFF_MCHAN,
-      /*getopts=*/nullptr,
-      /*start=*/nullptr,
-      /*flow=*/nullptr,
-      /*drain=*/fileobj_input_drain,
-      /*stop=*/nullptr,
-      /*kill=*/nullptr,
-      /*priv_size=*/sizeof(FileObjInputPriv)};
-  return &handler;
-}
-
-auto get_fileobj_output_handler() -> sox_effect_handler_t* {
-  static sox_effect_handler_t handler{
-      /*name=*/"output_fileobj_object",
-      /*usage=*/nullptr,
-      /*flags=*/SOX_EFF_MCHAN,
-      /*getopts=*/nullptr,
-      /*start=*/nullptr,
-      /*flow=*/fileobj_output_flow,
-      /*drain=*/nullptr,
-      /*stop=*/nullptr,
-      /*kill=*/nullptr,
-      /*priv_size=*/sizeof(FileObjOutputPriv)};
-  return &handler;
-}
-
-} // namespace
-
-void SoxEffectsChainPyBind::addInputFileObj(
-    sox_format_t* sf,
-    char* buffer,
-    uint64_t buffer_size,
-    py::object* fileobj) {
-  in_sig_ = sf->signal;
-  interm_sig_ = in_sig_;
-
-  SoxEffect e(sox_create_effect(get_fileobj_input_handler()));
-  auto priv = static_cast<FileObjInputPriv*>(e->priv);
-  priv->sf = sf;
-  priv->fileobj = fileobj;
-  priv->eof_reached = false;
-  priv->buffer = buffer;
-  priv->buffer_size = buffer_size;
-  if (sox_add_effect(sec_, e, &interm_sig_, &in_sig_) != SOX_SUCCESS) {
-    throw std::runtime_error(
-        "Internal Error: Failed to add effect: input fileobj");
-  }
-}
-
-void SoxEffectsChainPyBind::addOutputFileObj(
-    sox_format_t* sf,
-    char** buffer,
-    size_t* buffer_size,
-    py::object* fileobj) {
-  out_sig_ = sf->signal;
-  SoxEffect e(sox_create_effect(get_fileobj_output_handler()));
-  auto priv = static_cast<FileObjOutputPriv*>(e->priv);
-  priv->sf = sf;
-  priv->fileobj = fileobj;
-  priv->buffer = buffer;
-  priv->buffer_size = buffer_size;
-  if (sox_add_effect(sec_, e, &interm_sig_, &out_sig_) != SOX_SUCCESS) {
-    throw std::runtime_error(
-        "Internal Error: Failed to add effect: output fileobj");
-  }
-}
-
-} // namespace torchaudio::sox
--- a/torchaudio/csrc/sox/pybind/effects_chain.h
+++ b/torchaudio/csrc/sox/pybind/effects_chain.h
-#ifndef TORCHAUDIO_PYBIND_SOX_EFFECTS_CHAIN_H
-#define TORCHAUDIO_PYBIND_SOX_EFFECTS_CHAIN_H
-
-#include <torch/extension.h>
-#include <torchaudio/csrc/sox/effects_chain.h>
-
-namespace torchaudio::sox {
-
-class SoxEffectsChainPyBind : public SoxEffectsChain {
-  using SoxEffectsChain::SoxEffectsChain;
-
- public:
-  void addInputFileObj(
-      sox_format_t* sf,
-      char* buffer,
-      uint64_t buffer_size,
-      py::object* fileobj);
-
-  void addOutputFileObj(
-      sox_format_t* sf,
-      char** buffer,
-      size_t* buffer_size,
-      py::object* fileobj);
-};
-
-} // namespace torchaudio::sox
-
-#endif
--- a/torchaudio/csrc/sox/pybind/io.cpp
+++ b/torchaudio/csrc/sox/pybind/io.cpp
-#include <torchaudio/csrc/sox/io.h>
-#include <torchaudio/csrc/sox/pybind/effects.h>
-#include <torchaudio/csrc/sox/pybind/effects_chain.h>
-#include <torchaudio/csrc/sox/pybind/io.h>
-#include <torchaudio/csrc/sox/pybind/utils.h>
-#include <torchaudio/csrc/sox/types.h>
-
-#include <utility>
-
-namespace torchaudio::sox {
-
-auto get_info_fileobj(py::object fileobj, c10::optional<std::string> format)
-    -> c10::optional<MetaDataTuple> {
-  // Prepare in-memory file object
-  // When libsox opens a file, it also reads the header.
-  // When opening a file there are two functions that might touch FILE* (and the
-  // underlying buffer).
-  // * `auto_detect_format`
-  //   https://github.com/dmkrepo/libsox/blob/b9dd1a86e71bbd62221904e3e59dfaa9e5e72046/src/formats.c#L43
-  // * `startread` handler of detected format.
-  //   https://github.com/dmkrepo/libsox/blob/b9dd1a86e71bbd62221904e3e59dfaa9e5e72046/src/formats.c#L574
-  // To see the handler of a particular format, go to
-  //   https://github.com/dmkrepo/libsox/blob/b9dd1a86e71bbd62221904e3e59dfaa9e5e72046/src/<FORMAT>.c
-  // For example, voribs can be found
-  //   https://github.com/dmkrepo/libsox/blob/b9dd1a86e71bbd62221904e3e59dfaa9e5e72046/src/vorbis.c#L97-L158
-  //
-  // `auto_detect_format` function only requires 256 bytes, but format-dependent
-  // `startread` handler might require more data. In case of vorbis, the size of
-  // header is unbounded, but typically 4kB maximum.
-  //
-  // "The header size is unbounded, although for streaming a rule-of-thumb of
-  // 4kB or less is recommended (and Xiph.Org's Vorbis encoder follows this
-  // suggestion)."
-  //
-  // See:
-  // https://xiph.org/vorbis/doc/Vorbis_I_spec.html
-  const auto capacity = [&]() {
-    // NOTE:
-    // Use the abstraction provided by `libtorchaudio` to access the global
-    // config defined by libsox. Directly using `sox_get_globals` function will
-    // end up retrieving the static variable defined in `_torchaudio`, which is
-    // not correct.
-    const auto bufsiz = get_buffer_size();
-    const int64_t kDefaultCapacityInBytes = 4096;
-    return (bufsiz > kDefaultCapacityInBytes) ? bufsiz
-                                              : kDefaultCapacityInBytes;
-  }();
-  std::string buffer(capacity, '\0');
-  auto* buf = const_cast<char*>(buffer.data());
-  auto num_read = read_fileobj(&fileobj, capacity, buf);
-  // If the file is shorter than 256, then libsox cannot read the header.
-  auto buf_size = (num_read > 256) ? num_read : 256;
-
-  SoxFormat sf(sox_open_mem_read(
-      buf,
-      buf_size,
-      /*signal=*/nullptr,
-      /*encoding=*/nullptr,
-      /*filetype=*/format.has_value() ? format.value().c_str() : nullptr));
-
-  if (static_cast<sox_format_t*>(sf) == nullptr ||
-      sf->encoding.encoding == SOX_ENCODING_UNKNOWN) {
-    return c10::optional<MetaDataTuple>{};
-  }
-
-  return std::forward_as_tuple(
-      static_cast<int64_t>(sf->signal.rate),
-      static_cast<int64_t>(sf->signal.length / sf->signal.channels),
-      static_cast<int64_t>(sf->signal.channels),
-      static_cast<int64_t>(sf->encoding.bits_per_sample),
-      get_encoding(sf->encoding.encoding));
-}
-
-auto load_audio_fileobj(
-    py::object fileobj,
-    c10::optional<int64_t> frame_offset,
-    c10::optional<int64_t> num_frames,
-    c10::optional<bool> normalize,
-    c10::optional<bool> channels_first,
-    c10::optional<std::string> format)
-    -> c10::optional<std::tuple<torch::Tensor, int64_t>> {
-  auto effects = get_effects(frame_offset, num_frames);
-  return apply_effects_fileobj(
-      std::move(fileobj),
-      effects,
-      normalize,
-      channels_first,
-      std::move(format));
-}
-
-namespace {
-
-// helper class to automatically release buffer, to be used by
-// save_audio_fileobj
-struct AutoReleaseBuffer {
-  char* ptr;
-  size_t size;
-
-  AutoReleaseBuffer() : ptr(nullptr), size(0) {}
-  AutoReleaseBuffer(const AutoReleaseBuffer& other) = delete;
-  AutoReleaseBuffer(AutoReleaseBuffer&& other) = delete;
-  auto operator=(const AutoReleaseBuffer& other) -> AutoReleaseBuffer& = delete;
-  auto operator=(AutoReleaseBuffer&& other) -> AutoReleaseBuffer& = delete;
-  ~AutoReleaseBuffer() {
-    if (ptr) {
-      free(ptr);
-    }
-  }
-};
-
-} // namespace
-
-void save_audio_fileobj(
-    py::object fileobj,
-    torch::Tensor tensor,
-    int64_t sample_rate,
-    bool channels_first,
-    c10::optional<double> compression,
-    c10::optional<std::string> format,
-    c10::optional<std::string> encoding,
-    c10::optional<int64_t> bits_per_sample) {
-  validate_input_tensor(tensor);
-
-  if (!format.has_value()) {
-    throw std::runtime_error(
-        "`format` is required when saving to file object.");
-  }
-  const auto filetype = format.value();
-
-  if (filetype == "amr-nb") {
-    const auto num_channels = tensor.size(channels_first ? 0 : 1);
-    if (num_channels != 1) {
-      throw std::runtime_error(
-          "amr-nb format only supports single channel audio.");
-    }
-  } else if (filetype == "htk") {
-    const auto num_channels = tensor.size(channels_first ? 0 : 1);
-    if (num_channels != 1) {
-      throw std::runtime_error(
-          "htk format only supports single channel audio.");
-    }
-  } else if (filetype == "gsm") {
-    const auto num_channels = tensor.size(channels_first ? 0 : 1);
-    if (num_channels != 1) {
-      throw std::runtime_error(
-          "gsm format only supports single channel audio.");
-    }
-    if (sample_rate != 8000) {
-      throw std::runtime_error(
-          "gsm format only supports a sampling rate of 8kHz.");
-    }
-  }
-  const auto signal_info =
-      get_signalinfo(&tensor, sample_rate, filetype, channels_first);
-  const auto encoding_info = get_encodinginfo_for_save(
-      filetype,
-      tensor.dtype(),
-      compression,
-      std::move(encoding),
-      bits_per_sample);
-
-  AutoReleaseBuffer buffer;
-
-  SoxFormat sf(sox_open_memstream_write(
-      &buffer.ptr,
-      &buffer.size,
-      &signal_info,
-      &encoding_info,
-      filetype.c_str(),
-      /*oob=*/nullptr));
-
-  if (static_cast<sox_format_t*>(sf) == nullptr) {
-    throw std::runtime_error(
-        "Error saving audio file: failed to open memory stream.");
-  }
-
-  SoxEffectsChainPyBind chain(
-      /*input_encoding=*/get_tensor_encodinginfo(tensor.dtype()),
-      /*output_encoding=*/sf->encoding);
-  chain.addInputTensor(&tensor, sample_rate, channels_first);
-  chain.addOutputFileObj(sf, &buffer.ptr, &buffer.size, &fileobj);
-  chain.run();
-
-  // Closing the sox_format_t is necessary for flushing the last chunk to the
-  // buffer
-  sf.close();
-
-  fileobj.attr("write")(py::bytes(buffer.ptr, buffer.size));
-}
-
-} // namespace torchaudio::sox
--- a/torchaudio/csrc/sox/pybind/io.h
+++ b/torchaudio/csrc/sox/pybind/io.h
-#ifndef TORCHAUDIO_PYBIND_SOX_IO_H
-#define TORCHAUDIO_PYBIND_SOX_IO_H
-
-#include <torch/extension.h>
-
-namespace torchaudio::sox {
-
-using MetaDataTuple =
-    std::tuple<int64_t, int64_t, int64_t, int64_t, std::string>;
-
-auto get_info_fileobj(py::object fileobj, c10::optional<std::string> format)
-    -> c10::optional<MetaDataTuple>;
-
-auto load_audio_fileobj(
-    py::object fileobj,
-    c10::optional<int64_t> frame_offset,
-    c10::optional<int64_t> num_frames,
-    c10::optional<bool> normalize,
-    c10::optional<bool> channels_first,
-    c10::optional<std::string> format)
-    -> c10::optional<std::tuple<torch::Tensor, int64_t>>;
-
-void save_audio_fileobj(
-    py::object fileobj,
-    torch::Tensor tensor,
-    int64_t sample_rate,
-    bool channels_first,
-    c10::optional<double> compression,
-    c10::optional<std::string> format,
-    c10::optional<std::string> encoding,
-    c10::optional<int64_t> bits_per_sample);
-
-} // namespace torchaudio::sox
-
-#endif
--- a/torchaudio/csrc/sox/pybind/utils.cpp
+++ b/torchaudio/csrc/sox/pybind/utils.cpp
-#include <torchaudio/csrc/sox/pybind/utils.h>
-
-namespace torchaudio::sox {
-
-auto read_fileobj(py::object* fileobj, const uint64_t size, char* buffer)
-    -> uint64_t {
-  uint64_t num_read = 0;
-  while (num_read < size) {
-    auto request = size - num_read;
-    auto chunk = static_cast<std::string>(
-        static_cast<py::bytes>(fileobj->attr("read")(request)));
-    auto chunk_len = chunk.length();
-    if (chunk_len == 0) {
-      break;
-    }
-    if (chunk_len > request) {
-      std::ostringstream message;
-      message
-          << "Requested up to " << request << " bytes but, "
-          << "received " << chunk_len << " bytes. "
-          << "The given object does not confirm to read protocol of file object.";
-      throw std::runtime_error(message.str());
-    }
-    memcpy(buffer, chunk.data(), chunk_len);
-    buffer += chunk_len;
-    num_read += chunk_len;
-  }
-  return num_read;
-}
-
-} // namespace torchaudio::sox
--- a/torchaudio/csrc/sox/pybind/utils.h
+++ b/torchaudio/csrc/sox/pybind/utils.h
-#ifndef TORCHAUDIO_PYBIND_SOX_UTILS_H
-#define TORCHAUDIO_PYBIND_SOX_UTILS_H
-
-#include <torch/extension.h>
-
-namespace torchaudio::sox {
-
-auto read_fileobj(py::object* fileobj, uint64_t size, char* buffer) -> uint64_t;
-
-} // namespace torchaudio::sox
-
-#endif
--- a/torchaudio/csrc/sox/utils.h
+++ b/torchaudio/csrc/sox/utils.h
@@ -51,6 +51,10 @@ struct SoxFormat {
  sox_format_t* fd_;
 };

+///
+/// Verify that input file is found, has known encoding, and not empty
+void validate_input_file(const SoxFormat& sf, const std::string& path);
+
 ///
 /// Verify that input Tensor is 2D, CPU and either uin8, int16, int32 or float32
 void validate_input_tensor(const torch::Tensor&);

--- a/torchaudio/sox_effects/sox_effects.py
+++ b/torchaudio/sox_effects/sox_effects.py
@@ -269,7 +269,4 @@ def apply_effects_file(
                "Please use torchaudio.io.AudioEffector."
            )
        path = os.fspath(path)
-    ret = torch.ops.torchaudio.sox_effects_apply_effects_file(path, effects, normalize, channels_first, format)
-    if ret is not None:
-        return ret
-    raise RuntimeError("Failed to load audio from {}".format(path))
+    return torch.ops.torchaudio.sox_effects_apply_effects_file(path, effects, normalize, channels_first, format)