Make buffer size configurable in ffmpeg file object operations and set size in backend (#2810)

Summary: Partly addresses https://github.com/pytorch/audio/issues/2686 and https://github.com/pytorch/audio/issues/2356. Currently, when the buffer used for file object decoding is insufficiently large, `torchaudio.load` returns a shorter waveform than expected. To deal with this, the user is expected to increase the buffer size via `torchaudio.utils.sox_utils.get_buffer_size`, but this does not influence the buffer used by the FFMpeg fallback. To fix this, this PR introduces changes that apply the buffer size set for the SoX backend to FFMpeg. As a follow-up, we should see whether it's possible to programmatically detect that the buffer's too small and flag it to the user. Pull Request resolved: https://github.com/pytorch/audio/pull/2810 Reviewed By: mthrok Differential Revision: D40906978 Pulled By: hwangjeff fbshipit-source-id: 256fe1da8b21610b05bea9a0e043f484f9ea2e76

Make buffer size configurable in ffmpeg file object operations and set size in backend (#2810)
Summary: Partly addresses https://github.com/pytorch/audio/issues/2686 and https://github.com/pytorch/audio/issues/2356. Currently, when the buffer used for file object decoding is insufficiently large, `torchaudio.load` returns a shorter waveform than expected. To deal with this, the user is expected to increase the buffer size via `torchaudio.utils.sox_utils.get_buffer_size`, but this does not influence the buffer used by the FFMpeg fallback. To fix this, this PR introduces changes that apply the buffer size set for the SoX backend to FFMpeg. As a follow-up, we should see whether it's possible to programmatically detect that the buffer's too small and flag it to the user. Pull Request resolved: https://github.com/pytorch/audio/pull/2810 Reviewed By: mthrok Differential Revision: D40906978 Pulled By: hwangjeff fbshipit-source-id: 256fe1da8b21610b05bea9a0e043f484f9ea2e76
87eca36d · hwangjeff · Facebook GitHub Bot · 6318c81f · 87eca36d · 87eca36d
Commit 87eca36d authored Nov 01, 2022 by hwangjeff Committed by Facebook GitHub Bot Nov 01, 2022
Hide whitespace changes
Inline Side-by-side

Showing with 43 additions and 8 deletions

torchaudio/backend/sox_io_backend.py torchaudio/backend/sox_io_backend.py +39 -6

torchaudio/io/_compat.py torchaudio/io/_compat.py +4 -2

No files found.
--- a/torchaudio/backend/sox_io_backend.py
+++ b/torchaudio/backend/sox_io_backend.py
@@ -4,6 +4,7 @@ from typing import Optional, Tuple
 import torch
 import torchaudio
 from torchaudio._internal import module_utils as _mod_utils
+from torchaudio.utils.sox_utils import get_buffer_size

 from .common import AudioMetaData

@@ -91,12 +92,13 @@ def info(
            # The previous libsox-based implementation required `format="mp3"`
            # because internally libsox does not auto-detect the format.
            # For the special BC for mp3, we handle mp3 differently.
+            buffer_size = get_buffer_size()
            if format == "mp3":
-                return _fallback_info_fileobj(filepath, format)
+                return _fallback_info_fileobj(filepath, format, buffer_size)
            sinfo = torchaudio._torchaudio.get_info_fileobj(filepath, format)
            if sinfo is not None:
                return AudioMetaData(*sinfo)
-            return _fallback_info_fileobj(filepath, format)
+            return _fallback_info_fileobj(filepath, format, buffer_size)
        filepath = os.fspath(filepath)
    sinfo = torch.ops.torchaudio.sox_io_get_info(filepath, format)
    if sinfo is not None:
@@ -210,14 +212,31 @@ def load(
            # The previous libsox-based implementation required `format="mp3"`
            # because internally libsox does not auto-detect the format.
            # For the special BC for mp3, we handle mp3 differently.
+            buffer_size = get_buffer_size()
            if format == "mp3":
-                return _fallback_load_fileobj(filepath, frame_offset, num_frames, normalize, channels_first, format)
+                return _fallback_load_fileobj(
+                    filepath,
+                    frame_offset,
+                    num_frames,
+                    normalize,
+                    channels_first,
+                    format,
+                    buffer_size,
+                )
            ret = torchaudio._torchaudio.load_audio_fileobj(
                filepath, frame_offset, num_frames, normalize, channels_first, format
            )
            if ret is not None:
                return ret
-            return _fallback_load_fileobj(filepath, frame_offset, num_frames, normalize, channels_first, format)
+            return _fallback_load_fileobj(
+                filepath,
+                frame_offset,
+                num_frames,
+                normalize,
+                channels_first,
+                format,
+                buffer_size,
+            )
        filepath = os.fspath(filepath)
    ret = torch.ops.torchaudio.sox_io_load_audio_file(
        filepath, frame_offset, num_frames, normalize, channels_first, format
@@ -385,10 +404,24 @@ def save(
    if not torch.jit.is_scripting():
        if hasattr(filepath, "write"):
            torchaudio._torchaudio.save_audio_fileobj(
-                filepath, src, sample_rate, channels_first, compression, format, encoding, bits_per_sample
+                filepath,
+                src,
+                sample_rate,
+                channels_first,
+                compression,
+                format,
+                encoding,
+                bits_per_sample,
            )
            return
        filepath = os.fspath(filepath)
    torch.ops.torchaudio.sox_io_save_audio_file(
-        filepath, src, sample_rate, channels_first, compression, format, encoding, bits_per_sample
+        filepath,
+        src,
+        sample_rate,
+        channels_first,
+        compression,
+        format,
+        encoding,
+        bits_per_sample,
    )
--- a/torchaudio/io/_compat.py
+++ b/torchaudio/io/_compat.py
@@ -36,8 +36,9 @@ def info_audio(
 def info_audio_fileobj(
    src,
    format: Optional[str],
+    buffer_size: int = 4096,
 ) -> AudioMetaData:
-    s = torchaudio._torchaudio_ffmpeg.StreamReaderFileObj(src, format, None, 4096)
+    s = torchaudio._torchaudio_ffmpeg.StreamReaderFileObj(src, format, None, buffer_size)
    return _info_audio(s)


@@ -110,6 +111,7 @@ def load_audio_fileobj(
    convert: bool = True,
    channels_first: bool = True,
    format: Optional[str] = None,
+    buffer_size: int = 4096,
 ) -> Tuple[torch.Tensor, int]:
-    s = torchaudio._torchaudio_ffmpeg.StreamReaderFileObj(src, format, None, 4096)
+    s = torchaudio._torchaudio_ffmpeg.StreamReaderFileObj(src, format, None, buffer_size)
    return _load_audio(s, frame_offset, num_frames, convert, channels_first)