Add AudioEffector (#3163)

Summary: This commit adds a new feature AudioEffector, which can be used to apply various effects and codecs to waveforms in Tensor. Under the hood it uses StreamWriter and StreamReader to apply filters and encode/decode. This is going to replace the deprecated `apply_codec` and `apply_sox_effect_tensor` functions. It can also perform online, chunk-by-chunk filtering. Tutorial to follow. closes https://github.com/pytorch/audio/issues/3161 Pull Request resolved: https://github.com/pytorch/audio/pull/3163 Reviewed By: hwangjeff Differential Revision: D44576660 Pulled By: mthrok fbshipit-source-id: 2c5cc87082ab431315d29d56d6ac9efaf4cf7aeb

Add AudioEffector (#3163)
Summary: This commit adds a new feature AudioEffector, which can be used to apply various effects and codecs to waveforms in Tensor. Under the hood it uses StreamWriter and StreamReader to apply filters and encode/decode. This is going to replace the deprecated `apply_codec` and `apply_sox_effect_tensor` functions. It can also perform online, chunk-by-chunk filtering. Tutorial to follow. closes https://github.com/pytorch/audio/issues/3161 Pull Request resolved: https://github.com/pytorch/audio/pull/3163 Reviewed By: hwangjeff Differential Revision: D44576660 Pulled By: mthrok fbshipit-source-id: 2c5cc87082ab431315d29d56d6ac9efaf4cf7aeb
a4036248 · moto · Facebook GitHub Bot · fda41bbf · a4036248 · a4036248
Commit a4036248 authored Mar 31, 2023 by moto Committed by Facebook GitHub Bot Mar 31, 2023
8 changed files
--- a/docs/source/_templates/autosummary/io_class.rst
+++ b/docs/source/_templates/autosummary/io_class.rst
@@ -42,7 +42,6 @@ Methods
   not item.startswith('_')
   and item not in inherited_members
   and item not in attributes
-   and item != "CodecConfig"
   %}
 {{ item | underline("~") }}
@@ -56,11 +55,12 @@ Methods
 {%- endif %}
-{%- if name == "StreamReader" %}
+{%- if name in ["StreamReader", "StreamWriter"] %}
 Support Structures
 ------------------
+{%- if name == "StreamReader" %}
 {%- for item in [
    "ChunkTensor",
    "SourceStream",
@@ -77,15 +77,14 @@ Support Structures
   :members:
 {%- endfor %}
-{%- elif name == "StreamWriter" %}
-Support Structures
+{%- elif name == "StreamWriter" %}
------------------
 CodecConfig
 ~~~~~~~~~~~
-.. autoclass:: torchaudio.io::StreamWriter.CodecConfig()
+.. autoclass:: torchaudio.io::CodecConfig
   :members:
 {%- endif %}
+{%- endif %}
--- a/docs/source/io.rst
+++ b/docs/source/io.rst
@@ -12,6 +12,7 @@ torchaudio.io
   StreamReader
   StreamWriter
+   AudioEffector
   play_audio
 .. rubric:: Tutorials using ``torchaudio.io``

--- a/test/torchaudio_unittest/io/common.py
+++ b/test/torchaudio_unittest/io/common.py
+import torchaudio
+# If FFmpeg is 4.1 or older
+# Tests that checks the number of output samples from OPUS fails
+# They work on 4.2+
+# Probably this commit fixed it.
+# https://github.com/FFmpeg/FFmpeg/commit/18aea7bdd96b320a40573bccabea56afeccdd91c
+def lt42():
+    ver = torchaudio.utils.ffmpeg_utils.get_versions()["libavcodec"]
+    # 5.1 libavcodec     59. 18.100
+    # 4.4 libavcodec     58.134.100
+    # 4.3 libavcodec     58. 91.100
+    # 4.2 libavcodec     58. 54.100
+    # 4.1 libavcodec     58. 35.100
+    return ver[0] < 59 and ver[1] < 54
--- a/test/torchaudio_unittest/io/effector_test.py
+++ b/test/torchaudio_unittest/io/effector_test.py
+from parameterized import parameterized
+from torchaudio.io import AudioEffector
+from torchaudio_unittest.common_utils import get_sinusoid, skipIfNoFFmpeg, TorchaudioTestCase
+from .common import lt42
+@skipIfNoFFmpeg
+class EffectorTest(TorchaudioTestCase):
+    def test_null(self):
+        """No effect and codec will return the same result"""
+        sample_rate = 8000
+        frames_per_chunk = 256
+        effector = AudioEffector(effect=None, format=None)
+        original = get_sinusoid(n_channels=3, sample_rate=sample_rate, channels_first=False)
+        # one-go
+        output = effector.apply(original, sample_rate)
+        self.assertEqual(original, output)
+        # streaming
+        for i, chunk in enumerate(effector.stream(original, sample_rate, frames_per_chunk)):
+            start = i * frames_per_chunk
+            end = (i + 1) * frames_per_chunk
+            self.assertEqual(original[start:end, :], chunk)
+    @parameterized.expand(
+        [
+            ("ogg", "flac"),  # flac only supports s16 and s32
+            ("ogg", "opus"),  # opus only supports 48k Hz
+            ("ogg", "vorbis"),  # vorbis only supports stereo
+            ("wav", None),
+            ("wav", "pcm_u8"),
+            ("mp3", None),
+        ]
+    )
+    def test_formats(self, format, encoder):
+        """Formats (some with restrictions) just work without an issue in effector"""
+        sample_rate = 8000
+        effector = AudioEffector(format=format, encoder=encoder)
+        original = get_sinusoid(n_channels=3, sample_rate=sample_rate, channels_first=False)
+        output = effector.apply(original, sample_rate)
+        # On 4.1 OPUS produces 8020 samples (extra 20)
+        # this has been fixed on 4.2+
+        if encoder == "opus" and lt42():
+            return
+        self.assertEqual(original.shape, output.shape)
+        # Note
+        # MP3 adds padding which cannot be removed when the encoded data is written to
+        # file-like object without seek method.
+        # The number of padding is retrievable as `AVCoedcContext::initial_padding`
+        # https://ffmpeg.org/doxygen/4.1/structAVCodecContext.html#a8f95550ce04f236e9915516d04d3d1ab
+        # but this is not exposed yet.
+        # These "priming" samples have negative time stamp, so we can also add logic
+        # to discard them at decoding, however, as far as I checked, when data is loaded
+        # with StreamReader, the time stamp is reset. I tried options like avoid_negative_ts,
+        # https://ffmpeg.org/ffmpeg-formats.html
+        # but it made no difference. Perhaps this is because the information about negative
+        # timestamp is only available at encoding side, and it presumably is written to
+        # header file, but it is not happening somehow with file-like object.
+        # Need to investigate more to remove MP3 padding
+        if format == "mp3":
+            return
+        for chunk in effector.stream(original, sample_rate, frames_per_chunk=original.size(0)):
+            self.assertEqual(original.shape, chunk.shape)
+    @parameterized.expand([("loudnorm=I=-16:LRA=11:TP=-1.5",), ("volume=2",)])
+    def test_effect(self, effect):
+        sample_rate = 8000
+        effector = AudioEffector(effect=effect)
+        original = get_sinusoid(n_channels=3, sample_rate=sample_rate, channels_first=False)
+        output = effector.apply(original, sample_rate)
+        self.assertEqual(original.shape, output.shape)
--- a/test/torchaudio_unittest/io/stream_writer_test.py
+++ b/test/torchaudio_unittest/io/stream_writer_test.py
@@ -17,8 +17,10 @@ from torchaudio_unittest.common_utils import (
    TorchaudioTestCase,
 )
+from .common import lt42
 if is_ffmpeg_available():
-    from torchaudio.io import StreamReader, StreamWriter
+    from torchaudio.io import CodecConfig, StreamReader, StreamWriter
 def get_audio_chunk(fmt, sample_rate, num_channels):
@@ -380,19 +382,10 @@ class StreamWriterCorrectnessTest(TempDirMixin, TorchaudioTestCase):
        s.process_all_packets()
        (saved,) = s.pop_chunks()
-        # This test fails for OPUS if FFmpeg is 4.1, but it passes for 4.2+
+        # On 4.1 OPUS produces 48312 samples (extra 312)
-        # 4.1 produces 48312 samples (extra 312)
+        # this has been fixed on 4.2+
-        # Probably this commit fixes it.
+        # TODO: issue warning if on 4.1?
-        # https://github.com/FFmpeg/FFmpeg/commit/18aea7bdd96b320a40573bccabea56afeccdd91c
+        if ext == "opus" and lt42():
-        # TODO: issue warning if 4.1?
-        if ext == "opus":
-            ver = torchaudio.utils.ffmpeg_utils.get_versions()["libavcodec"]
-            # 5.1 libavcodec     59. 18.100
-            # 4.4 libavcodec     58.134.100
-            # 4.3 libavcodec     58. 91.100
-            # 4.2 libavcodec     58. 54.100
-            # 4.1 libavcodec     58. 35.100
-            if ver[0] < 59 and ver[1] < 54:
            return
        self.assertEqual(saved.shape, data.shape)
@@ -534,7 +527,7 @@ class StreamWriterCorrectnessTest(TempDirMixin, TorchaudioTestCase):
        # Write data
        dst = self.get_temp_path(filename)
        writer = torchaudio.io.StreamWriter(dst=dst, format=ext)
-        codec_config = torchaudio.io.StreamWriter.CodecConfig(bit_rate=198_000, compression_level=3)
+        codec_config = CodecConfig(bit_rate=198_000, compression_level=3)
        writer.add_audio_stream(sample_rate=sample_rate, num_channels=num_channels, codec_config=codec_config)
        audio = torch.zeros((8000, 2))
@@ -553,7 +546,7 @@ class StreamWriterCorrectnessTest(TempDirMixin, TorchaudioTestCase):
            writer.add_audio_stream(
                sample_rate=sample_rate,
                num_channels=num_channels,
-                codec_config=torchaudio.io.StreamWriter.CodecConfig(bit_rate=bit_rate),
+                codec_config=CodecConfig(bit_rate=bit_rate),
            )
            with writer.open():

--- a/torchaudio/io/__init__.py
+++ b/torchaudio/io/__init__.py
+from ._effector import AudioEffector
 from ._playback import play_audio
 from ._stream_reader import StreamReader
-from ._stream_writer import StreamWriter
+from ._stream_writer import CodecConfig, StreamWriter
 __all__ = [
+    "AudioEffector",
    "StreamReader",
    "StreamWriter",
+    "CodecConfig",
    "play_audio",
 ]
--- a/torchaudio/io/_effector.py
+++ b/torchaudio/io/_effector.py
+import io
+from typing import Iterator, List, Optional
+import torch
+from torch import Tensor
+from ._stream_reader import _get_afilter_desc, StreamReader
+from ._stream_writer import CodecConfig, StreamWriter
+class _StreamingIOBuffer:
+    """Streaming Bytes IO buffer. Data are dropped when read."""
+    def __init__(self):
+        self._buffer: List(bytes) = []
+    def write(self, b: bytes):
+        if b:
+            self._buffer.append(b)
+        return len(b)
+    def pop(self, n):
+        """Pop the oldest byte string. It does not necessary return the requested amount"""
+        if not self._buffer:
+            return b""
+        if len(self._buffer[0]) <= n:
+            return self._buffer.pop(0)
+        ret = self._buffer[0][:n]
+        self._buffer[0] = self._buffer[0][n:]
+        return ret
+def _get_sample_fmt(dtype: torch.dtype):
+    types = {
+        torch.uint8: "u8",
+        torch.int16: "s16",
+        torch.int32: "s32",
+        torch.float32: "flt",
+        torch.float64: "dbl",
+    }
+    if dtype not in types:
+        raise ValueError(f"Unsupported dtype is provided {dtype}. Supported dtypes are: {types.keys()}")
+    return types[dtype]
+class _AudioStreamingEncoder:
+    """Given a waveform, encode on-demand and return bytes"""
+    def __init__(
+        self,
+        src: Tensor,
+        sample_rate: int,
+        effect: str,
+        muxer: str,
+        encoder: Optional[str],
+        codec_config: Optional[CodecConfig],
+        frames_per_chunk: int,
+    ):
+        self.src = src
+        self.buffer = _StreamingIOBuffer()
+        self.writer = StreamWriter(self.buffer, format=muxer)
+        self.writer.add_audio_stream(
+            num_channels=src.size(1),
+            sample_rate=sample_rate,
+            format=_get_sample_fmt(src.dtype),
+            encoder=encoder,
+            filter_desc=effect,
+            codec_config=codec_config,
+        )
+        self.writer.open()
+        self.fpc = frames_per_chunk
+        # index on the input tensor (along time-axis)
+        # we use -1 to indicate that we finished iterating the tensor and
+        # the writer is closed.
+        self.i_iter = 0
+    def read(self, n):
+        while not self.buffer._buffer and self.i_iter >= 0:
+            self.writer.write_audio_chunk(0, self.src[self.i_iter : self.i_iter + self.fpc])
+            self.i_iter += self.fpc
+            if self.i_iter >= self.src.size(0):
+                self.writer.flush()
+                self.writer.close()
+                self.i_iter = -1
+        return self.buffer.pop(n)
+def _encode(
+    src: Tensor,
+    sample_rate: int,
+    effect: str,
+    muxer: str,
+    encoder: Optional[str],
+    codec_config: Optional[CodecConfig],
+):
+    buffer = io.BytesIO()
+    writer = StreamWriter(buffer, format=muxer)
+    writer.add_audio_stream(
+        num_channels=src.size(1),
+        sample_rate=sample_rate,
+        format=_get_sample_fmt(src.dtype),
+        encoder=encoder,
+        filter_desc=effect,
+        codec_config=codec_config,
+    )
+    with writer.open():
+        writer.write_audio_chunk(0, src)
+    buffer.seek(0)
+    return buffer
+def _get_muxer(dtype: torch.dtype):
+    # TODO: check if this works in Windows.
+    types = {
+        torch.uint8: "u8",
+        torch.int16: "s16le",
+        torch.int32: "s32le",
+        torch.float32: "f32le",
+        torch.float64: "f64le",
+    }
+    if dtype not in types:
+        raise ValueError(f"Unsupported dtype is provided {dtype}. Supported dtypes are: {types.keys()}")
+    return types[dtype]
+class AudioEffector:
+    """Apply various filters and/or codecs to waveforms.
+    .. versionadded:: 2.1
+    Args:
+        effect (str or None, optional): Filter expressions or ``None`` to apply no filter.
+            See https://ffmpeg.org/ffmpeg-filters.html#Audio-Filters for the
+            details of filter syntax.
+        format (str or None, optional): When provided, encode the audio into the
+            corresponding format. Default: ``None``.
+        encoder (str or None, optional): When provided, override the encoder used
+            by the ``format``. Default: ``None``.
+        codec_config (CodecConfig or None, optional): When provided, configure the encoding codec.
+            Should be provided in conjunction with ``format`` option.
+        pad_end (bool, optional): When enabled, and if the waveform becomes shorter after applying
+            effects/codec, then pad the end with silence.
+    Examples:
+        * Basic usage
+        To use ``AudioEffector``, first instantiate it with a set of
+        ``effect`` and ``format``.
+        >>> # instantiate the effector
+        >>> effector = AudioEffector(effect=..., format=...)
+        Then, use :py:meth:`~AudioEffector.apply` or :py:meth:`~AudioEffector.stream`
+        method to apply them.
+        >>> # Apply the effect to the whole waveform
+        >>> applied = effector.apply(waveform, sample_rate)
+        >>> # Apply the effect chunk-by-chunk
+        >>> for chunk in effector.stream(waveform, sample_rate):
+        >>>    ...
+        * Applying effects
+        Please refer to
+        https://ffmpeg.org/ffmpeg-filters.html#Filtergraph-description
+        for the overview of filter description, and
+        https://ffmpeg.org/ffmpeg-filters.html#toc-Audio-Filters
+        for the list of available filters.
+        Tempo - https://ffmpeg.org/ffmpeg-filters.html#atempo
+        >>> AudioEffector(effect="atempo=1.5")
+        Echo - https://ffmpeg.org/ffmpeg-filters.html#aecho
+        >>> AudioEffector(effect="aecho=0.8:0.88:60:0.4")
+        Flanger - https://ffmpeg.org/ffmpeg-filters.html#flanger
+        >>> AudioEffector(effect="aflanger")
+        Vibrato - https://ffmpeg.org/ffmpeg-filters.html#vibrato
+        >>> AudioEffector(effect="vibrato")
+        Tremolo - https://ffmpeg.org/ffmpeg-filters.html#tremolo
+        >>> AudioEffector(effect="vibrato")
+        You can also apply multiple effects at once.
+        >>> AudioEffector(effect="")
+        * Applying codec
+        One can apply codec using ``format`` argument. ``format`` can be
+        audio format or container format. If the container format supports
+        multiple encoders, you can specify it with ``encoder`` argument.
+        Wav format
+        (no compression is applied but samples are converted to
+        16-bit signed integer)
+        >>> AudioEffector(format="wav")
+        Ogg format with default encoder
+        >>> AudioEffector(format="ogg")
+        Ogg format with vorbis
+        >>> AudioEffector(format="ogg", encoder="vorbis")
+        Ogg format with opus
+        >>> AudioEffector(format="ogg", encoder="opus")
+        Webm format with opus
+        >>> AudioEffector(format="webm", encoder="opus")
+        * Applying codec with configuration
+        Reference: https://trac.ffmpeg.org/wiki/Encode/MP3
+        MP3 with default config
+        >>> AudioEffector(format="mp3")
+        MP3 with variable bitrate
+        >>> AudioEffector(format="mp3", codec_config=CodecConfig(qscale=5))
+        MP3 with constant bitrate
+        >>> AudioEffector(format="mp3", codec_config=CodecConfig(bit_rate=32_000))
+    """
+    def __init__(
+        self,
+        effect: Optional[str] = None,
+        format: Optional[str] = None,
+        *,
+        encoder: Optional[str] = None,
+        codec_config: Optional[CodecConfig] = None,
+        pad_end: bool = True,
+    ):
+        if format is None:
+            if encoder is not None or codec_config is not None:
+                raise ValueError("`encoder` and/or `condec_config` opions are provided without `format` option.")
+        self.effect = effect
+        self.format = format
+        self.encoder = encoder
+        self.codec_config = codec_config
+        self.pad_end = pad_end
+    def _get_reader(self, waveform, sample_rate, frames_per_chunk=None):
+        num_frames, num_channels = waveform.shape
+        if self.format is not None:
+            muxer = self.format
+            encoder = self.encoder
+            option = {}
+        else:  # PCM
+            muxer = _get_muxer(waveform.dtype)
+            encoder = None
+            option = {"sample_rate": f"{sample_rate}", "channels": f"{num_channels}"}
+        if frames_per_chunk is None:
+            src = _encode(waveform, sample_rate, self.effect, muxer, encoder, self.codec_config)
+        else:
+            src = _AudioStreamingEncoder(
+                waveform, sample_rate, self.effect, muxer, encoder, self.codec_config, frames_per_chunk
+            )
+        filter_desc = _get_afilter_desc(sample_rate, _get_sample_fmt(waveform.dtype), num_channels)
+        if self.pad_end:
+            filter_desc = f"{filter_desc},apad=whole_len={num_frames}"
+        reader = StreamReader(src, format=muxer, option=option)
+        reader.add_audio_stream(frames_per_chunk or -1, -1, filter_desc=filter_desc)
+        return reader
+    def apply(self, waveform: Tensor, sample_rate: int) -> Tensor:
+        """Apply the effect and/or codecs to the whole tensor.
+        Args:
+            waveform (Tensor): The input waveform. Shape: ``(time, channel)``
+            sample_rate (int): Sample rate of the waveform.
+        Returns:
+            Tensor:
+                Resulting Tensor. Shape: ``(time, channel)``. The number of frames
+                could be different from that of the input.
+        """
+        if waveform.ndim != 2:
+            raise ValueError(f"Expected the input waveform to be 2D. Found: {waveform.ndim}")
+        if waveform.numel() == 0:
+            return waveform
+        reader = self._get_reader(waveform, sample_rate)
+        reader.process_all_packets()
+        (applied,) = reader.pop_chunks()
+        return Tensor(applied)
+    def stream(self, waveform: Tensor, sample_rate: int, frames_per_chunk: int) -> Iterator[Tensor]:
+        """Apply the effect and/or codecs to the given tensor chunk by chunk.
+        Args:
+            waveform (Tensor): The input waveform. Shape: ``(time, channel)``
+            sample_rate (int): Sample rate of the waveform.
+            frames_per_chunk (int): The number of frames to return at a time.
+        Returns:
+            Iterator[Tensor]:
+                Series of processed chunks. Shape: ``(time, channel)``, where the
+                the number of frames matches ``frames_per_chunk`` except the
+                last chunk, which could be shorter.
+        """
+        if waveform.ndim != 2:
+            raise ValueError(f"Expected the input waveform to be 2D. Found: {waveform.ndim}")
+        if waveform.numel() == 0:
+            return waveform
+        reader = self._get_reader(waveform, sample_rate, frames_per_chunk)
+        for (applied,) in reader.stream():
+            yield Tensor(applied)
--- a/torchaudio/io/_stream_writer.py
+++ b/torchaudio/io/_stream_writer.py
@@ -11,6 +11,33 @@ else:
    ConfigBase = object
+@dataclass
+class CodecConfig(ConfigBase):
+    """Codec configuration."""
+    bit_rate: int = -1
+    """Bit rate"""
+    compression_level: int = -1
+    """Compression level"""
+    qscale: Optional[int] = None
+    """Global quality factor. Enables variable bit rate. Valid values depend on encoder.
+    For example: MP3 takes ``0`` - ``9`` (https://trac.ffmpeg.org/wiki/Encode/MP3) while
+    libvorbis takes ``-1`` - ``10``.
+    """
+    gop_size: int = -1
+    """The number of pictures in a group of pictures, or 0 for intra_only"""
+    max_b_frames: int = -1
+    """maximum number of B-frames between non-B-frames."""
+    def __post_init__(self):
+        super().__init__(self.bit_rate, self.compression_level, self.qscale, self.gop_size, self.max_b_frames)
 def _format_doc(**kwargs):
    def decorator(obj):
        obj.__doc__ = obj.__doc__.format(**kwargs)
@@ -131,32 +158,6 @@ class StreamWriter:
            Default: `4096`.
    """
-    @dataclass
-    class CodecConfig(ConfigBase):
-        """Codec configuration."""
-        bit_rate: int = -1
-        """Bit rate"""
-        compression_level: int = -1
-        """Compression level"""
-        qscale: Optional[int] = None
-        """Global quality factor. Enables variable bit rate. Valid values depend on encoder.
-        For example: MP3 takes ``0`` - ``9`` (https://trac.ffmpeg.org/wiki/Encode/MP3) while
-        libvorbis takes ``-1`` - ``10``.
-        """
-        gop_size: int = -1
-        """The number of pictures in a group of pictures, or 0 for intra_only"""
-        max_b_frames: int = -1
-        """maximum number of B-frames between non-B-frames."""
-        def __post_init__(self):
-            super().__init__(self.bit_rate, self.compression_level, self.qscale, self.gop_size, self.max_b_frames)
    def __init__(
        self,
        dst: Union[str, BinaryIO],