Commit a4036248 authored by moto's avatar moto Committed by Facebook GitHub Bot
Browse files

Add AudioEffector (#3163)

Summary:
This commit adds a new feature AudioEffector, which can be used to
apply various effects and codecs to waveforms in Tensor.

Under the hood it uses StreamWriter and StreamReader to apply
filters and encode/decode.

This is going to replace the deprecated `apply_codec` and
`apply_sox_effect_tensor` functions.

It can also perform online, chunk-by-chunk filtering.

Tutorial to follow.

closes https://github.com/pytorch/audio/issues/3161

Pull Request resolved: https://github.com/pytorch/audio/pull/3163

Reviewed By: hwangjeff

Differential Revision: D44576660

Pulled By: mthrok

fbshipit-source-id: 2c5cc87082ab431315d29d56d6ac9efaf4cf7aeb
parent fda41bbf
...@@ -42,7 +42,6 @@ Methods ...@@ -42,7 +42,6 @@ Methods
not item.startswith('_') not item.startswith('_')
and item not in inherited_members and item not in inherited_members
and item not in attributes and item not in attributes
and item != "CodecConfig"
%} %}
{{ item | underline("~") }} {{ item | underline("~") }}
...@@ -56,11 +55,12 @@ Methods ...@@ -56,11 +55,12 @@ Methods
{%- endif %} {%- endif %}
{%- if name == "StreamReader" %} {%- if name in ["StreamReader", "StreamWriter"] %}
Support Structures Support Structures
------------------ ------------------
{%- if name == "StreamReader" %}
{%- for item in [ {%- for item in [
"ChunkTensor", "ChunkTensor",
"SourceStream", "SourceStream",
...@@ -77,15 +77,14 @@ Support Structures ...@@ -77,15 +77,14 @@ Support Structures
:members: :members:
{%- endfor %} {%- endfor %}
{%- elif name == "StreamWriter" %}
Support Structures {%- elif name == "StreamWriter" %}
------------------
CodecConfig CodecConfig
~~~~~~~~~~~ ~~~~~~~~~~~
.. autoclass:: torchaudio.io::StreamWriter.CodecConfig() .. autoclass:: torchaudio.io::CodecConfig
:members: :members:
{%- endif %} {%- endif %}
{%- endif %}
...@@ -12,6 +12,7 @@ torchaudio.io ...@@ -12,6 +12,7 @@ torchaudio.io
StreamReader StreamReader
StreamWriter StreamWriter
AudioEffector
play_audio play_audio
.. rubric:: Tutorials using ``torchaudio.io`` .. rubric:: Tutorials using ``torchaudio.io``
......
import torchaudio
# If FFmpeg is 4.1 or older
# Tests that checks the number of output samples from OPUS fails
# They work on 4.2+
# Probably this commit fixed it.
# https://github.com/FFmpeg/FFmpeg/commit/18aea7bdd96b320a40573bccabea56afeccdd91c
def lt42():
ver = torchaudio.utils.ffmpeg_utils.get_versions()["libavcodec"]
# 5.1 libavcodec 59. 18.100
# 4.4 libavcodec 58.134.100
# 4.3 libavcodec 58. 91.100
# 4.2 libavcodec 58. 54.100
# 4.1 libavcodec 58. 35.100
return ver[0] < 59 and ver[1] < 54
from parameterized import parameterized
from torchaudio.io import AudioEffector
from torchaudio_unittest.common_utils import get_sinusoid, skipIfNoFFmpeg, TorchaudioTestCase
from .common import lt42
@skipIfNoFFmpeg
class EffectorTest(TorchaudioTestCase):
def test_null(self):
"""No effect and codec will return the same result"""
sample_rate = 8000
frames_per_chunk = 256
effector = AudioEffector(effect=None, format=None)
original = get_sinusoid(n_channels=3, sample_rate=sample_rate, channels_first=False)
# one-go
output = effector.apply(original, sample_rate)
self.assertEqual(original, output)
# streaming
for i, chunk in enumerate(effector.stream(original, sample_rate, frames_per_chunk)):
start = i * frames_per_chunk
end = (i + 1) * frames_per_chunk
self.assertEqual(original[start:end, :], chunk)
@parameterized.expand(
[
("ogg", "flac"), # flac only supports s16 and s32
("ogg", "opus"), # opus only supports 48k Hz
("ogg", "vorbis"), # vorbis only supports stereo
("wav", None),
("wav", "pcm_u8"),
("mp3", None),
]
)
def test_formats(self, format, encoder):
"""Formats (some with restrictions) just work without an issue in effector"""
sample_rate = 8000
effector = AudioEffector(format=format, encoder=encoder)
original = get_sinusoid(n_channels=3, sample_rate=sample_rate, channels_first=False)
output = effector.apply(original, sample_rate)
# On 4.1 OPUS produces 8020 samples (extra 20)
# this has been fixed on 4.2+
if encoder == "opus" and lt42():
return
self.assertEqual(original.shape, output.shape)
# Note
# MP3 adds padding which cannot be removed when the encoded data is written to
# file-like object without seek method.
# The number of padding is retrievable as `AVCoedcContext::initial_padding`
# https://ffmpeg.org/doxygen/4.1/structAVCodecContext.html#a8f95550ce04f236e9915516d04d3d1ab
# but this is not exposed yet.
# These "priming" samples have negative time stamp, so we can also add logic
# to discard them at decoding, however, as far as I checked, when data is loaded
# with StreamReader, the time stamp is reset. I tried options like avoid_negative_ts,
# https://ffmpeg.org/ffmpeg-formats.html
# but it made no difference. Perhaps this is because the information about negative
# timestamp is only available at encoding side, and it presumably is written to
# header file, but it is not happening somehow with file-like object.
# Need to investigate more to remove MP3 padding
if format == "mp3":
return
for chunk in effector.stream(original, sample_rate, frames_per_chunk=original.size(0)):
self.assertEqual(original.shape, chunk.shape)
@parameterized.expand([("loudnorm=I=-16:LRA=11:TP=-1.5",), ("volume=2",)])
def test_effect(self, effect):
sample_rate = 8000
effector = AudioEffector(effect=effect)
original = get_sinusoid(n_channels=3, sample_rate=sample_rate, channels_first=False)
output = effector.apply(original, sample_rate)
self.assertEqual(original.shape, output.shape)
...@@ -17,8 +17,10 @@ from torchaudio_unittest.common_utils import ( ...@@ -17,8 +17,10 @@ from torchaudio_unittest.common_utils import (
TorchaudioTestCase, TorchaudioTestCase,
) )
from .common import lt42
if is_ffmpeg_available(): if is_ffmpeg_available():
from torchaudio.io import StreamReader, StreamWriter from torchaudio.io import CodecConfig, StreamReader, StreamWriter
def get_audio_chunk(fmt, sample_rate, num_channels): def get_audio_chunk(fmt, sample_rate, num_channels):
...@@ -380,19 +382,10 @@ class StreamWriterCorrectnessTest(TempDirMixin, TorchaudioTestCase): ...@@ -380,19 +382,10 @@ class StreamWriterCorrectnessTest(TempDirMixin, TorchaudioTestCase):
s.process_all_packets() s.process_all_packets()
(saved,) = s.pop_chunks() (saved,) = s.pop_chunks()
# This test fails for OPUS if FFmpeg is 4.1, but it passes for 4.2+ # On 4.1 OPUS produces 48312 samples (extra 312)
# 4.1 produces 48312 samples (extra 312) # this has been fixed on 4.2+
# Probably this commit fixes it. # TODO: issue warning if on 4.1?
# https://github.com/FFmpeg/FFmpeg/commit/18aea7bdd96b320a40573bccabea56afeccdd91c if ext == "opus" and lt42():
# TODO: issue warning if 4.1?
if ext == "opus":
ver = torchaudio.utils.ffmpeg_utils.get_versions()["libavcodec"]
# 5.1 libavcodec 59. 18.100
# 4.4 libavcodec 58.134.100
# 4.3 libavcodec 58. 91.100
# 4.2 libavcodec 58. 54.100
# 4.1 libavcodec 58. 35.100
if ver[0] < 59 and ver[1] < 54:
return return
self.assertEqual(saved.shape, data.shape) self.assertEqual(saved.shape, data.shape)
...@@ -534,7 +527,7 @@ class StreamWriterCorrectnessTest(TempDirMixin, TorchaudioTestCase): ...@@ -534,7 +527,7 @@ class StreamWriterCorrectnessTest(TempDirMixin, TorchaudioTestCase):
# Write data # Write data
dst = self.get_temp_path(filename) dst = self.get_temp_path(filename)
writer = torchaudio.io.StreamWriter(dst=dst, format=ext) writer = torchaudio.io.StreamWriter(dst=dst, format=ext)
codec_config = torchaudio.io.StreamWriter.CodecConfig(bit_rate=198_000, compression_level=3) codec_config = CodecConfig(bit_rate=198_000, compression_level=3)
writer.add_audio_stream(sample_rate=sample_rate, num_channels=num_channels, codec_config=codec_config) writer.add_audio_stream(sample_rate=sample_rate, num_channels=num_channels, codec_config=codec_config)
audio = torch.zeros((8000, 2)) audio = torch.zeros((8000, 2))
...@@ -553,7 +546,7 @@ class StreamWriterCorrectnessTest(TempDirMixin, TorchaudioTestCase): ...@@ -553,7 +546,7 @@ class StreamWriterCorrectnessTest(TempDirMixin, TorchaudioTestCase):
writer.add_audio_stream( writer.add_audio_stream(
sample_rate=sample_rate, sample_rate=sample_rate,
num_channels=num_channels, num_channels=num_channels,
codec_config=torchaudio.io.StreamWriter.CodecConfig(bit_rate=bit_rate), codec_config=CodecConfig(bit_rate=bit_rate),
) )
with writer.open(): with writer.open():
......
from ._effector import AudioEffector
from ._playback import play_audio from ._playback import play_audio
from ._stream_reader import StreamReader from ._stream_reader import StreamReader
from ._stream_writer import StreamWriter from ._stream_writer import CodecConfig, StreamWriter
__all__ = [ __all__ = [
"AudioEffector",
"StreamReader", "StreamReader",
"StreamWriter", "StreamWriter",
"CodecConfig",
"play_audio", "play_audio",
] ]
import io
from typing import Iterator, List, Optional
import torch
from torch import Tensor
from ._stream_reader import _get_afilter_desc, StreamReader
from ._stream_writer import CodecConfig, StreamWriter
class _StreamingIOBuffer:
"""Streaming Bytes IO buffer. Data are dropped when read."""
def __init__(self):
self._buffer: List(bytes) = []
def write(self, b: bytes):
if b:
self._buffer.append(b)
return len(b)
def pop(self, n):
"""Pop the oldest byte string. It does not necessary return the requested amount"""
if not self._buffer:
return b""
if len(self._buffer[0]) <= n:
return self._buffer.pop(0)
ret = self._buffer[0][:n]
self._buffer[0] = self._buffer[0][n:]
return ret
def _get_sample_fmt(dtype: torch.dtype):
types = {
torch.uint8: "u8",
torch.int16: "s16",
torch.int32: "s32",
torch.float32: "flt",
torch.float64: "dbl",
}
if dtype not in types:
raise ValueError(f"Unsupported dtype is provided {dtype}. Supported dtypes are: {types.keys()}")
return types[dtype]
class _AudioStreamingEncoder:
"""Given a waveform, encode on-demand and return bytes"""
def __init__(
self,
src: Tensor,
sample_rate: int,
effect: str,
muxer: str,
encoder: Optional[str],
codec_config: Optional[CodecConfig],
frames_per_chunk: int,
):
self.src = src
self.buffer = _StreamingIOBuffer()
self.writer = StreamWriter(self.buffer, format=muxer)
self.writer.add_audio_stream(
num_channels=src.size(1),
sample_rate=sample_rate,
format=_get_sample_fmt(src.dtype),
encoder=encoder,
filter_desc=effect,
codec_config=codec_config,
)
self.writer.open()
self.fpc = frames_per_chunk
# index on the input tensor (along time-axis)
# we use -1 to indicate that we finished iterating the tensor and
# the writer is closed.
self.i_iter = 0
def read(self, n):
while not self.buffer._buffer and self.i_iter >= 0:
self.writer.write_audio_chunk(0, self.src[self.i_iter : self.i_iter + self.fpc])
self.i_iter += self.fpc
if self.i_iter >= self.src.size(0):
self.writer.flush()
self.writer.close()
self.i_iter = -1
return self.buffer.pop(n)
def _encode(
src: Tensor,
sample_rate: int,
effect: str,
muxer: str,
encoder: Optional[str],
codec_config: Optional[CodecConfig],
):
buffer = io.BytesIO()
writer = StreamWriter(buffer, format=muxer)
writer.add_audio_stream(
num_channels=src.size(1),
sample_rate=sample_rate,
format=_get_sample_fmt(src.dtype),
encoder=encoder,
filter_desc=effect,
codec_config=codec_config,
)
with writer.open():
writer.write_audio_chunk(0, src)
buffer.seek(0)
return buffer
def _get_muxer(dtype: torch.dtype):
# TODO: check if this works in Windows.
types = {
torch.uint8: "u8",
torch.int16: "s16le",
torch.int32: "s32le",
torch.float32: "f32le",
torch.float64: "f64le",
}
if dtype not in types:
raise ValueError(f"Unsupported dtype is provided {dtype}. Supported dtypes are: {types.keys()}")
return types[dtype]
class AudioEffector:
"""Apply various filters and/or codecs to waveforms.
.. versionadded:: 2.1
Args:
effect (str or None, optional): Filter expressions or ``None`` to apply no filter.
See https://ffmpeg.org/ffmpeg-filters.html#Audio-Filters for the
details of filter syntax.
format (str or None, optional): When provided, encode the audio into the
corresponding format. Default: ``None``.
encoder (str or None, optional): When provided, override the encoder used
by the ``format``. Default: ``None``.
codec_config (CodecConfig or None, optional): When provided, configure the encoding codec.
Should be provided in conjunction with ``format`` option.
pad_end (bool, optional): When enabled, and if the waveform becomes shorter after applying
effects/codec, then pad the end with silence.
Examples:
* Basic usage
To use ``AudioEffector``, first instantiate it with a set of
``effect`` and ``format``.
>>> # instantiate the effector
>>> effector = AudioEffector(effect=..., format=...)
Then, use :py:meth:`~AudioEffector.apply` or :py:meth:`~AudioEffector.stream`
method to apply them.
>>> # Apply the effect to the whole waveform
>>> applied = effector.apply(waveform, sample_rate)
>>> # Apply the effect chunk-by-chunk
>>> for chunk in effector.stream(waveform, sample_rate):
>>> ...
* Applying effects
Please refer to
https://ffmpeg.org/ffmpeg-filters.html#Filtergraph-description
for the overview of filter description, and
https://ffmpeg.org/ffmpeg-filters.html#toc-Audio-Filters
for the list of available filters.
Tempo - https://ffmpeg.org/ffmpeg-filters.html#atempo
>>> AudioEffector(effect="atempo=1.5")
Echo - https://ffmpeg.org/ffmpeg-filters.html#aecho
>>> AudioEffector(effect="aecho=0.8:0.88:60:0.4")
Flanger - https://ffmpeg.org/ffmpeg-filters.html#flanger
>>> AudioEffector(effect="aflanger")
Vibrato - https://ffmpeg.org/ffmpeg-filters.html#vibrato
>>> AudioEffector(effect="vibrato")
Tremolo - https://ffmpeg.org/ffmpeg-filters.html#tremolo
>>> AudioEffector(effect="vibrato")
You can also apply multiple effects at once.
>>> AudioEffector(effect="")
* Applying codec
One can apply codec using ``format`` argument. ``format`` can be
audio format or container format. If the container format supports
multiple encoders, you can specify it with ``encoder`` argument.
Wav format
(no compression is applied but samples are converted to
16-bit signed integer)
>>> AudioEffector(format="wav")
Ogg format with default encoder
>>> AudioEffector(format="ogg")
Ogg format with vorbis
>>> AudioEffector(format="ogg", encoder="vorbis")
Ogg format with opus
>>> AudioEffector(format="ogg", encoder="opus")
Webm format with opus
>>> AudioEffector(format="webm", encoder="opus")
* Applying codec with configuration
Reference: https://trac.ffmpeg.org/wiki/Encode/MP3
MP3 with default config
>>> AudioEffector(format="mp3")
MP3 with variable bitrate
>>> AudioEffector(format="mp3", codec_config=CodecConfig(qscale=5))
MP3 with constant bitrate
>>> AudioEffector(format="mp3", codec_config=CodecConfig(bit_rate=32_000))
"""
def __init__(
self,
effect: Optional[str] = None,
format: Optional[str] = None,
*,
encoder: Optional[str] = None,
codec_config: Optional[CodecConfig] = None,
pad_end: bool = True,
):
if format is None:
if encoder is not None or codec_config is not None:
raise ValueError("`encoder` and/or `condec_config` opions are provided without `format` option.")
self.effect = effect
self.format = format
self.encoder = encoder
self.codec_config = codec_config
self.pad_end = pad_end
def _get_reader(self, waveform, sample_rate, frames_per_chunk=None):
num_frames, num_channels = waveform.shape
if self.format is not None:
muxer = self.format
encoder = self.encoder
option = {}
else: # PCM
muxer = _get_muxer(waveform.dtype)
encoder = None
option = {"sample_rate": f"{sample_rate}", "channels": f"{num_channels}"}
if frames_per_chunk is None:
src = _encode(waveform, sample_rate, self.effect, muxer, encoder, self.codec_config)
else:
src = _AudioStreamingEncoder(
waveform, sample_rate, self.effect, muxer, encoder, self.codec_config, frames_per_chunk
)
filter_desc = _get_afilter_desc(sample_rate, _get_sample_fmt(waveform.dtype), num_channels)
if self.pad_end:
filter_desc = f"{filter_desc},apad=whole_len={num_frames}"
reader = StreamReader(src, format=muxer, option=option)
reader.add_audio_stream(frames_per_chunk or -1, -1, filter_desc=filter_desc)
return reader
def apply(self, waveform: Tensor, sample_rate: int) -> Tensor:
"""Apply the effect and/or codecs to the whole tensor.
Args:
waveform (Tensor): The input waveform. Shape: ``(time, channel)``
sample_rate (int): Sample rate of the waveform.
Returns:
Tensor:
Resulting Tensor. Shape: ``(time, channel)``. The number of frames
could be different from that of the input.
"""
if waveform.ndim != 2:
raise ValueError(f"Expected the input waveform to be 2D. Found: {waveform.ndim}")
if waveform.numel() == 0:
return waveform
reader = self._get_reader(waveform, sample_rate)
reader.process_all_packets()
(applied,) = reader.pop_chunks()
return Tensor(applied)
def stream(self, waveform: Tensor, sample_rate: int, frames_per_chunk: int) -> Iterator[Tensor]:
"""Apply the effect and/or codecs to the given tensor chunk by chunk.
Args:
waveform (Tensor): The input waveform. Shape: ``(time, channel)``
sample_rate (int): Sample rate of the waveform.
frames_per_chunk (int): The number of frames to return at a time.
Returns:
Iterator[Tensor]:
Series of processed chunks. Shape: ``(time, channel)``, where the
the number of frames matches ``frames_per_chunk`` except the
last chunk, which could be shorter.
"""
if waveform.ndim != 2:
raise ValueError(f"Expected the input waveform to be 2D. Found: {waveform.ndim}")
if waveform.numel() == 0:
return waveform
reader = self._get_reader(waveform, sample_rate, frames_per_chunk)
for (applied,) in reader.stream():
yield Tensor(applied)
...@@ -11,6 +11,33 @@ else: ...@@ -11,6 +11,33 @@ else:
ConfigBase = object ConfigBase = object
@dataclass
class CodecConfig(ConfigBase):
"""Codec configuration."""
bit_rate: int = -1
"""Bit rate"""
compression_level: int = -1
"""Compression level"""
qscale: Optional[int] = None
"""Global quality factor. Enables variable bit rate. Valid values depend on encoder.
For example: MP3 takes ``0`` - ``9`` (https://trac.ffmpeg.org/wiki/Encode/MP3) while
libvorbis takes ``-1`` - ``10``.
"""
gop_size: int = -1
"""The number of pictures in a group of pictures, or 0 for intra_only"""
max_b_frames: int = -1
"""maximum number of B-frames between non-B-frames."""
def __post_init__(self):
super().__init__(self.bit_rate, self.compression_level, self.qscale, self.gop_size, self.max_b_frames)
def _format_doc(**kwargs): def _format_doc(**kwargs):
def decorator(obj): def decorator(obj):
obj.__doc__ = obj.__doc__.format(**kwargs) obj.__doc__ = obj.__doc__.format(**kwargs)
...@@ -131,32 +158,6 @@ class StreamWriter: ...@@ -131,32 +158,6 @@ class StreamWriter:
Default: `4096`. Default: `4096`.
""" """
@dataclass
class CodecConfig(ConfigBase):
"""Codec configuration."""
bit_rate: int = -1
"""Bit rate"""
compression_level: int = -1
"""Compression level"""
qscale: Optional[int] = None
"""Global quality factor. Enables variable bit rate. Valid values depend on encoder.
For example: MP3 takes ``0`` - ``9`` (https://trac.ffmpeg.org/wiki/Encode/MP3) while
libvorbis takes ``-1`` - ``10``.
"""
gop_size: int = -1
"""The number of pictures in a group of pictures, or 0 for intra_only"""
max_b_frames: int = -1
"""maximum number of B-frames between non-B-frames."""
def __post_init__(self):
super().__init__(self.bit_rate, self.compression_level, self.qscale, self.gop_size, self.max_b_frames)
def __init__( def __init__(
self, self,
dst: Union[str, BinaryIO], dst: Union[str, BinaryIO],
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment