Commit ffeba11a authored by mayp777's avatar mayp777
Browse files

UPDATE

parent 29deb085
from typing import List, Optional
import torchaudio
from torchaudio._internal.module_utils import deprecated
# TODO: Once legacy global backend is removed, move this to torchaudio.__init__
def _init_backend():
from . import utils
torchaudio.info = utils.get_info_func()
torchaudio.load = utils.get_load_func()
torchaudio.save = utils.get_save_func()
def list_audio_backends() -> List[str]:
"""List available backends
Returns:
list of str: The list of available backends.
The possible values are;
- Dispatcher mode: ``"ffmpeg"``, ``"sox"`` and ``"soundfile"``.
- Legacy backend mode: ``"sox_io"``, ``"soundfile"``.
"""
from . import utils
return list(utils.get_available_backends().keys())
# Temporary until global backend is removed
@deprecated("With dispatcher enabled, this function is no-op. You can remove the function call.")
def get_audio_backend() -> Optional[str]:
"""Get the name of the current global backend
Returns:
str or None:
If dispatcher mode is enabled, returns ``None`` otherwise,
the name of current backend or ``None`` (no backend is set).
"""
return None
# Temporary until global backend is removed
@deprecated("With dispatcher enabled, this function is no-op. You can remove the function call.")
def set_audio_backend(backend: Optional[str]): # noqa
"""Set the global backend.
This is a no-op when dispatcher mode is enabled.
Args:
backend (str or None): Name of the backend.
One of ``"sox_io"`` or ``"soundfile"`` based on availability
of the system. If ``None`` is provided the current backend is unassigned.
"""
pass
import os
from abc import ABC, abstractmethod
from typing import BinaryIO, Optional, Tuple, Union
from torch import Tensor
from torchaudio.io import CodecConfig
from .common import AudioMetaData
class Backend(ABC):
@staticmethod
@abstractmethod
def info(uri: Union[BinaryIO, str, os.PathLike], format: Optional[str], buffer_size: int = 4096) -> AudioMetaData:
raise NotImplementedError
@staticmethod
@abstractmethod
def load(
uri: Union[BinaryIO, str, os.PathLike],
frame_offset: int = 0,
num_frames: int = -1,
normalize: bool = True,
channels_first: bool = True,
format: Optional[str] = None,
buffer_size: int = 4096,
) -> Tuple[Tensor, int]:
raise NotImplementedError
@staticmethod
@abstractmethod
def save(
uri: Union[BinaryIO, str, os.PathLike],
src: Tensor,
sample_rate: int,
channels_first: bool = True,
format: Optional[str] = None,
encoding: Optional[str] = None,
bits_per_sample: Optional[int] = None,
buffer_size: int = 4096,
compression: Optional[Union[CodecConfig, float, int]] = None,
) -> None:
raise NotImplementedError
@staticmethod
@abstractmethod
def can_decode(uri: Union[BinaryIO, str, os.PathLike], format: Optional[str]) -> bool:
raise NotImplementedError
@staticmethod
@abstractmethod
def can_encode(uri: Union[BinaryIO, str, os.PathLike], format: Optional[str]) -> bool:
raise NotImplementedError
class AudioMetaData:
"""AudioMetaData()
Return type of ``torchaudio.info`` function.
:ivar int sample_rate: Sample rate
:ivar int num_frames: The number of frames
:ivar int num_channels: The number of channels
:ivar int bits_per_sample: The number of bits per sample. This is 0 for lossy formats,
or when it cannot be accurately inferred.
:ivar str encoding: Audio encoding
The values encoding can take are one of the following:
* ``PCM_S``: Signed integer linear PCM
* ``PCM_U``: Unsigned integer linear PCM
* ``PCM_F``: Floating point linear PCM
* ``FLAC``: Flac, Free Lossless Audio Codec
* ``ULAW``: Mu-law
* ``ALAW``: A-law
* ``MP3`` : MP3, MPEG-1 Audio Layer III
* ``VORBIS``: OGG Vorbis
* ``AMR_WB``: Adaptive Multi-Rate Wideband
* ``AMR_NB``: Adaptive Multi-Rate Narrowband
* ``OPUS``: Opus
* ``HTK``: Single channel 16-bit PCM
* ``UNKNOWN`` : None of above
"""
def __init__(
self,
sample_rate: int,
num_frames: int,
num_channels: int,
bits_per_sample: int,
encoding: str,
):
self.sample_rate = sample_rate
self.num_frames = num_frames
self.num_channels = num_channels
self.bits_per_sample = bits_per_sample
self.encoding = encoding
def __str__(self):
return (
f"AudioMetaData("
f"sample_rate={self.sample_rate}, "
f"num_frames={self.num_frames}, "
f"num_channels={self.num_channels}, "
f"bits_per_sample={self.bits_per_sample}, "
f"encoding={self.encoding}"
f")"
)
import os
import re
import sys
from typing import BinaryIO, Optional, Tuple, Union
import torch
import torchaudio
from torchaudio.io import StreamWriter
from .backend import Backend
from .common import AudioMetaData
if torchaudio._extension._FFMPEG_EXT is not None:
StreamReaderFileObj = torchaudio._extension._FFMPEG_EXT.StreamReaderFileObj
else:
StreamReaderFileObj = object
def info_audio(
src: str,
format: Optional[str],
) -> AudioMetaData:
i = torch.ops.torchaudio.compat_info(src, format)
return AudioMetaData(i[0], i[1], i[2], i[3], i[4].upper())
def info_audio_fileobj(
src,
format: Optional[str],
buffer_size: int = 4096,
) -> AudioMetaData:
s = StreamReaderFileObj(src, format, None, buffer_size)
i = s.find_best_audio_stream()
sinfo = s.get_src_stream_info(i)
if sinfo.num_frames == 0:
waveform = _load_audio_fileobj(s)
num_frames = waveform.size(1)
else:
num_frames = sinfo.num_frames
return AudioMetaData(
int(sinfo.sample_rate),
num_frames,
sinfo.num_channels,
sinfo.bits_per_sample,
sinfo.codec_name.upper(),
)
def _get_load_filter(
frame_offset: int = 0,
num_frames: int = -1,
convert: bool = True,
) -> Optional[str]:
if frame_offset < 0:
raise RuntimeError("Invalid argument: frame_offset must be non-negative. Found: {}".format(frame_offset))
if num_frames == 0 or num_frames < -1:
raise RuntimeError("Invalid argument: num_frames must be -1 or greater than 0. Found: {}".format(num_frames))
# All default values -> no filter
if frame_offset == 0 and num_frames == -1 and not convert:
return None
# Only convert
aformat = "aformat=sample_fmts=fltp"
if frame_offset == 0 and num_frames == -1 and convert:
return aformat
# At least one of frame_offset or num_frames has non-default value
if num_frames > 0:
atrim = "atrim=start_sample={}:end_sample={}".format(frame_offset, frame_offset + num_frames)
else:
atrim = "atrim=start_sample={}".format(frame_offset)
if not convert:
return atrim
return "{},{}".format(atrim, aformat)
def _load_audio_fileobj(
s: StreamReaderFileObj,
filter: Optional[str] = None,
channels_first: bool = True,
) -> torch.Tensor:
i = s.find_best_audio_stream()
s.add_audio_stream(i, -1, -1, filter, None, None)
s.process_all_packets()
chunk = s.pop_chunks()[0]
if chunk is None:
raise RuntimeError("Failed to decode audio.")
waveform = chunk.frames
return waveform.T if channels_first else waveform
def load_audio(
src: str,
frame_offset: int = 0,
num_frames: int = -1,
convert: bool = True,
channels_first: bool = True,
format: Optional[str] = None,
) -> Tuple[torch.Tensor, int]:
filter = _get_load_filter(frame_offset, num_frames, convert)
return torch.ops.torchaudio.compat_load(src, format, filter, channels_first)
def load_audio_fileobj(
src: BinaryIO,
frame_offset: int = 0,
num_frames: int = -1,
convert: bool = True,
channels_first: bool = True,
format: Optional[str] = None,
buffer_size: int = 4096,
) -> Tuple[torch.Tensor, int]:
demuxer = "ogg" if format == "vorbis" else format
s = StreamReaderFileObj(src, demuxer, None, buffer_size)
sample_rate = int(s.get_src_stream_info(s.find_best_audio_stream()).sample_rate)
filter = _get_load_filter(frame_offset, num_frames, convert)
waveform = _load_audio_fileobj(s, filter, channels_first)
return waveform, sample_rate
def _get_sample_format(dtype: torch.dtype) -> str:
dtype_to_format = {
torch.uint8: "u8",
torch.int16: "s16",
torch.int32: "s32",
torch.int64: "s64",
torch.float32: "flt",
torch.float64: "dbl",
}
format = dtype_to_format.get(dtype)
if format is None:
raise ValueError(f"No format found for dtype {dtype}; dtype must be one of {list(dtype_to_format.keys())}.")
return format
def _native_endianness() -> str:
if sys.byteorder == "little":
return "le"
else:
return "be"
def _get_encoder_for_wav(encoding: str, bits_per_sample: int) -> str:
if bits_per_sample not in {None, 8, 16, 24, 32, 64}:
raise ValueError(f"Invalid bits_per_sample {bits_per_sample} for WAV encoding.")
endianness = _native_endianness()
if not encoding:
if not bits_per_sample:
# default to PCM S16
return f"pcm_s16{endianness}"
if bits_per_sample == 8:
return "pcm_u8"
return f"pcm_s{bits_per_sample}{endianness}"
if encoding == "PCM_S":
if not bits_per_sample:
bits_per_sample = 16
if bits_per_sample == 8:
raise ValueError("For WAV signed PCM, 8-bit encoding is not supported.")
return f"pcm_s{bits_per_sample}{endianness}"
if encoding == "PCM_U":
if bits_per_sample in (None, 8):
return "pcm_u8"
raise ValueError("For WAV unsigned PCM, only 8-bit encoding is supported.")
if encoding == "PCM_F":
if not bits_per_sample:
bits_per_sample = 32
if bits_per_sample in (32, 64):
return f"pcm_f{bits_per_sample}{endianness}"
raise ValueError("For WAV float PCM, only 32- and 64-bit encodings are supported.")
if encoding == "ULAW":
if bits_per_sample in (None, 8):
return "pcm_mulaw"
raise ValueError("For WAV PCM mu-law, only 8-bit encoding is supported.")
if encoding == "ALAW":
if bits_per_sample in (None, 8):
return "pcm_alaw"
raise ValueError("For WAV PCM A-law, only 8-bit encoding is supported.")
raise ValueError(f"WAV encoding {encoding} is not supported.")
def _get_flac_sample_fmt(bps):
if bps is None or bps == 16:
return "s16"
if bps == 24:
return "s32"
raise ValueError(f"FLAC only supports bits_per_sample values of 16 and 24 ({bps} specified).")
def _parse_save_args(
ext: Optional[str],
format: Optional[str],
encoding: Optional[str],
bps: Optional[int],
):
# torchaudio's save function accepts the followings, which do not 1to1 map
# to FFmpeg.
#
# - format: audio format
# - bits_per_sample: encoder sample format
# - encoding: such as PCM_U8.
#
# In FFmpeg, format is specified with the following three (and more)
#
# - muxer: could be audio format or container format.
# the one we passed to the constructor of StreamWriter
# - encoder: the audio encoder used to encode audio
# - encoder sample format: the format used by encoder to encode audio.
#
# If encoder sample format is different from source sample format, StreamWriter
# will insert a filter automatically.
#
def _type(spec):
# either format is exactly the specified one
# or extension matches to the spec AND there is no format override.
return format == spec or (format is None and ext == spec)
if _type("wav") or _type("amb"):
# wav is special because it supports different encoding through encoders
# each encoder only supports one encoder format
#
# amb format is a special case originated from libsox.
# It is basically a WAV format, with slight modification.
# https://github.com/chirlu/sox/commit/4a4ea33edbca5972a1ed8933cc3512c7302fa67a#diff-39171191a858add9df87f5f210a34a776ac2c026842ae6db6ce97f5e68836795
# It is a format so that decoders will recognize it as ambisonic.
# https://www.ambisonia.com/Members/mleese/file-format-for-b-format/
# FFmpeg does not recognize amb because it is basically a WAV format.
muxer = "wav"
encoder = _get_encoder_for_wav(encoding, bps)
sample_fmt = None
elif _type("vorbis"):
# FFpmeg does not recognize vorbis extension, while libsox used to do.
# For the sake of bakward compatibility, (and the simplicity),
# we support the case where users want to do save("foo.vorbis")
muxer = "ogg"
encoder = "vorbis"
sample_fmt = None
else:
muxer = format
encoder = None
sample_fmt = None
if _type("flac"):
sample_fmt = _get_flac_sample_fmt(bps)
if _type("ogg"):
sample_fmt = _get_flac_sample_fmt(bps)
return muxer, encoder, sample_fmt
def save_audio(
uri: Union[BinaryIO, str, os.PathLike],
src: torch.Tensor,
sample_rate: int,
channels_first: bool = True,
format: Optional[str] = None,
encoding: Optional[str] = None,
bits_per_sample: Optional[int] = None,
buffer_size: int = 4096,
compression: Optional[torchaudio.io.CodecConfig] = None,
) -> None:
ext = None
if hasattr(uri, "write"):
if format is None:
raise RuntimeError("'format' is required when saving to file object.")
else:
uri = os.path.normpath(uri)
if tokens := str(uri).split(".")[1:]:
ext = tokens[-1].lower()
muxer, encoder, enc_fmt = _parse_save_args(ext, format, encoding, bits_per_sample)
if channels_first:
src = src.T
s = StreamWriter(uri, format=muxer, buffer_size=buffer_size)
s.add_audio_stream(
sample_rate,
num_channels=src.size(-1),
format=_get_sample_format(src.dtype),
encoder=encoder,
encoder_format=enc_fmt,
codec_config=compression,
)
with s.open():
s.write_audio_chunk(0, src)
def _map_encoding(encoding: str) -> str:
for dst in ["PCM_S", "PCM_U", "PCM_F"]:
if dst in encoding:
return dst
if encoding == "PCM_MULAW":
return "ULAW"
elif encoding == "PCM_ALAW":
return "ALAW"
return encoding
def _get_bits_per_sample(encoding: str, bits_per_sample: int) -> str:
if m := re.search(r"PCM_\w(\d+)\w*", encoding):
return int(m.group(1))
elif encoding in ["PCM_ALAW", "PCM_MULAW"]:
return 8
return bits_per_sample
class FFmpegBackend(Backend):
@staticmethod
def info(uri: Union[BinaryIO, str, os.PathLike], format: Optional[str], buffer_size: int = 4096) -> AudioMetaData:
if hasattr(uri, "read"):
metadata = info_audio_fileobj(uri, format, buffer_size=buffer_size)
else:
metadata = info_audio(os.path.normpath(uri), format)
metadata.bits_per_sample = _get_bits_per_sample(metadata.encoding, metadata.bits_per_sample)
metadata.encoding = _map_encoding(metadata.encoding)
return metadata
@staticmethod
def load(
uri: Union[BinaryIO, str, os.PathLike],
frame_offset: int = 0,
num_frames: int = -1,
normalize: bool = True,
channels_first: bool = True,
format: Optional[str] = None,
buffer_size: int = 4096,
) -> Tuple[torch.Tensor, int]:
if hasattr(uri, "read"):
return load_audio_fileobj(
uri,
frame_offset,
num_frames,
normalize,
channels_first,
format,
buffer_size,
)
else:
return load_audio(os.path.normpath(uri), frame_offset, num_frames, normalize, channels_first, format)
@staticmethod
def save(
uri: Union[BinaryIO, str, os.PathLike],
src: torch.Tensor,
sample_rate: int,
channels_first: bool = True,
format: Optional[str] = None,
encoding: Optional[str] = None,
bits_per_sample: Optional[int] = None,
buffer_size: int = 4096,
compression: Optional[Union[torchaudio.io.CodecConfig, float, int]] = None,
) -> None:
if not isinstance(compression, (torchaudio.io.CodecConfig, type(None))):
raise ValueError(
"FFmpeg backend expects non-`None` value for argument `compression` to be of ",
f"type `torchaudio.io.CodecConfig`, but received value of type {type(compression)}",
)
save_audio(
uri,
src,
sample_rate,
channels_first,
format,
encoding,
bits_per_sample,
buffer_size,
compression,
)
@staticmethod
def can_decode(uri: Union[BinaryIO, str, os.PathLike], format: Optional[str]) -> bool:
return True
@staticmethod
def can_encode(uri: Union[BinaryIO, str, os.PathLike], format: Optional[str]) -> bool:
return True
import os
from typing import BinaryIO, Optional, Tuple, Union
import torch
from torchaudio.io import CodecConfig
from . import soundfile_backend
from .backend import Backend
from .common import AudioMetaData
class SoundfileBackend(Backend):
@staticmethod
def info(uri: Union[BinaryIO, str, os.PathLike], format: Optional[str], buffer_size: int = 4096) -> AudioMetaData:
return soundfile_backend.info(uri, format)
@staticmethod
def load(
uri: Union[BinaryIO, str, os.PathLike],
frame_offset: int = 0,
num_frames: int = -1,
normalize: bool = True,
channels_first: bool = True,
format: Optional[str] = None,
buffer_size: int = 4096,
) -> Tuple[torch.Tensor, int]:
return soundfile_backend.load(uri, frame_offset, num_frames, normalize, channels_first, format)
@staticmethod
def save(
uri: Union[BinaryIO, str, os.PathLike],
src: torch.Tensor,
sample_rate: int,
channels_first: bool = True,
format: Optional[str] = None,
encoding: Optional[str] = None,
bits_per_sample: Optional[int] = None,
buffer_size: int = 4096,
compression: Optional[Union[CodecConfig, float, int]] = None,
) -> None:
if compression:
raise ValueError("soundfile backend does not support argument `compression`.")
soundfile_backend.save(
uri, src, sample_rate, channels_first, format=format, encoding=encoding, bits_per_sample=bits_per_sample
)
@staticmethod
def can_decode(uri, format) -> bool:
return True
@staticmethod
def can_encode(uri, format) -> bool:
return True
"""The new soundfile backend which will become default in 0.8.0 onward"""
import warnings
from typing import Optional, Tuple
import torch
from torchaudio._internal import module_utils as _mod_utils
from .common import AudioMetaData
_IS_SOUNDFILE_AVAILABLE = False
# TODO: import soundfile only when it is used.
if _mod_utils.is_module_available("soundfile"):
try:
import soundfile
_requires_soundfile = _mod_utils.no_op
_IS_SOUNDFILE_AVAILABLE = True
except Exception:
_requires_soundfile = _mod_utils.fail_with_message(
"requires soundfile, but we failed to import it. Please check the installation of soundfile."
)
else:
_requires_soundfile = _mod_utils.fail_with_message(
"requires soundfile, but it is not installed. Please install soundfile."
)
# Mapping from soundfile subtype to number of bits per sample.
# This is mostly heuristical and the value is set to 0 when it is irrelevant
# (lossy formats) or when it can't be inferred.
# For ADPCM (and G72X) subtypes, it's hard to infer the bit depth because it's not part of the standard:
# According to https://en.wikipedia.org/wiki/Adaptive_differential_pulse-code_modulation#In_telephony,
# the default seems to be 8 bits but it can be compressed further to 4 bits.
# The dict is inspired from
# https://github.com/bastibe/python-soundfile/blob/744efb4b01abc72498a96b09115b42a4cabd85e4/soundfile.py#L66-L94
_SUBTYPE_TO_BITS_PER_SAMPLE = {
"PCM_S8": 8, # Signed 8 bit data
"PCM_16": 16, # Signed 16 bit data
"PCM_24": 24, # Signed 24 bit data
"PCM_32": 32, # Signed 32 bit data
"PCM_U8": 8, # Unsigned 8 bit data (WAV and RAW only)
"FLOAT": 32, # 32 bit float data
"DOUBLE": 64, # 64 bit float data
"ULAW": 8, # U-Law encoded. See https://en.wikipedia.org/wiki/G.711#Types
"ALAW": 8, # A-Law encoded. See https://en.wikipedia.org/wiki/G.711#Types
"IMA_ADPCM": 0, # IMA ADPCM.
"MS_ADPCM": 0, # Microsoft ADPCM.
"GSM610": 0, # GSM 6.10 encoding. (Wikipedia says 1.625 bit depth?? https://en.wikipedia.org/wiki/Full_Rate)
"VOX_ADPCM": 0, # OKI / Dialogix ADPCM
"G721_32": 0, # 32kbs G721 ADPCM encoding.
"G723_24": 0, # 24kbs G723 ADPCM encoding.
"G723_40": 0, # 40kbs G723 ADPCM encoding.
"DWVW_12": 12, # 12 bit Delta Width Variable Word encoding.
"DWVW_16": 16, # 16 bit Delta Width Variable Word encoding.
"DWVW_24": 24, # 24 bit Delta Width Variable Word encoding.
"DWVW_N": 0, # N bit Delta Width Variable Word encoding.
"DPCM_8": 8, # 8 bit differential PCM (XI only)
"DPCM_16": 16, # 16 bit differential PCM (XI only)
"VORBIS": 0, # Xiph Vorbis encoding. (lossy)
"ALAC_16": 16, # Apple Lossless Audio Codec (16 bit).
"ALAC_20": 20, # Apple Lossless Audio Codec (20 bit).
"ALAC_24": 24, # Apple Lossless Audio Codec (24 bit).
"ALAC_32": 32, # Apple Lossless Audio Codec (32 bit).
}
def _get_bit_depth(subtype):
if subtype not in _SUBTYPE_TO_BITS_PER_SAMPLE:
warnings.warn(
f"The {subtype} subtype is unknown to TorchAudio. As a result, the bits_per_sample "
"attribute will be set to 0. If you are seeing this warning, please "
"report by opening an issue on github (after checking for existing/closed ones). "
"You may otherwise ignore this warning."
)
return _SUBTYPE_TO_BITS_PER_SAMPLE.get(subtype, 0)
_SUBTYPE_TO_ENCODING = {
"PCM_S8": "PCM_S",
"PCM_16": "PCM_S",
"PCM_24": "PCM_S",
"PCM_32": "PCM_S",
"PCM_U8": "PCM_U",
"FLOAT": "PCM_F",
"DOUBLE": "PCM_F",
"ULAW": "ULAW",
"ALAW": "ALAW",
"VORBIS": "VORBIS",
}
def _get_encoding(format: str, subtype: str):
if format == "FLAC":
return "FLAC"
return _SUBTYPE_TO_ENCODING.get(subtype, "UNKNOWN")
@_requires_soundfile
def info(filepath: str, format: Optional[str] = None) -> AudioMetaData:
"""Get signal information of an audio file.
Note:
``filepath`` argument is intentionally annotated as ``str`` only, even though it accepts
``pathlib.Path`` object as well. This is for the consistency with ``"sox_io"`` backend,
which has a restriction on type annotation due to TorchScript compiler compatiblity.
Args:
filepath (path-like object or file-like object):
Source of audio data.
format (str or None, optional):
Not used. PySoundFile does not accept format hint.
Returns:
AudioMetaData: meta data of the given audio.
"""
sinfo = soundfile.info(filepath)
return AudioMetaData(
sinfo.samplerate,
sinfo.frames,
sinfo.channels,
bits_per_sample=_get_bit_depth(sinfo.subtype),
encoding=_get_encoding(sinfo.format, sinfo.subtype),
)
_SUBTYPE2DTYPE = {
"PCM_S8": "int8",
"PCM_U8": "uint8",
"PCM_16": "int16",
"PCM_32": "int32",
"FLOAT": "float32",
"DOUBLE": "float64",
}
@_requires_soundfile
def load(
filepath: str,
frame_offset: int = 0,
num_frames: int = -1,
normalize: bool = True,
channels_first: bool = True,
format: Optional[str] = None,
) -> Tuple[torch.Tensor, int]:
"""Load audio data from file.
Note:
The formats this function can handle depend on the soundfile installation.
This function is tested on the following formats;
* WAV
* 32-bit floating-point
* 32-bit signed integer
* 16-bit signed integer
* 8-bit unsigned integer
* FLAC
* OGG/VORBIS
* SPHERE
By default (``normalize=True``, ``channels_first=True``), this function returns Tensor with
``float32`` dtype, and the shape of `[channel, time]`.
.. warning::
``normalize`` argument does not perform volume normalization.
It only converts the sample type to `torch.float32` from the native sample
type.
When the input format is WAV with integer type, such as 32-bit signed integer, 16-bit
signed integer, 24-bit signed integer, and 8-bit unsigned integer, by providing ``normalize=False``,
this function can return integer Tensor, where the samples are expressed within the whole range
of the corresponding dtype, that is, ``int32`` tensor for 32-bit signed PCM,
``int16`` for 16-bit signed PCM and ``uint8`` for 8-bit unsigned PCM. Since torch does not
support ``int24`` dtype, 24-bit signed PCM are converted to ``int32`` tensors.
``normalize`` argument has no effect on 32-bit floating-point WAV and other formats, such as
``flac`` and ``mp3``.
For these formats, this function always returns ``float32`` Tensor with values.
Note:
``filepath`` argument is intentionally annotated as ``str`` only, even though it accepts
``pathlib.Path`` object as well. This is for the consistency with ``"sox_io"`` backend,
which has a restriction on type annotation due to TorchScript compiler compatiblity.
Args:
filepath (path-like object or file-like object):
Source of audio data.
frame_offset (int, optional):
Number of frames to skip before start reading data.
num_frames (int, optional):
Maximum number of frames to read. ``-1`` reads all the remaining samples,
starting from ``frame_offset``.
This function may return the less number of frames if there is not enough
frames in the given file.
normalize (bool, optional):
When ``True``, this function converts the native sample type to ``float32``.
Default: ``True``.
If input file is integer WAV, giving ``False`` will change the resulting Tensor type to
integer type.
This argument has no effect for formats other than integer WAV type.
channels_first (bool, optional):
When True, the returned Tensor has dimension `[channel, time]`.
Otherwise, the returned Tensor's dimension is `[time, channel]`.
format (str or None, optional):
Not used. PySoundFile does not accept format hint.
Returns:
(torch.Tensor, int): Resulting Tensor and sample rate.
If the input file has integer wav format and normalization is off, then it has
integer type, else ``float32`` type. If ``channels_first=True``, it has
`[channel, time]` else `[time, channel]`.
"""
with soundfile.SoundFile(filepath, "r") as file_:
if file_.format != "WAV" or normalize:
dtype = "float32"
elif file_.subtype not in _SUBTYPE2DTYPE:
raise ValueError(f"Unsupported subtype: {file_.subtype}")
else:
dtype = _SUBTYPE2DTYPE[file_.subtype]
frames = file_._prepare_read(frame_offset, None, num_frames)
waveform = file_.read(frames, dtype, always_2d=True)
sample_rate = file_.samplerate
waveform = torch.from_numpy(waveform)
if channels_first:
waveform = waveform.t()
return waveform, sample_rate
def _get_subtype_for_wav(dtype: torch.dtype, encoding: str, bits_per_sample: int):
if not encoding:
if not bits_per_sample:
subtype = {
torch.uint8: "PCM_U8",
torch.int16: "PCM_16",
torch.int32: "PCM_32",
torch.float32: "FLOAT",
torch.float64: "DOUBLE",
}.get(dtype)
if not subtype:
raise ValueError(f"Unsupported dtype for wav: {dtype}")
return subtype
if bits_per_sample == 8:
return "PCM_U8"
return f"PCM_{bits_per_sample}"
if encoding == "PCM_S":
if not bits_per_sample:
return "PCM_32"
if bits_per_sample == 8:
raise ValueError("wav does not support 8-bit signed PCM encoding.")
return f"PCM_{bits_per_sample}"
if encoding == "PCM_U":
if bits_per_sample in (None, 8):
return "PCM_U8"
raise ValueError("wav only supports 8-bit unsigned PCM encoding.")
if encoding == "PCM_F":
if bits_per_sample in (None, 32):
return "FLOAT"
if bits_per_sample == 64:
return "DOUBLE"
raise ValueError("wav only supports 32/64-bit float PCM encoding.")
if encoding == "ULAW":
if bits_per_sample in (None, 8):
return "ULAW"
raise ValueError("wav only supports 8-bit mu-law encoding.")
if encoding == "ALAW":
if bits_per_sample in (None, 8):
return "ALAW"
raise ValueError("wav only supports 8-bit a-law encoding.")
raise ValueError(f"wav does not support {encoding}.")
def _get_subtype_for_sphere(encoding: str, bits_per_sample: int):
if encoding in (None, "PCM_S"):
return f"PCM_{bits_per_sample}" if bits_per_sample else "PCM_32"
if encoding in ("PCM_U", "PCM_F"):
raise ValueError(f"sph does not support {encoding} encoding.")
if encoding == "ULAW":
if bits_per_sample in (None, 8):
return "ULAW"
raise ValueError("sph only supports 8-bit for mu-law encoding.")
if encoding == "ALAW":
return "ALAW"
raise ValueError(f"sph does not support {encoding}.")
def _get_subtype(dtype: torch.dtype, format: str, encoding: str, bits_per_sample: int):
if format == "wav":
return _get_subtype_for_wav(dtype, encoding, bits_per_sample)
if format == "flac":
if encoding:
raise ValueError("flac does not support encoding.")
if not bits_per_sample:
return "PCM_16"
if bits_per_sample > 24:
raise ValueError("flac does not support bits_per_sample > 24.")
return "PCM_S8" if bits_per_sample == 8 else f"PCM_{bits_per_sample}"
if format in ("ogg", "vorbis"):
if bits_per_sample:
raise ValueError("ogg/vorbis does not support bits_per_sample.")
if encoding is None or encoding == "vorbis":
return "VORBIS"
if encoding == "opus":
return "OPUS"
raise ValueError(f"Unexpected encoding: {encoding}")
if format == "mp3":
return "MPEG_LAYER_III"
if format == "sph":
return _get_subtype_for_sphere(encoding, bits_per_sample)
if format in ("nis", "nist"):
return "PCM_16"
raise ValueError(f"Unsupported format: {format}")
@_requires_soundfile
def save(
filepath: str,
src: torch.Tensor,
sample_rate: int,
channels_first: bool = True,
compression: Optional[float] = None,
format: Optional[str] = None,
encoding: Optional[str] = None,
bits_per_sample: Optional[int] = None,
):
"""Save audio data to file.
Note:
The formats this function can handle depend on the soundfile installation.
This function is tested on the following formats;
* WAV
* 32-bit floating-point
* 32-bit signed integer
* 16-bit signed integer
* 8-bit unsigned integer
* FLAC
* OGG/VORBIS
* SPHERE
Note:
``filepath`` argument is intentionally annotated as ``str`` only, even though it accepts
``pathlib.Path`` object as well. This is for the consistency with ``"sox_io"`` backend,
which has a restriction on type annotation due to TorchScript compiler compatiblity.
Args:
filepath (str or pathlib.Path): Path to audio file.
src (torch.Tensor): Audio data to save. must be 2D tensor.
sample_rate (int): sampling rate
channels_first (bool, optional): If ``True``, the given tensor is interpreted as `[channel, time]`,
otherwise `[time, channel]`.
compression (float of None, optional): Not used.
It is here only for interface compatibility reson with "sox_io" backend.
format (str or None, optional): Override the audio format.
When ``filepath`` argument is path-like object, audio format is
inferred from file extension. If the file extension is missing or
different, you can specify the correct format with this argument.
When ``filepath`` argument is file-like object,
this argument is required.
Valid values are ``"wav"``, ``"ogg"``, ``"vorbis"``,
``"flac"`` and ``"sph"``.
encoding (str or None, optional): Changes the encoding for supported formats.
This argument is effective only for supported formats, sush as
``"wav"``, ``""flac"`` and ``"sph"``. Valid values are;
- ``"PCM_S"`` (signed integer Linear PCM)
- ``"PCM_U"`` (unsigned integer Linear PCM)
- ``"PCM_F"`` (floating point PCM)
- ``"ULAW"`` (mu-law)
- ``"ALAW"`` (a-law)
bits_per_sample (int or None, optional): Changes the bit depth for the
supported formats.
When ``format`` is one of ``"wav"``, ``"flac"`` or ``"sph"``,
you can change the bit depth.
Valid values are ``8``, ``16``, ``24``, ``32`` and ``64``.
Supported formats/encodings/bit depth/compression are:
``"wav"``
- 32-bit floating-point PCM
- 32-bit signed integer PCM
- 24-bit signed integer PCM
- 16-bit signed integer PCM
- 8-bit unsigned integer PCM
- 8-bit mu-law
- 8-bit a-law
Note:
Default encoding/bit depth is determined by the dtype of
the input Tensor.
``"flac"``
- 8-bit
- 16-bit (default)
- 24-bit
``"ogg"``, ``"vorbis"``
- Doesn't accept changing configuration.
``"sph"``
- 8-bit signed integer PCM
- 16-bit signed integer PCM
- 24-bit signed integer PCM
- 32-bit signed integer PCM (default)
- 8-bit mu-law
- 8-bit a-law
- 16-bit a-law
- 24-bit a-law
- 32-bit a-law
"""
if src.ndim != 2:
raise ValueError(f"Expected 2D Tensor, got {src.ndim}D.")
if compression is not None:
warnings.warn(
'`save` function of "soundfile" backend does not support "compression" parameter. '
"The argument is silently ignored."
)
if hasattr(filepath, "write"):
if format is None:
raise RuntimeError("`format` is required when saving to file object.")
ext = format.lower()
else:
ext = str(filepath).split(".")[-1].lower()
if bits_per_sample not in (None, 8, 16, 24, 32, 64):
raise ValueError("Invalid bits_per_sample.")
if bits_per_sample == 24:
warnings.warn(
"Saving audio with 24 bits per sample might warp samples near -1. "
"Using 16 bits per sample might be able to avoid this."
)
subtype = _get_subtype(src.dtype, ext, encoding, bits_per_sample)
# sph is a extension used in TED-LIUM but soundfile does not recognize it as NIST format,
# so we extend the extensions manually here
if ext in ["nis", "nist", "sph"] and format is None:
format = "NIST"
if channels_first:
src = src.t()
soundfile.write(file=filepath, data=src, samplerate=sample_rate, subtype=subtype, format=format)
import os
from typing import BinaryIO, Optional, Tuple, Union
import torch
from torchaudio.io import CodecConfig
from .backend import Backend
from .common import AudioMetaData
class SoXBackend(Backend):
@staticmethod
def info(uri: Union[BinaryIO, str, os.PathLike], format: Optional[str], buffer_size: int = 4096) -> AudioMetaData:
if hasattr(uri, "read"):
raise ValueError(
"SoX backend does not support reading from file-like objects. ",
"Please use an alternative backend that does support reading from file-like objects, e.g. FFmpeg.",
)
else:
sinfo = torch.ops.torchaudio.sox_io_get_info(uri, format)
if sinfo:
return AudioMetaData(*sinfo)
else:
raise RuntimeError(f"Failed to fetch metadata for {uri}.")
@staticmethod
def load(
uri: Union[BinaryIO, str, os.PathLike],
frame_offset: int = 0,
num_frames: int = -1,
normalize: bool = True,
channels_first: bool = True,
format: Optional[str] = None,
buffer_size: int = 4096,
) -> Tuple[torch.Tensor, int]:
if hasattr(uri, "read"):
raise ValueError(
"SoX backend does not support loading from file-like objects. ",
"Please use an alternative backend that does support loading from file-like objects, e.g. FFmpeg.",
)
else:
ret = torch.ops.torchaudio.sox_io_load_audio_file(
uri, frame_offset, num_frames, normalize, channels_first, format
)
if not ret:
raise RuntimeError(f"Failed to load audio from {uri}.")
return ret
@staticmethod
def save(
uri: Union[BinaryIO, str, os.PathLike],
src: torch.Tensor,
sample_rate: int,
channels_first: bool = True,
format: Optional[str] = None,
encoding: Optional[str] = None,
bits_per_sample: Optional[int] = None,
buffer_size: int = 4096,
compression: Optional[Union[CodecConfig, float, int]] = None,
) -> None:
if not isinstance(compression, (float, int, type(None))):
raise ValueError(
"SoX backend expects non-`None` value for argument `compression` to be of ",
f"type `float` or `int`, but received value of type {type(compression)}",
)
if hasattr(uri, "write"):
raise ValueError(
"SoX backend does not support writing to file-like objects. ",
"Please use an alternative backend that does support writing to file-like objects, e.g. FFmpeg.",
)
else:
torch.ops.torchaudio.sox_io_save_audio_file(
uri,
src,
sample_rate,
channels_first,
compression,
format,
encoding,
bits_per_sample,
)
@staticmethod
def can_decode(uri: Union[BinaryIO, str, os.PathLike], format: Optional[str]) -> bool:
# i.e. not a file-like object.
return not hasattr(uri, "read")
@staticmethod
def can_encode(uri: Union[BinaryIO, str, os.PathLike], format: Optional[str]) -> bool:
# i.e. not a file-like object.
return not hasattr(uri, "write")
import os
from functools import lru_cache
from typing import BinaryIO, Dict, Optional, Tuple, Type, Union
import torch
from torchaudio._extension import _FFMPEG_EXT, _SOX_INITIALIZED
from torchaudio.io import CodecConfig
from . import soundfile_backend
from .backend import Backend
from .common import AudioMetaData
from .ffmpeg import FFmpegBackend
from .soundfile import SoundfileBackend
from .sox import SoXBackend
@lru_cache(None)
def get_available_backends() -> Dict[str, Type[Backend]]:
backend_specs: Dict[str, Type[Backend]] = {}
if _FFMPEG_EXT is not None:
backend_specs["ffmpeg"] = FFmpegBackend
if _SOX_INITIALIZED:
backend_specs["sox"] = SoXBackend
if soundfile_backend._IS_SOUNDFILE_AVAILABLE:
backend_specs["soundfile"] = SoundfileBackend
return backend_specs
def get_backend(backend_name, backends) -> Backend:
if backend := backends.get(backend_name):
return backend
else:
raise ValueError(
f"Unsupported backend '{backend_name}' specified; ",
f"please select one of {list(backends.keys())} instead.",
)
def get_info_func():
backends = get_available_backends()
def dispatcher(
uri: Union[BinaryIO, str, os.PathLike], format: Optional[str], backend_name: Optional[str]
) -> Backend:
if backend_name is not None:
return get_backend(backend_name, backends)
for backend in backends.values():
if backend.can_decode(uri, format):
return backend
raise RuntimeError(f"Couldn't find appropriate backend to handle uri {uri} and format {format}.")
def info(
uri: Union[BinaryIO, str, os.PathLike],
format: Optional[str] = None,
buffer_size: int = 4096,
backend: Optional[str] = None,
) -> AudioMetaData:
"""Get signal information of an audio file.
Note:
When the input type is file-like object, this function cannot
get the correct length (``num_samples``) for certain formats,
such as ``vorbis``.
In this case, the value of ``num_samples`` is ``0``.
Args:
uri (path-like object or file-like object):
Source of audio data. The following types are accepted:
* ``path-like``: File path or URL.
* ``file-like``: Object with ``read(size: int) -> bytes`` method,
which returns byte string of at most ``size`` length.
format (str or None, optional):
If not ``None``, interpreted as hint that may allow backend to override the detected format.
(Default: ``None``)
buffer_size (int, optional):
Size of buffer to use when processing file-like objects, in bytes. (Default: ``4096``)
backend (str or None, optional):
I/O backend to use.
If ``None``, function selects backend given input and available backends.
Otherwise, must be one of [``"ffmpeg"``, ``"sox"``, ``"soundfile"``],
with the corresponding backend available.
(Default: ``None``)
.. seealso::
:ref:`backend`
Returns:
AudioMetaData
"""
backend = dispatcher(uri, format, backend)
return backend.info(uri, format, buffer_size)
return info
def get_load_func():
backends = get_available_backends()
def dispatcher(
uri: Union[BinaryIO, str, os.PathLike], format: Optional[str], backend_name: Optional[str]
) -> Backend:
if backend_name is not None:
return get_backend(backend_name, backends)
for backend in backends.values():
if backend.can_decode(uri, format):
return backend
raise RuntimeError(f"Couldn't find appropriate backend to handle uri {uri} and format {format}.")
def load(
uri: Union[BinaryIO, str, os.PathLike],
frame_offset: int = 0,
num_frames: int = -1,
normalize: bool = True,
channels_first: bool = True,
format: Optional[str] = None,
buffer_size: int = 4096,
backend: Optional[str] = None,
) -> Tuple[torch.Tensor, int]:
"""Load audio data from source.
By default (``normalize=True``, ``channels_first=True``), this function returns Tensor with
``float32`` dtype, and the shape of `[channel, time]`.
Note:
The formats this function can handle depend on the availability of backends.
Please use the following functions to fetch the supported formats.
- FFmpeg: :py:func:`torchaudio.utils.ffmpeg_utils.get_audio_decoders`
- Sox: :py:func:`torchaudio.utils.sox_utils.list_read_formats`
- SoundFile: Refer to `the official document <https://pysoundfile.readthedocs.io/>`__.
.. warning::
``normalize`` argument does not perform volume normalization.
It only converts the sample type to `torch.float32` from the native sample
type.
When the input format is WAV with integer type, such as 32-bit signed integer, 16-bit
signed integer, 24-bit signed integer, and 8-bit unsigned integer, by providing ``normalize=False``,
this function can return integer Tensor, where the samples are expressed within the whole range
of the corresponding dtype, that is, ``int32`` tensor for 32-bit signed PCM,
``int16`` for 16-bit signed PCM and ``uint8`` for 8-bit unsigned PCM. Since torch does not
support ``int24`` dtype, 24-bit signed PCM are converted to ``int32`` tensors.
``normalize`` argument has no effect on 32-bit floating-point WAV and other formats, such as
``flac`` and ``mp3``.
For these formats, this function always returns ``float32`` Tensor with values.
Args:
uri (path-like object or file-like object):
Source of audio data.
frame_offset (int, optional):
Number of frames to skip before start reading data.
num_frames (int, optional):
Maximum number of frames to read. ``-1`` reads all the remaining samples,
starting from ``frame_offset``.
This function may return the less number of frames if there is not enough
frames in the given file.
normalize (bool, optional):
When ``True``, this function converts the native sample type to ``float32``.
Default: ``True``.
If input file is integer WAV, giving ``False`` will change the resulting Tensor type to
integer type.
This argument has no effect for formats other than integer WAV type.
channels_first (bool, optional):
When True, the returned Tensor has dimension `[channel, time]`.
Otherwise, the returned Tensor's dimension is `[time, channel]`.
format (str or None, optional):
If not ``None``, interpreted as hint that may allow backend to override the detected format.
(Default: ``None``)
buffer_size (int, optional):
Size of buffer to use when processing file-like objects, in bytes. (Default: ``4096``)
backend (str or None, optional):
I/O backend to use.
If ``None``, function selects backend given input and available backends.
Otherwise, must be one of [``"ffmpeg"``, ``"sox"``, ``"soundfile"``],
with the corresponding backend being available. (Default: ``None``)
.. seealso::
:ref:`backend`
Returns:
(torch.Tensor, int): Resulting Tensor and sample rate.
If the input file has integer wav format and normalization is off, then it has
integer type, else ``float32`` type. If ``channels_first=True``, it has
`[channel, time]` else `[time, channel]`.
"""
backend = dispatcher(uri, format, backend)
return backend.load(uri, frame_offset, num_frames, normalize, channels_first, format, buffer_size)
return load
def get_save_func():
backends = get_available_backends()
def dispatcher(
uri: Union[BinaryIO, str, os.PathLike], format: Optional[str], backend_name: Optional[str]
) -> Backend:
if backend_name is not None:
return get_backend(backend_name, backends)
for backend in backends.values():
if backend.can_encode(uri, format):
return backend
raise RuntimeError(f"Couldn't find appropriate backend to handle uri {uri} and format {format}.")
def save(
uri: Union[BinaryIO, str, os.PathLike],
src: torch.Tensor,
sample_rate: int,
channels_first: bool = True,
format: Optional[str] = None,
encoding: Optional[str] = None,
bits_per_sample: Optional[int] = None,
buffer_size: int = 4096,
backend: Optional[str] = None,
compression: Optional[Union[CodecConfig, float, int]] = None,
):
"""Save audio data to file.
Note:
The formats this function can handle depend on the availability of backends.
Please use the following functions to fetch the supported formats.
- FFmpeg: :py:func:`torchaudio.utils.ffmpeg_utils.get_audio_encoders`
- Sox: :py:func:`torchaudio.utils.sox_utils.list_write_formats`
- SoundFile: Refer to `the official document <https://pysoundfile.readthedocs.io/>`__.
Args:
uri (str or pathlib.Path): Path to audio file.
src (torch.Tensor): Audio data to save. must be 2D tensor.
sample_rate (int): sampling rate
channels_first (bool, optional): If ``True``, the given tensor is interpreted as `[channel, time]`,
otherwise `[time, channel]`.
format (str or None, optional): Override the audio format.
When ``uri`` argument is path-like object, audio format is
inferred from file extension. If the file extension is missing or
different, you can specify the correct format with this argument.
When ``uri`` argument is file-like object,
this argument is required.
Valid values are ``"wav"``, ``"ogg"``, and ``"flac"``.
encoding (str or None, optional): Changes the encoding for supported formats.
This argument is effective only for supported formats, i.e.
``"wav"`` and ``""flac"```. Valid values are
- ``"PCM_S"`` (signed integer Linear PCM)
- ``"PCM_U"`` (unsigned integer Linear PCM)
- ``"PCM_F"`` (floating point PCM)
- ``"ULAW"`` (mu-law)
- ``"ALAW"`` (a-law)
bits_per_sample (int or None, optional): Changes the bit depth for the
supported formats.
When ``format`` is one of ``"wav"`` and ``"flac"``,
you can change the bit depth.
Valid values are ``8``, ``16``, ``24``, ``32`` and ``64``.
buffer_size (int, optional):
Size of buffer to use when processing file-like objects, in bytes. (Default: ``4096``)
backend (str or None, optional):
I/O backend to use.
If ``None``, function selects backend given input and available backends.
Otherwise, must be one of [``"ffmpeg"``, ``"sox"``, ``"soundfile"``],
with the corresponding backend being available.
(Default: ``None``)
.. seealso::
:ref:`backend`
compression (CodecConfig, float, int, or None, optional):
Compression configuration to apply.
If the selected backend is FFmpeg, an instance of :py:class:`CodecConfig` must be provided.
Otherwise, if the selected backend is SoX, a float or int value corresponding to option ``-C`` of the
``sox`` command line interface must be provided. For instance:
``"mp3"``
Either bitrate (in ``kbps``) with quality factor, such as ``128.2``, or
VBR encoding with quality factor such as ``-4.2``. Default: ``-4.5``.
``"flac"``
Whole number from ``0`` to ``8``. ``8`` is default and highest compression.
``"ogg"``, ``"vorbis"``
Number from ``-1`` to ``10``; ``-1`` is the highest compression
and lowest quality. Default: ``3``.
Refer to http://sox.sourceforge.net/soxformat.html for more details.
"""
backend = dispatcher(uri, format, backend)
return backend.save(
uri, src, sample_rate, channels_first, format, encoding, bits_per_sample, buffer_size, compression
)
return save
import logging
import os
import sys
from torchaudio._internal.module_utils import eval_env, fail_with_message, is_module_available, no_op
try:
from .fb import _init_ffmpeg
except ImportError:
from .utils import _init_ffmpeg
from .utils import _check_cuda_version, _fail_since_no_ffmpeg, _fail_since_no_sox, _init_dll_path, _init_sox, _load_lib
_LG = logging.getLogger(__name__)
# Note:
# `_check_cuda_version` is not meant to be used by regular users.
# Builder uses it for debugging purpose, so we export it.
# https://github.com/pytorch/builder/blob/e2e4542b8eb0bdf491214451a1a4128bd606cce2/test/smoke_test/smoke_test.py#L80
__all__ = [
"fail_if_no_sox",
"fail_if_no_ffmpeg",
"_check_cuda_version",
"_IS_TORCHAUDIO_EXT_AVAILABLE",
"_IS_RIR_AVAILABLE",
"_SOX_INITIALIZED",
"_FFMPEG_EXT",
]
if os.name == "nt" and (3, 8) <= sys.version_info < (3, 9):
_init_dll_path()
# When the extension module is built, we initialize it.
# In case of an error, we do not catch the failure as it suggests there is something
# wrong with the installation.
_IS_TORCHAUDIO_EXT_AVAILABLE = is_module_available("torchaudio.lib._torchaudio")
# RIR features are implemented in _torchaudio extension, but they can be individually
# turned on/off at build time. Available means that _torchaudio is loaded properly, and
# RIR features are found there.
_IS_RIR_AVAILABLE = False
_IS_ALIGN_AVAILABLE = False
if _IS_TORCHAUDIO_EXT_AVAILABLE:
_load_lib("libtorchaudio")
import torchaudio.lib._torchaudio # noqa
_check_cuda_version()
_IS_RIR_AVAILABLE = torchaudio.lib._torchaudio.is_rir_available()
_IS_ALIGN_AVAILABLE = torchaudio.lib._torchaudio.is_align_available()
# Initialize libsox-related features
_SOX_INITIALIZED = False
_USE_SOX = False if os.name == "nt" else eval_env("TORCHAUDIO_USE_SOX", True)
_SOX_MODULE_AVAILABLE = is_module_available("torchaudio.lib._torchaudio_sox")
if _USE_SOX and _SOX_MODULE_AVAILABLE:
try:
_init_sox()
_SOX_INITIALIZED = True
except Exception:
# The initialization of sox extension will fail if supported sox
# libraries are not found in the system.
# Since the rest of the torchaudio works without it, we do not report the
# error here.
# The error will be raised when user code attempts to use these features.
_LG.debug("Failed to initialize sox extension", exc_info=True)
if os.name == "nt":
fail_if_no_sox = fail_with_message("requires sox extension, which is not supported on Windows.")
elif not _USE_SOX:
fail_if_no_sox = fail_with_message("requires sox extension, but it is disabled. (TORCHAUDIO_USE_SOX=0)")
elif not _SOX_MODULE_AVAILABLE:
fail_if_no_sox = fail_with_message(
"requires sox extension, but TorchAudio is not compiled with it. "
"Please build TorchAudio with libsox support. (BUILD_SOX=1)"
)
else:
fail_if_no_sox = no_op if _SOX_INITIALIZED else _fail_since_no_sox
# Initialize FFmpeg-related features
_FFMPEG_EXT = None
_USE_FFMPEG = eval_env("TORCHAUDIO_USE_FFMPEG", True)
if _USE_FFMPEG and _IS_TORCHAUDIO_EXT_AVAILABLE:
try:
_FFMPEG_EXT = _init_ffmpeg()
except Exception:
# The initialization of FFmpeg extension will fail if supported FFmpeg
# libraries are not found in the system.
# Since the rest of the torchaudio works without it, we do not report the
# error here.
# The error will be raised when user code attempts to use these features.
_LG.debug("Failed to initialize ffmpeg bindings", exc_info=True)
if _USE_FFMPEG:
fail_if_no_ffmpeg = _fail_since_no_ffmpeg if _FFMPEG_EXT is None else no_op
else:
fail_if_no_ffmpeg = fail_with_message("requires ffmpeg extension, but it is disabled. (TORCHAUDIO_USE_FFMPEG=0)")
fail_if_no_rir = (
no_op
if _IS_RIR_AVAILABLE
else fail_with_message(
"requires RIR extension, but TorchAudio is not compiled with it. Please build TorchAudio with RIR support."
)
)
fail_if_no_align = (
no_op
if _IS_ALIGN_AVAILABLE
else fail_with_message(
"Requires alignment extension, but TorchAudio is not compiled with it. \
Please build TorchAudio with alignment support."
)
)
"""Module to implement logics used for initializing extensions.
The implementations here should be stateless.
They should not depend on external state.
Anything that depends on external state should happen in __init__.py
"""
import importlib
import logging
import os
import platform
import warnings
from functools import wraps
from pathlib import Path
import torch
import torchaudio
_LG = logging.getLogger(__name__)
_LIB_DIR = Path(__file__).parent.parent / "lib"
def _get_lib_path(lib: str):
suffix = "pyd" if os.name == "nt" else "so"
path = _LIB_DIR / f"{lib}.{suffix}"
return path
def _load_lib(lib: str) -> bool:
"""Load extension module
Note:
In case `torchaudio` is deployed with `pex` format, the library file
is not in a standard location.
In this case, we expect that `libtorchaudio` is available somewhere
in the search path of dynamic loading mechanism, so that importing
`_torchaudio` will have library loader find and load `libtorchaudio`.
This is the reason why the function should not raising an error when the library
file is not found.
Returns:
bool:
True if the library file is found AND the library loaded without failure.
False if the library file is not found (like in the case where torchaudio
is deployed with pex format, thus the shared library file is
in a non-standard location.).
If the library file is found but there is an issue loading the library,
(such as missing dependency) then this function raises the exception as-is.
Raises:
Exception:
If the library file is found, but there is an issue loading the library file,
(when underlying `ctype.DLL` throws an exception), this function will pass
the exception as-is, instead of catching it and returning bool.
The expected case is `OSError` thrown by `ctype.DLL` when a dynamic dependency
is not found.
This behavior was chosen because the expected failure case is not recoverable.
If a dependency is missing, then users have to install it.
"""
path = _get_lib_path(lib)
if not path.exists():
return False
torch.ops.load_library(path)
torch.classes.load_library(path)
return True
def _init_sox():
_load_lib("libtorchaudio_sox")
import torchaudio.lib._torchaudio_sox # noqa
torchaudio.lib._torchaudio_sox.set_verbosity(0)
import atexit
torch.ops.torchaudio.sox_effects_initialize_sox_effects()
atexit.register(torch.ops.torchaudio.sox_effects_shutdown_sox_effects)
def _try_access_avutil(ffmpeg_ver):
libname_template = {
"Linux": "libavutil.so.{ver}",
"Darwin": "libavutil.{ver}.dylib",
"Windows": "avutil-{ver}.dll",
}[platform.system()]
avutil_ver = {"6": 58, "5": 57, "4": 56}[ffmpeg_ver]
libavutil = libname_template.format(ver=avutil_ver)
torchaudio.lib._torchaudio.find_avutil(libavutil)
def _find_versionsed_ffmpeg_extension(ffmpeg_ver: str):
_LG.debug("Attempting to load FFmpeg version %s.", ffmpeg_ver)
library = f"libtorchaudio_ffmpeg{ffmpeg_ver}"
extension = f"_torchaudio_ffmpeg{ffmpeg_ver}"
if not _get_lib_path(extension).exists():
raise RuntimeError(f"FFmpeg {ffmpeg_ver} extension is not available.")
if ffmpeg_ver:
# A simple check for FFmpeg availability.
# This is not technically sufficient as other libraries could be missing,
# but usually this is sufficient.
#
# Note: the reason why this check is performed is because I don't know
# if the next `_load_lib` (which calls `ctypes.CDLL` under the hood),
# could leak handle to shared libraries of dependencies, in case it fails.
#
# i.e. If the `ctypes.CDLL("foo")` fails because one of `foo`'s dependency
# does not exist while `foo` and some other dependencies exist, is it guaranteed
# that none-of them are kept in memory after the failure??
_try_access_avutil(ffmpeg_ver)
_load_lib(library)
_LG.debug("Found FFmpeg version %s.", ffmpeg_ver)
return importlib.import_module(f"torchaudio.lib.{extension}")
_FFMPEG_VERS = ["6", "5", "4", ""]
def _find_ffmpeg_extension(ffmpeg_vers, show_error):
logger = _LG.error if show_error else _LG.debug
for ffmpeg_ver in ffmpeg_vers:
try:
return _find_versionsed_ffmpeg_extension(ffmpeg_ver)
except Exception:
logger("Failed to load FFmpeg %s extension.", ffmpeg_ver, exc_info=True)
continue
raise ImportError(f"Failed to intialize FFmpeg extension. Tried versions: {ffmpeg_vers}")
def _find_available_ffmpeg_ext():
ffmpeg_vers = ["6", "5", "4", ""]
return [v for v in ffmpeg_vers if _get_lib_path(f"_torchaudio_ffmpeg{v}").exists()]
def _init_ffmpeg(show_error=False):
ffmpeg_vers = _find_available_ffmpeg_ext()
if not ffmpeg_vers:
raise RuntimeError(
# fmt: off
"TorchAudio is not built with FFmpeg integration. "
"Please build torchaudio with USE_FFMPEG=1."
# fmt: on
)
# User override
if ffmpeg_ver := os.environ.get("TORCHAUDIO_USE_FFMPEG_VERSION"):
if ffmpeg_vers == [""]:
warnings.warn("TorchAudio is built in single FFmpeg mode. TORCHAUDIO_USE_FFMPEG_VERSION is ignored.")
else:
if ffmpeg_ver not in ffmpeg_vers:
raise ValueError(
f"The FFmpeg version {ffmpeg_ver} (read from TORCHAUDIO_USE_FFMPEG_VERSION) "
f"is not available. Available versions are {[v for v in ffmpeg_vers if v]}"
)
ffmpeg_vers = [ffmpeg_ver]
ext = _find_ffmpeg_extension(ffmpeg_vers, show_error)
ext.init()
if ext.get_log_level() > 8:
ext.set_log_level(8)
return ext
def _init_dll_path():
# On Windows Python-3.8+ has `os.add_dll_directory` call,
# which is called to configure dll search path.
# To find cuda related dlls we need to make sure the
# conda environment/bin path is configured Please take a look:
# https://stackoverflow.com/questions/59330863/cant-import-dll-module-in-python
# Please note: if some path can't be added using add_dll_directory we simply ignore this path
for path in os.environ.get("PATH", "").split(";"):
if os.path.exists(path):
try:
os.add_dll_directory(path)
except Exception:
pass
def _check_cuda_version():
version = torchaudio.lib._torchaudio.cuda_version()
if version is not None and torch.version.cuda is not None:
version_str = str(version)
ta_version = f"{version_str[:-3]}.{version_str[-2]}"
t_version = torch.version.cuda.split(".")
t_version = f"{t_version[0]}.{t_version[1]}"
if ta_version != t_version:
raise RuntimeError(
"Detected that PyTorch and TorchAudio were compiled with different CUDA versions. "
f"PyTorch has CUDA version {t_version} whereas TorchAudio has CUDA version {ta_version}. "
"Please install the TorchAudio version that matches your PyTorch version."
)
return version
def _fail_since_no_sox(func):
@wraps(func)
def wrapped(*_args, **_kwargs):
try:
# Note:
# We run _init_sox again just to show users the stacktrace.
# _init_sox would not succeed here.
_init_sox()
except Exception as err:
raise RuntimeError(
f"{func.__name__} requires sox extension which is not available. "
"Please refer to the stacktrace above for how to resolve this."
) from err
# This should not happen in normal execution, but just in case.
return func(*_args, **_kwargs)
return wrapped
def _fail_since_no_ffmpeg(func):
@wraps(func)
def wrapped(*_args, **_kwargs):
try:
# Note:
# We run _init_ffmpeg again just to show users the stacktrace.
# _init_ffmpeg would not succeed here.
_init_ffmpeg(show_error=True)
except Exception as err:
raise RuntimeError(
f"{func.__name__} requires FFmpeg extension which is not available. "
"Please refer to the stacktrace above for how to resolve this."
) from err
# This should not happen in normal execution, but just in case.
return func(*_args, **_kwargs)
return wrapped
from torch.hub import download_url_to_file, load_state_dict_from_url
try:
from .fb import download_url_to_file, load_state_dict_from_url
except ImportError:
from torch.hub import download_url_to_file, load_state_dict_from_url
__all__ = [
......
import importlib.util
import os
import warnings
from functools import wraps
from typing import Optional
import torch
def eval_env(var, default):
"""Check if environment varable has True-y value"""
if var not in os.environ:
return default
val = os.environ.get(var, "0")
trues = ["1", "true", "TRUE", "on", "ON", "yes", "YES"]
falses = ["0", "false", "FALSE", "off", "OFF", "no", "NO"]
if val in trues:
return True
if val not in falses:
# fmt: off
raise RuntimeError(
f"Unexpected environment variable value `{var}={val}`. "
f"Expected one of {trues + falses}")
# fmt: on
return False
def is_module_available(*modules: str) -> bool:
......@@ -42,106 +60,54 @@ def requires_module(*modules: str):
return decorator
def deprecated(direction: str, version: Optional[str] = None):
def deprecated(direction: str, version: Optional[str] = None, remove: bool = False):
"""Decorator to add deprecation message
Args:
direction (str): Migration steps to be given to users.
version (str or int): The version when the object will be removed
remove (bool): If enabled, append future removal message.
"""
def decorator(func):
@wraps(func)
def wrapped(*args, **kwargs):
message = (
f"{func.__module__}.{func.__name__} has been deprecated "
f'and will be removed from {"future" if version is None else version} release. '
f"{direction}"
)
message = f"{func.__module__}.{func.__name__} has been deprecated. {direction}"
if remove:
message += f' It will be removed from {"future" if version is None else version} release. '
warnings.warn(message, stacklevel=2)
return func(*args, **kwargs)
return wrapped
return decorator
message = "This function has been deprecated. "
if remove:
message += f'It will be removed from {"future" if version is None else version} release. '
def is_kaldi_available():
return is_module_available("torchaudio._torchaudio") and torch.ops.torchaudio.is_kaldi_available()
wrapped.__doc__ = f"""DEPRECATED: {func.__doc__}
.. warning::
def requires_kaldi():
if is_kaldi_available():
{message}
{direction}
"""
def decorator(func):
return func
else:
def decorator(func):
@wraps(func)
def wrapped(*args, **kwargs):
raise RuntimeError(f"{func.__module__}.{func.__name__} requires kaldi")
return wrapped
return wrapped
return decorator
def _check_soundfile_importable():
if not is_module_available("soundfile"):
return False
try:
import soundfile # noqa: F401
return True
except Exception:
warnings.warn("Failed to import soundfile. 'soundfile' backend is not available.")
return False
_is_soundfile_importable = _check_soundfile_importable()
def is_soundfile_available():
return _is_soundfile_importable
def requires_soundfile():
if is_soundfile_available():
def decorator(func):
return func
else:
def fail_with_message(message):
"""Generate decorator to give users message about missing TorchAudio extension."""
def decorator(func):
@wraps(func)
def wrapped(*args, **kwargs):
raise RuntimeError(f"{func.__module__}.{func.__name__} requires soundfile")
def decorator(func):
@wraps(func)
def wrapped(*args, **kwargs):
raise RuntimeError(f"{func.__module__}.{func.__name__} {message}")
return wrapped
return wrapped
return decorator
def is_sox_available():
return is_module_available("torchaudio._torchaudio") and torch.ops.torchaudio.is_sox_available()
def requires_sox():
if is_sox_available():
def decorator(func):
return func
else:
def decorator(func):
@wraps(func)
def wrapped(*args, **kwargs):
raise RuntimeError(f"{func.__module__}.{func.__name__} requires sox")
return wrapped
return decorator
def no_op(func):
"""Op-op decorator. Used in place of fail_with_message when a functionality that requires extension works fine."""
return func
# flake8: noqa
from . import utils
from .utils import get_audio_backend, list_audio_backends, set_audio_backend
# NOTE:
# The entire `torchaudio.backend` module is deprecated.
# New things should be added to `torchaudio._backend`.
# Only things related to backward compatibility should be placed here.
utils._init_audio_backend()
from . import common, no_backend, soundfile_backend, sox_io_backend # noqa
from .utils import _init_backend, get_audio_backend, list_audio_backends, set_audio_backend
__all__ = ["_init_backend", "get_audio_backend", "list_audio_backends", "set_audio_backend"]
from pathlib import Path
from typing import Callable, Optional, Tuple, Union
from torch import Tensor
def load(
filepath: Union[str, Path],
out: Optional[Tensor] = None,
normalization: Union[bool, float, Callable] = True,
channels_first: bool = True,
num_frames: int = 0,
offset: int = 0,
filetype: Optional[str] = None,
) -> Tuple[Tensor, int]:
raise RuntimeError("No audio I/O backend is available.")
def save(filepath: str, src: Tensor, sample_rate: int, precision: int = 16, channels_first: bool = True) -> None:
raise RuntimeError("No audio I/O backend is available.")
def info(filepath: str) -> None:
raise RuntimeError("No audio I/O backend is available.")
import os
from typing import Optional, Tuple
import torch
import torchaudio
from torchaudio import AudioMetaData
@torchaudio._extension.fail_if_no_sox
def info(
filepath: str,
format: Optional[str] = None,
) -> AudioMetaData:
"""Get signal information of an audio file.
Args:
filepath (str):
Source of audio data.
format (str or None, optional):
Override the format detection with the given format.
Providing the argument might help when libsox can not infer the format
from header or extension.
Returns:
AudioMetaData: Metadata of the given audio.
"""
if not torch.jit.is_scripting():
if hasattr(filepath, "read"):
raise RuntimeError("sox_io backend does not support file-like object.")
filepath = os.fspath(filepath)
sinfo = torch.ops.torchaudio.sox_io_get_info(filepath, format)
return AudioMetaData(*sinfo)
@torchaudio._extension.fail_if_no_sox
def load(
filepath: str,
frame_offset: int = 0,
num_frames: int = -1,
normalize: bool = True,
channels_first: bool = True,
format: Optional[str] = None,
) -> Tuple[torch.Tensor, int]:
"""Load audio data from file.
Note:
This function can handle all the codecs that underlying libsox can handle,
however it is tested on the following formats;
* WAV, AMB
* 32-bit floating-point
* 32-bit signed integer
* 24-bit signed integer
* 16-bit signed integer
* 8-bit unsigned integer (WAV only)
* MP3
* FLAC
* OGG/VORBIS
* OPUS
* SPHERE
* AMR-NB
To load ``MP3``, ``FLAC``, ``OGG/VORBIS``, ``OPUS`` and other codecs ``libsox`` does not
handle natively, your installation of ``torchaudio`` has to be linked to ``libsox``
and corresponding codec libraries such as ``libmad`` or ``libmp3lame`` etc.
By default (``normalize=True``, ``channels_first=True``), this function returns Tensor with
``float32`` dtype, and the shape of `[channel, time]`.
.. warning::
``normalize`` argument does not perform volume normalization.
It only converts the sample type to `torch.float32` from the native sample
type.
When the input format is WAV with integer type, such as 32-bit signed integer, 16-bit
signed integer, 24-bit signed integer, and 8-bit unsigned integer, by providing ``normalize=False``,
this function can return integer Tensor, where the samples are expressed within the whole range
of the corresponding dtype, that is, ``int32`` tensor for 32-bit signed PCM,
``int16`` for 16-bit signed PCM and ``uint8`` for 8-bit unsigned PCM. Since torch does not
support ``int24`` dtype, 24-bit signed PCM are converted to ``int32`` tensors.
``normalize`` argument has no effect on 32-bit floating-point WAV and other formats, such as
``flac`` and ``mp3``.
For these formats, this function always returns ``float32`` Tensor with values.
Args:
filepath (path-like object): Source of audio data.
frame_offset (int):
Number of frames to skip before start reading data.
num_frames (int, optional):
Maximum number of frames to read. ``-1`` reads all the remaining samples,
starting from ``frame_offset``.
This function may return the less number of frames if there is not enough
frames in the given file.
normalize (bool, optional):
When ``True``, this function converts the native sample type to ``float32``.
Default: ``True``.
If input file is integer WAV, giving ``False`` will change the resulting Tensor type to
integer type.
This argument has no effect for formats other than integer WAV type.
channels_first (bool, optional):
When True, the returned Tensor has dimension `[channel, time]`.
Otherwise, the returned Tensor's dimension is `[time, channel]`.
format (str or None, optional):
Override the format detection with the given format.
Providing the argument might help when libsox can not infer the format
from header or extension.
Returns:
(torch.Tensor, int): Resulting Tensor and sample rate.
If the input file has integer wav format and ``normalize=False``, then it has
integer type, else ``float32`` type. If ``channels_first=True``, it has
`[channel, time]` else `[time, channel]`.
"""
if not torch.jit.is_scripting():
if hasattr(filepath, "read"):
raise RuntimeError("sox_io backend does not support file-like object.")
filepath = os.fspath(filepath)
return torch.ops.torchaudio.sox_io_load_audio_file(
filepath, frame_offset, num_frames, normalize, channels_first, format
)
@torchaudio._extension.fail_if_no_sox
def save(
filepath: str,
src: torch.Tensor,
sample_rate: int,
channels_first: bool = True,
compression: Optional[float] = None,
format: Optional[str] = None,
encoding: Optional[str] = None,
bits_per_sample: Optional[int] = None,
):
"""Save audio data to file.
Args:
filepath (path-like object): Path to save file.
src (torch.Tensor): Audio data to save. must be 2D tensor.
sample_rate (int): sampling rate
channels_first (bool, optional): If ``True``, the given tensor is interpreted as `[channel, time]`,
otherwise `[time, channel]`.
compression (float or None, optional): Used for formats other than WAV.
This corresponds to ``-C`` option of ``sox`` command.
``"mp3"``
Either bitrate (in ``kbps``) with quality factor, such as ``128.2``, or
VBR encoding with quality factor such as ``-4.2``. Default: ``-4.5``.
``"flac"``
Whole number from ``0`` to ``8``. ``8`` is default and highest compression.
``"ogg"``, ``"vorbis"``
Number from ``-1`` to ``10``; ``-1`` is the highest compression
and lowest quality. Default: ``3``.
See the detail at http://sox.sourceforge.net/soxformat.html.
format (str or None, optional): Override the audio format.
When ``filepath`` argument is path-like object, audio format is infered from
file extension. If file extension is missing or different, you can specify the
correct format with this argument.
When ``filepath`` argument is file-like object, this argument is required.
Valid values are ``"wav"``, ``"mp3"``, ``"ogg"``, ``"vorbis"``, ``"amr-nb"``,
``"amb"``, ``"flac"``, ``"sph"``, ``"gsm"``, and ``"htk"``.
encoding (str or None, optional): Changes the encoding for the supported formats.
This argument is effective only for supported formats, such as ``"wav"``, ``""amb"``
and ``"sph"``. Valid values are;
- ``"PCM_S"`` (signed integer Linear PCM)
- ``"PCM_U"`` (unsigned integer Linear PCM)
- ``"PCM_F"`` (floating point PCM)
- ``"ULAW"`` (mu-law)
- ``"ALAW"`` (a-law)
Default values
If not provided, the default value is picked based on ``format`` and ``bits_per_sample``.
``"wav"``, ``"amb"``
- | If both ``encoding`` and ``bits_per_sample`` are not provided, the ``dtype`` of the
| Tensor is used to determine the default value.
- ``"PCM_U"`` if dtype is ``uint8``
- ``"PCM_S"`` if dtype is ``int16`` or ``int32``
- ``"PCM_F"`` if dtype is ``float32``
- ``"PCM_U"`` if ``bits_per_sample=8``
- ``"PCM_S"`` otherwise
``"sph"`` format;
- the default value is ``"PCM_S"``
bits_per_sample (int or None, optional): Changes the bit depth for the supported formats.
When ``format`` is one of ``"wav"``, ``"flac"``, ``"sph"``, or ``"amb"``, you can change the
bit depth. Valid values are ``8``, ``16``, ``32`` and ``64``.
Default Value;
If not provided, the default values are picked based on ``format`` and ``"encoding"``;
``"wav"``, ``"amb"``;
- | If both ``encoding`` and ``bits_per_sample`` are not provided, the ``dtype`` of the
| Tensor is used.
- ``8`` if dtype is ``uint8``
- ``16`` if dtype is ``int16``
- ``32`` if dtype is ``int32`` or ``float32``
- ``8`` if ``encoding`` is ``"PCM_U"``, ``"ULAW"`` or ``"ALAW"``
- ``16`` if ``encoding`` is ``"PCM_S"``
- ``32`` if ``encoding`` is ``"PCM_F"``
``"flac"`` format;
- the default value is ``24``
``"sph"`` format;
- ``16`` if ``encoding`` is ``"PCM_U"``, ``"PCM_S"``, ``"PCM_F"`` or not provided.
- ``8`` if ``encoding`` is ``"ULAW"`` or ``"ALAW"``
``"amb"`` format;
- ``8`` if ``encoding`` is ``"PCM_U"``, ``"ULAW"`` or ``"ALAW"``
- ``16`` if ``encoding`` is ``"PCM_S"`` or not provided.
- ``32`` if ``encoding`` is ``"PCM_F"``
Supported formats/encodings/bit depth/compression are;
``"wav"``, ``"amb"``
- 32-bit floating-point PCM
- 32-bit signed integer PCM
- 24-bit signed integer PCM
- 16-bit signed integer PCM
- 8-bit unsigned integer PCM
- 8-bit mu-law
- 8-bit a-law
Note: Default encoding/bit depth is determined by the dtype of the input Tensor.
``"mp3"``
Fixed bit rate (such as 128kHz) and variable bit rate compression.
Default: VBR with high quality.
``"flac"``
- 8-bit
- 16-bit
- 24-bit (default)
``"ogg"``, ``"vorbis"``
- Different quality level. Default: approx. 112kbps
``"sph"``
- 8-bit signed integer PCM
- 16-bit signed integer PCM
- 24-bit signed integer PCM
- 32-bit signed integer PCM (default)
- 8-bit mu-law
- 8-bit a-law
- 16-bit a-law
- 24-bit a-law
- 32-bit a-law
``"amr-nb"``
Bitrate ranging from 4.75 kbit/s to 12.2 kbit/s. Default: 4.75 kbit/s
``"gsm"``
Lossy Speech Compression, CPU intensive.
``"htk"``
Uses a default single-channel 16-bit PCM format.
Note:
To save into formats that ``libsox`` does not handle natively, (such as ``"mp3"``,
``"flac"``, ``"ogg"`` and ``"vorbis"``), your installation of ``torchaudio`` has
to be linked to ``libsox`` and corresponding codec libraries such as ``libmad``
or ``libmp3lame`` etc.
"""
if not torch.jit.is_scripting():
if hasattr(filepath, "write"):
raise RuntimeError("sox_io backend does not handle file-like object.")
filepath = os.fspath(filepath)
torch.ops.torchaudio.sox_io_save_audio_file(
filepath,
src,
sample_rate,
channels_first,
compression,
format,
encoding,
bits_per_sample,
)
class AudioMetaData:
"""Return type of ``torchaudio.info`` function.
This class is used by :py:mod:`"sox_io" backend<torchaudio.backends.sox_io_backend>` and
:py:mod:`"soundfile" backend<torchaudio.backends.soundfile_backend>`.
:ivar int sample_rate: Sample rate
:ivar int num_frames: The number of frames
:ivar int num_channels: The number of channels
:ivar int bits_per_sample: The number of bits per sample. This is 0 for lossy formats,
or when it cannot be accurately inferred.
:ivar str encoding: Audio encoding
The values encoding can take are one of the following:
* ``PCM_S``: Signed integer linear PCM
* ``PCM_U``: Unsigned integer linear PCM
* ``PCM_F``: Floating point linear PCM
* ``FLAC``: Flac, Free Lossless Audio Codec
* ``ULAW``: Mu-law
* ``ALAW``: A-law
* ``MP3`` : MP3, MPEG-1 Audio Layer III
* ``VORBIS``: OGG Vorbis
* ``AMR_WB``: Adaptive Multi-Rate
* ``AMR_NB``: Adaptive Multi-Rate Wideband
* ``OPUS``: Opus
* ``HTK``: Single channel 16-bit PCM
* ``UNKNOWN`` : None of above
"""
def __init__(
self,
sample_rate: int,
num_frames: int,
num_channels: int,
bits_per_sample: int,
encoding: str,
):
self.sample_rate = sample_rate
self.num_frames = num_frames
self.num_channels = num_channels
self.bits_per_sample = bits_per_sample
self.encoding = encoding
def __str__(self):
return (
f"AudioMetaData("
f"sample_rate={self.sample_rate}, "
f"num_frames={self.num_frames}, "
f"num_channels={self.num_channels}, "
f"bits_per_sample={self.bits_per_sample}, "
f"encoding={self.encoding}"
f")"
def __getattr__(name: str):
import warnings
if name == "AudioMetaData":
warnings.warn(
"`torchaudio.backend.common.AudioMetaData` has been moved to "
"`torchaudio.AudioMetaData`. Please update the import path.",
stacklevel=2,
)
from torchaudio._backend.common import AudioMetaData
return AudioMetaData
raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
from pathlib import Path
from typing import Callable, Optional, Tuple, Union
def __getattr__(name: str):
import warnings
from torch import Tensor
warnings.warn(
"Torchaudio's I/O functions now support par-call bakcend dispatch. "
"Importing backend implementation directly is no longer guaranteed to work. "
"Please use `backend` keyword with load/save/info function, instead of "
"calling the udnerlying implementation directly.",
stacklevel=2,
)
from . import _no_backend
def load(
filepath: Union[str, Path],
out: Optional[Tensor] = None,
normalization: Union[bool, float, Callable] = True,
channels_first: bool = True,
num_frames: int = 0,
offset: int = 0,
filetype: Optional[str] = None,
) -> Tuple[Tensor, int]:
raise RuntimeError("No audio I/O backend is available.")
def save(filepath: str, src: Tensor, sample_rate: int, precision: int = 16, channels_first: bool = True) -> None:
raise RuntimeError("No audio I/O backend is available.")
def info(filepath: str) -> None:
raise RuntimeError("No audio I/O backend is available.")
return getattr(_no_backend, name)
"""The new soundfile backend which will become default in 0.8.0 onward"""
import warnings
from typing import Optional, Tuple
import torch
from torchaudio._internal import module_utils as _mod_utils
from .common import AudioMetaData
if _mod_utils.is_soundfile_available():
import soundfile
# Mapping from soundfile subtype to number of bits per sample.
# This is mostly heuristical and the value is set to 0 when it is irrelevant
# (lossy formats) or when it can't be inferred.
# For ADPCM (and G72X) subtypes, it's hard to infer the bit depth because it's not part of the standard:
# According to https://en.wikipedia.org/wiki/Adaptive_differential_pulse-code_modulation#In_telephony,
# the default seems to be 8 bits but it can be compressed further to 4 bits.
# The dict is inspired from
# https://github.com/bastibe/python-soundfile/blob/744efb4b01abc72498a96b09115b42a4cabd85e4/soundfile.py#L66-L94
_SUBTYPE_TO_BITS_PER_SAMPLE = {
"PCM_S8": 8, # Signed 8 bit data
"PCM_16": 16, # Signed 16 bit data
"PCM_24": 24, # Signed 24 bit data
"PCM_32": 32, # Signed 32 bit data
"PCM_U8": 8, # Unsigned 8 bit data (WAV and RAW only)
"FLOAT": 32, # 32 bit float data
"DOUBLE": 64, # 64 bit float data
"ULAW": 8, # U-Law encoded. See https://en.wikipedia.org/wiki/G.711#Types
"ALAW": 8, # A-Law encoded. See https://en.wikipedia.org/wiki/G.711#Types
"IMA_ADPCM": 0, # IMA ADPCM.
"MS_ADPCM": 0, # Microsoft ADPCM.
"GSM610": 0, # GSM 6.10 encoding. (Wikipedia says 1.625 bit depth?? https://en.wikipedia.org/wiki/Full_Rate)
"VOX_ADPCM": 0, # OKI / Dialogix ADPCM
"G721_32": 0, # 32kbs G721 ADPCM encoding.
"G723_24": 0, # 24kbs G723 ADPCM encoding.
"G723_40": 0, # 40kbs G723 ADPCM encoding.
"DWVW_12": 12, # 12 bit Delta Width Variable Word encoding.
"DWVW_16": 16, # 16 bit Delta Width Variable Word encoding.
"DWVW_24": 24, # 24 bit Delta Width Variable Word encoding.
"DWVW_N": 0, # N bit Delta Width Variable Word encoding.
"DPCM_8": 8, # 8 bit differential PCM (XI only)
"DPCM_16": 16, # 16 bit differential PCM (XI only)
"VORBIS": 0, # Xiph Vorbis encoding. (lossy)
"ALAC_16": 16, # Apple Lossless Audio Codec (16 bit).
"ALAC_20": 20, # Apple Lossless Audio Codec (20 bit).
"ALAC_24": 24, # Apple Lossless Audio Codec (24 bit).
"ALAC_32": 32, # Apple Lossless Audio Codec (32 bit).
}
def _get_bit_depth(subtype):
if subtype not in _SUBTYPE_TO_BITS_PER_SAMPLE:
warnings.warn(
f"The {subtype} subtype is unknown to TorchAudio. As a result, the bits_per_sample "
"attribute will be set to 0. If you are seeing this warning, please "
"report by opening an issue on github (after checking for existing/closed ones). "
"You may otherwise ignore this warning."
)
return _SUBTYPE_TO_BITS_PER_SAMPLE.get(subtype, 0)
_SUBTYPE_TO_ENCODING = {
"PCM_S8": "PCM_S",
"PCM_16": "PCM_S",
"PCM_24": "PCM_S",
"PCM_32": "PCM_S",
"PCM_U8": "PCM_U",
"FLOAT": "PCM_F",
"DOUBLE": "PCM_F",
"ULAW": "ULAW",
"ALAW": "ALAW",
"VORBIS": "VORBIS",
}
def _get_encoding(format: str, subtype: str):
if format == "FLAC":
return "FLAC"
return _SUBTYPE_TO_ENCODING.get(subtype, "UNKNOWN")
@_mod_utils.requires_soundfile()
def info(filepath: str, format: Optional[str] = None) -> AudioMetaData:
"""Get signal information of an audio file.
Note:
``filepath`` argument is intentionally annotated as ``str`` only, even though it accepts
``pathlib.Path`` object as well. This is for the consistency with ``"sox_io"`` backend,
which has a restriction on type annotation due to TorchScript compiler compatiblity.
Args:
filepath (path-like object or file-like object):
Source of audio data.
format (str or None, optional):
Not used. PySoundFile does not accept format hint.
Returns:
AudioMetaData: meta data of the given audio.
"""
sinfo = soundfile.info(filepath)
return AudioMetaData(
sinfo.samplerate,
sinfo.frames,
sinfo.channels,
bits_per_sample=_get_bit_depth(sinfo.subtype),
encoding=_get_encoding(sinfo.format, sinfo.subtype),
def __getattr__(name: str):
import warnings
warnings.warn(
"Torchaudio's I/O functions now support par-call bakcend dispatch. "
"Importing backend implementation directly is no longer guaranteed to work. "
"Please use `backend` keyword with load/save/info function, instead of "
"calling the udnerlying implementation directly.",
stacklevel=2,
)
from torchaudio._backend import soundfile_backend
_SUBTYPE2DTYPE = {
"PCM_S8": "int8",
"PCM_U8": "uint8",
"PCM_16": "int16",
"PCM_32": "int32",
"FLOAT": "float32",
"DOUBLE": "float64",
}
@_mod_utils.requires_soundfile()
def load(
filepath: str,
frame_offset: int = 0,
num_frames: int = -1,
normalize: bool = True,
channels_first: bool = True,
format: Optional[str] = None,
) -> Tuple[torch.Tensor, int]:
"""Load audio data from file.
Note:
The formats this function can handle depend on the soundfile installation.
This function is tested on the following formats;
* WAV
* 32-bit floating-point
* 32-bit signed integer
* 16-bit signed integer
* 8-bit unsigned integer
* FLAC
* OGG/VORBIS
* SPHERE
By default (``normalize=True``, ``channels_first=True``), this function returns Tensor with
``float32`` dtype, and the shape of `[channel, time]`.
.. warning::
``normalize`` argument does not perform volume normalization.
It only converts the sample type to `torch.float32` from the native sample
type.
When the input format is WAV with integer type, such as 32-bit signed integer, 16-bit
signed integer, 24-bit signed integer, and 8-bit unsigned integer, by providing ``normalize=False``,
this function can return integer Tensor, where the samples are expressed within the whole range
of the corresponding dtype, that is, ``int32`` tensor for 32-bit signed PCM,
``int16`` for 16-bit signed PCM and ``uint8`` for 8-bit unsigned PCM. Since torch does not
support ``int24`` dtype, 24-bit signed PCM are converted to ``int32`` tensors.
``normalize`` argument has no effect on 32-bit floating-point WAV and other formats, such as
``flac`` and ``mp3``.
For these formats, this function always returns ``float32`` Tensor with values.
Note:
``filepath`` argument is intentionally annotated as ``str`` only, even though it accepts
``pathlib.Path`` object as well. This is for the consistency with ``"sox_io"`` backend,
which has a restriction on type annotation due to TorchScript compiler compatiblity.
Args:
filepath (path-like object or file-like object):
Source of audio data.
frame_offset (int, optional):
Number of frames to skip before start reading data.
num_frames (int, optional):
Maximum number of frames to read. ``-1`` reads all the remaining samples,
starting from ``frame_offset``.
This function may return the less number of frames if there is not enough
frames in the given file.
normalize (bool, optional):
When ``True``, this function converts the native sample type to ``float32``.
Default: ``True``.
If input file is integer WAV, giving ``False`` will change the resulting Tensor type to
integer type.
This argument has no effect for formats other than integer WAV type.
channels_first (bool, optional):
When True, the returned Tensor has dimension `[channel, time]`.
Otherwise, the returned Tensor's dimension is `[time, channel]`.
format (str or None, optional):
Not used. PySoundFile does not accept format hint.
Returns:
(torch.Tensor, int): Resulting Tensor and sample rate.
If the input file has integer wav format and normalization is off, then it has
integer type, else ``float32`` type. If ``channels_first=True``, it has
`[channel, time]` else `[time, channel]`.
"""
with soundfile.SoundFile(filepath, "r") as file_:
if file_.format != "WAV" or normalize:
dtype = "float32"
elif file_.subtype not in _SUBTYPE2DTYPE:
raise ValueError(f"Unsupported subtype: {file_.subtype}")
else:
dtype = _SUBTYPE2DTYPE[file_.subtype]
frames = file_._prepare_read(frame_offset, None, num_frames)
waveform = file_.read(frames, dtype, always_2d=True)
sample_rate = file_.samplerate
waveform = torch.from_numpy(waveform)
if channels_first:
waveform = waveform.t()
return waveform, sample_rate
def _get_subtype_for_wav(dtype: torch.dtype, encoding: str, bits_per_sample: int):
if not encoding:
if not bits_per_sample:
subtype = {
torch.uint8: "PCM_U8",
torch.int16: "PCM_16",
torch.int32: "PCM_32",
torch.float32: "FLOAT",
torch.float64: "DOUBLE",
}.get(dtype)
if not subtype:
raise ValueError(f"Unsupported dtype for wav: {dtype}")
return subtype
if bits_per_sample == 8:
return "PCM_U8"
return f"PCM_{bits_per_sample}"
if encoding == "PCM_S":
if not bits_per_sample:
return "PCM_32"
if bits_per_sample == 8:
raise ValueError("wav does not support 8-bit signed PCM encoding.")
return f"PCM_{bits_per_sample}"
if encoding == "PCM_U":
if bits_per_sample in (None, 8):
return "PCM_U8"
raise ValueError("wav only supports 8-bit unsigned PCM encoding.")
if encoding == "PCM_F":
if bits_per_sample in (None, 32):
return "FLOAT"
if bits_per_sample == 64:
return "DOUBLE"
raise ValueError("wav only supports 32/64-bit float PCM encoding.")
if encoding == "ULAW":
if bits_per_sample in (None, 8):
return "ULAW"
raise ValueError("wav only supports 8-bit mu-law encoding.")
if encoding == "ALAW":
if bits_per_sample in (None, 8):
return "ALAW"
raise ValueError("wav only supports 8-bit a-law encoding.")
raise ValueError(f"wav does not support {encoding}.")
def _get_subtype_for_sphere(encoding: str, bits_per_sample: int):
if encoding in (None, "PCM_S"):
return f"PCM_{bits_per_sample}" if bits_per_sample else "PCM_32"
if encoding in ("PCM_U", "PCM_F"):
raise ValueError(f"sph does not support {encoding} encoding.")
if encoding == "ULAW":
if bits_per_sample in (None, 8):
return "ULAW"
raise ValueError("sph only supports 8-bit for mu-law encoding.")
if encoding == "ALAW":
return "ALAW"
raise ValueError(f"sph does not support {encoding}.")
def _get_subtype(dtype: torch.dtype, format: str, encoding: str, bits_per_sample: int):
if format == "wav":
return _get_subtype_for_wav(dtype, encoding, bits_per_sample)
if format == "flac":
if encoding:
raise ValueError("flac does not support encoding.")
if not bits_per_sample:
return "PCM_16"
if bits_per_sample > 24:
raise ValueError("flac does not support bits_per_sample > 24.")
return "PCM_S8" if bits_per_sample == 8 else f"PCM_{bits_per_sample}"
if format in ("ogg", "vorbis"):
if encoding or bits_per_sample:
raise ValueError("ogg/vorbis does not support encoding/bits_per_sample.")
return "VORBIS"
if format == "sph":
return _get_subtype_for_sphere(encoding, bits_per_sample)
if format in ("nis", "nist"):
return "PCM_16"
raise ValueError(f"Unsupported format: {format}")
@_mod_utils.requires_soundfile()
def save(
filepath: str,
src: torch.Tensor,
sample_rate: int,
channels_first: bool = True,
compression: Optional[float] = None,
format: Optional[str] = None,
encoding: Optional[str] = None,
bits_per_sample: Optional[int] = None,
):
"""Save audio data to file.
Note:
The formats this function can handle depend on the soundfile installation.
This function is tested on the following formats;
* WAV
* 32-bit floating-point
* 32-bit signed integer
* 16-bit signed integer
* 8-bit unsigned integer
* FLAC
* OGG/VORBIS
* SPHERE
Note:
``filepath`` argument is intentionally annotated as ``str`` only, even though it accepts
``pathlib.Path`` object as well. This is for the consistency with ``"sox_io"`` backend,
which has a restriction on type annotation due to TorchScript compiler compatiblity.
Args:
filepath (str or pathlib.Path): Path to audio file.
src (torch.Tensor): Audio data to save. must be 2D tensor.
sample_rate (int): sampling rate
channels_first (bool, optional): If ``True``, the given tensor is interpreted as `[channel, time]`,
otherwise `[time, channel]`.
compression (float of None, optional): Not used.
It is here only for interface compatibility reson with "sox_io" backend.
format (str or None, optional): Override the audio format.
When ``filepath`` argument is path-like object, audio format is
inferred from file extension. If the file extension is missing or
different, you can specify the correct format with this argument.
When ``filepath`` argument is file-like object,
this argument is required.
Valid values are ``"wav"``, ``"ogg"``, ``"vorbis"``,
``"flac"`` and ``"sph"``.
encoding (str or None, optional): Changes the encoding for supported formats.
This argument is effective only for supported formats, sush as
``"wav"``, ``""flac"`` and ``"sph"``. Valid values are;
- ``"PCM_S"`` (signed integer Linear PCM)
- ``"PCM_U"`` (unsigned integer Linear PCM)
- ``"PCM_F"`` (floating point PCM)
- ``"ULAW"`` (mu-law)
- ``"ALAW"`` (a-law)
bits_per_sample (int or None, optional): Changes the bit depth for the
supported formats.
When ``format`` is one of ``"wav"``, ``"flac"`` or ``"sph"``,
you can change the bit depth.
Valid values are ``8``, ``16``, ``24``, ``32`` and ``64``.
Supported formats/encodings/bit depth/compression are:
``"wav"``
- 32-bit floating-point PCM
- 32-bit signed integer PCM
- 24-bit signed integer PCM
- 16-bit signed integer PCM
- 8-bit unsigned integer PCM
- 8-bit mu-law
- 8-bit a-law
Note:
Default encoding/bit depth is determined by the dtype of
the input Tensor.
``"flac"``
- 8-bit
- 16-bit (default)
- 24-bit
``"ogg"``, ``"vorbis"``
- Doesn't accept changing configuration.
``"sph"``
- 8-bit signed integer PCM
- 16-bit signed integer PCM
- 24-bit signed integer PCM
- 32-bit signed integer PCM (default)
- 8-bit mu-law
- 8-bit a-law
- 16-bit a-law
- 24-bit a-law
- 32-bit a-law
"""
if src.ndim != 2:
raise ValueError(f"Expected 2D Tensor, got {src.ndim}D.")
if compression is not None:
warnings.warn(
'`save` function of "soundfile" backend does not support "compression" parameter. '
"The argument is silently ignored."
)
if hasattr(filepath, "write"):
if format is None:
raise RuntimeError("`format` is required when saving to file object.")
ext = format.lower()
else:
ext = str(filepath).split(".")[-1].lower()
if bits_per_sample not in (None, 8, 16, 24, 32, 64):
raise ValueError("Invalid bits_per_sample.")
if bits_per_sample == 24:
warnings.warn(
"Saving audio with 24 bits per sample might warp samples near -1. "
"Using 16 bits per sample might be able to avoid this."
)
subtype = _get_subtype(src.dtype, ext, encoding, bits_per_sample)
# sph is a extension used in TED-LIUM but soundfile does not recognize it as NIST format,
# so we extend the extensions manually here
if ext in ["nis", "nist", "sph"] and format is None:
format = "NIST"
if channels_first:
src = src.t()
soundfile.write(file=filepath, data=src, samplerate=sample_rate, subtype=subtype, format=format)
return getattr(soundfile_backend, name)
import os
from typing import Optional, Tuple
import torch
import torchaudio
from torchaudio._internal import module_utils as _mod_utils
from torchaudio.utils.sox_utils import get_buffer_size
from .common import AudioMetaData
# Note: need to comply TorchScript syntax -- need annotation and no f-string
def _fail_info(filepath: str, format: Optional[str]) -> AudioMetaData:
raise RuntimeError("Failed to fetch metadata from {}".format(filepath))
def _fail_info_fileobj(fileobj, format: Optional[str]) -> AudioMetaData:
raise RuntimeError("Failed to fetch metadata from {}".format(fileobj))
# Note: need to comply TorchScript syntax -- need annotation and no f-string
def _fail_load(
filepath: str,
frame_offset: int = 0,
num_frames: int = -1,
normalize: bool = True,
channels_first: bool = True,
format: Optional[str] = None,
) -> Tuple[torch.Tensor, int]:
raise RuntimeError("Failed to load audio from {}".format(filepath))
def _fail_load_fileobj(fileobj, *args, **kwargs):
raise RuntimeError(f"Failed to load audio from {fileobj}")
if torchaudio._extension._FFMPEG_INITIALIZED:
import torchaudio.io._compat as _compat
_fallback_info = _compat.info_audio
_fallback_info_fileobj = _compat.info_audio_fileobj
_fallback_load = _compat.load_audio
_fallback_load_fileobj = _compat.load_audio_fileobj
else:
_fallback_info = _fail_info
_fallback_info_fileobj = _fail_info_fileobj
_fallback_load = _fail_load
_fallback_load_fileobj = _fail_load_fileobj
@_mod_utils.requires_sox()
def info(
filepath: str,
format: Optional[str] = None,
) -> AudioMetaData:
"""Get signal information of an audio file.
Args:
filepath (path-like object or file-like object):
Source of audio data. When the function is not compiled by TorchScript,
(e.g. ``torch.jit.script``), the following types are accepted;
* ``path-like``: file path
* ``file-like``: Object with ``read(size: int) -> bytes`` method,
which returns byte string of at most ``size`` length.
When the function is compiled by TorchScript, only ``str`` type is allowed.
Note:
* When the input type is file-like object, this function cannot
get the correct length (``num_samples``) for certain formats,
such as ``vorbis``.
In this case, the value of ``num_samples`` is ``0``.
* This argument is intentionally annotated as ``str`` only due to
TorchScript compiler compatibility.
format (str or None, optional):
Override the format detection with the given format.
Providing the argument might help when libsox can not infer the format
from header or extension.
Returns:
AudioMetaData: Metadata of the given audio.
"""
if not torch.jit.is_scripting():
if hasattr(filepath, "read"):
# Special case for Backward compatibility
# v0.11 -> v0.12, mp3 handling is moved to FFmpeg.
# file-like objects are not necessarily fallback-able
# when they are not seekable.
# The previous libsox-based implementation required `format="mp3"`
# because internally libsox does not auto-detect the format.
# For the special BC for mp3, we handle mp3 differently.
buffer_size = get_buffer_size()
if format == "mp3":
return _fallback_info_fileobj(filepath, format, buffer_size)
sinfo = torchaudio._torchaudio.get_info_fileobj(filepath, format)
if sinfo is not None:
return AudioMetaData(*sinfo)
return _fallback_info_fileobj(filepath, format, buffer_size)
filepath = os.fspath(filepath)
sinfo = torch.ops.torchaudio.sox_io_get_info(filepath, format)
if sinfo is not None:
return AudioMetaData(*sinfo)
return _fallback_info(filepath, format)
@_mod_utils.requires_sox()
def load(
filepath: str,
frame_offset: int = 0,
num_frames: int = -1,
normalize: bool = True,
channels_first: bool = True,
format: Optional[str] = None,
) -> Tuple[torch.Tensor, int]:
"""Load audio data from file.
Note:
This function can handle all the codecs that underlying libsox can handle,
however it is tested on the following formats;
* WAV, AMB
* 32-bit floating-point
* 32-bit signed integer
* 24-bit signed integer
* 16-bit signed integer
* 8-bit unsigned integer (WAV only)
* MP3
* FLAC
* OGG/VORBIS
* OPUS
* SPHERE
* AMR-NB
To load ``MP3``, ``FLAC``, ``OGG/VORBIS``, ``OPUS`` and other codecs ``libsox`` does not
handle natively, your installation of ``torchaudio`` has to be linked to ``libsox``
and corresponding codec libraries such as ``libmad`` or ``libmp3lame`` etc.
By default (``normalize=True``, ``channels_first=True``), this function returns Tensor with
``float32`` dtype, and the shape of `[channel, time]`.
.. warning::
``normalize`` argument does not perform volume normalization.
It only converts the sample type to `torch.float32` from the native sample
type.
When the input format is WAV with integer type, such as 32-bit signed integer, 16-bit
signed integer, 24-bit signed integer, and 8-bit unsigned integer, by providing ``normalize=False``,
this function can return integer Tensor, where the samples are expressed within the whole range
of the corresponding dtype, that is, ``int32`` tensor for 32-bit signed PCM,
``int16`` for 16-bit signed PCM and ``uint8`` for 8-bit unsigned PCM. Since torch does not
support ``int24`` dtype, 24-bit signed PCM are converted to ``int32`` tensors.
``normalize`` argument has no effect on 32-bit floating-point WAV and other formats, such as
``flac`` and ``mp3``.
For these formats, this function always returns ``float32`` Tensor with values.
Args:
filepath (path-like object or file-like object):
Source of audio data. When the function is not compiled by TorchScript,
(e.g. ``torch.jit.script``), the following types are accepted;
* ``path-like``: file path
* ``file-like``: Object with ``read(size: int) -> bytes`` method,
which returns byte string of at most ``size`` length.
When the function is compiled by TorchScript, only ``str`` type is allowed.
Note: This argument is intentionally annotated as ``str`` only due to
TorchScript compiler compatibility.
frame_offset (int):
Number of frames to skip before start reading data.
num_frames (int, optional):
Maximum number of frames to read. ``-1`` reads all the remaining samples,
starting from ``frame_offset``.
This function may return the less number of frames if there is not enough
frames in the given file.
normalize (bool, optional):
When ``True``, this function converts the native sample type to ``float32``.
Default: ``True``.
If input file is integer WAV, giving ``False`` will change the resulting Tensor type to
integer type.
This argument has no effect for formats other than integer WAV type.
channels_first (bool, optional):
When True, the returned Tensor has dimension `[channel, time]`.
Otherwise, the returned Tensor's dimension is `[time, channel]`.
format (str or None, optional):
Override the format detection with the given format.
Providing the argument might help when libsox can not infer the format
from header or extension.
Returns:
(torch.Tensor, int): Resulting Tensor and sample rate.
If the input file has integer wav format and ``normalize=False``, then it has
integer type, else ``float32`` type. If ``channels_first=True``, it has
`[channel, time]` else `[time, channel]`.
"""
if not torch.jit.is_scripting():
if hasattr(filepath, "read"):
# Special case for Backward compatibility
# v0.11 -> v0.12, mp3 handling is moved to FFmpeg.
# file-like objects are not necessarily fallback-able
# when they are not seekable.
# The previous libsox-based implementation required `format="mp3"`
# because internally libsox does not auto-detect the format.
# For the special BC for mp3, we handle mp3 differently.
buffer_size = get_buffer_size()
if format == "mp3":
return _fallback_load_fileobj(
filepath,
frame_offset,
num_frames,
normalize,
channels_first,
format,
buffer_size,
)
ret = torchaudio._torchaudio.load_audio_fileobj(
filepath, frame_offset, num_frames, normalize, channels_first, format
)
if ret is not None:
return ret
return _fallback_load_fileobj(
filepath,
frame_offset,
num_frames,
normalize,
channels_first,
format,
buffer_size,
)
filepath = os.fspath(filepath)
ret = torch.ops.torchaudio.sox_io_load_audio_file(
filepath, frame_offset, num_frames, normalize, channels_first, format
def __getattr__(name: str):
import warnings
warnings.warn(
"Torchaudio's I/O functions now support par-call bakcend dispatch. "
"Importing backend implementation directly is no longer guaranteed to work. "
"Please use `backend` keyword with load/save/info function, instead of "
"calling the udnerlying implementation directly.",
stacklevel=2,
)
if ret is not None:
return ret
return _fallback_load(filepath, frame_offset, num_frames, normalize, channels_first, format)
@_mod_utils.requires_sox()
def save(
filepath: str,
src: torch.Tensor,
sample_rate: int,
channels_first: bool = True,
compression: Optional[float] = None,
format: Optional[str] = None,
encoding: Optional[str] = None,
bits_per_sample: Optional[int] = None,
):
"""Save audio data to file.
Args:
filepath (str or pathlib.Path): Path to save file.
This function also handles ``pathlib.Path`` objects, but is annotated
as ``str`` for TorchScript compiler compatibility.
src (torch.Tensor): Audio data to save. must be 2D tensor.
sample_rate (int): sampling rate
channels_first (bool, optional): If ``True``, the given tensor is interpreted as `[channel, time]`,
otherwise `[time, channel]`.
compression (float or None, optional): Used for formats other than WAV.
This corresponds to ``-C`` option of ``sox`` command.
``"mp3"``
Either bitrate (in ``kbps``) with quality factor, such as ``128.2``, or
VBR encoding with quality factor such as ``-4.2``. Default: ``-4.5``.
``"flac"``
Whole number from ``0`` to ``8``. ``8`` is default and highest compression.
``"ogg"``, ``"vorbis"``
Number from ``-1`` to ``10``; ``-1`` is the highest compression
and lowest quality. Default: ``3``.
See the detail at http://sox.sourceforge.net/soxformat.html.
format (str or None, optional): Override the audio format.
When ``filepath`` argument is path-like object, audio format is infered from
file extension. If file extension is missing or different, you can specify the
correct format with this argument.
When ``filepath`` argument is file-like object, this argument is required.
Valid values are ``"wav"``, ``"mp3"``, ``"ogg"``, ``"vorbis"``, ``"amr-nb"``,
``"amb"``, ``"flac"``, ``"sph"``, ``"gsm"``, and ``"htk"``.
encoding (str or None, optional): Changes the encoding for the supported formats.
This argument is effective only for supported formats, such as ``"wav"``, ``""amb"``
and ``"sph"``. Valid values are;
- ``"PCM_S"`` (signed integer Linear PCM)
- ``"PCM_U"`` (unsigned integer Linear PCM)
- ``"PCM_F"`` (floating point PCM)
- ``"ULAW"`` (mu-law)
- ``"ALAW"`` (a-law)
Default values
If not provided, the default value is picked based on ``format`` and ``bits_per_sample``.
``"wav"``, ``"amb"``
- | If both ``encoding`` and ``bits_per_sample`` are not provided, the ``dtype`` of the
| Tensor is used to determine the default value.
- ``"PCM_U"`` if dtype is ``uint8``
- ``"PCM_S"`` if dtype is ``int16`` or ``int32``
- ``"PCM_F"`` if dtype is ``float32``
- ``"PCM_U"`` if ``bits_per_sample=8``
- ``"PCM_S"`` otherwise
``"sph"`` format;
- the default value is ``"PCM_S"``
from . import _sox_io_backend
bits_per_sample (int or None, optional): Changes the bit depth for the supported formats.
When ``format`` is one of ``"wav"``, ``"flac"``, ``"sph"``, or ``"amb"``, you can change the
bit depth. Valid values are ``8``, ``16``, ``32`` and ``64``.
Default Value;
If not provided, the default values are picked based on ``format`` and ``"encoding"``;
``"wav"``, ``"amb"``;
- | If both ``encoding`` and ``bits_per_sample`` are not provided, the ``dtype`` of the
| Tensor is used.
- ``8`` if dtype is ``uint8``
- ``16`` if dtype is ``int16``
- ``32`` if dtype is ``int32`` or ``float32``
- ``8`` if ``encoding`` is ``"PCM_U"``, ``"ULAW"`` or ``"ALAW"``
- ``16`` if ``encoding`` is ``"PCM_S"``
- ``32`` if ``encoding`` is ``"PCM_F"``
``"flac"`` format;
- the default value is ``24``
``"sph"`` format;
- ``16`` if ``encoding`` is ``"PCM_U"``, ``"PCM_S"``, ``"PCM_F"`` or not provided.
- ``8`` if ``encoding`` is ``"ULAW"`` or ``"ALAW"``
``"amb"`` format;
- ``8`` if ``encoding`` is ``"PCM_U"``, ``"ULAW"`` or ``"ALAW"``
- ``16`` if ``encoding`` is ``"PCM_S"`` or not provided.
- ``32`` if ``encoding`` is ``"PCM_F"``
Supported formats/encodings/bit depth/compression are;
``"wav"``, ``"amb"``
- 32-bit floating-point PCM
- 32-bit signed integer PCM
- 24-bit signed integer PCM
- 16-bit signed integer PCM
- 8-bit unsigned integer PCM
- 8-bit mu-law
- 8-bit a-law
Note: Default encoding/bit depth is determined by the dtype of the input Tensor.
``"mp3"``
Fixed bit rate (such as 128kHz) and variable bit rate compression.
Default: VBR with high quality.
``"flac"``
- 8-bit
- 16-bit
- 24-bit (default)
``"ogg"``, ``"vorbis"``
- Different quality level. Default: approx. 112kbps
``"sph"``
- 8-bit signed integer PCM
- 16-bit signed integer PCM
- 24-bit signed integer PCM
- 32-bit signed integer PCM (default)
- 8-bit mu-law
- 8-bit a-law
- 16-bit a-law
- 24-bit a-law
- 32-bit a-law
``"amr-nb"``
Bitrate ranging from 4.75 kbit/s to 12.2 kbit/s. Default: 4.75 kbit/s
``"gsm"``
Lossy Speech Compression, CPU intensive.
``"htk"``
Uses a default single-channel 16-bit PCM format.
Note:
To save into formats that ``libsox`` does not handle natively, (such as ``"mp3"``,
``"flac"``, ``"ogg"`` and ``"vorbis"``), your installation of ``torchaudio`` has
to be linked to ``libsox`` and corresponding codec libraries such as ``libmad``
or ``libmp3lame`` etc.
"""
if not torch.jit.is_scripting():
if hasattr(filepath, "write"):
torchaudio._torchaudio.save_audio_fileobj(
filepath,
src,
sample_rate,
channels_first,
compression,
format,
encoding,
bits_per_sample,
)
return
filepath = os.fspath(filepath)
torch.ops.torchaudio.sox_io_save_audio_file(
filepath,
src,
sample_rate,
channels_first,
compression,
format,
encoding,
bits_per_sample,
)
return getattr(_sox_io_backend, name)
......@@ -3,9 +3,10 @@ import warnings
from typing import List, Optional
import torchaudio
from torchaudio._backend import soundfile_backend
from torchaudio._internal import module_utils as _mod_utils
from . import no_backend, soundfile_backend, sox_io_backend
from . import _no_backend as no_backend, _sox_io_backend as sox_io_backend
__all__ = [
"list_audio_backends",
......@@ -23,7 +24,7 @@ def list_audio_backends() -> List[str]:
backends = []
if _mod_utils.is_module_available("soundfile"):
backends.append("soundfile")
if _mod_utils.is_sox_available():
if torchaudio._extension._SOX_INITIALIZED:
backends.append("sox_io")
return backends
......@@ -52,14 +53,19 @@ def set_audio_backend(backend: Optional[str]):
setattr(torchaudio, func, getattr(module, func))
def _init_audio_backend():
def _init_backend():
warnings.warn(
"TorchAudio's global backend is now deprecated. "
"Please enable distpatcher by setting `TORCHAUDIO_USE_BACKEND_DISPATCHER=1`, "
"and specify backend when calling load/info/save function.",
stacklevel=3,
)
backends = list_audio_backends()
if "sox_io" in backends:
set_audio_backend("sox_io")
elif "soundfile" in backends:
set_audio_backend("soundfile")
else:
warnings.warn("No audio backend is available.")
set_audio_backend(None)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment