UPDATE

ffeba11a · mayp777 · 29deb085 · ffeba11a · ffeba11a · ffeba11a
Commit ffeba11a authored Sep 02, 2024 by mayp777
20 changed files
--- a/torchaudio/_backend/__init__.py
+++ b/torchaudio/_backend/__init__.py
+from typing import List, Optional
+
+import torchaudio
+from torchaudio._internal.module_utils import deprecated
+
+
+# TODO: Once legacy global backend is removed, move this to torchaudio.__init__
+def _init_backend():
+    from . import utils
+
+    torchaudio.info = utils.get_info_func()
+    torchaudio.load = utils.get_load_func()
+    torchaudio.save = utils.get_save_func()
+
+
+def list_audio_backends() -> List[str]:
+    """List available backends
+
+    Returns:
+        list of str: The list of available backends.
+
+        The possible values are;
+
+        - Dispatcher mode: ``"ffmpeg"``, ``"sox"`` and ``"soundfile"``.
+        - Legacy backend mode: ``"sox_io"``, ``"soundfile"``.
+    """
+    from . import utils
+
+    return list(utils.get_available_backends().keys())
+
+
+# Temporary until global backend is removed
+@deprecated("With dispatcher enabled, this function is no-op. You can remove the function call.")
+def get_audio_backend() -> Optional[str]:
+    """Get the name of the current global backend
+
+    Returns:
+        str or None:
+            If dispatcher mode is enabled, returns ``None`` otherwise,
+            the name of current backend or ``None`` (no backend is set).
+    """
+    return None
+
+
+# Temporary until global backend is removed
+@deprecated("With dispatcher enabled, this function is no-op. You can remove the function call.")
+def set_audio_backend(backend: Optional[str]):  # noqa
+    """Set the global backend.
+
+    This is a no-op when dispatcher mode is enabled.
+
+    Args:
+        backend (str or None): Name of the backend.
+            One of ``"sox_io"`` or ``"soundfile"`` based on availability
+            of the system. If ``None`` is provided the  current backend is unassigned.
+    """
+    pass
--- a/torchaudio/_backend/backend.py
+++ b/torchaudio/_backend/backend.py
+import os
+from abc import ABC, abstractmethod
+from typing import BinaryIO, Optional, Tuple, Union
+
+from torch import Tensor
+from torchaudio.io import CodecConfig
+
+from .common import AudioMetaData
+
+
+class Backend(ABC):
+    @staticmethod
+    @abstractmethod
+    def info(uri: Union[BinaryIO, str, os.PathLike], format: Optional[str], buffer_size: int = 4096) -> AudioMetaData:
+        raise NotImplementedError
+
+    @staticmethod
+    @abstractmethod
+    def load(
+        uri: Union[BinaryIO, str, os.PathLike],
+        frame_offset: int = 0,
+        num_frames: int = -1,
+        normalize: bool = True,
+        channels_first: bool = True,
+        format: Optional[str] = None,
+        buffer_size: int = 4096,
+    ) -> Tuple[Tensor, int]:
+        raise NotImplementedError
+
+    @staticmethod
+    @abstractmethod
+    def save(
+        uri: Union[BinaryIO, str, os.PathLike],
+        src: Tensor,
+        sample_rate: int,
+        channels_first: bool = True,
+        format: Optional[str] = None,
+        encoding: Optional[str] = None,
+        bits_per_sample: Optional[int] = None,
+        buffer_size: int = 4096,
+        compression: Optional[Union[CodecConfig, float, int]] = None,
+    ) -> None:
+        raise NotImplementedError
+
+    @staticmethod
+    @abstractmethod
+    def can_decode(uri: Union[BinaryIO, str, os.PathLike], format: Optional[str]) -> bool:
+        raise NotImplementedError
+
+    @staticmethod
+    @abstractmethod
+    def can_encode(uri: Union[BinaryIO, str, os.PathLike], format: Optional[str]) -> bool:
+        raise NotImplementedError
--- a/torchaudio/_backend/common.py
+++ b/torchaudio/_backend/common.py
+class AudioMetaData:
+    """AudioMetaData()
+
+    Return type of ``torchaudio.info`` function.
+
+    :ivar int sample_rate: Sample rate
+    :ivar int num_frames: The number of frames
+    :ivar int num_channels: The number of channels
+    :ivar int bits_per_sample: The number of bits per sample. This is 0 for lossy formats,
+        or when it cannot be accurately inferred.
+    :ivar str encoding: Audio encoding
+        The values encoding can take are one of the following:
+
+            * ``PCM_S``: Signed integer linear PCM
+            * ``PCM_U``: Unsigned integer linear PCM
+            * ``PCM_F``: Floating point linear PCM
+            * ``FLAC``: Flac, Free Lossless Audio Codec
+            * ``ULAW``: Mu-law
+            * ``ALAW``: A-law
+            * ``MP3`` : MP3, MPEG-1 Audio Layer III
+            * ``VORBIS``: OGG Vorbis
+            * ``AMR_WB``: Adaptive Multi-Rate Wideband
+            * ``AMR_NB``: Adaptive Multi-Rate Narrowband
+            * ``OPUS``: Opus
+            * ``HTK``: Single channel 16-bit PCM
+            * ``UNKNOWN`` : None of above
+    """
+
+    def __init__(
+        self,
+        sample_rate: int,
+        num_frames: int,
+        num_channels: int,
+        bits_per_sample: int,
+        encoding: str,
+    ):
+        self.sample_rate = sample_rate
+        self.num_frames = num_frames
+        self.num_channels = num_channels
+        self.bits_per_sample = bits_per_sample
+        self.encoding = encoding
+
+    def __str__(self):
+        return (
+            f"AudioMetaData("
+            f"sample_rate={self.sample_rate}, "
+            f"num_frames={self.num_frames}, "
+            f"num_channels={self.num_channels}, "
+            f"bits_per_sample={self.bits_per_sample}, "
+            f"encoding={self.encoding}"
+            f")"
+        )
--- a/torchaudio/_backend/ffmpeg.py
+++ b/torchaudio/_backend/ffmpeg.py
+import os
+import re
+import sys
+from typing import BinaryIO, Optional, Tuple, Union
+
+import torch
+import torchaudio
+from torchaudio.io import StreamWriter
+
+from .backend import Backend
+from .common import AudioMetaData
+
+if torchaudio._extension._FFMPEG_EXT is not None:
+    StreamReaderFileObj = torchaudio._extension._FFMPEG_EXT.StreamReaderFileObj
+else:
+    StreamReaderFileObj = object
+
+
+def info_audio(
+    src: str,
+    format: Optional[str],
+) -> AudioMetaData:
+    i = torch.ops.torchaudio.compat_info(src, format)
+    return AudioMetaData(i[0], i[1], i[2], i[3], i[4].upper())
+
+
+def info_audio_fileobj(
+    src,
+    format: Optional[str],
+    buffer_size: int = 4096,
+) -> AudioMetaData:
+    s = StreamReaderFileObj(src, format, None, buffer_size)
+    i = s.find_best_audio_stream()
+    sinfo = s.get_src_stream_info(i)
+    if sinfo.num_frames == 0:
+        waveform = _load_audio_fileobj(s)
+        num_frames = waveform.size(1)
+    else:
+        num_frames = sinfo.num_frames
+    return AudioMetaData(
+        int(sinfo.sample_rate),
+        num_frames,
+        sinfo.num_channels,
+        sinfo.bits_per_sample,
+        sinfo.codec_name.upper(),
+    )
+
+
+def _get_load_filter(
+    frame_offset: int = 0,
+    num_frames: int = -1,
+    convert: bool = True,
+) -> Optional[str]:
+    if frame_offset < 0:
+        raise RuntimeError("Invalid argument: frame_offset must be non-negative. Found: {}".format(frame_offset))
+    if num_frames == 0 or num_frames < -1:
+        raise RuntimeError("Invalid argument: num_frames must be -1 or greater than 0. Found: {}".format(num_frames))
+
+    # All default values -> no filter
+    if frame_offset == 0 and num_frames == -1 and not convert:
+        return None
+    # Only convert
+    aformat = "aformat=sample_fmts=fltp"
+    if frame_offset == 0 and num_frames == -1 and convert:
+        return aformat
+    # At least one of frame_offset or num_frames has non-default value
+    if num_frames > 0:
+        atrim = "atrim=start_sample={}:end_sample={}".format(frame_offset, frame_offset + num_frames)
+    else:
+        atrim = "atrim=start_sample={}".format(frame_offset)
+    if not convert:
+        return atrim
+    return "{},{}".format(atrim, aformat)
+
+
+def _load_audio_fileobj(
+    s: StreamReaderFileObj,
+    filter: Optional[str] = None,
+    channels_first: bool = True,
+) -> torch.Tensor:
+    i = s.find_best_audio_stream()
+    s.add_audio_stream(i, -1, -1, filter, None, None)
+    s.process_all_packets()
+    chunk = s.pop_chunks()[0]
+    if chunk is None:
+        raise RuntimeError("Failed to decode audio.")
+    waveform = chunk.frames
+    return waveform.T if channels_first else waveform
+
+
+def load_audio(
+    src: str,
+    frame_offset: int = 0,
+    num_frames: int = -1,
+    convert: bool = True,
+    channels_first: bool = True,
+    format: Optional[str] = None,
+) -> Tuple[torch.Tensor, int]:
+    filter = _get_load_filter(frame_offset, num_frames, convert)
+    return torch.ops.torchaudio.compat_load(src, format, filter, channels_first)
+
+
+def load_audio_fileobj(
+    src: BinaryIO,
+    frame_offset: int = 0,
+    num_frames: int = -1,
+    convert: bool = True,
+    channels_first: bool = True,
+    format: Optional[str] = None,
+    buffer_size: int = 4096,
+) -> Tuple[torch.Tensor, int]:
+    demuxer = "ogg" if format == "vorbis" else format
+    s = StreamReaderFileObj(src, demuxer, None, buffer_size)
+    sample_rate = int(s.get_src_stream_info(s.find_best_audio_stream()).sample_rate)
+    filter = _get_load_filter(frame_offset, num_frames, convert)
+    waveform = _load_audio_fileobj(s, filter, channels_first)
+    return waveform, sample_rate
+
+
+def _get_sample_format(dtype: torch.dtype) -> str:
+    dtype_to_format = {
+        torch.uint8: "u8",
+        torch.int16: "s16",
+        torch.int32: "s32",
+        torch.int64: "s64",
+        torch.float32: "flt",
+        torch.float64: "dbl",
+    }
+    format = dtype_to_format.get(dtype)
+    if format is None:
+        raise ValueError(f"No format found for dtype {dtype}; dtype must be one of {list(dtype_to_format.keys())}.")
+    return format
+
+
+def _native_endianness() -> str:
+    if sys.byteorder == "little":
+        return "le"
+    else:
+        return "be"
+
+
+def _get_encoder_for_wav(encoding: str, bits_per_sample: int) -> str:
+    if bits_per_sample not in {None, 8, 16, 24, 32, 64}:
+        raise ValueError(f"Invalid bits_per_sample {bits_per_sample} for WAV encoding.")
+    endianness = _native_endianness()
+    if not encoding:
+        if not bits_per_sample:
+            # default to PCM S16
+            return f"pcm_s16{endianness}"
+        if bits_per_sample == 8:
+            return "pcm_u8"
+        return f"pcm_s{bits_per_sample}{endianness}"
+    if encoding == "PCM_S":
+        if not bits_per_sample:
+            bits_per_sample = 16
+        if bits_per_sample == 8:
+            raise ValueError("For WAV signed PCM, 8-bit encoding is not supported.")
+        return f"pcm_s{bits_per_sample}{endianness}"
+    if encoding == "PCM_U":
+        if bits_per_sample in (None, 8):
+            return "pcm_u8"
+        raise ValueError("For WAV unsigned PCM, only 8-bit encoding is supported.")
+    if encoding == "PCM_F":
+        if not bits_per_sample:
+            bits_per_sample = 32
+        if bits_per_sample in (32, 64):
+            return f"pcm_f{bits_per_sample}{endianness}"
+        raise ValueError("For WAV float PCM, only 32- and 64-bit encodings are supported.")
+    if encoding == "ULAW":
+        if bits_per_sample in (None, 8):
+            return "pcm_mulaw"
+        raise ValueError("For WAV PCM mu-law, only 8-bit encoding is supported.")
+    if encoding == "ALAW":
+        if bits_per_sample in (None, 8):
+            return "pcm_alaw"
+        raise ValueError("For WAV PCM A-law, only 8-bit encoding is supported.")
+    raise ValueError(f"WAV encoding {encoding} is not supported.")
+
+
+def _get_flac_sample_fmt(bps):
+    if bps is None or bps == 16:
+        return "s16"
+    if bps == 24:
+        return "s32"
+    raise ValueError(f"FLAC only supports bits_per_sample values of 16 and 24 ({bps} specified).")
+
+
+def _parse_save_args(
+    ext: Optional[str],
+    format: Optional[str],
+    encoding: Optional[str],
+    bps: Optional[int],
+):
+    # torchaudio's save function accepts the followings, which do not 1to1 map
+    # to FFmpeg.
+    #
+    # - format: audio format
+    # - bits_per_sample: encoder sample format
+    # - encoding: such as PCM_U8.
+    #
+    # In FFmpeg, format is specified with the following three (and more)
+    #
+    # - muxer: could be audio format or container format.
+    # the one we passed to the constructor of StreamWriter
+    # - encoder: the audio encoder used to encode audio
+    # - encoder sample format: the format used by encoder to encode audio.
+    #
+    # If encoder sample format is different from source sample format, StreamWriter
+    # will insert a filter automatically.
+    #
+    def _type(spec):
+        # either format is exactly the specified one
+        # or extension matches to the spec AND there is no format override.
+        return format == spec or (format is None and ext == spec)
+
+    if _type("wav") or _type("amb"):
+        # wav is special because it supports different encoding through encoders
+        # each encoder only supports one encoder format
+        #
+        # amb format is a special case originated from libsox.
+        # It is basically a WAV format, with slight modification.
+        # https://github.com/chirlu/sox/commit/4a4ea33edbca5972a1ed8933cc3512c7302fa67a#diff-39171191a858add9df87f5f210a34a776ac2c026842ae6db6ce97f5e68836795
+        # It is a format so that decoders will recognize it as ambisonic.
+        # https://www.ambisonia.com/Members/mleese/file-format-for-b-format/
+        # FFmpeg does not recognize amb because it is basically a WAV format.
+        muxer = "wav"
+        encoder = _get_encoder_for_wav(encoding, bps)
+        sample_fmt = None
+    elif _type("vorbis"):
+        # FFpmeg does not recognize vorbis extension, while libsox used to do.
+        # For the sake of bakward compatibility, (and the simplicity),
+        # we support the case where users want to do save("foo.vorbis")
+        muxer = "ogg"
+        encoder = "vorbis"
+        sample_fmt = None
+    else:
+        muxer = format
+        encoder = None
+        sample_fmt = None
+        if _type("flac"):
+            sample_fmt = _get_flac_sample_fmt(bps)
+        if _type("ogg"):
+            sample_fmt = _get_flac_sample_fmt(bps)
+    return muxer, encoder, sample_fmt
+
+
+def save_audio(
+    uri: Union[BinaryIO, str, os.PathLike],
+    src: torch.Tensor,
+    sample_rate: int,
+    channels_first: bool = True,
+    format: Optional[str] = None,
+    encoding: Optional[str] = None,
+    bits_per_sample: Optional[int] = None,
+    buffer_size: int = 4096,
+    compression: Optional[torchaudio.io.CodecConfig] = None,
+) -> None:
+    ext = None
+    if hasattr(uri, "write"):
+        if format is None:
+            raise RuntimeError("'format' is required when saving to file object.")
+    else:
+        uri = os.path.normpath(uri)
+        if tokens := str(uri).split(".")[1:]:
+            ext = tokens[-1].lower()
+
+    muxer, encoder, enc_fmt = _parse_save_args(ext, format, encoding, bits_per_sample)
+
+    if channels_first:
+        src = src.T
+
+    s = StreamWriter(uri, format=muxer, buffer_size=buffer_size)
+    s.add_audio_stream(
+        sample_rate,
+        num_channels=src.size(-1),
+        format=_get_sample_format(src.dtype),
+        encoder=encoder,
+        encoder_format=enc_fmt,
+        codec_config=compression,
+    )
+    with s.open():
+        s.write_audio_chunk(0, src)
+
+
+def _map_encoding(encoding: str) -> str:
+    for dst in ["PCM_S", "PCM_U", "PCM_F"]:
+        if dst in encoding:
+            return dst
+    if encoding == "PCM_MULAW":
+        return "ULAW"
+    elif encoding == "PCM_ALAW":
+        return "ALAW"
+    return encoding
+
+
+def _get_bits_per_sample(encoding: str, bits_per_sample: int) -> str:
+    if m := re.search(r"PCM_\w(\d+)\w*", encoding):
+        return int(m.group(1))
+    elif encoding in ["PCM_ALAW", "PCM_MULAW"]:
+        return 8
+    return bits_per_sample
+
+
+class FFmpegBackend(Backend):
+    @staticmethod
+    def info(uri: Union[BinaryIO, str, os.PathLike], format: Optional[str], buffer_size: int = 4096) -> AudioMetaData:
+        if hasattr(uri, "read"):
+            metadata = info_audio_fileobj(uri, format, buffer_size=buffer_size)
+        else:
+            metadata = info_audio(os.path.normpath(uri), format)
+        metadata.bits_per_sample = _get_bits_per_sample(metadata.encoding, metadata.bits_per_sample)
+        metadata.encoding = _map_encoding(metadata.encoding)
+        return metadata
+
+    @staticmethod
+    def load(
+        uri: Union[BinaryIO, str, os.PathLike],
+        frame_offset: int = 0,
+        num_frames: int = -1,
+        normalize: bool = True,
+        channels_first: bool = True,
+        format: Optional[str] = None,
+        buffer_size: int = 4096,
+    ) -> Tuple[torch.Tensor, int]:
+        if hasattr(uri, "read"):
+            return load_audio_fileobj(
+                uri,
+                frame_offset,
+                num_frames,
+                normalize,
+                channels_first,
+                format,
+                buffer_size,
+            )
+        else:
+            return load_audio(os.path.normpath(uri), frame_offset, num_frames, normalize, channels_first, format)
+
+    @staticmethod
+    def save(
+        uri: Union[BinaryIO, str, os.PathLike],
+        src: torch.Tensor,
+        sample_rate: int,
+        channels_first: bool = True,
+        format: Optional[str] = None,
+        encoding: Optional[str] = None,
+        bits_per_sample: Optional[int] = None,
+        buffer_size: int = 4096,
+        compression: Optional[Union[torchaudio.io.CodecConfig, float, int]] = None,
+    ) -> None:
+        if not isinstance(compression, (torchaudio.io.CodecConfig, type(None))):
+            raise ValueError(
+                "FFmpeg backend expects non-`None` value for argument `compression` to be of ",
+                f"type `torchaudio.io.CodecConfig`, but received value of type {type(compression)}",
+            )
+        save_audio(
+            uri,
+            src,
+            sample_rate,
+            channels_first,
+            format,
+            encoding,
+            bits_per_sample,
+            buffer_size,
+            compression,
+        )
+
+    @staticmethod
+    def can_decode(uri: Union[BinaryIO, str, os.PathLike], format: Optional[str]) -> bool:
+        return True
+
+    @staticmethod
+    def can_encode(uri: Union[BinaryIO, str, os.PathLike], format: Optional[str]) -> bool:
+        return True
--- a/torchaudio/_backend/soundfile.py
+++ b/torchaudio/_backend/soundfile.py
+import os
+from typing import BinaryIO, Optional, Tuple, Union
+
+import torch
+from torchaudio.io import CodecConfig
+
+from . import soundfile_backend
+from .backend import Backend
+from .common import AudioMetaData
+
+
+class SoundfileBackend(Backend):
+    @staticmethod
+    def info(uri: Union[BinaryIO, str, os.PathLike], format: Optional[str], buffer_size: int = 4096) -> AudioMetaData:
+        return soundfile_backend.info(uri, format)
+
+    @staticmethod
+    def load(
+        uri: Union[BinaryIO, str, os.PathLike],
+        frame_offset: int = 0,
+        num_frames: int = -1,
+        normalize: bool = True,
+        channels_first: bool = True,
+        format: Optional[str] = None,
+        buffer_size: int = 4096,
+    ) -> Tuple[torch.Tensor, int]:
+        return soundfile_backend.load(uri, frame_offset, num_frames, normalize, channels_first, format)
+
+    @staticmethod
+    def save(
+        uri: Union[BinaryIO, str, os.PathLike],
+        src: torch.Tensor,
+        sample_rate: int,
+        channels_first: bool = True,
+        format: Optional[str] = None,
+        encoding: Optional[str] = None,
+        bits_per_sample: Optional[int] = None,
+        buffer_size: int = 4096,
+        compression: Optional[Union[CodecConfig, float, int]] = None,
+    ) -> None:
+        if compression:
+            raise ValueError("soundfile backend does not support argument `compression`.")
+
+        soundfile_backend.save(
+            uri, src, sample_rate, channels_first, format=format, encoding=encoding, bits_per_sample=bits_per_sample
+        )
+
+    @staticmethod
+    def can_decode(uri, format) -> bool:
+        return True
+
+    @staticmethod
+    def can_encode(uri, format) -> bool:
+        return True
--- a/torchaudio/_backend/soundfile_backend.py
+++ b/torchaudio/_backend/soundfile_backend.py
+"""The new soundfile backend which will become default in 0.8.0 onward"""
+import warnings
+from typing import Optional, Tuple
+
+import torch
+from torchaudio._internal import module_utils as _mod_utils
+
+from .common import AudioMetaData
+
+
+_IS_SOUNDFILE_AVAILABLE = False
+
+# TODO: import soundfile only when it is used.
+if _mod_utils.is_module_available("soundfile"):
+    try:
+        import soundfile
+
+        _requires_soundfile = _mod_utils.no_op
+        _IS_SOUNDFILE_AVAILABLE = True
+    except Exception:
+        _requires_soundfile = _mod_utils.fail_with_message(
+            "requires soundfile, but we failed to import it. Please check the installation of soundfile."
+        )
+else:
+    _requires_soundfile = _mod_utils.fail_with_message(
+        "requires soundfile, but it is not installed. Please install soundfile."
+    )
+
+
+# Mapping from soundfile subtype to number of bits per sample.
+# This is mostly heuristical and the value is set to 0 when it is irrelevant
+# (lossy formats) or when it can't be inferred.
+# For ADPCM (and G72X) subtypes, it's hard to infer the bit depth because it's not part of the standard:
+# According to https://en.wikipedia.org/wiki/Adaptive_differential_pulse-code_modulation#In_telephony,
+# the default seems to be 8 bits but it can be compressed further to 4 bits.
+# The dict is inspired from
+# https://github.com/bastibe/python-soundfile/blob/744efb4b01abc72498a96b09115b42a4cabd85e4/soundfile.py#L66-L94
+_SUBTYPE_TO_BITS_PER_SAMPLE = {
+    "PCM_S8": 8,  # Signed 8 bit data
+    "PCM_16": 16,  # Signed 16 bit data
+    "PCM_24": 24,  # Signed 24 bit data
+    "PCM_32": 32,  # Signed 32 bit data
+    "PCM_U8": 8,  # Unsigned 8 bit data (WAV and RAW only)
+    "FLOAT": 32,  # 32 bit float data
+    "DOUBLE": 64,  # 64 bit float data
+    "ULAW": 8,  # U-Law encoded. See https://en.wikipedia.org/wiki/G.711#Types
+    "ALAW": 8,  # A-Law encoded. See https://en.wikipedia.org/wiki/G.711#Types
+    "IMA_ADPCM": 0,  # IMA ADPCM.
+    "MS_ADPCM": 0,  # Microsoft ADPCM.
+    "GSM610": 0,  # GSM 6.10 encoding. (Wikipedia says 1.625 bit depth?? https://en.wikipedia.org/wiki/Full_Rate)
+    "VOX_ADPCM": 0,  # OKI / Dialogix ADPCM
+    "G721_32": 0,  # 32kbs G721 ADPCM encoding.
+    "G723_24": 0,  # 24kbs G723 ADPCM encoding.
+    "G723_40": 0,  # 40kbs G723 ADPCM encoding.
+    "DWVW_12": 12,  # 12 bit Delta Width Variable Word encoding.
+    "DWVW_16": 16,  # 16 bit Delta Width Variable Word encoding.
+    "DWVW_24": 24,  # 24 bit Delta Width Variable Word encoding.
+    "DWVW_N": 0,  # N bit Delta Width Variable Word encoding.
+    "DPCM_8": 8,  # 8 bit differential PCM (XI only)
+    "DPCM_16": 16,  # 16 bit differential PCM (XI only)
+    "VORBIS": 0,  # Xiph Vorbis encoding. (lossy)
+    "ALAC_16": 16,  # Apple Lossless Audio Codec (16 bit).
+    "ALAC_20": 20,  # Apple Lossless Audio Codec (20 bit).
+    "ALAC_24": 24,  # Apple Lossless Audio Codec (24 bit).
+    "ALAC_32": 32,  # Apple Lossless Audio Codec (32 bit).
+}
+
+
+def _get_bit_depth(subtype):
+    if subtype not in _SUBTYPE_TO_BITS_PER_SAMPLE:
+        warnings.warn(
+            f"The {subtype} subtype is unknown to TorchAudio. As a result, the bits_per_sample "
+            "attribute will be set to 0. If you are seeing this warning, please "
+            "report by opening an issue on github (after checking for existing/closed ones). "
+            "You may otherwise ignore this warning."
+        )
+    return _SUBTYPE_TO_BITS_PER_SAMPLE.get(subtype, 0)
+
+
+_SUBTYPE_TO_ENCODING = {
+    "PCM_S8": "PCM_S",
+    "PCM_16": "PCM_S",
+    "PCM_24": "PCM_S",
+    "PCM_32": "PCM_S",
+    "PCM_U8": "PCM_U",
+    "FLOAT": "PCM_F",
+    "DOUBLE": "PCM_F",
+    "ULAW": "ULAW",
+    "ALAW": "ALAW",
+    "VORBIS": "VORBIS",
+}
+
+
+def _get_encoding(format: str, subtype: str):
+    if format == "FLAC":
+        return "FLAC"
+    return _SUBTYPE_TO_ENCODING.get(subtype, "UNKNOWN")
+
+
+@_requires_soundfile
+def info(filepath: str, format: Optional[str] = None) -> AudioMetaData:
+    """Get signal information of an audio file.
+
+    Note:
+        ``filepath`` argument is intentionally annotated as ``str`` only, even though it accepts
+        ``pathlib.Path`` object as well. This is for the consistency with ``"sox_io"`` backend,
+        which has a restriction on type annotation due to TorchScript compiler compatiblity.
+
+    Args:
+        filepath (path-like object or file-like object):
+            Source of audio data.
+        format (str or None, optional):
+            Not used. PySoundFile does not accept format hint.
+
+    Returns:
+        AudioMetaData: meta data of the given audio.
+
+    """
+    sinfo = soundfile.info(filepath)
+    return AudioMetaData(
+        sinfo.samplerate,
+        sinfo.frames,
+        sinfo.channels,
+        bits_per_sample=_get_bit_depth(sinfo.subtype),
+        encoding=_get_encoding(sinfo.format, sinfo.subtype),
+    )
+
+
+_SUBTYPE2DTYPE = {
+    "PCM_S8": "int8",
+    "PCM_U8": "uint8",
+    "PCM_16": "int16",
+    "PCM_32": "int32",
+    "FLOAT": "float32",
+    "DOUBLE": "float64",
+}
+
+
+@_requires_soundfile
+def load(
+    filepath: str,
+    frame_offset: int = 0,
+    num_frames: int = -1,
+    normalize: bool = True,
+    channels_first: bool = True,
+    format: Optional[str] = None,
+) -> Tuple[torch.Tensor, int]:
+    """Load audio data from file.
+
+    Note:
+        The formats this function can handle depend on the soundfile installation.
+        This function is tested on the following formats;
+
+        * WAV
+
+            * 32-bit floating-point
+            * 32-bit signed integer
+            * 16-bit signed integer
+            * 8-bit unsigned integer
+
+        * FLAC
+        * OGG/VORBIS
+        * SPHERE
+
+    By default (``normalize=True``, ``channels_first=True``), this function returns Tensor with
+    ``float32`` dtype, and the shape of `[channel, time]`.
+
+    .. warning::
+
+       ``normalize`` argument does not perform volume normalization.
+       It only converts the sample type to `torch.float32` from the native sample
+       type.
+
+       When the input format is WAV with integer type, such as 32-bit signed integer, 16-bit
+       signed integer, 24-bit signed integer, and 8-bit unsigned integer, by providing ``normalize=False``,
+       this function can return integer Tensor, where the samples are expressed within the whole range
+       of the corresponding dtype, that is, ``int32`` tensor for 32-bit signed PCM,
+       ``int16`` for 16-bit signed PCM and ``uint8`` for 8-bit unsigned PCM. Since torch does not
+       support ``int24`` dtype, 24-bit signed PCM are converted to ``int32`` tensors.
+
+       ``normalize`` argument has no effect on 32-bit floating-point WAV and other formats, such as
+       ``flac`` and ``mp3``.
+
+       For these formats, this function always returns ``float32`` Tensor with values.
+
+    Note:
+        ``filepath`` argument is intentionally annotated as ``str`` only, even though it accepts
+        ``pathlib.Path`` object as well. This is for the consistency with ``"sox_io"`` backend,
+        which has a restriction on type annotation due to TorchScript compiler compatiblity.
+
+    Args:
+        filepath (path-like object or file-like object):
+            Source of audio data.
+        frame_offset (int, optional):
+            Number of frames to skip before start reading data.
+        num_frames (int, optional):
+            Maximum number of frames to read. ``-1`` reads all the remaining samples,
+            starting from ``frame_offset``.
+            This function may return the less number of frames if there is not enough
+            frames in the given file.
+        normalize (bool, optional):
+            When ``True``, this function converts the native sample type to ``float32``.
+            Default: ``True``.
+
+            If input file is integer WAV, giving ``False`` will change the resulting Tensor type to
+            integer type.
+            This argument has no effect for formats other than integer WAV type.
+
+        channels_first (bool, optional):
+            When True, the returned Tensor has dimension `[channel, time]`.
+            Otherwise, the returned Tensor's dimension is `[time, channel]`.
+        format (str or None, optional):
+            Not used. PySoundFile does not accept format hint.
+
+    Returns:
+        (torch.Tensor, int): Resulting Tensor and sample rate.
+            If the input file has integer wav format and normalization is off, then it has
+            integer type, else ``float32`` type. If ``channels_first=True``, it has
+            `[channel, time]` else `[time, channel]`.
+    """
+    with soundfile.SoundFile(filepath, "r") as file_:
+        if file_.format != "WAV" or normalize:
+            dtype = "float32"
+        elif file_.subtype not in _SUBTYPE2DTYPE:
+            raise ValueError(f"Unsupported subtype: {file_.subtype}")
+        else:
+            dtype = _SUBTYPE2DTYPE[file_.subtype]
+
+        frames = file_._prepare_read(frame_offset, None, num_frames)
+        waveform = file_.read(frames, dtype, always_2d=True)
+        sample_rate = file_.samplerate
+
+    waveform = torch.from_numpy(waveform)
+    if channels_first:
+        waveform = waveform.t()
+    return waveform, sample_rate
+
+
+def _get_subtype_for_wav(dtype: torch.dtype, encoding: str, bits_per_sample: int):
+    if not encoding:
+        if not bits_per_sample:
+            subtype = {
+                torch.uint8: "PCM_U8",
+                torch.int16: "PCM_16",
+                torch.int32: "PCM_32",
+                torch.float32: "FLOAT",
+                torch.float64: "DOUBLE",
+            }.get(dtype)
+            if not subtype:
+                raise ValueError(f"Unsupported dtype for wav: {dtype}")
+            return subtype
+        if bits_per_sample == 8:
+            return "PCM_U8"
+        return f"PCM_{bits_per_sample}"
+    if encoding == "PCM_S":
+        if not bits_per_sample:
+            return "PCM_32"
+        if bits_per_sample == 8:
+            raise ValueError("wav does not support 8-bit signed PCM encoding.")
+        return f"PCM_{bits_per_sample}"
+    if encoding == "PCM_U":
+        if bits_per_sample in (None, 8):
+            return "PCM_U8"
+        raise ValueError("wav only supports 8-bit unsigned PCM encoding.")
+    if encoding == "PCM_F":
+        if bits_per_sample in (None, 32):
+            return "FLOAT"
+        if bits_per_sample == 64:
+            return "DOUBLE"
+        raise ValueError("wav only supports 32/64-bit float PCM encoding.")
+    if encoding == "ULAW":
+        if bits_per_sample in (None, 8):
+            return "ULAW"
+        raise ValueError("wav only supports 8-bit mu-law encoding.")
+    if encoding == "ALAW":
+        if bits_per_sample in (None, 8):
+            return "ALAW"
+        raise ValueError("wav only supports 8-bit a-law encoding.")
+    raise ValueError(f"wav does not support {encoding}.")
+
+
+def _get_subtype_for_sphere(encoding: str, bits_per_sample: int):
+    if encoding in (None, "PCM_S"):
+        return f"PCM_{bits_per_sample}" if bits_per_sample else "PCM_32"
+    if encoding in ("PCM_U", "PCM_F"):
+        raise ValueError(f"sph does not support {encoding} encoding.")
+    if encoding == "ULAW":
+        if bits_per_sample in (None, 8):
+            return "ULAW"
+        raise ValueError("sph only supports 8-bit for mu-law encoding.")
+    if encoding == "ALAW":
+        return "ALAW"
+    raise ValueError(f"sph does not support {encoding}.")
+
+
+def _get_subtype(dtype: torch.dtype, format: str, encoding: str, bits_per_sample: int):
+    if format == "wav":
+        return _get_subtype_for_wav(dtype, encoding, bits_per_sample)
+    if format == "flac":
+        if encoding:
+            raise ValueError("flac does not support encoding.")
+        if not bits_per_sample:
+            return "PCM_16"
+        if bits_per_sample > 24:
+            raise ValueError("flac does not support bits_per_sample > 24.")
+        return "PCM_S8" if bits_per_sample == 8 else f"PCM_{bits_per_sample}"
+    if format in ("ogg", "vorbis"):
+        if bits_per_sample:
+            raise ValueError("ogg/vorbis does not support bits_per_sample.")
+        if encoding is None or encoding == "vorbis":
+            return "VORBIS"
+        if encoding == "opus":
+            return "OPUS"
+        raise ValueError(f"Unexpected encoding: {encoding}")
+    if format == "mp3":
+        return "MPEG_LAYER_III"
+    if format == "sph":
+        return _get_subtype_for_sphere(encoding, bits_per_sample)
+    if format in ("nis", "nist"):
+        return "PCM_16"
+    raise ValueError(f"Unsupported format: {format}")
+
+
+@_requires_soundfile
+def save(
+    filepath: str,
+    src: torch.Tensor,
+    sample_rate: int,
+    channels_first: bool = True,
+    compression: Optional[float] = None,
+    format: Optional[str] = None,
+    encoding: Optional[str] = None,
+    bits_per_sample: Optional[int] = None,
+):
+    """Save audio data to file.
+
+    Note:
+        The formats this function can handle depend on the soundfile installation.
+        This function is tested on the following formats;
+
+        * WAV
+
+            * 32-bit floating-point
+            * 32-bit signed integer
+            * 16-bit signed integer
+            * 8-bit unsigned integer
+
+        * FLAC
+        * OGG/VORBIS
+        * SPHERE
+
+    Note:
+        ``filepath`` argument is intentionally annotated as ``str`` only, even though it accepts
+        ``pathlib.Path`` object as well. This is for the consistency with ``"sox_io"`` backend,
+        which has a restriction on type annotation due to TorchScript compiler compatiblity.
+
+    Args:
+        filepath (str or pathlib.Path): Path to audio file.
+        src (torch.Tensor): Audio data to save. must be 2D tensor.
+        sample_rate (int): sampling rate
+        channels_first (bool, optional): If ``True``, the given tensor is interpreted as `[channel, time]`,
+            otherwise `[time, channel]`.
+        compression (float of None, optional): Not used.
+            It is here only for interface compatibility reson with "sox_io" backend.
+        format (str or None, optional): Override the audio format.
+            When ``filepath`` argument is path-like object, audio format is
+            inferred from file extension. If the file extension is missing or
+            different, you can specify the correct format with this argument.
+
+            When ``filepath`` argument is file-like object,
+            this argument is required.
+
+            Valid values are ``"wav"``, ``"ogg"``, ``"vorbis"``,
+            ``"flac"`` and ``"sph"``.
+        encoding (str or None, optional): Changes the encoding for supported formats.
+            This argument is effective only for supported formats, sush as
+            ``"wav"``, ``""flac"`` and ``"sph"``. Valid values are;
+
+                - ``"PCM_S"`` (signed integer Linear PCM)
+                - ``"PCM_U"`` (unsigned integer Linear PCM)
+                - ``"PCM_F"`` (floating point PCM)
+                - ``"ULAW"`` (mu-law)
+                - ``"ALAW"`` (a-law)
+
+        bits_per_sample (int or None, optional): Changes the bit depth for the
+            supported formats.
+            When ``format`` is one of ``"wav"``, ``"flac"`` or ``"sph"``,
+            you can change the bit depth.
+            Valid values are ``8``, ``16``, ``24``, ``32`` and ``64``.
+
+    Supported formats/encodings/bit depth/compression are:
+
+    ``"wav"``
+        - 32-bit floating-point PCM
+        - 32-bit signed integer PCM
+        - 24-bit signed integer PCM
+        - 16-bit signed integer PCM
+        - 8-bit unsigned integer PCM
+        - 8-bit mu-law
+        - 8-bit a-law
+
+        Note:
+            Default encoding/bit depth is determined by the dtype of
+            the input Tensor.
+
+    ``"flac"``
+        - 8-bit
+        - 16-bit (default)
+        - 24-bit
+
+    ``"ogg"``, ``"vorbis"``
+        - Doesn't accept changing configuration.
+
+    ``"sph"``
+        - 8-bit signed integer PCM
+        - 16-bit signed integer PCM
+        - 24-bit signed integer PCM
+        - 32-bit signed integer PCM (default)
+        - 8-bit mu-law
+        - 8-bit a-law
+        - 16-bit a-law
+        - 24-bit a-law
+        - 32-bit a-law
+
+    """
+    if src.ndim != 2:
+        raise ValueError(f"Expected 2D Tensor, got {src.ndim}D.")
+    if compression is not None:
+        warnings.warn(
+            '`save` function of "soundfile" backend does not support "compression" parameter. '
+            "The argument is silently ignored."
+        )
+    if hasattr(filepath, "write"):
+        if format is None:
+            raise RuntimeError("`format` is required when saving to file object.")
+        ext = format.lower()
+    else:
+        ext = str(filepath).split(".")[-1].lower()
+
+    if bits_per_sample not in (None, 8, 16, 24, 32, 64):
+        raise ValueError("Invalid bits_per_sample.")
+    if bits_per_sample == 24:
+        warnings.warn(
+            "Saving audio with 24 bits per sample might warp samples near -1. "
+            "Using 16 bits per sample might be able to avoid this."
+        )
+    subtype = _get_subtype(src.dtype, ext, encoding, bits_per_sample)
+
+    # sph is a extension used in TED-LIUM but soundfile does not recognize it as NIST format,
+    # so we extend the extensions manually here
+    if ext in ["nis", "nist", "sph"] and format is None:
+        format = "NIST"
+
+    if channels_first:
+        src = src.t()
+
+    soundfile.write(file=filepath, data=src, samplerate=sample_rate, subtype=subtype, format=format)
--- a/torchaudio/_backend/sox.py
+++ b/torchaudio/_backend/sox.py
+import os
+from typing import BinaryIO, Optional, Tuple, Union
+
+import torch
+from torchaudio.io import CodecConfig
+
+from .backend import Backend
+from .common import AudioMetaData
+
+
+class SoXBackend(Backend):
+    @staticmethod
+    def info(uri: Union[BinaryIO, str, os.PathLike], format: Optional[str], buffer_size: int = 4096) -> AudioMetaData:
+        if hasattr(uri, "read"):
+            raise ValueError(
+                "SoX backend does not support reading from file-like objects. ",
+                "Please use an alternative backend that does support reading from file-like objects, e.g. FFmpeg.",
+            )
+        else:
+            sinfo = torch.ops.torchaudio.sox_io_get_info(uri, format)
+            if sinfo:
+                return AudioMetaData(*sinfo)
+            else:
+                raise RuntimeError(f"Failed to fetch metadata for {uri}.")
+
+    @staticmethod
+    def load(
+        uri: Union[BinaryIO, str, os.PathLike],
+        frame_offset: int = 0,
+        num_frames: int = -1,
+        normalize: bool = True,
+        channels_first: bool = True,
+        format: Optional[str] = None,
+        buffer_size: int = 4096,
+    ) -> Tuple[torch.Tensor, int]:
+        if hasattr(uri, "read"):
+            raise ValueError(
+                "SoX backend does not support loading from file-like objects. ",
+                "Please use an alternative backend that does support loading from file-like objects, e.g. FFmpeg.",
+            )
+        else:
+            ret = torch.ops.torchaudio.sox_io_load_audio_file(
+                uri, frame_offset, num_frames, normalize, channels_first, format
+            )
+            if not ret:
+                raise RuntimeError(f"Failed to load audio from {uri}.")
+            return ret
+
+    @staticmethod
+    def save(
+        uri: Union[BinaryIO, str, os.PathLike],
+        src: torch.Tensor,
+        sample_rate: int,
+        channels_first: bool = True,
+        format: Optional[str] = None,
+        encoding: Optional[str] = None,
+        bits_per_sample: Optional[int] = None,
+        buffer_size: int = 4096,
+        compression: Optional[Union[CodecConfig, float, int]] = None,
+    ) -> None:
+        if not isinstance(compression, (float, int, type(None))):
+            raise ValueError(
+                "SoX backend expects non-`None` value for argument `compression` to be of ",
+                f"type `float` or `int`, but received value of type {type(compression)}",
+            )
+        if hasattr(uri, "write"):
+            raise ValueError(
+                "SoX backend does not support writing to file-like objects. ",
+                "Please use an alternative backend that does support writing to file-like objects, e.g. FFmpeg.",
+            )
+        else:
+            torch.ops.torchaudio.sox_io_save_audio_file(
+                uri,
+                src,
+                sample_rate,
+                channels_first,
+                compression,
+                format,
+                encoding,
+                bits_per_sample,
+            )
+
+    @staticmethod
+    def can_decode(uri: Union[BinaryIO, str, os.PathLike], format: Optional[str]) -> bool:
+        # i.e. not a file-like object.
+        return not hasattr(uri, "read")
+
+    @staticmethod
+    def can_encode(uri: Union[BinaryIO, str, os.PathLike], format: Optional[str]) -> bool:
+        # i.e. not a file-like object.
+        return not hasattr(uri, "write")
--- a/torchaudio/_backend/utils.py
+++ b/torchaudio/_backend/utils.py
+import os
+from functools import lru_cache
+from typing import BinaryIO, Dict, Optional, Tuple, Type, Union
+
+import torch
+
+from torchaudio._extension import _FFMPEG_EXT, _SOX_INITIALIZED
+from torchaudio.io import CodecConfig
+
+from . import soundfile_backend
+
+from .backend import Backend
+from .common import AudioMetaData
+from .ffmpeg import FFmpegBackend
+from .soundfile import SoundfileBackend
+from .sox import SoXBackend
+
+
+@lru_cache(None)
+def get_available_backends() -> Dict[str, Type[Backend]]:
+    backend_specs: Dict[str, Type[Backend]] = {}
+    if _FFMPEG_EXT is not None:
+        backend_specs["ffmpeg"] = FFmpegBackend
+    if _SOX_INITIALIZED:
+        backend_specs["sox"] = SoXBackend
+    if soundfile_backend._IS_SOUNDFILE_AVAILABLE:
+        backend_specs["soundfile"] = SoundfileBackend
+    return backend_specs
+
+
+def get_backend(backend_name, backends) -> Backend:
+    if backend := backends.get(backend_name):
+        return backend
+    else:
+        raise ValueError(
+            f"Unsupported backend '{backend_name}' specified; ",
+            f"please select one of {list(backends.keys())} instead.",
+        )
+
+
+def get_info_func():
+    backends = get_available_backends()
+
+    def dispatcher(
+        uri: Union[BinaryIO, str, os.PathLike], format: Optional[str], backend_name: Optional[str]
+    ) -> Backend:
+        if backend_name is not None:
+            return get_backend(backend_name, backends)
+
+        for backend in backends.values():
+            if backend.can_decode(uri, format):
+                return backend
+        raise RuntimeError(f"Couldn't find appropriate backend to handle uri {uri} and format {format}.")
+
+    def info(
+        uri: Union[BinaryIO, str, os.PathLike],
+        format: Optional[str] = None,
+        buffer_size: int = 4096,
+        backend: Optional[str] = None,
+    ) -> AudioMetaData:
+        """Get signal information of an audio file.
+
+        Note:
+            When the input type is file-like object, this function cannot
+            get the correct length (``num_samples``) for certain formats,
+            such as ``vorbis``.
+            In this case, the value of ``num_samples`` is ``0``.
+
+        Args:
+            uri (path-like object or file-like object):
+                Source of audio data. The following types are accepted:
+
+                * ``path-like``: File path or URL.
+                * ``file-like``: Object with ``read(size: int) -> bytes`` method,
+                  which returns byte string of at most ``size`` length.
+
+            format (str or None, optional):
+                If not ``None``, interpreted as hint that may allow backend to override the detected format.
+                (Default: ``None``)
+
+            buffer_size (int, optional):
+                Size of buffer to use when processing file-like objects, in bytes. (Default: ``4096``)
+
+            backend (str or None, optional):
+                I/O backend to use.
+                If ``None``, function selects backend given input and available backends.
+                Otherwise, must be one of [``"ffmpeg"``, ``"sox"``, ``"soundfile"``],
+                with the corresponding backend available.
+                (Default: ``None``)
+
+                .. seealso::
+                   :ref:`backend`
+
+        Returns:
+            AudioMetaData
+        """
+        backend = dispatcher(uri, format, backend)
+        return backend.info(uri, format, buffer_size)
+
+    return info
+
+
+def get_load_func():
+    backends = get_available_backends()
+
+    def dispatcher(
+        uri: Union[BinaryIO, str, os.PathLike], format: Optional[str], backend_name: Optional[str]
+    ) -> Backend:
+        if backend_name is not None:
+            return get_backend(backend_name, backends)
+
+        for backend in backends.values():
+            if backend.can_decode(uri, format):
+                return backend
+        raise RuntimeError(f"Couldn't find appropriate backend to handle uri {uri} and format {format}.")
+
+    def load(
+        uri: Union[BinaryIO, str, os.PathLike],
+        frame_offset: int = 0,
+        num_frames: int = -1,
+        normalize: bool = True,
+        channels_first: bool = True,
+        format: Optional[str] = None,
+        buffer_size: int = 4096,
+        backend: Optional[str] = None,
+    ) -> Tuple[torch.Tensor, int]:
+        """Load audio data from source.
+
+        By default (``normalize=True``, ``channels_first=True``), this function returns Tensor with
+        ``float32`` dtype, and the shape of `[channel, time]`.
+
+        Note:
+            The formats this function can handle depend on the availability of backends.
+            Please use the following functions to fetch the supported formats.
+
+            - FFmpeg: :py:func:`torchaudio.utils.ffmpeg_utils.get_audio_decoders`
+            - Sox: :py:func:`torchaudio.utils.sox_utils.list_read_formats`
+            - SoundFile: Refer to `the official document <https://pysoundfile.readthedocs.io/>`__.
+
+        .. warning::
+
+            ``normalize`` argument does not perform volume normalization.
+            It only converts the sample type to `torch.float32` from the native sample
+            type.
+
+            When the input format is WAV with integer type, such as 32-bit signed integer, 16-bit
+            signed integer, 24-bit signed integer, and 8-bit unsigned integer, by providing ``normalize=False``,
+            this function can return integer Tensor, where the samples are expressed within the whole range
+            of the corresponding dtype, that is, ``int32`` tensor for 32-bit signed PCM,
+            ``int16`` for 16-bit signed PCM and ``uint8`` for 8-bit unsigned PCM. Since torch does not
+            support ``int24`` dtype, 24-bit signed PCM are converted to ``int32`` tensors.
+
+            ``normalize`` argument has no effect on 32-bit floating-point WAV and other formats, such as
+            ``flac`` and ``mp3``.
+
+            For these formats, this function always returns ``float32`` Tensor with values.
+
+
+        Args:
+            uri (path-like object or file-like object):
+                Source of audio data.
+            frame_offset (int, optional):
+                Number of frames to skip before start reading data.
+            num_frames (int, optional):
+                Maximum number of frames to read. ``-1`` reads all the remaining samples,
+                starting from ``frame_offset``.
+                This function may return the less number of frames if there is not enough
+                frames in the given file.
+            normalize (bool, optional):
+                When ``True``, this function converts the native sample type to ``float32``.
+                Default: ``True``.
+
+                If input file is integer WAV, giving ``False`` will change the resulting Tensor type to
+                integer type.
+                This argument has no effect for formats other than integer WAV type.
+
+            channels_first (bool, optional):
+                When True, the returned Tensor has dimension `[channel, time]`.
+                Otherwise, the returned Tensor's dimension is `[time, channel]`.
+
+            format (str or None, optional):
+                If not ``None``, interpreted as hint that may allow backend to override the detected format.
+                (Default: ``None``)
+
+            buffer_size (int, optional):
+                Size of buffer to use when processing file-like objects, in bytes. (Default: ``4096``)
+
+            backend (str or None, optional):
+                I/O backend to use.
+                If ``None``, function selects backend given input and available backends.
+                Otherwise, must be one of [``"ffmpeg"``, ``"sox"``, ``"soundfile"``],
+                with the corresponding backend being available. (Default: ``None``)
+
+                .. seealso::
+                   :ref:`backend`
+
+        Returns:
+            (torch.Tensor, int): Resulting Tensor and sample rate.
+                If the input file has integer wav format and normalization is off, then it has
+                integer type, else ``float32`` type. If ``channels_first=True``, it has
+                `[channel, time]` else `[time, channel]`.
+        """
+        backend = dispatcher(uri, format, backend)
+        return backend.load(uri, frame_offset, num_frames, normalize, channels_first, format, buffer_size)
+
+    return load
+
+
+def get_save_func():
+    backends = get_available_backends()
+
+    def dispatcher(
+        uri: Union[BinaryIO, str, os.PathLike], format: Optional[str], backend_name: Optional[str]
+    ) -> Backend:
+        if backend_name is not None:
+            return get_backend(backend_name, backends)
+
+        for backend in backends.values():
+            if backend.can_encode(uri, format):
+                return backend
+        raise RuntimeError(f"Couldn't find appropriate backend to handle uri {uri} and format {format}.")
+
+    def save(
+        uri: Union[BinaryIO, str, os.PathLike],
+        src: torch.Tensor,
+        sample_rate: int,
+        channels_first: bool = True,
+        format: Optional[str] = None,
+        encoding: Optional[str] = None,
+        bits_per_sample: Optional[int] = None,
+        buffer_size: int = 4096,
+        backend: Optional[str] = None,
+        compression: Optional[Union[CodecConfig, float, int]] = None,
+    ):
+        """Save audio data to file.
+
+        Note:
+            The formats this function can handle depend on the availability of backends.
+            Please use the following functions to fetch the supported formats.
+
+            - FFmpeg: :py:func:`torchaudio.utils.ffmpeg_utils.get_audio_encoders`
+            - Sox: :py:func:`torchaudio.utils.sox_utils.list_write_formats`
+            - SoundFile: Refer to `the official document <https://pysoundfile.readthedocs.io/>`__.
+
+        Args:
+            uri (str or pathlib.Path): Path to audio file.
+            src (torch.Tensor): Audio data to save. must be 2D tensor.
+            sample_rate (int): sampling rate
+            channels_first (bool, optional): If ``True``, the given tensor is interpreted as `[channel, time]`,
+                otherwise `[time, channel]`.
+            format (str or None, optional): Override the audio format.
+                When ``uri`` argument is path-like object, audio format is
+                inferred from file extension. If the file extension is missing or
+                different, you can specify the correct format with this argument.
+
+                When ``uri`` argument is file-like object,
+                this argument is required.
+
+                Valid values are ``"wav"``, ``"ogg"``, and ``"flac"``.
+            encoding (str or None, optional): Changes the encoding for supported formats.
+                This argument is effective only for supported formats, i.e.
+                ``"wav"`` and ``""flac"```. Valid values are
+
+                - ``"PCM_S"`` (signed integer Linear PCM)
+                - ``"PCM_U"`` (unsigned integer Linear PCM)
+                - ``"PCM_F"`` (floating point PCM)
+                - ``"ULAW"`` (mu-law)
+                - ``"ALAW"`` (a-law)
+
+            bits_per_sample (int or None, optional): Changes the bit depth for the
+                supported formats.
+                When ``format`` is one of ``"wav"`` and ``"flac"``,
+                you can change the bit depth.
+                Valid values are ``8``, ``16``, ``24``, ``32`` and ``64``.
+
+            buffer_size (int, optional):
+                Size of buffer to use when processing file-like objects, in bytes. (Default: ``4096``)
+
+            backend (str or None, optional):
+                I/O backend to use.
+                If ``None``, function selects backend given input and available backends.
+                Otherwise, must be one of [``"ffmpeg"``, ``"sox"``, ``"soundfile"``],
+                with the corresponding backend being available.
+                (Default: ``None``)
+
+                .. seealso::
+                   :ref:`backend`
+
+            compression (CodecConfig, float, int, or None, optional):
+                Compression configuration to apply.
+
+                If the selected backend is FFmpeg, an instance of :py:class:`CodecConfig` must be provided.
+
+                Otherwise, if the selected backend is SoX, a float or int value corresponding to option ``-C`` of the
+                ``sox`` command line interface must be provided. For instance:
+
+                ``"mp3"``
+                    Either bitrate (in ``kbps``) with quality factor, such as ``128.2``, or
+                    VBR encoding with quality factor such as ``-4.2``. Default: ``-4.5``.
+
+                ``"flac"``
+                    Whole number from ``0`` to ``8``. ``8`` is default and highest compression.
+
+                ``"ogg"``, ``"vorbis"``
+                    Number from ``-1`` to ``10``; ``-1`` is the highest compression
+                    and lowest quality. Default: ``3``.
+
+                Refer to http://sox.sourceforge.net/soxformat.html for more details.
+
+        """
+        backend = dispatcher(uri, format, backend)
+        return backend.save(
+            uri, src, sample_rate, channels_first, format, encoding, bits_per_sample, buffer_size, compression
+        )
+
+    return save
--- a/torchaudio/_extension/__init__.py
+++ b/torchaudio/_extension/__init__.py
+import logging
+import os
+import sys
+
+from torchaudio._internal.module_utils import eval_env, fail_with_message, is_module_available, no_op
+
+try:
+    from .fb import _init_ffmpeg
+except ImportError:
+    from .utils import _init_ffmpeg
+from .utils import _check_cuda_version, _fail_since_no_ffmpeg, _fail_since_no_sox, _init_dll_path, _init_sox, _load_lib
+
+_LG = logging.getLogger(__name__)
+
+
+# Note:
+# `_check_cuda_version` is not meant to be used by regular users.
+# Builder uses it for debugging purpose, so we export it.
+# https://github.com/pytorch/builder/blob/e2e4542b8eb0bdf491214451a1a4128bd606cce2/test/smoke_test/smoke_test.py#L80
+__all__ = [
+    "fail_if_no_sox",
+    "fail_if_no_ffmpeg",
+    "_check_cuda_version",
+    "_IS_TORCHAUDIO_EXT_AVAILABLE",
+    "_IS_RIR_AVAILABLE",
+    "_SOX_INITIALIZED",
+    "_FFMPEG_EXT",
+]
+
+
+if os.name == "nt" and (3, 8) <= sys.version_info < (3, 9):
+    _init_dll_path()
+
+
+# When the extension module is built, we initialize it.
+# In case of an error, we do not catch the failure as it suggests there is something
+# wrong with the installation.
+_IS_TORCHAUDIO_EXT_AVAILABLE = is_module_available("torchaudio.lib._torchaudio")
+# RIR features are implemented in _torchaudio extension, but they can be individually
+# turned on/off at build time. Available means that _torchaudio is loaded properly, and
+# RIR features are found there.
+_IS_RIR_AVAILABLE = False
+_IS_ALIGN_AVAILABLE = False
+if _IS_TORCHAUDIO_EXT_AVAILABLE:
+    _load_lib("libtorchaudio")
+
+    import torchaudio.lib._torchaudio  # noqa
+
+    _check_cuda_version()
+    _IS_RIR_AVAILABLE = torchaudio.lib._torchaudio.is_rir_available()
+    _IS_ALIGN_AVAILABLE = torchaudio.lib._torchaudio.is_align_available()
+
+
+# Initialize libsox-related features
+_SOX_INITIALIZED = False
+_USE_SOX = False if os.name == "nt" else eval_env("TORCHAUDIO_USE_SOX", True)
+_SOX_MODULE_AVAILABLE = is_module_available("torchaudio.lib._torchaudio_sox")
+if _USE_SOX and _SOX_MODULE_AVAILABLE:
+    try:
+        _init_sox()
+        _SOX_INITIALIZED = True
+    except Exception:
+        # The initialization of sox extension will fail if supported sox
+        # libraries are not found in the system.
+        # Since the rest of the torchaudio works without it, we do not report the
+        # error here.
+        # The error will be raised when user code attempts to use these features.
+        _LG.debug("Failed to initialize sox extension", exc_info=True)
+
+
+if os.name == "nt":
+    fail_if_no_sox = fail_with_message("requires sox extension, which is not supported on Windows.")
+elif not _USE_SOX:
+    fail_if_no_sox = fail_with_message("requires sox extension, but it is disabled. (TORCHAUDIO_USE_SOX=0)")
+elif not _SOX_MODULE_AVAILABLE:
+    fail_if_no_sox = fail_with_message(
+        "requires sox extension, but TorchAudio is not compiled with it. "
+        "Please build TorchAudio with libsox support. (BUILD_SOX=1)"
+    )
+else:
+    fail_if_no_sox = no_op if _SOX_INITIALIZED else _fail_since_no_sox
+
+
+# Initialize FFmpeg-related features
+_FFMPEG_EXT = None
+_USE_FFMPEG = eval_env("TORCHAUDIO_USE_FFMPEG", True)
+if _USE_FFMPEG and _IS_TORCHAUDIO_EXT_AVAILABLE:
+    try:
+        _FFMPEG_EXT = _init_ffmpeg()
+    except Exception:
+        # The initialization of FFmpeg extension will fail if supported FFmpeg
+        # libraries are not found in the system.
+        # Since the rest of the torchaudio works without it, we do not report the
+        # error here.
+        # The error will be raised when user code attempts to use these features.
+        _LG.debug("Failed to initialize ffmpeg bindings", exc_info=True)
+
+
+if _USE_FFMPEG:
+    fail_if_no_ffmpeg = _fail_since_no_ffmpeg if _FFMPEG_EXT is None else no_op
+else:
+    fail_if_no_ffmpeg = fail_with_message("requires ffmpeg extension, but it is disabled. (TORCHAUDIO_USE_FFMPEG=0)")
+
+
+fail_if_no_rir = (
+    no_op
+    if _IS_RIR_AVAILABLE
+    else fail_with_message(
+        "requires RIR extension, but TorchAudio is not compiled with it. Please build TorchAudio with RIR support."
+    )
+)
+
+fail_if_no_align = (
+    no_op
+    if _IS_ALIGN_AVAILABLE
+    else fail_with_message(
+        "Requires alignment extension, but TorchAudio is not compiled with it. \
+        Please build TorchAudio with alignment support."
+    )
+)
--- a/torchaudio/_extension/utils.py
+++ b/torchaudio/_extension/utils.py
+"""Module to implement logics used for initializing extensions.
+
+The implementations here should be stateless.
+They should not depend on external state.
+Anything that depends on external state should happen in __init__.py
+"""
+
+
+import importlib
+import logging
+import os
+import platform
+import warnings
+from functools import wraps
+from pathlib import Path
+
+import torch
+import torchaudio
+
+_LG = logging.getLogger(__name__)
+_LIB_DIR = Path(__file__).parent.parent / "lib"
+
+
+def _get_lib_path(lib: str):
+    suffix = "pyd" if os.name == "nt" else "so"
+    path = _LIB_DIR / f"{lib}.{suffix}"
+    return path
+
+
+def _load_lib(lib: str) -> bool:
+    """Load extension module
+
+    Note:
+        In case `torchaudio` is deployed with `pex` format, the library file
+        is not in a standard location.
+        In this case, we expect that `libtorchaudio` is available somewhere
+        in the search path of dynamic loading mechanism, so that importing
+        `_torchaudio` will have library loader find and load `libtorchaudio`.
+        This is the reason why the function should not raising an error when the library
+        file is not found.
+
+    Returns:
+        bool:
+            True if the library file is found AND the library loaded without failure.
+            False if the library file is not found (like in the case where torchaudio
+            is deployed with pex format, thus the shared library file is
+            in a non-standard location.).
+            If the library file is found but there is an issue loading the library,
+            (such as missing dependency) then this function raises the exception as-is.
+
+    Raises:
+        Exception:
+            If the library file is found, but there is an issue loading the library file,
+            (when underlying `ctype.DLL` throws an exception), this function will pass
+            the exception as-is, instead of catching it and returning bool.
+            The expected case is `OSError` thrown by `ctype.DLL` when a dynamic dependency
+            is not found.
+            This behavior was chosen because the expected failure case is not recoverable.
+            If a dependency is missing, then users have to install it.
+    """
+    path = _get_lib_path(lib)
+    if not path.exists():
+        return False
+    torch.ops.load_library(path)
+    torch.classes.load_library(path)
+    return True
+
+
+def _init_sox():
+    _load_lib("libtorchaudio_sox")
+    import torchaudio.lib._torchaudio_sox  # noqa
+
+    torchaudio.lib._torchaudio_sox.set_verbosity(0)
+
+    import atexit
+
+    torch.ops.torchaudio.sox_effects_initialize_sox_effects()
+    atexit.register(torch.ops.torchaudio.sox_effects_shutdown_sox_effects)
+
+
+def _try_access_avutil(ffmpeg_ver):
+    libname_template = {
+        "Linux": "libavutil.so.{ver}",
+        "Darwin": "libavutil.{ver}.dylib",
+        "Windows": "avutil-{ver}.dll",
+    }[platform.system()]
+    avutil_ver = {"6": 58, "5": 57, "4": 56}[ffmpeg_ver]
+    libavutil = libname_template.format(ver=avutil_ver)
+    torchaudio.lib._torchaudio.find_avutil(libavutil)
+
+
+def _find_versionsed_ffmpeg_extension(ffmpeg_ver: str):
+    _LG.debug("Attempting to load FFmpeg version %s.", ffmpeg_ver)
+
+    library = f"libtorchaudio_ffmpeg{ffmpeg_ver}"
+    extension = f"_torchaudio_ffmpeg{ffmpeg_ver}"
+
+    if not _get_lib_path(extension).exists():
+        raise RuntimeError(f"FFmpeg {ffmpeg_ver} extension is not available.")
+
+    if ffmpeg_ver:
+        # A simple check for FFmpeg availability.
+        # This is not technically sufficient as other libraries could be missing,
+        # but usually this is sufficient.
+        #
+        # Note: the reason why this check is performed is because I don't know
+        # if the next `_load_lib` (which calls `ctypes.CDLL` under the hood),
+        # could leak handle to shared libraries of dependencies, in case it fails.
+        #
+        # i.e. If the `ctypes.CDLL("foo")` fails because one of `foo`'s dependency
+        # does not exist while `foo` and some other dependencies exist, is it guaranteed
+        # that none-of them are kept in memory after the failure??
+        _try_access_avutil(ffmpeg_ver)
+
+    _load_lib(library)
+
+    _LG.debug("Found FFmpeg version %s.", ffmpeg_ver)
+    return importlib.import_module(f"torchaudio.lib.{extension}")
+
+
+_FFMPEG_VERS = ["6", "5", "4", ""]
+
+
+def _find_ffmpeg_extension(ffmpeg_vers, show_error):
+    logger = _LG.error if show_error else _LG.debug
+    for ffmpeg_ver in ffmpeg_vers:
+        try:
+            return _find_versionsed_ffmpeg_extension(ffmpeg_ver)
+        except Exception:
+            logger("Failed to load FFmpeg %s extension.", ffmpeg_ver, exc_info=True)
+            continue
+    raise ImportError(f"Failed to intialize FFmpeg extension. Tried versions: {ffmpeg_vers}")
+
+
+def _find_available_ffmpeg_ext():
+    ffmpeg_vers = ["6", "5", "4", ""]
+    return [v for v in ffmpeg_vers if _get_lib_path(f"_torchaudio_ffmpeg{v}").exists()]
+
+
+def _init_ffmpeg(show_error=False):
+    ffmpeg_vers = _find_available_ffmpeg_ext()
+    if not ffmpeg_vers:
+        raise RuntimeError(
+            # fmt: off
+            "TorchAudio is not built with FFmpeg integration. "
+            "Please build torchaudio with USE_FFMPEG=1."
+            # fmt: on
+        )
+
+    # User override
+    if ffmpeg_ver := os.environ.get("TORCHAUDIO_USE_FFMPEG_VERSION"):
+        if ffmpeg_vers == [""]:
+            warnings.warn("TorchAudio is built in single FFmpeg mode. TORCHAUDIO_USE_FFMPEG_VERSION is ignored.")
+        else:
+            if ffmpeg_ver not in ffmpeg_vers:
+                raise ValueError(
+                    f"The FFmpeg version {ffmpeg_ver} (read from TORCHAUDIO_USE_FFMPEG_VERSION) "
+                    f"is not available. Available versions are {[v for v in ffmpeg_vers if v]}"
+                )
+            ffmpeg_vers = [ffmpeg_ver]
+
+    ext = _find_ffmpeg_extension(ffmpeg_vers, show_error)
+    ext.init()
+    if ext.get_log_level() > 8:
+        ext.set_log_level(8)
+    return ext
+
+
+def _init_dll_path():
+    # On Windows Python-3.8+ has `os.add_dll_directory` call,
+    # which is called to configure dll search path.
+    # To find cuda related dlls we need to make sure the
+    # conda environment/bin path is configured Please take a look:
+    # https://stackoverflow.com/questions/59330863/cant-import-dll-module-in-python
+    # Please note: if some path can't be added using add_dll_directory we simply ignore this path
+    for path in os.environ.get("PATH", "").split(";"):
+        if os.path.exists(path):
+            try:
+                os.add_dll_directory(path)
+            except Exception:
+                pass
+
+
+def _check_cuda_version():
+    version = torchaudio.lib._torchaudio.cuda_version()
+    if version is not None and torch.version.cuda is not None:
+        version_str = str(version)
+        ta_version = f"{version_str[:-3]}.{version_str[-2]}"
+        t_version = torch.version.cuda.split(".")
+        t_version = f"{t_version[0]}.{t_version[1]}"
+        if ta_version != t_version:
+            raise RuntimeError(
+                "Detected that PyTorch and TorchAudio were compiled with different CUDA versions. "
+                f"PyTorch has CUDA version {t_version} whereas TorchAudio has CUDA version {ta_version}. "
+                "Please install the TorchAudio version that matches your PyTorch version."
+            )
+    return version
+
+
+def _fail_since_no_sox(func):
+    @wraps(func)
+    def wrapped(*_args, **_kwargs):
+        try:
+            # Note:
+            # We run _init_sox again just to show users the stacktrace.
+            # _init_sox would not succeed here.
+            _init_sox()
+        except Exception as err:
+            raise RuntimeError(
+                f"{func.__name__} requires sox extension which is not available. "
+                "Please refer to the stacktrace above for how to resolve this."
+            ) from err
+        # This should not happen in normal execution, but just in case.
+        return func(*_args, **_kwargs)
+
+    return wrapped
+
+
+def _fail_since_no_ffmpeg(func):
+    @wraps(func)
+    def wrapped(*_args, **_kwargs):
+        try:
+            # Note:
+            # We run _init_ffmpeg again just to show users the stacktrace.
+            # _init_ffmpeg would not succeed here.
+            _init_ffmpeg(show_error=True)
+        except Exception as err:
+            raise RuntimeError(
+                f"{func.__name__} requires FFmpeg extension which is not available. "
+                "Please refer to the stacktrace above for how to resolve this."
+            ) from err
+        # This should not happen in normal execution, but just in case.
+        return func(*_args, **_kwargs)
+
+    return wrapped
--- a/torchaudio/_internal/__init__.py
+++ b/torchaudio/_internal/__init__.py
-from torch.hub import download_url_to_file, load_state_dict_from_url
+try:
+    from .fb import download_url_to_file, load_state_dict_from_url
+except ImportError:
+    from torch.hub import download_url_to_file, load_state_dict_from_url


 __all__ = [

--- a/torchaudio/_internal/module_utils.py
+++ b/torchaudio/_internal/module_utils.py
 import importlib.util
+import os
 import warnings
 from functools import wraps
 from typing import Optional

-import torch
+
+def eval_env(var, default):
+    """Check if environment varable has True-y value"""
+    if var not in os.environ:
+        return default
+
+    val = os.environ.get(var, "0")
+    trues = ["1", "true", "TRUE", "on", "ON", "yes", "YES"]
+    falses = ["0", "false", "FALSE", "off", "OFF", "no", "NO"]
+    if val in trues:
+        return True
+    if val not in falses:
+        # fmt: off
+        raise RuntimeError(
+            f"Unexpected environment variable value `{var}={val}`. "
+            f"Expected one of {trues + falses}")
+        # fmt: on
+    return False


 def is_module_available(*modules: str) -> bool:
@@ -42,106 +60,54 @@ def requires_module(*modules: str):
    return decorator


-def deprecated(direction: str, version: Optional[str] = None):
+def deprecated(direction: str, version: Optional[str] = None, remove: bool = False):
    """Decorator to add deprecation message

    Args:
        direction (str): Migration steps to be given to users.
        version (str or int): The version when the object will be removed
+        remove (bool): If enabled, append future removal message.
    """

    def decorator(func):
        @wraps(func)
        def wrapped(*args, **kwargs):
-            message = (
-                f"{func.__module__}.{func.__name__} has been deprecated "
-                f'and will be removed from {"future" if version is None else version} release. '
-                f"{direction}"
-            )
+            message = f"{func.__module__}.{func.__name__} has been deprecated. {direction}"
+            if remove:
+                message += f' It will be removed from {"future" if version is None else version} release. '
            warnings.warn(message, stacklevel=2)
            return func(*args, **kwargs)

-        return wrapped
-
-    return decorator
-
+        message = "This function has been deprecated. "
+        if remove:
+            message += f'It will be removed from {"future" if version is None else version} release. '

-def is_kaldi_available():
-    return is_module_available("torchaudio._torchaudio") and torch.ops.torchaudio.is_kaldi_available()
+        wrapped.__doc__ = f"""DEPRECATED: {func.__doc__}

+    .. warning::

-def requires_kaldi():
-    if is_kaldi_available():
+       {message}
+       {direction}
+        """

-        def decorator(func):
-            return func
-
-    else:
-
-        def decorator(func):
-            @wraps(func)
-            def wrapped(*args, **kwargs):
-                raise RuntimeError(f"{func.__module__}.{func.__name__} requires kaldi")
-
-            return wrapped
+        return wrapped

    return decorator


-def _check_soundfile_importable():
-    if not is_module_available("soundfile"):
-        return False
-    try:
-        import soundfile  # noqa: F401
-
-        return True
-    except Exception:
-        warnings.warn("Failed to import soundfile. 'soundfile' backend is not available.")
-        return False
-
-
-_is_soundfile_importable = _check_soundfile_importable()
-
-
-def is_soundfile_available():
-    return _is_soundfile_importable
-
-
-def requires_soundfile():
-    if is_soundfile_available():
-
-        def decorator(func):
-            return func
-
-    else:
+def fail_with_message(message):
+    """Generate decorator to give users message about missing TorchAudio extension."""

-        def decorator(func):
-            @wraps(func)
-            def wrapped(*args, **kwargs):
-                raise RuntimeError(f"{func.__module__}.{func.__name__} requires soundfile")
+    def decorator(func):
+        @wraps(func)
+        def wrapped(*args, **kwargs):
+            raise RuntimeError(f"{func.__module__}.{func.__name__} {message}")

-            return wrapped
+        return wrapped

    return decorator


-def is_sox_available():
-    return is_module_available("torchaudio._torchaudio") and torch.ops.torchaudio.is_sox_available()
-
-
-def requires_sox():
-    if is_sox_available():
-
-        def decorator(func):
-            return func
-
-    else:
-
-        def decorator(func):
-            @wraps(func)
-            def wrapped(*args, **kwargs):
-                raise RuntimeError(f"{func.__module__}.{func.__name__} requires sox")
-
-            return wrapped
-
-    return decorator
+def no_op(func):
+    """Op-op decorator. Used in place of fail_with_message when a functionality that requires extension works fine."""
+    return func
--- a/torchaudio/backend/__init__.py
+++ b/torchaudio/backend/__init__.py
-# flake8: noqa
-from . import utils
-from .utils import get_audio_backend, list_audio_backends, set_audio_backend
+# NOTE:
+# The entire `torchaudio.backend` module is deprecated.
+# New things should be added to `torchaudio._backend`.
+# Only things related to backward compatibility should be placed here.


-utils._init_audio_backend()
+from . import common, no_backend, soundfile_backend, sox_io_backend  # noqa
+from .utils import _init_backend, get_audio_backend, list_audio_backends, set_audio_backend
+
+__all__ = ["_init_backend", "get_audio_backend", "list_audio_backends", "set_audio_backend"]
--- a/torchaudio/backend/_no_backend.py
+++ b/torchaudio/backend/_no_backend.py
+from pathlib import Path
+from typing import Callable, Optional, Tuple, Union
+
+from torch import Tensor
+
+
+def load(
+    filepath: Union[str, Path],
+    out: Optional[Tensor] = None,
+    normalization: Union[bool, float, Callable] = True,
+    channels_first: bool = True,
+    num_frames: int = 0,
+    offset: int = 0,
+    filetype: Optional[str] = None,
+) -> Tuple[Tensor, int]:
+    raise RuntimeError("No audio I/O backend is available.")
+
+
+def save(filepath: str, src: Tensor, sample_rate: int, precision: int = 16, channels_first: bool = True) -> None:
+    raise RuntimeError("No audio I/O backend is available.")
+
+
+def info(filepath: str) -> None:
+    raise RuntimeError("No audio I/O backend is available.")
--- a/torchaudio/backend/_sox_io_backend.py
+++ b/torchaudio/backend/_sox_io_backend.py
+import os
+from typing import Optional, Tuple
+
+import torch
+import torchaudio
+from torchaudio import AudioMetaData
+
+
+@torchaudio._extension.fail_if_no_sox
+def info(
+    filepath: str,
+    format: Optional[str] = None,
+) -> AudioMetaData:
+    """Get signal information of an audio file.
+
+    Args:
+        filepath (str):
+            Source of audio data.
+
+        format (str or None, optional):
+            Override the format detection with the given format.
+            Providing the argument might help when libsox can not infer the format
+            from header or extension.
+
+    Returns:
+        AudioMetaData: Metadata of the given audio.
+    """
+    if not torch.jit.is_scripting():
+        if hasattr(filepath, "read"):
+            raise RuntimeError("sox_io backend does not support file-like object.")
+        filepath = os.fspath(filepath)
+    sinfo = torch.ops.torchaudio.sox_io_get_info(filepath, format)
+    return AudioMetaData(*sinfo)
+
+
+@torchaudio._extension.fail_if_no_sox
+def load(
+    filepath: str,
+    frame_offset: int = 0,
+    num_frames: int = -1,
+    normalize: bool = True,
+    channels_first: bool = True,
+    format: Optional[str] = None,
+) -> Tuple[torch.Tensor, int]:
+    """Load audio data from file.
+
+    Note:
+        This function can handle all the codecs that underlying libsox can handle,
+        however it is tested on the following formats;
+
+        * WAV, AMB
+
+            * 32-bit floating-point
+            * 32-bit signed integer
+            * 24-bit signed integer
+            * 16-bit signed integer
+            * 8-bit unsigned integer (WAV only)
+
+        * MP3
+        * FLAC
+        * OGG/VORBIS
+        * OPUS
+        * SPHERE
+        * AMR-NB
+
+        To load ``MP3``, ``FLAC``, ``OGG/VORBIS``, ``OPUS`` and other codecs ``libsox`` does not
+        handle natively, your installation of ``torchaudio`` has to be linked to ``libsox``
+        and corresponding codec libraries such as ``libmad`` or ``libmp3lame`` etc.
+
+    By default (``normalize=True``, ``channels_first=True``), this function returns Tensor with
+    ``float32`` dtype, and the shape of `[channel, time]`.
+
+    .. warning::
+
+       ``normalize`` argument does not perform volume normalization.
+       It only converts the sample type to `torch.float32` from the native sample
+       type.
+
+       When the input format is WAV with integer type, such as 32-bit signed integer, 16-bit
+       signed integer, 24-bit signed integer, and 8-bit unsigned integer, by providing ``normalize=False``,
+       this function can return integer Tensor, where the samples are expressed within the whole range
+       of the corresponding dtype, that is, ``int32`` tensor for 32-bit signed PCM,
+       ``int16`` for 16-bit signed PCM and ``uint8`` for 8-bit unsigned PCM. Since torch does not
+       support ``int24`` dtype, 24-bit signed PCM are converted to ``int32`` tensors.
+
+       ``normalize`` argument has no effect on 32-bit floating-point WAV and other formats, such as
+       ``flac`` and ``mp3``.
+
+       For these formats, this function always returns ``float32`` Tensor with values.
+
+    Args:
+        filepath (path-like object): Source of audio data.
+        frame_offset (int):
+            Number of frames to skip before start reading data.
+        num_frames (int, optional):
+            Maximum number of frames to read. ``-1`` reads all the remaining samples,
+            starting from ``frame_offset``.
+            This function may return the less number of frames if there is not enough
+            frames in the given file.
+        normalize (bool, optional):
+            When ``True``, this function converts the native sample type to ``float32``.
+            Default: ``True``.
+
+            If input file is integer WAV, giving ``False`` will change the resulting Tensor type to
+            integer type.
+            This argument has no effect for formats other than integer WAV type.
+
+        channels_first (bool, optional):
+            When True, the returned Tensor has dimension `[channel, time]`.
+            Otherwise, the returned Tensor's dimension is `[time, channel]`.
+        format (str or None, optional):
+            Override the format detection with the given format.
+            Providing the argument might help when libsox can not infer the format
+            from header or extension.
+
+    Returns:
+        (torch.Tensor, int): Resulting Tensor and sample rate.
+            If the input file has integer wav format and ``normalize=False``, then it has
+            integer type, else ``float32`` type. If ``channels_first=True``, it has
+            `[channel, time]` else `[time, channel]`.
+    """
+    if not torch.jit.is_scripting():
+        if hasattr(filepath, "read"):
+            raise RuntimeError("sox_io backend does not support file-like object.")
+        filepath = os.fspath(filepath)
+    return torch.ops.torchaudio.sox_io_load_audio_file(
+        filepath, frame_offset, num_frames, normalize, channels_first, format
+    )
+
+
+@torchaudio._extension.fail_if_no_sox
+def save(
+    filepath: str,
+    src: torch.Tensor,
+    sample_rate: int,
+    channels_first: bool = True,
+    compression: Optional[float] = None,
+    format: Optional[str] = None,
+    encoding: Optional[str] = None,
+    bits_per_sample: Optional[int] = None,
+):
+    """Save audio data to file.
+
+    Args:
+        filepath (path-like object): Path to save file.
+        src (torch.Tensor): Audio data to save. must be 2D tensor.
+        sample_rate (int): sampling rate
+        channels_first (bool, optional): If ``True``, the given tensor is interpreted as `[channel, time]`,
+            otherwise `[time, channel]`.
+        compression (float or None, optional): Used for formats other than WAV.
+            This corresponds to ``-C`` option of ``sox`` command.
+
+            ``"mp3"``
+                Either bitrate (in ``kbps``) with quality factor, such as ``128.2``, or
+                VBR encoding with quality factor such as ``-4.2``. Default: ``-4.5``.
+
+            ``"flac"``
+                Whole number from ``0`` to ``8``. ``8`` is default and highest compression.
+
+            ``"ogg"``, ``"vorbis"``
+                Number from ``-1`` to ``10``; ``-1`` is the highest compression
+                and lowest quality. Default: ``3``.
+
+            See the detail at http://sox.sourceforge.net/soxformat.html.
+        format (str or None, optional): Override the audio format.
+            When ``filepath`` argument is path-like object, audio format is infered from
+            file extension. If file extension is missing or different, you can specify the
+            correct format with this argument.
+
+            When ``filepath`` argument is file-like object, this argument is required.
+
+            Valid values are ``"wav"``, ``"mp3"``, ``"ogg"``, ``"vorbis"``, ``"amr-nb"``,
+            ``"amb"``, ``"flac"``, ``"sph"``, ``"gsm"``, and ``"htk"``.
+
+        encoding (str or None, optional): Changes the encoding for the supported formats.
+            This argument is effective only for supported formats, such as ``"wav"``, ``""amb"``
+            and ``"sph"``. Valid values are;
+
+                - ``"PCM_S"`` (signed integer Linear PCM)
+                - ``"PCM_U"`` (unsigned integer Linear PCM)
+                - ``"PCM_F"`` (floating point PCM)
+                - ``"ULAW"`` (mu-law)
+                - ``"ALAW"`` (a-law)
+
+            Default values
+                If not provided, the default value is picked based on ``format`` and ``bits_per_sample``.
+
+                ``"wav"``, ``"amb"``
+                    - | If both ``encoding`` and ``bits_per_sample`` are not provided, the ``dtype`` of the
+                      | Tensor is used to determine the default value.
+
+                        - ``"PCM_U"`` if dtype is ``uint8``
+                        - ``"PCM_S"`` if dtype is ``int16`` or ``int32``
+                        - ``"PCM_F"`` if dtype is ``float32``
+
+                    - ``"PCM_U"`` if ``bits_per_sample=8``
+                    - ``"PCM_S"`` otherwise
+
+                ``"sph"`` format;
+                    - the default value is ``"PCM_S"``
+
+        bits_per_sample (int or None, optional): Changes the bit depth for the supported formats.
+            When ``format`` is one of ``"wav"``, ``"flac"``, ``"sph"``, or ``"amb"``, you can change the
+            bit depth. Valid values are ``8``, ``16``, ``32`` and ``64``.
+
+            Default Value;
+                If not provided, the default values are picked based on ``format`` and ``"encoding"``;
+
+                ``"wav"``, ``"amb"``;
+                    - | If both ``encoding`` and ``bits_per_sample`` are not provided, the ``dtype`` of the
+                      | Tensor is used.
+
+                        - ``8`` if dtype is ``uint8``
+                        - ``16`` if dtype is ``int16``
+                        - ``32`` if dtype is  ``int32`` or ``float32``
+
+                    - ``8`` if ``encoding`` is ``"PCM_U"``, ``"ULAW"`` or ``"ALAW"``
+                    - ``16`` if ``encoding`` is ``"PCM_S"``
+                    - ``32`` if ``encoding`` is ``"PCM_F"``
+
+                ``"flac"`` format;
+                    - the default value is ``24``
+
+                ``"sph"`` format;
+                    - ``16`` if ``encoding`` is ``"PCM_U"``, ``"PCM_S"``, ``"PCM_F"`` or not provided.
+                    - ``8`` if ``encoding`` is ``"ULAW"`` or ``"ALAW"``
+
+                ``"amb"`` format;
+                    - ``8`` if ``encoding`` is ``"PCM_U"``, ``"ULAW"`` or ``"ALAW"``
+                    - ``16`` if ``encoding`` is ``"PCM_S"`` or not provided.
+                    - ``32`` if ``encoding`` is ``"PCM_F"``
+
+    Supported formats/encodings/bit depth/compression are;
+
+    ``"wav"``, ``"amb"``
+        - 32-bit floating-point PCM
+        - 32-bit signed integer PCM
+        - 24-bit signed integer PCM
+        - 16-bit signed integer PCM
+        - 8-bit unsigned integer PCM
+        - 8-bit mu-law
+        - 8-bit a-law
+
+        Note: Default encoding/bit depth is determined by the dtype of the input Tensor.
+
+    ``"mp3"``
+        Fixed bit rate (such as 128kHz) and variable bit rate compression.
+        Default: VBR with high quality.
+
+    ``"flac"``
+        - 8-bit
+        - 16-bit
+        - 24-bit (default)
+
+    ``"ogg"``, ``"vorbis"``
+        - Different quality level. Default: approx. 112kbps
+
+    ``"sph"``
+        - 8-bit signed integer PCM
+        - 16-bit signed integer PCM
+        - 24-bit signed integer PCM
+        - 32-bit signed integer PCM (default)
+        - 8-bit mu-law
+        - 8-bit a-law
+        - 16-bit a-law
+        - 24-bit a-law
+        - 32-bit a-law
+
+    ``"amr-nb"``
+        Bitrate ranging from 4.75 kbit/s to 12.2 kbit/s. Default: 4.75 kbit/s
+
+    ``"gsm"``
+        Lossy Speech Compression, CPU intensive.
+
+    ``"htk"``
+        Uses a default single-channel 16-bit PCM format.
+
+    Note:
+        To save into formats that ``libsox`` does not handle natively, (such as ``"mp3"``,
+        ``"flac"``, ``"ogg"`` and ``"vorbis"``), your installation of ``torchaudio`` has
+        to be linked to ``libsox`` and corresponding codec libraries such as ``libmad``
+        or ``libmp3lame`` etc.
+    """
+    if not torch.jit.is_scripting():
+        if hasattr(filepath, "write"):
+            raise RuntimeError("sox_io backend does not handle file-like object.")
+        filepath = os.fspath(filepath)
+    torch.ops.torchaudio.sox_io_save_audio_file(
+        filepath,
+        src,
+        sample_rate,
+        channels_first,
+        compression,
+        format,
+        encoding,
+        bits_per_sample,
+    )
--- a/torchaudio/backend/common.py
+++ b/torchaudio/backend/common.py
-class AudioMetaData:
-    """Return type of ``torchaudio.info`` function.
-
-    This class is used by :py:mod:`"sox_io" backend<torchaudio.backends.sox_io_backend>` and
-    :py:mod:`"soundfile" backend<torchaudio.backends.soundfile_backend>`.
-
-    :ivar int sample_rate: Sample rate
-    :ivar int num_frames: The number of frames
-    :ivar int num_channels: The number of channels
-    :ivar int bits_per_sample: The number of bits per sample. This is 0 for lossy formats,
-        or when it cannot be accurately inferred.
-    :ivar str encoding: Audio encoding
-        The values encoding can take are one of the following:
-
-            * ``PCM_S``: Signed integer linear PCM
-            * ``PCM_U``: Unsigned integer linear PCM
-            * ``PCM_F``: Floating point linear PCM
-            * ``FLAC``: Flac, Free Lossless Audio Codec
-            * ``ULAW``: Mu-law
-            * ``ALAW``: A-law
-            * ``MP3`` : MP3, MPEG-1 Audio Layer III
-            * ``VORBIS``: OGG Vorbis
-            * ``AMR_WB``: Adaptive Multi-Rate
-            * ``AMR_NB``: Adaptive Multi-Rate Wideband
-            * ``OPUS``: Opus
-            * ``HTK``: Single channel 16-bit PCM
-            * ``UNKNOWN`` : None of above
-    """
-
-    def __init__(
-        self,
-        sample_rate: int,
-        num_frames: int,
-        num_channels: int,
-        bits_per_sample: int,
-        encoding: str,
-    ):
-        self.sample_rate = sample_rate
-        self.num_frames = num_frames
-        self.num_channels = num_channels
-        self.bits_per_sample = bits_per_sample
-        self.encoding = encoding
-
-    def __str__(self):
-        return (
-            f"AudioMetaData("
-            f"sample_rate={self.sample_rate}, "
-            f"num_frames={self.num_frames}, "
-            f"num_channels={self.num_channels}, "
-            f"bits_per_sample={self.bits_per_sample}, "
-            f"encoding={self.encoding}"
-            f")"
+def __getattr__(name: str):
+    import warnings
+
+    if name == "AudioMetaData":
+        warnings.warn(
+            "`torchaudio.backend.common.AudioMetaData` has been moved to "
+            "`torchaudio.AudioMetaData`. Please update the import path.",
+            stacklevel=2,
        )
+        from torchaudio._backend.common import AudioMetaData
+
+        return AudioMetaData
+    raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
--- a/torchaudio/backend/no_backend.py
+++ b/torchaudio/backend/no_backend.py
-from pathlib import Path
-from typing import Callable, Optional, Tuple, Union
+def __getattr__(name: str):
+    import warnings

-from torch import Tensor
+    warnings.warn(
+        "Torchaudio's I/O functions now support par-call bakcend dispatch. "
+        "Importing backend implementation directly is no longer guaranteed to work. "
+        "Please use `backend` keyword with load/save/info function, instead of "
+        "calling the udnerlying implementation directly.",
+        stacklevel=2,
+    )

+    from . import _no_backend

-def load(
-    filepath: Union[str, Path],
-    out: Optional[Tensor] = None,
-    normalization: Union[bool, float, Callable] = True,
-    channels_first: bool = True,
-    num_frames: int = 0,
-    offset: int = 0,
-    filetype: Optional[str] = None,
-) -> Tuple[Tensor, int]:
-    raise RuntimeError("No audio I/O backend is available.")
-
-
-def save(filepath: str, src: Tensor, sample_rate: int, precision: int = 16, channels_first: bool = True) -> None:
-    raise RuntimeError("No audio I/O backend is available.")
-
-
-def info(filepath: str) -> None:
-    raise RuntimeError("No audio I/O backend is available.")
+    return getattr(_no_backend, name)
--- a/torchaudio/backend/soundfile_backend.py
+++ b/torchaudio/backend/soundfile_backend.py
-"""The new soundfile backend which will become default in 0.8.0 onward"""
-import warnings
-from typing import Optional, Tuple
-
-import torch
-from torchaudio._internal import module_utils as _mod_utils
-
-from .common import AudioMetaData
-
-
-if _mod_utils.is_soundfile_available():
-    import soundfile
-
-# Mapping from soundfile subtype to number of bits per sample.
-# This is mostly heuristical and the value is set to 0 when it is irrelevant
-# (lossy formats) or when it can't be inferred.
-# For ADPCM (and G72X) subtypes, it's hard to infer the bit depth because it's not part of the standard:
-# According to https://en.wikipedia.org/wiki/Adaptive_differential_pulse-code_modulation#In_telephony,
-# the default seems to be 8 bits but it can be compressed further to 4 bits.
-# The dict is inspired from
-# https://github.com/bastibe/python-soundfile/blob/744efb4b01abc72498a96b09115b42a4cabd85e4/soundfile.py#L66-L94
-_SUBTYPE_TO_BITS_PER_SAMPLE = {
-    "PCM_S8": 8,  # Signed 8 bit data
-    "PCM_16": 16,  # Signed 16 bit data
-    "PCM_24": 24,  # Signed 24 bit data
-    "PCM_32": 32,  # Signed 32 bit data
-    "PCM_U8": 8,  # Unsigned 8 bit data (WAV and RAW only)
-    "FLOAT": 32,  # 32 bit float data
-    "DOUBLE": 64,  # 64 bit float data
-    "ULAW": 8,  # U-Law encoded. See https://en.wikipedia.org/wiki/G.711#Types
-    "ALAW": 8,  # A-Law encoded. See https://en.wikipedia.org/wiki/G.711#Types
-    "IMA_ADPCM": 0,  # IMA ADPCM.
-    "MS_ADPCM": 0,  # Microsoft ADPCM.
-    "GSM610": 0,  # GSM 6.10 encoding. (Wikipedia says 1.625 bit depth?? https://en.wikipedia.org/wiki/Full_Rate)
-    "VOX_ADPCM": 0,  # OKI / Dialogix ADPCM
-    "G721_32": 0,  # 32kbs G721 ADPCM encoding.
-    "G723_24": 0,  # 24kbs G723 ADPCM encoding.
-    "G723_40": 0,  # 40kbs G723 ADPCM encoding.
-    "DWVW_12": 12,  # 12 bit Delta Width Variable Word encoding.
-    "DWVW_16": 16,  # 16 bit Delta Width Variable Word encoding.
-    "DWVW_24": 24,  # 24 bit Delta Width Variable Word encoding.
-    "DWVW_N": 0,  # N bit Delta Width Variable Word encoding.
-    "DPCM_8": 8,  # 8 bit differential PCM (XI only)
-    "DPCM_16": 16,  # 16 bit differential PCM (XI only)
-    "VORBIS": 0,  # Xiph Vorbis encoding. (lossy)
-    "ALAC_16": 16,  # Apple Lossless Audio Codec (16 bit).
-    "ALAC_20": 20,  # Apple Lossless Audio Codec (20 bit).
-    "ALAC_24": 24,  # Apple Lossless Audio Codec (24 bit).
-    "ALAC_32": 32,  # Apple Lossless Audio Codec (32 bit).
-}
-
-
-def _get_bit_depth(subtype):
-    if subtype not in _SUBTYPE_TO_BITS_PER_SAMPLE:
-        warnings.warn(
-            f"The {subtype} subtype is unknown to TorchAudio. As a result, the bits_per_sample "
-            "attribute will be set to 0. If you are seeing this warning, please "
-            "report by opening an issue on github (after checking for existing/closed ones). "
-            "You may otherwise ignore this warning."
-        )
-    return _SUBTYPE_TO_BITS_PER_SAMPLE.get(subtype, 0)
-
-
-_SUBTYPE_TO_ENCODING = {
-    "PCM_S8": "PCM_S",
-    "PCM_16": "PCM_S",
-    "PCM_24": "PCM_S",
-    "PCM_32": "PCM_S",
-    "PCM_U8": "PCM_U",
-    "FLOAT": "PCM_F",
-    "DOUBLE": "PCM_F",
-    "ULAW": "ULAW",
-    "ALAW": "ALAW",
-    "VORBIS": "VORBIS",
-}
-
-
-def _get_encoding(format: str, subtype: str):
-    if format == "FLAC":
-        return "FLAC"
-    return _SUBTYPE_TO_ENCODING.get(subtype, "UNKNOWN")
-
-
-@_mod_utils.requires_soundfile()
-def info(filepath: str, format: Optional[str] = None) -> AudioMetaData:
-    """Get signal information of an audio file.
-
-    Note:
-        ``filepath`` argument is intentionally annotated as ``str`` only, even though it accepts
-        ``pathlib.Path`` object as well. This is for the consistency with ``"sox_io"`` backend,
-        which has a restriction on type annotation due to TorchScript compiler compatiblity.
-
-    Args:
-        filepath (path-like object or file-like object):
-            Source of audio data.
-        format (str or None, optional):
-            Not used. PySoundFile does not accept format hint.
-
-    Returns:
-        AudioMetaData: meta data of the given audio.
-
-    """
-    sinfo = soundfile.info(filepath)
-    return AudioMetaData(
-        sinfo.samplerate,
-        sinfo.frames,
-        sinfo.channels,
-        bits_per_sample=_get_bit_depth(sinfo.subtype),
-        encoding=_get_encoding(sinfo.format, sinfo.subtype),
+def __getattr__(name: str):
+    import warnings
+
+    warnings.warn(
+        "Torchaudio's I/O functions now support par-call bakcend dispatch. "
+        "Importing backend implementation directly is no longer guaranteed to work. "
+        "Please use `backend` keyword with load/save/info function, instead of "
+        "calling the udnerlying implementation directly.",
+        stacklevel=2,
    )

+    from torchaudio._backend import soundfile_backend

-_SUBTYPE2DTYPE = {
-    "PCM_S8": "int8",
-    "PCM_U8": "uint8",
-    "PCM_16": "int16",
-    "PCM_32": "int32",
-    "FLOAT": "float32",
-    "DOUBLE": "float64",
-}
-
-
-@_mod_utils.requires_soundfile()
-def load(
-    filepath: str,
-    frame_offset: int = 0,
-    num_frames: int = -1,
-    normalize: bool = True,
-    channels_first: bool = True,
-    format: Optional[str] = None,
-) -> Tuple[torch.Tensor, int]:
-    """Load audio data from file.
-
-    Note:
-        The formats this function can handle depend on the soundfile installation.
-        This function is tested on the following formats;
-
-        * WAV
-
-            * 32-bit floating-point
-            * 32-bit signed integer
-            * 16-bit signed integer
-            * 8-bit unsigned integer
-
-        * FLAC
-        * OGG/VORBIS
-        * SPHERE
-
-    By default (``normalize=True``, ``channels_first=True``), this function returns Tensor with
-    ``float32`` dtype, and the shape of `[channel, time]`.
-
-    .. warning::
-
-       ``normalize`` argument does not perform volume normalization.
-       It only converts the sample type to `torch.float32` from the native sample
-       type.
-
-       When the input format is WAV with integer type, such as 32-bit signed integer, 16-bit
-       signed integer, 24-bit signed integer, and 8-bit unsigned integer, by providing ``normalize=False``,
-       this function can return integer Tensor, where the samples are expressed within the whole range
-       of the corresponding dtype, that is, ``int32`` tensor for 32-bit signed PCM,
-       ``int16`` for 16-bit signed PCM and ``uint8`` for 8-bit unsigned PCM. Since torch does not
-       support ``int24`` dtype, 24-bit signed PCM are converted to ``int32`` tensors.
-
-       ``normalize`` argument has no effect on 32-bit floating-point WAV and other formats, such as
-       ``flac`` and ``mp3``.
-
-       For these formats, this function always returns ``float32`` Tensor with values.
-
-    Note:
-        ``filepath`` argument is intentionally annotated as ``str`` only, even though it accepts
-        ``pathlib.Path`` object as well. This is for the consistency with ``"sox_io"`` backend,
-        which has a restriction on type annotation due to TorchScript compiler compatiblity.
-
-    Args:
-        filepath (path-like object or file-like object):
-            Source of audio data.
-        frame_offset (int, optional):
-            Number of frames to skip before start reading data.
-        num_frames (int, optional):
-            Maximum number of frames to read. ``-1`` reads all the remaining samples,
-            starting from ``frame_offset``.
-            This function may return the less number of frames if there is not enough
-            frames in the given file.
-        normalize (bool, optional):
-            When ``True``, this function converts the native sample type to ``float32``.
-            Default: ``True``.
-
-            If input file is integer WAV, giving ``False`` will change the resulting Tensor type to
-            integer type.
-            This argument has no effect for formats other than integer WAV type.
-
-        channels_first (bool, optional):
-            When True, the returned Tensor has dimension `[channel, time]`.
-            Otherwise, the returned Tensor's dimension is `[time, channel]`.
-        format (str or None, optional):
-            Not used. PySoundFile does not accept format hint.
-
-    Returns:
-        (torch.Tensor, int): Resulting Tensor and sample rate.
-            If the input file has integer wav format and normalization is off, then it has
-            integer type, else ``float32`` type. If ``channels_first=True``, it has
-            `[channel, time]` else `[time, channel]`.
-    """
-    with soundfile.SoundFile(filepath, "r") as file_:
-        if file_.format != "WAV" or normalize:
-            dtype = "float32"
-        elif file_.subtype not in _SUBTYPE2DTYPE:
-            raise ValueError(f"Unsupported subtype: {file_.subtype}")
-        else:
-            dtype = _SUBTYPE2DTYPE[file_.subtype]
-
-        frames = file_._prepare_read(frame_offset, None, num_frames)
-        waveform = file_.read(frames, dtype, always_2d=True)
-        sample_rate = file_.samplerate
-
-    waveform = torch.from_numpy(waveform)
-    if channels_first:
-        waveform = waveform.t()
-    return waveform, sample_rate
-
-
-def _get_subtype_for_wav(dtype: torch.dtype, encoding: str, bits_per_sample: int):
-    if not encoding:
-        if not bits_per_sample:
-            subtype = {
-                torch.uint8: "PCM_U8",
-                torch.int16: "PCM_16",
-                torch.int32: "PCM_32",
-                torch.float32: "FLOAT",
-                torch.float64: "DOUBLE",
-            }.get(dtype)
-            if not subtype:
-                raise ValueError(f"Unsupported dtype for wav: {dtype}")
-            return subtype
-        if bits_per_sample == 8:
-            return "PCM_U8"
-        return f"PCM_{bits_per_sample}"
-    if encoding == "PCM_S":
-        if not bits_per_sample:
-            return "PCM_32"
-        if bits_per_sample == 8:
-            raise ValueError("wav does not support 8-bit signed PCM encoding.")
-        return f"PCM_{bits_per_sample}"
-    if encoding == "PCM_U":
-        if bits_per_sample in (None, 8):
-            return "PCM_U8"
-        raise ValueError("wav only supports 8-bit unsigned PCM encoding.")
-    if encoding == "PCM_F":
-        if bits_per_sample in (None, 32):
-            return "FLOAT"
-        if bits_per_sample == 64:
-            return "DOUBLE"
-        raise ValueError("wav only supports 32/64-bit float PCM encoding.")
-    if encoding == "ULAW":
-        if bits_per_sample in (None, 8):
-            return "ULAW"
-        raise ValueError("wav only supports 8-bit mu-law encoding.")
-    if encoding == "ALAW":
-        if bits_per_sample in (None, 8):
-            return "ALAW"
-        raise ValueError("wav only supports 8-bit a-law encoding.")
-    raise ValueError(f"wav does not support {encoding}.")
-
-
-def _get_subtype_for_sphere(encoding: str, bits_per_sample: int):
-    if encoding in (None, "PCM_S"):
-        return f"PCM_{bits_per_sample}" if bits_per_sample else "PCM_32"
-    if encoding in ("PCM_U", "PCM_F"):
-        raise ValueError(f"sph does not support {encoding} encoding.")
-    if encoding == "ULAW":
-        if bits_per_sample in (None, 8):
-            return "ULAW"
-        raise ValueError("sph only supports 8-bit for mu-law encoding.")
-    if encoding == "ALAW":
-        return "ALAW"
-    raise ValueError(f"sph does not support {encoding}.")
-
-
-def _get_subtype(dtype: torch.dtype, format: str, encoding: str, bits_per_sample: int):
-    if format == "wav":
-        return _get_subtype_for_wav(dtype, encoding, bits_per_sample)
-    if format == "flac":
-        if encoding:
-            raise ValueError("flac does not support encoding.")
-        if not bits_per_sample:
-            return "PCM_16"
-        if bits_per_sample > 24:
-            raise ValueError("flac does not support bits_per_sample > 24.")
-        return "PCM_S8" if bits_per_sample == 8 else f"PCM_{bits_per_sample}"
-    if format in ("ogg", "vorbis"):
-        if encoding or bits_per_sample:
-            raise ValueError("ogg/vorbis does not support encoding/bits_per_sample.")
-        return "VORBIS"
-    if format == "sph":
-        return _get_subtype_for_sphere(encoding, bits_per_sample)
-    if format in ("nis", "nist"):
-        return "PCM_16"
-    raise ValueError(f"Unsupported format: {format}")
-
-
-@_mod_utils.requires_soundfile()
-def save(
-    filepath: str,
-    src: torch.Tensor,
-    sample_rate: int,
-    channels_first: bool = True,
-    compression: Optional[float] = None,
-    format: Optional[str] = None,
-    encoding: Optional[str] = None,
-    bits_per_sample: Optional[int] = None,
-):
-    """Save audio data to file.
-
-    Note:
-        The formats this function can handle depend on the soundfile installation.
-        This function is tested on the following formats;
-
-        * WAV
-
-            * 32-bit floating-point
-            * 32-bit signed integer
-            * 16-bit signed integer
-            * 8-bit unsigned integer
-
-        * FLAC
-        * OGG/VORBIS
-        * SPHERE
-
-    Note:
-        ``filepath`` argument is intentionally annotated as ``str`` only, even though it accepts
-        ``pathlib.Path`` object as well. This is for the consistency with ``"sox_io"`` backend,
-        which has a restriction on type annotation due to TorchScript compiler compatiblity.
-
-    Args:
-        filepath (str or pathlib.Path): Path to audio file.
-        src (torch.Tensor): Audio data to save. must be 2D tensor.
-        sample_rate (int): sampling rate
-        channels_first (bool, optional): If ``True``, the given tensor is interpreted as `[channel, time]`,
-            otherwise `[time, channel]`.
-        compression (float of None, optional): Not used.
-            It is here only for interface compatibility reson with "sox_io" backend.
-        format (str or None, optional): Override the audio format.
-            When ``filepath`` argument is path-like object, audio format is
-            inferred from file extension. If the file extension is missing or
-            different, you can specify the correct format with this argument.
-
-            When ``filepath`` argument is file-like object,
-            this argument is required.
-
-            Valid values are ``"wav"``, ``"ogg"``, ``"vorbis"``,
-            ``"flac"`` and ``"sph"``.
-        encoding (str or None, optional): Changes the encoding for supported formats.
-            This argument is effective only for supported formats, sush as
-            ``"wav"``, ``""flac"`` and ``"sph"``. Valid values are;
-
-                - ``"PCM_S"`` (signed integer Linear PCM)
-                - ``"PCM_U"`` (unsigned integer Linear PCM)
-                - ``"PCM_F"`` (floating point PCM)
-                - ``"ULAW"`` (mu-law)
-                - ``"ALAW"`` (a-law)
-
-        bits_per_sample (int or None, optional): Changes the bit depth for the
-            supported formats.
-            When ``format`` is one of ``"wav"``, ``"flac"`` or ``"sph"``,
-            you can change the bit depth.
-            Valid values are ``8``, ``16``, ``24``, ``32`` and ``64``.
-
-    Supported formats/encodings/bit depth/compression are:
-
-    ``"wav"``
-        - 32-bit floating-point PCM
-        - 32-bit signed integer PCM
-        - 24-bit signed integer PCM
-        - 16-bit signed integer PCM
-        - 8-bit unsigned integer PCM
-        - 8-bit mu-law
-        - 8-bit a-law
-
-        Note:
-            Default encoding/bit depth is determined by the dtype of
-            the input Tensor.
-
-    ``"flac"``
-        - 8-bit
-        - 16-bit (default)
-        - 24-bit
-
-    ``"ogg"``, ``"vorbis"``
-        - Doesn't accept changing configuration.
-
-    ``"sph"``
-        - 8-bit signed integer PCM
-        - 16-bit signed integer PCM
-        - 24-bit signed integer PCM
-        - 32-bit signed integer PCM (default)
-        - 8-bit mu-law
-        - 8-bit a-law
-        - 16-bit a-law
-        - 24-bit a-law
-        - 32-bit a-law
-
-    """
-    if src.ndim != 2:
-        raise ValueError(f"Expected 2D Tensor, got {src.ndim}D.")
-    if compression is not None:
-        warnings.warn(
-            '`save` function of "soundfile" backend does not support "compression" parameter. '
-            "The argument is silently ignored."
-        )
-    if hasattr(filepath, "write"):
-        if format is None:
-            raise RuntimeError("`format` is required when saving to file object.")
-        ext = format.lower()
-    else:
-        ext = str(filepath).split(".")[-1].lower()
-
-    if bits_per_sample not in (None, 8, 16, 24, 32, 64):
-        raise ValueError("Invalid bits_per_sample.")
-    if bits_per_sample == 24:
-        warnings.warn(
-            "Saving audio with 24 bits per sample might warp samples near -1. "
-            "Using 16 bits per sample might be able to avoid this."
-        )
-    subtype = _get_subtype(src.dtype, ext, encoding, bits_per_sample)
-
-    # sph is a extension used in TED-LIUM but soundfile does not recognize it as NIST format,
-    # so we extend the extensions manually here
-    if ext in ["nis", "nist", "sph"] and format is None:
-        format = "NIST"
-
-    if channels_first:
-        src = src.t()
-
-    soundfile.write(file=filepath, data=src, samplerate=sample_rate, subtype=subtype, format=format)
+    return getattr(soundfile_backend, name)
--- a/torchaudio/backend/sox_io_backend.py
+++ b/torchaudio/backend/sox_io_backend.py
-import os
-from typing import Optional, Tuple
-
-import torch
-import torchaudio
-from torchaudio._internal import module_utils as _mod_utils
-from torchaudio.utils.sox_utils import get_buffer_size
-
-from .common import AudioMetaData
-
-
-# Note: need to comply TorchScript syntax -- need annotation and no f-string
-def _fail_info(filepath: str, format: Optional[str]) -> AudioMetaData:
-    raise RuntimeError("Failed to fetch metadata from {}".format(filepath))
-
-
-def _fail_info_fileobj(fileobj, format: Optional[str]) -> AudioMetaData:
-    raise RuntimeError("Failed to fetch metadata from {}".format(fileobj))
-
-
-# Note: need to comply TorchScript syntax -- need annotation and no f-string
-def _fail_load(
-    filepath: str,
-    frame_offset: int = 0,
-    num_frames: int = -1,
-    normalize: bool = True,
-    channels_first: bool = True,
-    format: Optional[str] = None,
-) -> Tuple[torch.Tensor, int]:
-    raise RuntimeError("Failed to load audio from {}".format(filepath))
-
-
-def _fail_load_fileobj(fileobj, *args, **kwargs):
-    raise RuntimeError(f"Failed to load audio from {fileobj}")
-
-
-if torchaudio._extension._FFMPEG_INITIALIZED:
-    import torchaudio.io._compat as _compat
-
-    _fallback_info = _compat.info_audio
-    _fallback_info_fileobj = _compat.info_audio_fileobj
-    _fallback_load = _compat.load_audio
-    _fallback_load_fileobj = _compat.load_audio_fileobj
-else:
-    _fallback_info = _fail_info
-    _fallback_info_fileobj = _fail_info_fileobj
-    _fallback_load = _fail_load
-    _fallback_load_fileobj = _fail_load_fileobj
-
-
-@_mod_utils.requires_sox()
-def info(
-    filepath: str,
-    format: Optional[str] = None,
-) -> AudioMetaData:
-    """Get signal information of an audio file.
-
-    Args:
-        filepath (path-like object or file-like object):
-            Source of audio data. When the function is not compiled by TorchScript,
-            (e.g. ``torch.jit.script``), the following types are accepted;
-
-                  * ``path-like``: file path
-                  * ``file-like``: Object with ``read(size: int) -> bytes`` method,
-                    which returns byte string of at most ``size`` length.
-
-            When the function is compiled by TorchScript, only ``str`` type is allowed.
-
-            Note:
-
-                  * When the input type is file-like object, this function cannot
-                    get the correct length (``num_samples``) for certain formats,
-                    such as ``vorbis``.
-                    In this case, the value of ``num_samples`` is ``0``.
-                  * This argument is intentionally annotated as ``str`` only due to
-                    TorchScript compiler compatibility.
-
-        format (str or None, optional):
-            Override the format detection with the given format.
-            Providing the argument might help when libsox can not infer the format
-            from header or extension.
-
-    Returns:
-        AudioMetaData: Metadata of the given audio.
-    """
-    if not torch.jit.is_scripting():
-        if hasattr(filepath, "read"):
-            # Special case for Backward compatibility
-            # v0.11 -> v0.12, mp3 handling is moved to FFmpeg.
-            # file-like objects are not necessarily fallback-able
-            # when they are not seekable.
-            # The previous libsox-based implementation required `format="mp3"`
-            # because internally libsox does not auto-detect the format.
-            # For the special BC for mp3, we handle mp3 differently.
-            buffer_size = get_buffer_size()
-            if format == "mp3":
-                return _fallback_info_fileobj(filepath, format, buffer_size)
-            sinfo = torchaudio._torchaudio.get_info_fileobj(filepath, format)
-            if sinfo is not None:
-                return AudioMetaData(*sinfo)
-            return _fallback_info_fileobj(filepath, format, buffer_size)
-        filepath = os.fspath(filepath)
-    sinfo = torch.ops.torchaudio.sox_io_get_info(filepath, format)
-    if sinfo is not None:
-        return AudioMetaData(*sinfo)
-    return _fallback_info(filepath, format)
-
-
-@_mod_utils.requires_sox()
-def load(
-    filepath: str,
-    frame_offset: int = 0,
-    num_frames: int = -1,
-    normalize: bool = True,
-    channels_first: bool = True,
-    format: Optional[str] = None,
-) -> Tuple[torch.Tensor, int]:
-    """Load audio data from file.
-
-    Note:
-        This function can handle all the codecs that underlying libsox can handle,
-        however it is tested on the following formats;
-
-        * WAV, AMB
-
-            * 32-bit floating-point
-            * 32-bit signed integer
-            * 24-bit signed integer
-            * 16-bit signed integer
-            * 8-bit unsigned integer (WAV only)
-
-        * MP3
-        * FLAC
-        * OGG/VORBIS
-        * OPUS
-        * SPHERE
-        * AMR-NB
-
-        To load ``MP3``, ``FLAC``, ``OGG/VORBIS``, ``OPUS`` and other codecs ``libsox`` does not
-        handle natively, your installation of ``torchaudio`` has to be linked to ``libsox``
-        and corresponding codec libraries such as ``libmad`` or ``libmp3lame`` etc.
-
-    By default (``normalize=True``, ``channels_first=True``), this function returns Tensor with
-    ``float32`` dtype, and the shape of `[channel, time]`.
-
-    .. warning::
-
-       ``normalize`` argument does not perform volume normalization.
-       It only converts the sample type to `torch.float32` from the native sample
-       type.
-
-       When the input format is WAV with integer type, such as 32-bit signed integer, 16-bit
-       signed integer, 24-bit signed integer, and 8-bit unsigned integer, by providing ``normalize=False``,
-       this function can return integer Tensor, where the samples are expressed within the whole range
-       of the corresponding dtype, that is, ``int32`` tensor for 32-bit signed PCM,
-       ``int16`` for 16-bit signed PCM and ``uint8`` for 8-bit unsigned PCM. Since torch does not
-       support ``int24`` dtype, 24-bit signed PCM are converted to ``int32`` tensors.
-
-       ``normalize`` argument has no effect on 32-bit floating-point WAV and other formats, such as
-       ``flac`` and ``mp3``.
-
-       For these formats, this function always returns ``float32`` Tensor with values.
-
-    Args:
-        filepath (path-like object or file-like object):
-            Source of audio data. When the function is not compiled by TorchScript,
-            (e.g. ``torch.jit.script``), the following types are accepted;
-
-                  * ``path-like``: file path
-                  * ``file-like``: Object with ``read(size: int) -> bytes`` method,
-                    which returns byte string of at most ``size`` length.
-
-            When the function is compiled by TorchScript, only ``str`` type is allowed.
-
-            Note: This argument is intentionally annotated as ``str`` only due to
-            TorchScript compiler compatibility.
-        frame_offset (int):
-            Number of frames to skip before start reading data.
-        num_frames (int, optional):
-            Maximum number of frames to read. ``-1`` reads all the remaining samples,
-            starting from ``frame_offset``.
-            This function may return the less number of frames if there is not enough
-            frames in the given file.
-        normalize (bool, optional):
-            When ``True``, this function converts the native sample type to ``float32``.
-            Default: ``True``.
-
-            If input file is integer WAV, giving ``False`` will change the resulting Tensor type to
-            integer type.
-            This argument has no effect for formats other than integer WAV type.
-
-        channels_first (bool, optional):
-            When True, the returned Tensor has dimension `[channel, time]`.
-            Otherwise, the returned Tensor's dimension is `[time, channel]`.
-        format (str or None, optional):
-            Override the format detection with the given format.
-            Providing the argument might help when libsox can not infer the format
-            from header or extension.
-
-    Returns:
-        (torch.Tensor, int): Resulting Tensor and sample rate.
-            If the input file has integer wav format and ``normalize=False``, then it has
-            integer type, else ``float32`` type. If ``channels_first=True``, it has
-            `[channel, time]` else `[time, channel]`.
-    """
-    if not torch.jit.is_scripting():
-        if hasattr(filepath, "read"):
-            # Special case for Backward compatibility
-            # v0.11 -> v0.12, mp3 handling is moved to FFmpeg.
-            # file-like objects are not necessarily fallback-able
-            # when they are not seekable.
-            # The previous libsox-based implementation required `format="mp3"`
-            # because internally libsox does not auto-detect the format.
-            # For the special BC for mp3, we handle mp3 differently.
-            buffer_size = get_buffer_size()
-            if format == "mp3":
-                return _fallback_load_fileobj(
-                    filepath,
-                    frame_offset,
-                    num_frames,
-                    normalize,
-                    channels_first,
-                    format,
-                    buffer_size,
-                )
-            ret = torchaudio._torchaudio.load_audio_fileobj(
-                filepath, frame_offset, num_frames, normalize, channels_first, format
-            )
-            if ret is not None:
-                return ret
-            return _fallback_load_fileobj(
-                filepath,
-                frame_offset,
-                num_frames,
-                normalize,
-                channels_first,
-                format,
-                buffer_size,
-            )
-        filepath = os.fspath(filepath)
-    ret = torch.ops.torchaudio.sox_io_load_audio_file(
-        filepath, frame_offset, num_frames, normalize, channels_first, format
+def __getattr__(name: str):
+    import warnings
+
+    warnings.warn(
+        "Torchaudio's I/O functions now support par-call bakcend dispatch. "
+        "Importing backend implementation directly is no longer guaranteed to work. "
+        "Please use `backend` keyword with load/save/info function, instead of "
+        "calling the udnerlying implementation directly.",
+        stacklevel=2,
    )
-    if ret is not None:
-        return ret
-    return _fallback_load(filepath, frame_offset, num_frames, normalize, channels_first, format)
-
-
-@_mod_utils.requires_sox()
-def save(
-    filepath: str,
-    src: torch.Tensor,
-    sample_rate: int,
-    channels_first: bool = True,
-    compression: Optional[float] = None,
-    format: Optional[str] = None,
-    encoding: Optional[str] = None,
-    bits_per_sample: Optional[int] = None,
-):
-    """Save audio data to file.
-
-    Args:
-        filepath (str or pathlib.Path): Path to save file.
-            This function also handles ``pathlib.Path`` objects, but is annotated
-            as ``str`` for TorchScript compiler compatibility.
-        src (torch.Tensor): Audio data to save. must be 2D tensor.
-        sample_rate (int): sampling rate
-        channels_first (bool, optional): If ``True``, the given tensor is interpreted as `[channel, time]`,
-            otherwise `[time, channel]`.
-        compression (float or None, optional): Used for formats other than WAV.
-            This corresponds to ``-C`` option of ``sox`` command.
-
-            ``"mp3"``
-                Either bitrate (in ``kbps``) with quality factor, such as ``128.2``, or
-                VBR encoding with quality factor such as ``-4.2``. Default: ``-4.5``.
-
-            ``"flac"``
-                Whole number from ``0`` to ``8``. ``8`` is default and highest compression.
-
-            ``"ogg"``, ``"vorbis"``
-                Number from ``-1`` to ``10``; ``-1`` is the highest compression
-                and lowest quality. Default: ``3``.
-
-            See the detail at http://sox.sourceforge.net/soxformat.html.
-        format (str or None, optional): Override the audio format.
-            When ``filepath`` argument is path-like object, audio format is infered from
-            file extension. If file extension is missing or different, you can specify the
-            correct format with this argument.
-
-            When ``filepath`` argument is file-like object, this argument is required.
-
-            Valid values are ``"wav"``, ``"mp3"``, ``"ogg"``, ``"vorbis"``, ``"amr-nb"``,
-            ``"amb"``, ``"flac"``, ``"sph"``, ``"gsm"``, and ``"htk"``.
-
-        encoding (str or None, optional): Changes the encoding for the supported formats.
-            This argument is effective only for supported formats, such as ``"wav"``, ``""amb"``
-            and ``"sph"``. Valid values are;
-
-                - ``"PCM_S"`` (signed integer Linear PCM)
-                - ``"PCM_U"`` (unsigned integer Linear PCM)
-                - ``"PCM_F"`` (floating point PCM)
-                - ``"ULAW"`` (mu-law)
-                - ``"ALAW"`` (a-law)
-
-            Default values
-                If not provided, the default value is picked based on ``format`` and ``bits_per_sample``.
-
-                ``"wav"``, ``"amb"``
-                    - | If both ``encoding`` and ``bits_per_sample`` are not provided, the ``dtype`` of the
-                      | Tensor is used to determine the default value.
-
-                        - ``"PCM_U"`` if dtype is ``uint8``
-                        - ``"PCM_S"`` if dtype is ``int16`` or ``int32``
-                        - ``"PCM_F"`` if dtype is ``float32``
-
-                    - ``"PCM_U"`` if ``bits_per_sample=8``
-                    - ``"PCM_S"`` otherwise

-                ``"sph"`` format;
-                    - the default value is ``"PCM_S"``
+    from . import _sox_io_backend

-        bits_per_sample (int or None, optional): Changes the bit depth for the supported formats.
-            When ``format`` is one of ``"wav"``, ``"flac"``, ``"sph"``, or ``"amb"``, you can change the
-            bit depth. Valid values are ``8``, ``16``, ``32`` and ``64``.
-
-            Default Value;
-                If not provided, the default values are picked based on ``format`` and ``"encoding"``;
-
-                ``"wav"``, ``"amb"``;
-                    - | If both ``encoding`` and ``bits_per_sample`` are not provided, the ``dtype`` of the
-                      | Tensor is used.
-
-                        - ``8`` if dtype is ``uint8``
-                        - ``16`` if dtype is ``int16``
-                        - ``32`` if dtype is  ``int32`` or ``float32``
-
-                    - ``8`` if ``encoding`` is ``"PCM_U"``, ``"ULAW"`` or ``"ALAW"``
-                    - ``16`` if ``encoding`` is ``"PCM_S"``
-                    - ``32`` if ``encoding`` is ``"PCM_F"``
-
-                ``"flac"`` format;
-                    - the default value is ``24``
-
-                ``"sph"`` format;
-                    - ``16`` if ``encoding`` is ``"PCM_U"``, ``"PCM_S"``, ``"PCM_F"`` or not provided.
-                    - ``8`` if ``encoding`` is ``"ULAW"`` or ``"ALAW"``
-
-                ``"amb"`` format;
-                    - ``8`` if ``encoding`` is ``"PCM_U"``, ``"ULAW"`` or ``"ALAW"``
-                    - ``16`` if ``encoding`` is ``"PCM_S"`` or not provided.
-                    - ``32`` if ``encoding`` is ``"PCM_F"``
-
-    Supported formats/encodings/bit depth/compression are;
-
-    ``"wav"``, ``"amb"``
-        - 32-bit floating-point PCM
-        - 32-bit signed integer PCM
-        - 24-bit signed integer PCM
-        - 16-bit signed integer PCM
-        - 8-bit unsigned integer PCM
-        - 8-bit mu-law
-        - 8-bit a-law
-
-        Note: Default encoding/bit depth is determined by the dtype of the input Tensor.
-
-    ``"mp3"``
-        Fixed bit rate (such as 128kHz) and variable bit rate compression.
-        Default: VBR with high quality.
-
-    ``"flac"``
-        - 8-bit
-        - 16-bit
-        - 24-bit (default)
-
-    ``"ogg"``, ``"vorbis"``
-        - Different quality level. Default: approx. 112kbps
-
-    ``"sph"``
-        - 8-bit signed integer PCM
-        - 16-bit signed integer PCM
-        - 24-bit signed integer PCM
-        - 32-bit signed integer PCM (default)
-        - 8-bit mu-law
-        - 8-bit a-law
-        - 16-bit a-law
-        - 24-bit a-law
-        - 32-bit a-law
-
-    ``"amr-nb"``
-        Bitrate ranging from 4.75 kbit/s to 12.2 kbit/s. Default: 4.75 kbit/s
-
-    ``"gsm"``
-        Lossy Speech Compression, CPU intensive.
-
-    ``"htk"``
-        Uses a default single-channel 16-bit PCM format.
-
-    Note:
-        To save into formats that ``libsox`` does not handle natively, (such as ``"mp3"``,
-        ``"flac"``, ``"ogg"`` and ``"vorbis"``), your installation of ``torchaudio`` has
-        to be linked to ``libsox`` and corresponding codec libraries such as ``libmad``
-        or ``libmp3lame`` etc.
-    """
-    if not torch.jit.is_scripting():
-        if hasattr(filepath, "write"):
-            torchaudio._torchaudio.save_audio_fileobj(
-                filepath,
-                src,
-                sample_rate,
-                channels_first,
-                compression,
-                format,
-                encoding,
-                bits_per_sample,
-            )
-            return
-        filepath = os.fspath(filepath)
-    torch.ops.torchaudio.sox_io_save_audio_file(
-        filepath,
-        src,
-        sample_rate,
-        channels_first,
-        compression,
-        format,
-        encoding,
-        bits_per_sample,
-    )
+    return getattr(_sox_io_backend, name)
--- a/torchaudio/backend/utils.py
+++ b/torchaudio/backend/utils.py
@@ -3,9 +3,10 @@ import warnings
 from typing import List, Optional

 import torchaudio
+from torchaudio._backend import soundfile_backend
 from torchaudio._internal import module_utils as _mod_utils

-from . import no_backend, soundfile_backend, sox_io_backend
+from . import _no_backend as no_backend, _sox_io_backend as sox_io_backend

 __all__ = [
    "list_audio_backends",
@@ -23,7 +24,7 @@ def list_audio_backends() -> List[str]:
    backends = []
    if _mod_utils.is_module_available("soundfile"):
        backends.append("soundfile")
-    if _mod_utils.is_sox_available():
+    if torchaudio._extension._SOX_INITIALIZED:
        backends.append("sox_io")
    return backends

@@ -52,14 +53,19 @@ def set_audio_backend(backend: Optional[str]):
        setattr(torchaudio, func, getattr(module, func))


-def _init_audio_backend():
+def _init_backend():
+    warnings.warn(
+        "TorchAudio's global backend is now deprecated. "
+        "Please enable distpatcher by setting `TORCHAUDIO_USE_BACKEND_DISPATCHER=1`, "
+        "and specify backend when calling load/info/save function.",
+        stacklevel=3,
+    )
    backends = list_audio_backends()
    if "sox_io" in backends:
        set_audio_backend("sox_io")
    elif "soundfile" in backends:
        set_audio_backend("soundfile")
    else:
-        warnings.warn("No audio backend is available.")
        set_audio_backend(None)