Use dlopen for FFmpeg (#3353)

Summary: This commit changes the way FFmpeg extension is built and used. Instead of linking (LGPL) FFmpeg libraries to torchaudio at build time, It uses dlopen to search and link them at run time. For dlopen-ing, we use PyTorch's `at::DynamicLibrary` class, which provides portable wrapper. Pull Request resolved: https://github.com/pytorch/audio/pull/3353 Differential Revision: D46059199 Pulled By: mthrok fbshipit-source-id: 4493a5fd8a4c802178d20276522f5334d637307d

Use dlopen for FFmpeg (#3353)
Summary: This commit changes the way FFmpeg extension is built and used. Instead of linking (LGPL) FFmpeg libraries to torchaudio at build time, It uses dlopen to search and link them at run time. For dlopen-ing, we use PyTorch's `at::DynamicLibrary` class, which provides portable wrapper. Pull Request resolved: https://github.com/pytorch/audio/pull/3353 Differential Revision: D46059199 Pulled By: mthrok fbshipit-source-id: 4493a5fd8a4c802178d20276522f5334d637307d
b14ced1a · moto · Facebook GitHub Bot · bc54ac8a · b14ced1a · b14ced1a
Commit b14ced1a authored Jun 01, 2023 by moto Committed by Facebook GitHub Bot Jun 01, 2023
19 changed files
--- a/torchaudio/csrc/ffmpeg/CMakeLists.txt
+++ b/torchaudio/csrc/ffmpeg/CMakeLists.txt
@@ -2,11 +2,10 @@ message(STATUS "FFMPEG_ROOT=$ENV{FFMPEG_ROOT}")
 find_package(FFMPEG 4.1 REQUIRED COMPONENTS avdevice avfilter avformat avcodec avutil)
 add_library(ffmpeg INTERFACE)
 target_include_directories(ffmpeg INTERFACE "${FFMPEG_INCLUDE_DIRS}")
-target_link_libraries(ffmpeg INTERFACE "${FFMPEG_LIBRARIES}")
-

 set(
  sources
+  libav.cpp
  ffmpeg.cpp
  filter_graph.cpp
  hw_context.cpp

--- a/torchaudio/csrc/ffmpeg/ffmpeg.cpp
+++ b/torchaudio/csrc/ffmpeg/ffmpeg.cpp
 #include <c10/util/Exception.h>
 #include <torchaudio/csrc/ffmpeg/ffmpeg.h>
+#include <torchaudio/csrc/ffmpeg/libav.h>
 #include <sstream>
 #include <stdexcept>
 #include <string>
@@ -8,6 +9,8 @@
 namespace torchaudio {
 namespace io {

+using torchaudio::io::detail::libav;
+
 ////////////////////////////////////////////////////////////////////////////////
 // AVDictionary
 ////////////////////////////////////////////////////////////////////////////////
@@ -15,7 +18,7 @@ AVDictionary* get_option_dict(const c10::optional<OptionDict>& option) {
  AVDictionary* opt = nullptr;
  if (option) {
    for (auto const& [key, value] : option.value()) {
-      av_dict_set(&opt, key.c_str(), value.c_str(), 0);
+      libav().av_dict_set(&opt, key.c_str(), value.c_str(), 0);
    }
  }
  return opt;
@@ -26,10 +29,10 @@ void clean_up_dict(AVDictionary* p) {
    std::vector<std::string> unused_keys;
    // Check and copy unused keys, clean up the original dictionary
    AVDictionaryEntry* t = nullptr;
-    while ((t = av_dict_get(p, "", t, AV_DICT_IGNORE_SUFFIX))) {
+    while ((t = libav().av_dict_get(p, "", t, AV_DICT_IGNORE_SUFFIX))) {
      unused_keys.emplace_back(t->key);
    }
-    av_dict_free(&p);
+    libav().av_dict_free(&p);
    TORCH_CHECK(
        unused_keys.empty(),
        "Unexpected options: ",
@@ -41,14 +44,14 @@ void clean_up_dict(AVDictionary* p) {
 // AVFormatContext
 ////////////////////////////////////////////////////////////////////////////////
 void AVFormatInputContextDeleter::operator()(AVFormatContext* p) {
-  avformat_close_input(&p);
+  libav().avformat_close_input(&p);
 };

 AVFormatInputContextPtr::AVFormatInputContextPtr(AVFormatContext* p)
    : Wrapper<AVFormatContext, AVFormatInputContextDeleter>(p) {}

 void AVFormatOutputContextDeleter::operator()(AVFormatContext* p) {
-  avformat_free_context(p);
+  libav().avformat_free_context(p);
 };

 AVFormatOutputContextPtr::AVFormatOutputContextPtr(AVFormatContext* p)
@@ -58,9 +61,9 @@ AVFormatOutputContextPtr::AVFormatOutputContextPtr(AVFormatContext* p)
 // AVIO
 ////////////////////////////////////////////////////////////////////////////////
 void AVIOContextDeleter::operator()(AVIOContext* p) {
-  avio_flush(p);
-  av_freep(&p->buffer);
-  av_freep(&p);
+  libav().avio_flush(p);
+  libav().av_freep(&p->buffer);
+  libav().av_freep(&p);
 };

 AVIOContextPtr::AVIOContextPtr(AVIOContext* p)
@@ -70,13 +73,13 @@ AVIOContextPtr::AVIOContextPtr(AVIOContext* p)
 // AVPacket
 ////////////////////////////////////////////////////////////////////////////////
 void AVPacketDeleter::operator()(AVPacket* p) {
-  av_packet_free(&p);
+  libav().av_packet_free(&p);
 };

 AVPacketPtr::AVPacketPtr(AVPacket* p) : Wrapper<AVPacket, AVPacketDeleter>(p) {}

 AVPacketPtr alloc_avpacket() {
-  AVPacket* p = av_packet_alloc();
+  AVPacket* p = libav().av_packet_alloc();
  TORCH_CHECK(p, "Failed to allocate AVPacket object.");
  return AVPacketPtr{p};
 }
@@ -86,7 +89,7 @@ AVPacketPtr alloc_avpacket() {
 ////////////////////////////////////////////////////////////////////////////////
 AutoPacketUnref::AutoPacketUnref(AVPacketPtr& p) : p_(p){};
 AutoPacketUnref::~AutoPacketUnref() {
-  av_packet_unref(p_);
+  libav().av_packet_unref(p_);
 }
 AutoPacketUnref::operator AVPacket*() const {
  return p_;
@@ -96,13 +99,13 @@ AutoPacketUnref::operator AVPacket*() const {
 // AVFrame
 ////////////////////////////////////////////////////////////////////////////////
 void AVFrameDeleter::operator()(AVFrame* p) {
-  av_frame_free(&p);
+  libav().av_frame_free(&p);
 };

 AVFramePtr::AVFramePtr(AVFrame* p) : Wrapper<AVFrame, AVFrameDeleter>(p) {}

 AVFramePtr alloc_avframe() {
-  AVFrame* p = av_frame_alloc();
+  AVFrame* p = libav().av_frame_alloc();
  TORCH_CHECK(p, "Failed to allocate AVFrame object.");
  return AVFramePtr{p};
 };
@@ -111,7 +114,7 @@ AVFramePtr alloc_avframe() {
 // AVCodecContext
 ////////////////////////////////////////////////////////////////////////////////
 void AVCodecContextDeleter::operator()(AVCodecContext* p) {
-  avcodec_free_context(&p);
+  libav().avcodec_free_context(&p);
 };

 AVCodecContextPtr::AVCodecContextPtr(AVCodecContext* p)
@@ -121,7 +124,7 @@ AVCodecContextPtr::AVCodecContextPtr(AVCodecContext* p)
 // AVBufferRefPtr
 ////////////////////////////////////////////////////////////////////////////////
 void AutoBufferUnref::operator()(AVBufferRef* p) {
-  av_buffer_unref(&p);
+  libav().av_buffer_unref(&p);
 }

 AVBufferRefPtr::AVBufferRefPtr(AVBufferRef* p)
@@ -131,7 +134,7 @@ AVBufferRefPtr::AVBufferRefPtr(AVBufferRef* p)
 // AVFilterGraph
 ////////////////////////////////////////////////////////////////////////////////
 void AVFilterGraphDeleter::operator()(AVFilterGraph* p) {
-  avfilter_graph_free(&p);
+  libav().avfilter_graph_free(&p);
 };

 AVFilterGraphPtr::AVFilterGraphPtr(AVFilterGraph* p)
@@ -141,7 +144,7 @@ AVFilterGraphPtr::AVFilterGraphPtr(AVFilterGraph* p)
 // AVCodecParameters
 ////////////////////////////////////////////////////////////////////////////////
 void AVCodecParametersDeleter::operator()(AVCodecParameters* codecpar) {
-  avcodec_parameters_free(&codecpar);
+  libav().avcodec_parameters_free(&codecpar);
 }

 AVCodecParametersPtr::AVCodecParametersPtr(AVCodecParameters* p)

--- a/torchaudio/csrc/ffmpeg/ffmpeg.h
+++ b/torchaudio/csrc/ffmpeg/ffmpeg.h
@@ -6,6 +6,9 @@
 #include <memory>
 #include <string>

+#include <torchaudio/csrc/ffmpeg/libav.h>
+#include <torchaudio/csrc/ffmpeg/macro.h>
+
 extern "C" {
 #include <libavcodec/avcodec.h>
 #include <libavdevice/avdevice.h>
@@ -29,21 +32,13 @@ namespace io {

 using OptionDict = std::map<std::string, std::string>;

-// https://github.com/FFmpeg/FFmpeg/blob/4e6debe1df7d53f3f59b37449b82265d5c08a172/doc/APIchanges#L252-L260
-// Starting from libavformat 59 (ffmpeg 5),
-// AVInputFormat is const and related functions expect constant.
-#if LIBAVFORMAT_VERSION_MAJOR >= 59
-#define AVFORMAT_CONST const
-#else
-#define AVFORMAT_CONST
-#endif
-
 // Replacement of av_err2str, which causes
 // `error: taking address of temporary array`
 // https://github.com/joncampbell123/composite-video-simulator/issues/5
 av_always_inline std::string av_err2string(int errnum) {
  char str[AV_ERROR_MAX_STRING_SIZE];
-  return av_make_error_string(str, AV_ERROR_MAX_STRING_SIZE, errnum);
+  detail::libav().av_strerror(errnum, str, AV_ERROR_MAX_STRING_SIZE);
+  return str;
 }

 // Base structure that handles memory management.

--- a/torchaudio/csrc/ffmpeg/filter_graph.cpp
+++ b/torchaudio/csrc/ffmpeg/filter_graph.cpp
 #include <torchaudio/csrc/ffmpeg/filter_graph.h>
+#include <torchaudio/csrc/ffmpeg/libav.h>
 #include <stdexcept>

 namespace torchaudio {
 namespace io {

+using torchaudio::io::detail::libav;
+
 namespace {
 AVFilterGraph* get_filter_graph() {
-  AVFilterGraph* ptr = avfilter_graph_alloc();
+  AVFilterGraph* ptr = libav().avfilter_graph_alloc();
  TORCH_CHECK(ptr, "Failed to allocate resouce.");
  ptr->nb_threads = 1;
  return ptr;
@@ -32,7 +35,7 @@ std::string get_audio_src_args(
      time_base.num,
      time_base.den,
      sample_rate,
-      av_get_sample_fmt_name(format),
+      libav().av_get_sample_fmt_name(format),
      channel_layout);
  return std::string(args);
 }
@@ -51,7 +54,7 @@ std::string get_video_src_args(
      "video_size=%dx%d:pix_fmt=%s:time_base=%d/%d:frame_rate=%d/%d:pixel_aspect=%d/%d",
      width,
      height,
-      av_get_pix_fmt_name(format),
+      libav().av_get_pix_fmt_name(format),
      time_base.num,
      time_base.den,
      frame_rate.num,
@@ -69,7 +72,7 @@ void FilterGraph::add_audio_src(
    int sample_rate,
    uint64_t channel_layout) {
  add_src(
-      avfilter_get_by_name("abuffer"),
+      libav().avfilter_get_by_name("abuffer"),
      get_audio_src_args(format, time_base, sample_rate, channel_layout));
 }

@@ -81,13 +84,13 @@ void FilterGraph::add_video_src(
    int height,
    AVRational sample_aspect_ratio) {
  add_src(
-      avfilter_get_by_name("buffer"),
+      libav().avfilter_get_by_name("buffer"),
      get_video_src_args(
          format, time_base, frame_rate, width, height, sample_aspect_ratio));
 }

 void FilterGraph::add_src(const AVFilter* buffersrc, const std::string& args) {
-  int ret = avfilter_graph_create_filter(
+  int ret = libav().avfilter_graph_create_filter(
      &buffersrc_ctx, buffersrc, "in", args.c_str(), nullptr, graph);
  TORCH_CHECK(
      ret >= 0,
@@ -96,11 +99,11 @@ void FilterGraph::add_src(const AVFilter* buffersrc, const std::string& args) {
 }

 void FilterGraph::add_audio_sink() {
-  add_sink(avfilter_get_by_name("abuffersink"));
+  add_sink(libav().avfilter_get_by_name("abuffersink"));
 }

 void FilterGraph::add_video_sink() {
-  add_sink(avfilter_get_by_name("buffersink"));
+  add_sink(libav().avfilter_get_by_name("buffersink"));
 }

 void FilterGraph::add_sink(const AVFilter* buffersink) {
@@ -114,7 +117,7 @@ void FilterGraph::add_sink(const AVFilter* buffersink) {
  // According to the other example
  // https://ffmpeg.org/doxygen/4.1/filter_audio_8c-example.html
  // `abuffersink` should not take options, and this resolved issue.
-  int ret = avfilter_graph_create_filter(
+  int ret = libav().avfilter_graph_create_filter(
      &buffersink_ctx, buffersink, "out", nullptr, nullptr, graph);
  TORCH_CHECK(ret >= 0, "Failed to create output filter.");
 }
@@ -131,15 +134,15 @@ class InOuts {

 public:
  InOuts(const char* name, AVFilterContext* pCtx) {
-    p = avfilter_inout_alloc();
+    p = libav().avfilter_inout_alloc();
    TORCH_CHECK(p, "Failed to allocate AVFilterInOut.");
-    p->name = av_strdup(name);
+    p->name = libav().av_strdup(name);
    p->filter_ctx = pCtx;
    p->pad_idx = 0;
    p->next = nullptr;
  }
  ~InOuts() {
-    avfilter_inout_free(&p);
+    libav().avfilter_inout_free(&p);
  }
  operator AVFilterInOut**() {
    return &p;
@@ -156,7 +159,7 @@ void FilterGraph::add_process(const std::string& filter_description) {
  // If you are debugging this part of the code, you might get confused.
  InOuts in{"in", buffersrc_ctx}, out{"out", buffersink_ctx};

-  int ret = avfilter_graph_parse_ptr(
+  int ret = libav().avfilter_graph_parse_ptr(
      graph, filter_description.c_str(), out, in, nullptr);

  TORCH_CHECK(
@@ -167,11 +170,11 @@ void FilterGraph::add_process(const std::string& filter_description) {

 void FilterGraph::create_filter(AVBufferRef* hw_frames_ctx) {
  buffersrc_ctx->outputs[0]->hw_frames_ctx = hw_frames_ctx;
-  int ret = avfilter_graph_config(graph, nullptr);
+  int ret = libav().avfilter_graph_config(graph, nullptr);
  TORCH_CHECK(ret >= 0, "Failed to configure the graph: " + av_err2string(ret));
-  // char* desc = avfilter_graph_dump(graph, NULL);
+  // char* desc = libav().avfilter_graph_dump(graph, NULL);
  // std::cerr << "Filter created:\n" << desc << std::endl;
-  // av_free(static_cast<void*>(desc));
+  // libav().av_free(static_cast<void*>(desc));
 }

 //////////////////////////////////////////////////////////////////////////////
@@ -191,7 +194,8 @@ FilterGraphOutputInfo FilterGraph::get_output_info() const {
      ret.num_channels = l->ch_layout.nb_channels;
 #else
      // Before FFmpeg 5.1
-      ret.num_channels = av_get_channel_layout_nb_channels(l->channel_layout);
+      ret.num_channels =
+          libav().av_get_channel_layout_nb_channels(l->channel_layout);
 #endif
      break;
    }
@@ -214,12 +218,12 @@ FilterGraphOutputInfo FilterGraph::get_output_info() const {
 // Streaming process
 //////////////////////////////////////////////////////////////////////////////
 int FilterGraph::add_frame(AVFrame* pInputFrame) {
-  return av_buffersrc_add_frame_flags(
+  return libav().av_buffersrc_add_frame_flags(
      buffersrc_ctx, pInputFrame, AV_BUFFERSRC_FLAG_KEEP_REF);
 }

 int FilterGraph::get_frame(AVFrame* pOutputFrame) {
-  return av_buffersink_get_frame(buffersink_ctx, pOutputFrame);
+  return libav().av_buffersink_get_frame(buffersink_ctx, pOutputFrame);
 }

 } // namespace io

--- a/torchaudio/csrc/ffmpeg/hw_context.cpp
+++ b/torchaudio/csrc/ffmpeg/hw_context.cpp
 #include <torchaudio/csrc/ffmpeg/hw_context.h>
+#include <torchaudio/csrc/ffmpeg/libav.h>

 namespace torchaudio::io {
+
+using detail::libav;
+
 namespace {

 static std::mutex MUTEX;
@@ -15,7 +19,7 @@ AVBufferRef* get_cuda_context(int index) {
  }
  if (CUDA_CONTEXT_CACHE.count(index) == 0) {
    AVBufferRef* p = nullptr;
-    int ret = av_hwdevice_ctx_create(
+    int ret = libav().av_hwdevice_ctx_create(
        &p, AV_HWDEVICE_TYPE_CUDA, std::to_string(index).c_str(), nullptr, 0);
    TORCH_CHECK(
        ret >= 0,

--- a/torchaudio/csrc/ffmpeg/libav.cpp
+++ b/torchaudio/csrc/ffmpeg/libav.cpp
+#include <ATen/DynamicLibrary.h>
+#include <c10/util/CallOnce.h>
+#include <torchaudio/csrc/ffmpeg/libav.h>
+
+extern "C" {
+#include <libavcodec/version.h>
+#include <libavdevice/version.h>
+#include <libavfilter/version.h>
+#include <libavformat/version.h>
+#include <libavutil/version.h>
+}
+
+namespace torchaudio::io::detail {
+namespace {
+class LibAVImpl {
+  at::DynamicLibrary libavutil;
+  at::DynamicLibrary libavcodec;
+  at::DynamicLibrary libavformat;
+  at::DynamicLibrary libavdevice;
+  at::DynamicLibrary libavfilter;
+
+ public:
+  // The struct that holds all the function pointers to be used.
+  LibAV libav{};
+
+  LibAVImpl(
+      const char* util,
+      const char* codec,
+      const char* format,
+      const char* device,
+      const char* filter)
+      : libavutil(util),
+        libavcodec(codec),
+        libavformat(format),
+        libavdevice(device),
+        libavfilter(filter) {
+#define set(X) this->libav.X = (decltype(LibAV::X))libavutil.sym(#X)
+    set(av_buffer_ref);
+    set(av_buffer_unref);
+    set(av_d2q);
+    set(av_dict_free);
+    set(av_dict_get);
+    set(av_dict_set);
+    set(av_frame_alloc);
+    set(av_frame_free);
+    set(av_frame_get_buffer);
+    set(av_frame_is_writable);
+    set(av_frame_make_writable);
+    set(av_frame_unref);
+    set(av_freep);
+    set(av_get_channel_layout_nb_channels);
+    set(av_get_channel_name);
+    set(av_get_default_channel_layout);
+    set(av_get_media_type_string);
+    set(av_get_pix_fmt);
+    set(av_get_pix_fmt_name);
+    set(av_get_sample_fmt);
+    set(av_get_sample_fmt_name);
+    set(av_get_time_base_q);
+    set(av_hwdevice_ctx_create);
+    set(av_hwframe_ctx_alloc);
+    set(av_hwframe_ctx_init);
+    set(av_hwframe_get_buffer);
+    set(av_log_get_level);
+    set(av_log_set_level);
+    set(av_malloc);
+    set(av_pix_fmt_desc_get);
+    set(av_rescale_q);
+    set(av_sample_fmt_is_planar);
+    set(av_strdup);
+    set(av_strerror);
+    set(avutil_version);
+#undef set
+
+#define set(X) this->libav.X = (decltype(LibAV::X))libavcodec.sym(#X)
+    set(av_codec_is_decoder);
+    set(av_codec_is_encoder);
+    set(av_codec_iterate);
+    set(av_packet_alloc);
+    set(av_packet_clone);
+    set(av_packet_free);
+    set(av_packet_ref);
+    set(av_packet_rescale_ts);
+    set(av_packet_unref);
+    set(avcodec_alloc_context3);
+    set(avcodec_configuration);
+    set(avcodec_descriptor_get);
+    set(avcodec_find_decoder);
+    set(avcodec_find_decoder_by_name);
+    set(avcodec_find_encoder);
+    set(avcodec_find_encoder_by_name);
+    set(avcodec_flush_buffers);
+    set(avcodec_free_context);
+    set(avcodec_get_hw_config);
+    set(avcodec_get_name);
+    set(avcodec_open2);
+    set(avcodec_parameters_alloc);
+    set(avcodec_parameters_copy);
+    set(avcodec_parameters_free);
+    set(avcodec_parameters_from_context);
+    set(avcodec_parameters_to_context);
+    set(avcodec_receive_frame);
+    set(avcodec_receive_packet);
+    set(avcodec_send_frame);
+    set(avcodec_send_packet);
+    set(avcodec_version);
+#undef set
+
+#define set(X) this->libav.X = (decltype(LibAV::X))libavformat.sym(#X)
+    set(av_demuxer_iterate);
+    set(av_dump_format);
+    set(av_find_best_stream);
+    set(av_find_input_format);
+    set(av_guess_frame_rate);
+    set(av_interleaved_write_frame);
+    set(av_muxer_iterate);
+    set(av_read_frame);
+    set(av_seek_frame);
+    set(av_write_trailer);
+    set(avio_alloc_context);
+    set(avio_enum_protocols);
+    set(avio_closep);
+    set(avio_flush);
+    set(avio_open2);
+    set(avformat_alloc_context);
+    set(avformat_alloc_output_context2);
+    set(avformat_close_input);
+    set(avformat_find_stream_info);
+    set(avformat_free_context);
+    set(avformat_new_stream);
+    set(avformat_open_input);
+    set(avformat_version);
+    set(avformat_write_header);
+#undef set
+
+#define set(X) this->libav.X = (decltype(LibAV::X))libavdevice.sym(#X)
+    set(avdevice_register_all);
+    set(avdevice_version);
+#undef set
+
+#define set(X) this->libav.X = (decltype(LibAV::X))libavfilter.sym(#X)
+    set(av_buffersink_get_frame);
+    set(av_buffersrc_add_frame_flags);
+    set(avfilter_get_by_name);
+    set(avfilter_graph_alloc);
+    set(avfilter_graph_config);
+    set(avfilter_graph_create_filter);
+    set(avfilter_graph_free);
+    set(avfilter_graph_parse_ptr);
+    set(avfilter_inout_alloc);
+    set(avfilter_inout_free);
+    set(avfilter_version);
+#undef set
+  }
+};
+
+static std::unique_ptr<LibAVImpl> _libav;
+
+void _load_libav() {
+#if defined(_WIN32)
+  _libav = std::make_unique<LibAVImpl>(
+      "avutil-" AV_STRINGIFY(LIBAVUTIL_VERSION_MAJOR) ".dll",
+      "avcodec-" AV_STRINGIFY(LIBAVCODEC_VERSION_MAJOR) ".dll",
+      "avformat-" AV_STRINGIFY(LIBAVFORMAT_VERSION_MAJOR) ".dll",
+      "avdevice-" AV_STRINGIFY(LIBAVDEVICE_VERSION_MAJOR) ".dll",
+      "avfilter-" AV_STRINGIFY(LIBAVFILTER_VERSION_MAJOR) ".dll");
+#elif defined(__APPLE__)
+  _libav = std::make_unique<LibAVImpl>(
+      "libavutil." AV_STRINGIFY(LIBAVUTIL_VERSION_MAJOR) ".dylib",
+      "libavcodec." AV_STRINGIFY(LIBAVCODEC_VERSION_MAJOR) ".dylib",
+      "libavformat." AV_STRINGIFY(LIBAVFORMAT_VERSION_MAJOR) ".dylib",
+      "libavdevice." AV_STRINGIFY(LIBAVDEVICE_VERSION_MAJOR) ".dylib",
+      "libavfilter." AV_STRINGIFY(LIBAVFILTER_VERSION_MAJOR) ".dylib");
+#else
+  _libav = std::make_unique<LibAVImpl>(
+      "libavutil.so." AV_STRINGIFY(LIBAVUTIL_VERSION_MAJOR),
+      "libavcodec.so." AV_STRINGIFY(LIBAVCODEC_VERSION_MAJOR),
+      "libavformat.so." AV_STRINGIFY(LIBAVFORMAT_VERSION_MAJOR),
+      "libavdevice.so." AV_STRINGIFY(LIBAVDEVICE_VERSION_MAJOR),
+      "libavfilter.so." AV_STRINGIFY(LIBAVFILTER_VERSION_MAJOR));
+#endif
+}
+
+} // namespace
+
+LibAV& libav() {
+  static c10::once_flag init_flag;
+  c10::call_once(init_flag, _load_libav);
+  return _libav->libav;
+}
+
+} // namespace torchaudio::io::detail
--- a/torchaudio/csrc/ffmpeg/libav.h
+++ b/torchaudio/csrc/ffmpeg/libav.h
+#pragma once
+
+extern "C" {
+#include <libavcodec/avcodec.h>
+#include <libavfilter/avfilter.h>
+#include <libavformat/avformat.h>
+#include <libavutil/avutil.h>
+#include <libavutil/pixdesc.h>
+}
+
+#include <torchaudio/csrc/ffmpeg/macro.h>
+
+namespace torchaudio::io::detail {
+
+struct LibAV {
+  /////////////////////////////////////////////////////////////////////////////
+  // libavutil
+  /////////////////////////////////////////////////////////////////////////////
+
+  AVBufferRef* (*av_buffer_ref)(const AVBufferRef*);
+
+  void (*av_buffer_unref)(AVBufferRef**);
+
+  AVRational (*av_d2q)(double, int) av_const;
+
+  void (*av_dict_free)(AVDictionary**);
+
+  AVDictionaryEntry* (*av_dict_get)(
+      const AVDictionary*,
+      const char*,
+      const AVDictionaryEntry*,
+      int);
+
+  int (*av_dict_set)(AVDictionary**, const char*, const char*, int);
+
+  AVFrame* (*av_frame_alloc)();
+
+  void (*av_frame_free)(AVFrame**);
+
+  int (*av_frame_get_buffer)(AVFrame*, int);
+
+  int (*av_frame_is_writable)(AVFrame*);
+
+  int (*av_frame_make_writable)(AVFrame*);
+
+  void (*av_frame_unref)(AVFrame*);
+
+  void (*av_freep)(void*);
+
+  int (*av_get_channel_layout_nb_channels)(uint64_t);
+
+  const char* (*av_get_channel_name)(uint64_t);
+
+  int64_t (*av_get_default_channel_layout)(int);
+
+  const char* (*av_get_media_type_string)(enum AVMediaType);
+
+  enum AVPixelFormat (*av_get_pix_fmt)(const char*);
+
+  const char* (*av_get_pix_fmt_name)(enum AVPixelFormat);
+
+  enum AVSampleFormat (*av_get_sample_fmt)(const char*);
+
+  const char* (*av_get_sample_fmt_name)(enum AVSampleFormat);
+
+  AVRational (*av_get_time_base_q)();
+
+  int (*av_hwdevice_ctx_create)(
+      AVBufferRef**,
+      enum AVHWDeviceType,
+      const char*,
+      AVDictionary*,
+      int);
+
+  AVBufferRef* (*av_hwframe_ctx_alloc)(AVBufferRef*);
+
+  int (*av_hwframe_ctx_init)(AVBufferRef*);
+
+  int (*av_hwframe_get_buffer)(AVBufferRef*, AVFrame*, int);
+
+  int (*av_log_get_level)();
+
+  void (*av_log_set_level)(int);
+
+  void* (*av_malloc)(size_t);
+
+  const AVPixFmtDescriptor* (*av_pix_fmt_desc_get)(enum AVPixelFormat);
+
+  int64_t (*av_rescale_q)(int64_t, AVRational, AVRational) av_const;
+
+  int (*av_sample_fmt_is_planar)(enum AVSampleFormat);
+
+  char* (*av_strdup)(const char*);
+
+  int (*av_strerror)(int, char*, size_t);
+
+  unsigned (*avutil_version)();
+
+  /////////////////////////////////////////////////////////////////////////////
+  // libavcodec
+  /////////////////////////////////////////////////////////////////////////////
+
+  int (*av_codec_is_decoder)(const AVCodec*);
+
+  int (*av_codec_is_encoder)(const AVCodec*);
+
+  const AVCodec* (*av_codec_iterate)(void**);
+
+  AVPacket* (*av_packet_alloc)();
+
+  AVPacket* (*av_packet_clone)(const AVPacket*);
+
+  void (*av_packet_free)(AVPacket**);
+
+  int (*av_packet_ref)(AVPacket*, const AVPacket*);
+
+  void (*av_packet_rescale_ts)(AVPacket*, AVRational, AVRational);
+
+  void (*av_packet_unref)(AVPacket*);
+
+  AVCodecContext* (*avcodec_alloc_context3)(const AVCodec*);
+
+  const char* (*avcodec_configuration)();
+
+  const AVCodecDescriptor* (*avcodec_descriptor_get)(enum AVCodecID);
+
+  AVCodec* (*avcodec_find_decoder)(enum AVCodecID);
+
+  AVCodec* (*avcodec_find_decoder_by_name)(const char*);
+
+  AVCodec* (*avcodec_find_encoder)(enum AVCodecID);
+
+  AVCodec* (*avcodec_find_encoder_by_name)(const char*);
+
+  void (*avcodec_flush_buffers)(AVCodecContext*);
+
+  void (*avcodec_free_context)(AVCodecContext**);
+
+  const AVCodecHWConfig* (*avcodec_get_hw_config)(const AVCodec*, int);
+
+  const char* (*avcodec_get_name)(enum AVCodecID);
+
+  int (*avcodec_open2)(AVCodecContext*, const AVCodec*, AVDictionary**);
+
+  AVCodecParameters* (*avcodec_parameters_alloc)();
+
+  int (*avcodec_parameters_copy)(AVCodecParameters*, const AVCodecParameters*);
+
+  void (*avcodec_parameters_free)(AVCodecParameters**);
+
+  int (*avcodec_parameters_from_context)(
+      AVCodecParameters*,
+      const AVCodecContext*);
+
+  int (*avcodec_parameters_to_context)(
+      AVCodecContext*,
+      const AVCodecParameters*);
+
+  int (*avcodec_receive_frame)(AVCodecContext*, AVFrame*);
+
+  int (*avcodec_receive_packet)(AVCodecContext*, AVPacket*);
+
+  int (*avcodec_send_frame)(AVCodecContext*, const AVFrame*);
+
+  int (*avcodec_send_packet)(AVCodecContext*, const AVPacket*);
+
+  unsigned (*avcodec_version)();
+
+  /////////////////////////////////////////////////////////////////////////////
+  // libavformat
+  /////////////////////////////////////////////////////////////////////////////
+
+  const AVInputFormat* (*av_demuxer_iterate)(void**);
+
+  void (*av_dump_format)(AVFormatContext*, int, const char*, int);
+
+  int (*av_find_best_stream)(
+      AVFormatContext*,
+      enum AVMediaType,
+      int,
+      int,
+      AVCodec**,
+      int);
+
+  AVInputFormat* (*av_find_input_format)(const char*);
+
+  AVRational (*av_guess_frame_rate)(AVFormatContext*, AVStream*, AVFrame*);
+
+  int (*av_interleaved_write_frame)(AVFormatContext*, AVPacket*);
+
+  const AVOutputFormat* (*av_muxer_iterate)(void**);
+
+  int (*av_read_frame)(AVFormatContext*, AVPacket*);
+
+  int (*av_seek_frame)(AVFormatContext*, int, int64_t, int);
+
+  int (*av_write_trailer)(AVFormatContext* s);
+
+  AVIOContext* (*avio_alloc_context)(
+      unsigned char*,
+      int,
+      int,
+      void*,
+      int (*)(void*, uint8_t*, int),
+      int (*)(void*, uint8_t*, int),
+      int64_t (*)(void*, int64_t, int));
+
+  const char* (*avio_enum_protocols)(void**, int);
+
+  int (*avio_closep)(AVIOContext**);
+
+  void (*avio_flush)(AVIOContext*);
+
+  int (*avio_open2)(
+      AVIOContext**,
+      const char*,
+      int,
+      const AVIOInterruptCB*,
+      AVDictionary**);
+
+  AVFormatContext* (*avformat_alloc_context)();
+
+  int (*avformat_alloc_output_context2)(
+      AVFormatContext**,
+      AVOutputFormat*,
+      const char*,
+      const char*);
+
+  void (*avformat_close_input)(AVFormatContext**);
+
+  int (*avformat_find_stream_info)(AVFormatContext*, AVDictionary**);
+
+  void (*avformat_free_context)(AVFormatContext*);
+
+  AVStream* (*avformat_new_stream)(AVFormatContext*, const AVCodec*);
+
+  int (*avformat_open_input)(
+      AVFormatContext**,
+      const char*,
+      AVFORMAT_CONST AVInputFormat*,
+      AVDictionary**);
+
+  unsigned (*avformat_version)();
+
+  int (*avformat_write_header)(AVFormatContext*, AVDictionary**);
+
+  /////////////////////////////////////////////////////////////////////////////
+  // libavdevice
+  /////////////////////////////////////////////////////////////////////////////
+
+  void (*avdevice_register_all)();
+
+  unsigned (*avdevice_version)();
+
+  /////////////////////////////////////////////////////////////////////////////
+  // libavfilter
+  /////////////////////////////////////////////////////////////////////////////
+
+  int (*av_buffersink_get_frame)(AVFilterContext*, AVFrame*);
+
+  int (*av_buffersrc_add_frame_flags)(AVFilterContext*, AVFrame*, int);
+
+  const AVFilter* (*avfilter_get_by_name)(const char*);
+
+  AVFilterGraph* (*avfilter_graph_alloc)();
+
+  int (*avfilter_graph_config)(AVFilterGraph*, void*);
+
+  int (*avfilter_graph_create_filter)(
+      AVFilterContext**,
+      const AVFilter*,
+      const char*,
+      const char*,
+      void*,
+      AVFilterGraph*);
+
+  void (*avfilter_graph_free)(AVFilterGraph**);
+
+  int (*avfilter_graph_parse_ptr)(
+      AVFilterGraph*,
+      const char*,
+      AVFilterInOut**,
+      AVFilterInOut**,
+      void*);
+
+  AVFilterInOut* (*avfilter_inout_alloc)();
+
+  void (*avfilter_inout_free)(AVFilterInOut**);
+
+  unsigned (*avfilter_version)();
+};
+
+// Fetch handler for dlopen-ed FFmpeg libraries.
+LibAV& libav();
+
+} // namespace torchaudio::io::detail
--- a/torchaudio/csrc/ffmpeg/macro.h
+++ b/torchaudio/csrc/ffmpeg/macro.h
+#pragma once
+
+extern "C" {
+#include <libavformat/version.h>
+}
+
+#ifndef LIBAVFORMAT_VERSION_MAJOR
+#error LIBAVFORMAT_VERSION_MAJOR is not defined.
+#endif
+
+#if LIBAVFORMAT_VERSION_MAJOR >= 59
+#define AVFORMAT_CONST const
+#else
+#define AVFORMAT_CONST
+#endif
--- a/torchaudio/csrc/ffmpeg/pybind/pybind.cpp
+++ b/torchaudio/csrc/ffmpeg/pybind/pybind.cpp
 #include <torch/extension.h>
 #include <torchaudio/csrc/ffmpeg/hw_context.h>
+#include <torchaudio/csrc/ffmpeg/libav.h>
 #include <torchaudio/csrc/ffmpeg/stream_reader/stream_reader.h>
 #include <torchaudio/csrc/ffmpeg/stream_writer/stream_writer.h>

 namespace torchaudio {
 namespace io {
+
+using detail::libav;
+
 namespace {

 std::map<std::string, std::tuple<int64_t, int64_t, int64_t>> get_versions() {
@@ -12,7 +16,7 @@ std::map<std::string, std::tuple<int64_t, int64_t, int64_t>> get_versions() {

 #define add_version(NAME)               \
  {                                     \
-    int ver = NAME##_version();      \
+    int ver = libav().NAME##_version(); \
    ret.emplace(                        \
        "lib" #NAME,                    \
        std::make_tuple<>(              \
@@ -35,7 +39,7 @@ std::map<std::string, std::string> get_demuxers(bool req_device) {
  std::map<std::string, std::string> ret;
  const AVInputFormat* fmt = nullptr;
  void* i = nullptr;
-  while ((fmt = av_demuxer_iterate(&i))) {
+  while ((fmt = libav().av_demuxer_iterate(&i))) {
    assert(fmt);
    bool is_device = [&]() {
      const AVClass* avclass = fmt->priv_class;
@@ -52,7 +56,7 @@ std::map<std::string, std::string> get_muxers(bool req_device) {
  std::map<std::string, std::string> ret;
  const AVOutputFormat* fmt = nullptr;
  void* i = nullptr;
-  while ((fmt = av_muxer_iterate(&i))) {
+  while ((fmt = libav().av_muxer_iterate(&i))) {
    assert(fmt);
    bool is_device = [&]() {
      const AVClass* avclass = fmt->priv_class;
@@ -71,10 +75,10 @@ std::map<std::string, std::string> get_codecs(
  const AVCodec* c = nullptr;
  void* i = nullptr;
  std::map<std::string, std::string> ret;
-  while ((c = av_codec_iterate(&i))) {
+  while ((c = libav().av_codec_iterate(&i))) {
    assert(c);
-    if ((req_encoder && av_codec_is_encoder(c)) ||
-        (!req_encoder && av_codec_is_decoder(c))) {
+    if ((req_encoder && libav().av_codec_is_encoder(c)) ||
+        (!req_encoder && libav().av_codec_is_decoder(c))) {
      if (c->type == type && c->name) {
        ret.emplace(c->name, c->long_name ? c->long_name : "");
      }
@@ -87,7 +91,7 @@ std::vector<std::string> get_protocols(bool output) {
  void* opaque = nullptr;
  const char* name = nullptr;
  std::vector<std::string> ret;
-  while ((name = avio_enum_protocols(&opaque, output))) {
+  while ((name = libav().avio_enum_protocols(&opaque, output))) {
    assert(name);
    ret.emplace_back(name);
  }
@@ -95,7 +99,7 @@ std::vector<std::string> get_protocols(bool output) {
 }

 std::string get_build_config() {
-  return avcodec_configuration();
+  return libav().avcodec_configuration();
 }

 //////////////////////////////////////////////////////////////////////////////
@@ -188,9 +192,9 @@ struct StreamWriterFileObj : private FileObj, public StreamWriterCustomIO {
 };

 PYBIND11_MODULE(_torchaudio_ffmpeg, m) {
-  m.def("init", []() { avdevice_register_all(); });
-  m.def("get_log_level", []() { return av_log_get_level(); });
-  m.def("set_log_level", [](int level) { av_log_set_level(level); });
+  m.def("init", []() { libav().avdevice_register_all(); });
+  m.def("get_log_level", []() { return libav().av_log_get_level(); });
+  m.def("set_log_level", [](int level) { libav().av_log_set_level(level); });
  m.def("get_versions", &get_versions);
  m.def("get_muxers", []() { return get_muxers(false); });
  m.def("get_demuxers", []() { return get_demuxers(false); });
@@ -246,21 +250,22 @@ PYBIND11_MODULE(_torchaudio_ffmpeg, m) {
      .def_property_readonly(
          "media_type",
          [](const OutputStreamInfo& o) -> std::string {
-            return av_get_media_type_string(o.media_type);
+            return libav().av_get_media_type_string(o.media_type);
          })
      .def_property_readonly(
          "format",
          [](const OutputStreamInfo& o) -> std::string {
            switch (o.media_type) {
              case AVMEDIA_TYPE_AUDIO:
-                return av_get_sample_fmt_name((AVSampleFormat)(o.format));
+                return libav().av_get_sample_fmt_name(
+                    (AVSampleFormat)(o.format));
              case AVMEDIA_TYPE_VIDEO:
-                return av_get_pix_fmt_name((AVPixelFormat)(o.format));
+                return libav().av_get_pix_fmt_name((AVPixelFormat)(o.format));
              default:
                TORCH_INTERNAL_ASSERT(
                    false,
                    "FilterGraph is returning unexpected media type: ",
-                    av_get_media_type_string(o.media_type));
+                    libav().av_get_media_type_string(o.media_type));
            }
          })
      .def_readonly("sample_rate", &OutputStreamInfo::sample_rate)
@@ -284,7 +289,7 @@ PYBIND11_MODULE(_torchaudio_ffmpeg, m) {
      .def_property_readonly(
          "media_type",
          [](const SrcStreamInfo& s) {
-            return av_get_media_type_string(s.media_type);
+            return libav().av_get_media_type_string(s.media_type);
          })
      .def_readonly("codec_name", &SrcStreamInfo::codec_name)
      .def_readonly("codec_long_name", &SrcStreamInfo::codec_long_name)

--- a/torchaudio/csrc/ffmpeg/stream_reader/conversion.cpp
+++ b/torchaudio/csrc/ffmpeg/stream_reader/conversion.cpp
 #include <torch/torch.h>
+#include <torchaudio/csrc/ffmpeg/libav.h>
 #include <torchaudio/csrc/ffmpeg/stream_reader/conversion.h>

 #ifdef USE_CUDA
@@ -7,6 +8,8 @@

 namespace torchaudio::io {

+using detail::libav;
+
 ////////////////////////////////////////////////////////////////////////////////
 // Audio
 ////////////////////////////////////////////////////////////////////////////////
@@ -429,11 +432,11 @@ void NV12CudaConverter::convert(const AVFrame* src, torch::Tensor& dst) {
  TORCH_INTERNAL_ASSERT(
      AV_PIX_FMT_CUDA == fmt,
      "Expected CUDA frame. Found: ",
-      av_get_pix_fmt_name(fmt));
+      libav().av_get_pix_fmt_name(fmt));
  TORCH_INTERNAL_ASSERT(
      AV_PIX_FMT_NV12 == sw_fmt,
      "Expected NV12 format. Found: ",
-      av_get_pix_fmt_name(sw_fmt));
+      libav().av_get_pix_fmt_name(sw_fmt));

  // Write Y plane directly
  auto status = cudaMemcpy2D(
@@ -506,11 +509,11 @@ void P010CudaConverter::convert(const AVFrame* src, torch::Tensor& dst) {
  TORCH_INTERNAL_ASSERT(
      AV_PIX_FMT_CUDA == fmt,
      "Expected CUDA frame. Found: ",
-      av_get_pix_fmt_name(fmt));
+      libav().av_get_pix_fmt_name(fmt));
  TORCH_INTERNAL_ASSERT(
      AV_PIX_FMT_P010 == sw_fmt,
      "Expected P010 format. Found: ",
-      av_get_pix_fmt_name(sw_fmt));
+      libav().av_get_pix_fmt_name(sw_fmt));

  // Write Y plane directly
  auto status = cudaMemcpy2D(
@@ -581,11 +584,11 @@ void YUV444PCudaConverter::convert(const AVFrame* src, torch::Tensor& dst) {
  TORCH_INTERNAL_ASSERT(
      AV_PIX_FMT_CUDA == fmt,
      "Expected CUDA frame. Found: ",
-      av_get_pix_fmt_name(fmt));
+      libav().av_get_pix_fmt_name(fmt));
  TORCH_INTERNAL_ASSERT(
      AV_PIX_FMT_YUV444P == sw_fmt,
      "Expected YUV444P format. Found: ",
-      av_get_pix_fmt_name(sw_fmt));
+      libav().av_get_pix_fmt_name(sw_fmt));

  // Write Y plane directly
  for (int i = 0; i < num_channels; ++i) {

--- a/torchaudio/csrc/ffmpeg/stream_reader/packet_buffer.cpp
+++ b/torchaudio/csrc/ffmpeg/stream_reader/packet_buffer.cpp
+#include <torchaudio/csrc/ffmpeg/libav.h>
 #include <torchaudio/csrc/ffmpeg/stream_reader/packet_buffer.h>

 namespace torchaudio {
 namespace io {
+
+using detail::libav;
+
 void PacketBuffer::push_packet(AVPacket* packet) {
  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(packet, "Packet is null.");
-  AVPacket* p = av_packet_clone(packet);
+  AVPacket* p = libav().av_packet_clone(packet);
  TORCH_INTERNAL_ASSERT(p, "Failed to clone packet.");
  packets.emplace_back(p);
 }

--- a/torchaudio/csrc/ffmpeg/stream_reader/post_process.cpp
+++ b/torchaudio/csrc/ffmpeg/stream_reader/post_process.cpp
+#include <torchaudio/csrc/ffmpeg/libav.h>
 #include <torchaudio/csrc/ffmpeg/stream_reader/buffer/chunked_buffer.h>
 #include <torchaudio/csrc/ffmpeg/stream_reader/buffer/unchunked_buffer.h>
 #include <torchaudio/csrc/ffmpeg/stream_reader/conversion.h>
@@ -5,6 +6,9 @@

 namespace torchaudio::io {
 namespace detail {
+
+using detail::libav;
+
 namespace {

 ///////////////////////////////////////////////////////////////////////////////
@@ -48,7 +52,7 @@ FilterGraphFactory get_video_factory(
    f.add_video_sink();
    f.add_process(filter_desc);
    if (hw_frames_ctx) {
-      f.create_filter(av_buffer_ref(hw_frames_ctx));
+      f.create_filter(libav().av_buffer_ref(hw_frames_ctx));
    } else {
      f.create_filter();
    }
@@ -139,7 +143,7 @@ struct ProcessImpl : public IPostDecodeProcess {
      if (ret >= 0) {
        buffer.push_frame(converter.convert(frame), frame->pts);
      }
-      av_frame_unref(frame);
+      libav().av_frame_unref(frame);
    }
    return ret;
  }
@@ -159,7 +163,7 @@ std::unique_ptr<IPostDecodeProcess> get_unchunked_audio_process(
  TORCH_INTERNAL_ASSERT(
      i.type == AVMEDIA_TYPE_AUDIO,
      "Unsupported media type found: ",
-      av_get_media_type_string(i.type));
+      libav().av_get_media_type_string(i.type));

  using B = UnchunkedBuffer;

@@ -226,7 +230,7 @@ std::unique_ptr<IPostDecodeProcess> get_unchunked_audio_process(
    }
    default:
      TORCH_INTERNAL_ASSERT(
-          false, "Unexpected audio type:", av_get_sample_fmt_name(fmt));
+          false, "Unexpected audio type:", libav().av_get_sample_fmt_name(fmt));
  }
 }

@@ -239,7 +243,7 @@ std::unique_ptr<IPostDecodeProcess> get_chunked_audio_process(
  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
      i.type == AVMEDIA_TYPE_AUDIO,
      "Unsupported media type found: ",
-      av_get_media_type_string(i.type));
+      libav().av_get_media_type_string(i.type));

  using B = ChunkedBuffer;
  B buffer{i.time_base, frames_per_chunk, num_chunks};
@@ -307,7 +311,7 @@ std::unique_ptr<IPostDecodeProcess> get_chunked_audio_process(
    }
    default:
      TORCH_INTERNAL_ASSERT(
-          false, "Unexpected audio type:", av_get_sample_fmt_name(fmt));
+          false, "Unexpected audio type:", libav().av_get_sample_fmt_name(fmt));
  }
 }

@@ -321,7 +325,7 @@ std::unique_ptr<IPostDecodeProcess> get_unchunked_video_process(
  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
      i.type == AVMEDIA_TYPE_VIDEO,
      "Unsupported media type found: ",
-      av_get_media_type_string(i.type));
+      libav().av_get_media_type_string(i.type));

  auto h = i.height;
  auto w = i.width;
@@ -375,7 +379,9 @@ std::unique_ptr<IPostDecodeProcess> get_unchunked_video_process(
    }
    default: {
      TORCH_INTERNAL_ASSERT(
-          false, "Unexpected video format found: ", av_get_pix_fmt_name(fmt));
+          false,
+          "Unexpected video format found: ",
+          libav().av_get_pix_fmt_name(fmt));
    }
  }
 }
@@ -393,7 +399,7 @@ std::unique_ptr<IPostDecodeProcess> get_unchunked_cuda_video_process(
  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
      i.type == AVMEDIA_TYPE_VIDEO,
      "Unsupported media type found: ",
-      av_get_media_type_string(i.type));
+      libav().av_get_media_type_string(i.type));

  using B = UnchunkedBuffer;
  switch (auto fmt = (AVPixelFormat)i.format; fmt) {
@@ -416,13 +422,13 @@ std::unique_ptr<IPostDecodeProcess> get_unchunked_cuda_video_process(
      TORCH_CHECK(
          false,
          "Unsupported video format found in CUDA HW: ",
-          av_get_pix_fmt_name(fmt));
+          libav().av_get_pix_fmt_name(fmt));
    }
    default: {
      TORCH_CHECK(
          false,
          "Unexpected video format found in CUDA HW: ",
-          av_get_pix_fmt_name(fmt));
+          libav().av_get_pix_fmt_name(fmt));
    }
  }
 #endif
@@ -437,7 +443,7 @@ std::unique_ptr<IPostDecodeProcess> get_chunked_video_process(
  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
      i.type == AVMEDIA_TYPE_VIDEO,
      "Unsupported media type found: ",
-      av_get_media_type_string(i.type));
+      libav().av_get_media_type_string(i.type));

  auto h = i.height;
  auto w = i.width;
@@ -491,7 +497,9 @@ std::unique_ptr<IPostDecodeProcess> get_chunked_video_process(
    }
    default: {
      TORCH_INTERNAL_ASSERT(
-          false, "Unexpected video format found: ", av_get_pix_fmt_name(fmt));
+          false,
+          "Unexpected video format found: ",
+          libav().av_get_pix_fmt_name(fmt));
    }
  }
 }
@@ -511,7 +519,7 @@ std::unique_ptr<IPostDecodeProcess> get_chunked_cuda_video_process(
  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
      i.type == AVMEDIA_TYPE_VIDEO,
      "Unsupported media type found: ",
-      av_get_media_type_string(i.type));
+      libav().av_get_media_type_string(i.type));

  using B = ChunkedBuffer;
  switch (auto fmt = (AVPixelFormat)i.format; fmt) {
@@ -540,13 +548,13 @@ std::unique_ptr<IPostDecodeProcess> get_chunked_cuda_video_process(
      TORCH_CHECK(
          false,
          "Unsupported video format found in CUDA HW: ",
-          av_get_pix_fmt_name(fmt));
+          libav().av_get_pix_fmt_name(fmt));
    }
    default: {
      TORCH_CHECK(
          false,
          "Unexpected video format found in CUDA HW: ",
-          av_get_pix_fmt_name(fmt));
+          libav().av_get_pix_fmt_name(fmt));
    }
  }
 #endif

--- a/torchaudio/csrc/ffmpeg/stream_reader/stream_processor.cpp
+++ b/torchaudio/csrc/ffmpeg/stream_reader/stream_processor.cpp
 #include <torchaudio/csrc/ffmpeg/hw_context.h>
+#include <torchaudio/csrc/ffmpeg/libav.h>
 #include <torchaudio/csrc/ffmpeg/stream_reader/stream_processor.h>
 #include <stdexcept>
 #include <string_view>
@@ -6,6 +7,8 @@
 namespace torchaudio {
 namespace io {

+using detail::libav;
+
 namespace {
 AVCodecContextPtr alloc_codec_context(
    enum AVCodecID codec_id,
@@ -13,24 +16,24 @@ AVCodecContextPtr alloc_codec_context(
  const AVCodec* codec = [&]() {
    if (decoder_name) {
      const AVCodec* c =
-          avcodec_find_decoder_by_name(decoder_name.value().c_str());
+          libav().avcodec_find_decoder_by_name(decoder_name.value().c_str());
      TORCH_CHECK(c, "Unsupported codec: ", decoder_name.value());
      return c;
    } else {
-      const AVCodec* c = avcodec_find_decoder(codec_id);
-      TORCH_CHECK(c, "Unsupported codec: ", avcodec_get_name(codec_id));
+      const AVCodec* c = libav().avcodec_find_decoder(codec_id);
+      TORCH_CHECK(c, "Unsupported codec: ", libav().avcodec_get_name(codec_id));
      return c;
    }
  }();

-  AVCodecContext* codec_ctx = avcodec_alloc_context3(codec);
+  AVCodecContext* codec_ctx = libav().avcodec_alloc_context3(codec);
  TORCH_CHECK(codec_ctx, "Failed to allocate CodecContext.");
  return AVCodecContextPtr(codec_ctx);
 }

 const AVCodecHWConfig* get_cuda_config(const AVCodec* codec) {
  for (int i = 0;; ++i) {
-    const AVCodecHWConfig* config = avcodec_get_hw_config(codec, i);
+    const AVCodecHWConfig* config = libav().avcodec_get_hw_config(codec, i);
    if (!config) {
      break;
    }
@@ -83,7 +86,7 @@ enum AVPixelFormat get_hw_format(
 }

 AVBufferRef* get_hw_frames_ctx(AVCodecContext* codec_ctx) {
-  AVBufferRef* p = av_hwframe_ctx_alloc(codec_ctx->hw_device_ctx);
+  AVBufferRef* p = libav().av_hwframe_ctx_alloc(codec_ctx->hw_device_ctx);
  TORCH_CHECK(
      p,
      "Failed to allocate CUDA frame context from device context at ",
@@ -94,11 +97,11 @@ AVBufferRef* get_hw_frames_ctx(AVCodecContext* codec_ctx) {
  frames_ctx->width = codec_ctx->width;
  frames_ctx->height = codec_ctx->height;
  frames_ctx->initial_pool_size = 5;
-  int ret = av_hwframe_ctx_init(p);
+  int ret = libav().av_hwframe_ctx_init(p);
  if (ret >= 0) {
    return p;
  }
-  av_buffer_unref(&p);
+  libav().av_buffer_unref(&p);
  TORCH_CHECK(
      false, "Failed to initialize CUDA frame context: ", av_err2string(ret));
 }
@@ -107,7 +110,7 @@ void configure_codec_context(
    AVCodecContext* codec_ctx,
    const AVCodecParameters* params,
    const torch::Device& device) {
-  int ret = avcodec_parameters_to_context(codec_ctx, params);
+  int ret = libav().avcodec_parameters_to_context(codec_ctx, params);
  TORCH_CHECK(
      ret >= 0, "Failed to set CodecContext parameter: ", av_err2string(ret));

@@ -122,7 +125,8 @@ void configure_codec_context(
    // 2. Set pCodecContext->get_format call back function which
    // will retrieve the HW pixel format from opaque pointer.
    codec_ctx->get_format = get_hw_format;
-    codec_ctx->hw_device_ctx = av_buffer_ref(get_cuda_context(device.index()));
+    codec_ctx->hw_device_ctx =
+        libav().av_buffer_ref(get_cuda_context(device.index()));
    TORCH_INTERNAL_ASSERT(
        codec_ctx->hw_device_ctx, "Failed to reference HW device context.");
 #endif
@@ -135,16 +139,16 @@ void open_codec(
  AVDictionary* opts = get_option_dict(decoder_option);

  // Default to single thread execution.
-  if (!av_dict_get(opts, "threads", nullptr, 0)) {
-    av_dict_set(&opts, "threads", "1", 0);
+  if (!libav().av_dict_get(opts, "threads", nullptr, 0)) {
+    libav().av_dict_set(&opts, "threads", "1", 0);
  }

  if (!codec_ctx->channel_layout) {
    codec_ctx->channel_layout =
-        av_get_default_channel_layout(codec_ctx->channels);
+        libav().av_get_default_channel_layout(codec_ctx->channels);
  }

-  int ret = avcodec_open2(codec_ctx, codec_ctx->codec, &opts);
+  int ret = libav().avcodec_open2(codec_ctx, codec_ctx->codec, &opts);
  clean_up_dict(opts);
  TORCH_CHECK(
      ret >= 0, "Failed to initialize CodecContext: ", av_err2string(ret));
@@ -259,8 +263,8 @@ void StreamProcessor::remove_stream(KeyType key) {

 void StreamProcessor::set_discard_timestamp(int64_t timestamp) {
  TORCH_CHECK(timestamp >= 0, "timestamp must be non-negative.");
-  discard_before_pts =
-      av_rescale_q(timestamp, av_get_time_base_q(), stream_time_base);
+  discard_before_pts = libav().av_rescale_q(
+      timestamp, libav().av_get_time_base_q(), stream_time_base);
 }

 void StreamProcessor::set_decoder(
@@ -306,9 +310,9 @@ int StreamProcessor::process_packet(AVPacket* packet) {
  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
      is_decoder_set(),
      "Decoder must have been set prior to calling this function.");
-  int ret = avcodec_send_packet(codec_ctx, packet);
+  int ret = libav().avcodec_send_packet(codec_ctx, packet);
  while (ret >= 0) {
-    ret = avcodec_receive_frame(codec_ctx, frame);
+    ret = libav().avcodec_receive_frame(codec_ctx, frame);
    //  AVERROR(EAGAIN) means that new input data is required to return new
    //  output.
    if (ret == AVERROR(EAGAIN))
@@ -355,7 +359,7 @@ int StreamProcessor::process_packet(AVPacket* packet) {
    }

    // else we can just unref the frame and continue
-    av_frame_unref(frame);
+    libav().av_frame_unref(frame);
  }
  return ret;
 }
@@ -364,7 +368,7 @@ void StreamProcessor::flush() {
  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
      is_decoder_set(),
      "Decoder must have been set prior to calling this function.");
-  avcodec_flush_buffers(codec_ctx);
+  libav().avcodec_flush_buffers(codec_ctx);
  for (auto& ite : post_processes) {
    ite.second->flush();
  }

--- a/torchaudio/csrc/ffmpeg/stream_reader/stream_reader.cpp
+++ b/torchaudio/csrc/ffmpeg/stream_reader/stream_reader.cpp
 #include <torchaudio/csrc/ffmpeg/ffmpeg.h>
+#include <torchaudio/csrc/ffmpeg/libav.h>
 #include <torchaudio/csrc/ffmpeg/stream_reader/stream_reader.h>
 #include <chrono>
 #include <sstream>
 #include <stdexcept>
 #include <thread>

+extern "C" {
+#include <libavutil/rational.h>
+}
+
 namespace torchaudio {
 namespace io {

+using detail::libav;
 using KeyType = StreamProcessor::KeyType;

 //////////////////////////////////////////////////////////////////////////////
@@ -19,7 +25,7 @@ AVFormatContext* get_input_format_context(
    const c10::optional<std::string>& format,
    const c10::optional<OptionDict>& option,
    AVIOContext* io_ctx) {
-  AVFormatContext* p = avformat_alloc_context();
+  AVFormatContext* p = libav().avformat_alloc_context();
  TORCH_CHECK(p, "Failed to allocate AVFormatContext.");
  if (io_ctx) {
    p->pb = io_ctx;
@@ -29,7 +35,7 @@ AVFormatContext* get_input_format_context(
    if (format.has_value()) {
      std::string format_str = format.value();
      AVFORMAT_CONST AVInputFormat* pInput =
-          av_find_input_format(format_str.c_str());
+          libav().av_find_input_format(format_str.c_str());
      TORCH_CHECK(pInput, "Unsupported device/format: \"", format_str, "\"");
      return pInput;
    }
@@ -37,7 +43,7 @@ AVFormatContext* get_input_format_context(
  }();

  AVDictionary* opt = get_option_dict(option);
-  int ret = avformat_open_input(&p, src.c_str(), pInputFormat, &opt);
+  int ret = libav().avformat_open_input(&p, src.c_str(), pInputFormat, &opt);
  clean_up_dict(opt);

  TORCH_CHECK(
@@ -53,7 +59,7 @@ AVFormatContext* get_input_format_context(

 StreamReader::StreamReader(AVFormatContext* p) : format_ctx(p) {
  C10_LOG_API_USAGE_ONCE("torchaudio.io.StreamReader");
-  int ret = avformat_find_stream_info(format_ctx, nullptr);
+  int ret = libav().avformat_find_stream_info(format_ctx, nullptr);
  TORCH_CHECK(
      ret >= 0, "Failed to find stream information: ", av_err2string(ret));

@@ -110,7 +116,7 @@ void validate_src_stream_type(
      "Stream ",
      i,
      " is not ",
-      av_get_media_type_string(type),
+      libav().av_get_media_type_string(type),
      " stream.");
 }

@@ -125,7 +131,8 @@ namespace {
 OptionDict parse_metadata(const AVDictionary* metadata) {
  AVDictionaryEntry* tag = nullptr;
  OptionDict ret;
-  while ((tag = av_dict_get(metadata, "", tag, AV_DICT_IGNORE_SUFFIX))) {
+  while (
+      (tag = libav().av_dict_get(metadata, "", tag, AV_DICT_IGNORE_SUFFIX))) {
    ret.emplace(std::string(tag->key), std::string(tag->value));
  }
  return ret;
@@ -148,7 +155,8 @@ SrcStreamInfo StreamReader::get_src_stream_info(int i) const {
  ret.num_frames = stream->nb_frames;
  ret.bits_per_sample = codecpar->bits_per_raw_sample;
  ret.metadata = parse_metadata(stream->metadata);
-  const AVCodecDescriptor* desc = avcodec_descriptor_get(codecpar->codec_id);
+  const AVCodecDescriptor* desc =
+      libav().avcodec_descriptor_get(codecpar->codec_id);
  if (desc) {
    ret.codec_name = desc->name;
    ret.codec_long_name = desc->long_name;
@@ -158,7 +166,7 @@ SrcStreamInfo StreamReader::get_src_stream_info(int i) const {
    case AVMEDIA_TYPE_AUDIO: {
      AVSampleFormat smp_fmt = static_cast<AVSampleFormat>(codecpar->format);
      if (smp_fmt != AV_SAMPLE_FMT_NONE) {
-        ret.fmt_name = av_get_sample_fmt_name(smp_fmt);
+        ret.fmt_name = libav().av_get_sample_fmt_name(smp_fmt);
      }
      ret.sample_rate = static_cast<double>(codecpar->sample_rate);
      ret.num_channels = codecpar->channels;
@@ -167,7 +175,7 @@ SrcStreamInfo StreamReader::get_src_stream_info(int i) const {
    case AVMEDIA_TYPE_VIDEO: {
      AVPixelFormat pix_fmt = static_cast<AVPixelFormat>(codecpar->format);
      if (pix_fmt != AV_PIX_FMT_NONE) {
-        ret.fmt_name = av_get_pix_fmt_name(pix_fmt);
+        ret.fmt_name = libav().av_get_pix_fmt_name(pix_fmt);
      }
      ret.width = codecpar->width;
      ret.height = codecpar->height;
@@ -181,7 +189,7 @@ SrcStreamInfo StreamReader::get_src_stream_info(int i) const {

 namespace {
 AVCodecParameters* get_codecpar() {
-  AVCodecParameters* ptr = avcodec_parameters_alloc();
+  AVCodecParameters* ptr = libav().avcodec_parameters_alloc();
  TORCH_CHECK(ptr, "Failed to allocate resource.");
  return ptr;
 }
@@ -192,7 +200,7 @@ StreamParams StreamReader::get_src_stream_params(int i) {
  AVStream* stream = format_ctx->streams[i];

  AVCodecParametersPtr codec_params(get_codecpar());
-  int ret = avcodec_parameters_copy(codec_params, stream->codecpar);
+  int ret = libav().avcodec_parameters_copy(codec_params, stream->codecpar);
  TORCH_CHECK(
      ret >= 0,
      "Failed to copy the stream's codec parameters. (",
@@ -234,12 +242,12 @@ OutputStreamInfo StreamReader::get_out_stream_info(int i) const {
 }

 int64_t StreamReader::find_best_audio_stream() const {
-  return av_find_best_stream(
+  return libav().av_find_best_stream(
      format_ctx, AVMEDIA_TYPE_AUDIO, -1, -1, nullptr, 0);
 }

 int64_t StreamReader::find_best_video_stream() const {
-  return av_find_best_stream(
+  return libav().av_find_best_stream(
      format_ctx, AVMEDIA_TYPE_VIDEO, -1, -1, nullptr, 0);
 }

@@ -289,7 +297,7 @@ void StreamReader::seek(double timestamp_s, int64_t mode) {
      TORCH_CHECK(false, "Invalid mode value: ", mode);
  }

-  int ret = av_seek_frame(format_ctx, -1, timestamp_av_tb, flag);
+  int ret = libav().av_seek_frame(format_ctx, -1, timestamp_av_tb, flag);

  if (ret < 0) {
    seek_timestamp = 0;
@@ -402,12 +410,12 @@ void StreamReader::add_stream(
      case AVMEDIA_TYPE_AUDIO:
        return AVRational{0, 1};
      case AVMEDIA_TYPE_VIDEO:
-        return av_guess_frame_rate(format_ctx, stream, nullptr);
+        return libav().av_guess_frame_rate(format_ctx, stream, nullptr);
      default:
        TORCH_INTERNAL_ASSERT(
            false,
            "Unexpected media type is given: ",
-            av_get_media_type_string(media_type));
+            libav().av_get_media_type_string(media_type));
    }
  }();
  int key = processors[i]->add_stream(
@@ -446,7 +454,7 @@ void StreamReader::remove_stream(int64_t i) {
 // 1: It's done, caller should stop calling
 // <0: Some error happened
 int StreamReader::process_packet() {
-  int ret = av_read_frame(format_ctx, packet);
+  int ret = libav().av_read_frame(format_ctx, packet);
  if (ret == AVERROR_EOF) {
    ret = drain();
    return (ret < 0) ? ret : 1;
@@ -577,12 +585,13 @@ AVIOContext* get_io_context(
    int buffer_size,
    int (*read_packet)(void* opaque, uint8_t* buf, int buf_size),
    int64_t (*seek)(void* opaque, int64_t offset, int whence)) {
-  unsigned char* buffer = static_cast<unsigned char*>(av_malloc(buffer_size));
+  unsigned char* buffer =
+      static_cast<unsigned char*>(libav().av_malloc(buffer_size));
  TORCH_CHECK(buffer, "Failed to allocate buffer.");
-  AVIOContext* io_ctx = avio_alloc_context(
+  AVIOContext* io_ctx = libav().avio_alloc_context(
      buffer, buffer_size, 0, opaque, read_packet, nullptr, seek);
  if (!io_ctx) {
-    av_freep(&buffer);
+    libav().av_freep(&buffer);
    TORCH_CHECK(false, "Failed to allocate AVIOContext.");
  }
  return io_ctx;

--- a/torchaudio/csrc/ffmpeg/stream_writer/encode_process.cpp
+++ b/torchaudio/csrc/ffmpeg/stream_writer/encode_process.cpp
 #include <torchaudio/csrc/ffmpeg/hw_context.h>
+#include <torchaudio/csrc/ffmpeg/libav.h>
 #include <torchaudio/csrc/ffmpeg/stream_writer/encode_process.h>
 #include <cmath>

+extern "C" {
+#include <libavutil/rational.h>
+}
+
 namespace torchaudio::io {

+using detail::libav;
+
 ////////////////////////////////////////////////////////////////////////////////
 // EncodeProcess Logic Implementation
 ////////////////////////////////////////////////////////////////////////////////
@@ -56,7 +63,7 @@ void EncodeProcess::process_frame(AVFrame* src) {
    if (ret >= 0) {
      encoder.encode(dst_frame);
    }
-    av_frame_unref(dst_frame);
+    libav().av_frame_unref(dst_frame);
  }
 }

@@ -71,8 +78,8 @@ void EncodeProcess::flush() {
 namespace {

 enum AVSampleFormat get_src_sample_fmt(const std::string& src) {
-  auto fmt = av_get_sample_fmt(src.c_str());
-  if (fmt != AV_SAMPLE_FMT_NONE && !av_sample_fmt_is_planar(fmt)) {
+  auto fmt = libav().av_get_sample_fmt(src.c_str());
+  if (fmt != AV_SAMPLE_FMT_NONE && !libav().av_sample_fmt_is_planar(fmt)) {
    return fmt;
  }
  TORCH_CHECK(
@@ -89,7 +96,7 @@ enum AVSampleFormat get_src_sample_fmt(const std::string& src) {
              AV_SAMPLE_FMT_S64,
              AV_SAMPLE_FMT_FLT,
              AV_SAMPLE_FMT_DBL}) {
-          ret.emplace_back(av_get_sample_fmt_name(fmt));
+          ret.emplace_back(libav().av_get_sample_fmt_name(fmt));
        }
        return c10::Join(", ", ret);
      }(),
@@ -97,7 +104,7 @@ enum AVSampleFormat get_src_sample_fmt(const std::string& src) {
 }

 enum AVPixelFormat get_src_pix_fmt(const std::string& src) {
-  AVPixelFormat fmt = av_get_pix_fmt(src.c_str());
+  AVPixelFormat fmt = libav().av_get_pix_fmt(src.c_str());
  switch (fmt) {
    case AV_PIX_FMT_GRAY8:
    case AV_PIX_FMT_RGB24:
@@ -118,7 +125,7 @@ enum AVPixelFormat get_src_pix_fmt(const std::string& src) {
              AV_PIX_FMT_RGB24,
              AV_PIX_FMT_BGR24,
              AV_PIX_FMT_YUV444P}) {
-          ret.emplace_back(av_get_pix_fmt_name(fmt));
+          ret.emplace_back(libav().av_get_pix_fmt_name(fmt));
        }
        return c10::Join(", ", ret);
      }(),
@@ -132,18 +139,21 @@ const AVCodec* get_codec(
    AVCodecID default_codec,
    const c10::optional<std::string>& encoder) {
  if (encoder) {
-    const AVCodec* c = avcodec_find_encoder_by_name(encoder.value().c_str());
+    const AVCodec* c =
+        libav().avcodec_find_encoder_by_name(encoder.value().c_str());
    TORCH_CHECK(c, "Unexpected codec: ", encoder.value());
    return c;
  }
-  const AVCodec* c = avcodec_find_encoder(default_codec);
+  const AVCodec* c = libav().avcodec_find_encoder(default_codec);
  TORCH_CHECK(
-      c, "Encoder not found for codec: ", avcodec_get_name(default_codec));
+      c,
+      "Encoder not found for codec: ",
+      libav().avcodec_get_name(default_codec));
  return c;
 }

 AVCodecContextPtr get_codec_ctx(const AVCodec* codec, int flags) {
-  AVCodecContext* ctx = avcodec_alloc_context3(codec);
+  AVCodecContext* ctx = libav().avcodec_alloc_context3(codec);
  TORCH_CHECK(ctx, "Failed to allocate CodecContext.");

  if (flags & AVFMT_GLOBALHEADER) {
@@ -169,25 +179,25 @@ void open_codec(
  // while "libopus" refers to the one depends on libopusenc
  // https://ffmpeg.org/doxygen/4.1/libopusenc_8c.html#aa1d649e48cd2ec00cfe181cf9d0f3251
  if (std::strcmp(codec_ctx->codec->name, "vorbis") == 0) {
-    if (!av_dict_get(opt, "strict", nullptr, 0)) {
+    if (!libav().av_dict_get(opt, "strict", nullptr, 0)) {
      TORCH_WARN_ONCE(
          "\"vorbis\" encoder is selected. Enabling '-strict experimental'. ",
          "If this is not desired, please provide \"strict\" encoder option ",
          "with desired value.");
-      av_dict_set(&opt, "strict", "experimental", 0);
+      libav().av_dict_set(&opt, "strict", "experimental", 0);
    }
  }
  if (std::strcmp(codec_ctx->codec->name, "opus") == 0) {
-    if (!av_dict_get(opt, "strict", nullptr, 0)) {
+    if (!libav().av_dict_get(opt, "strict", nullptr, 0)) {
      TORCH_WARN_ONCE(
          "\"opus\" encoder is selected. Enabling '-strict experimental'. ",
          "If this is not desired, please provide \"strict\" encoder option ",
          "with desired value.");
-      av_dict_set(&opt, "strict", "experimental", 0);
+      libav().av_dict_set(&opt, "strict", "experimental", 0);
    }
  }

-  int ret = avcodec_open2(codec_ctx, codec_ctx->codec, &opt);
+  int ret = libav().avcodec_open2(codec_ctx, codec_ctx->codec, &opt);
  clean_up_dict(opt);
  TORCH_CHECK(ret >= 0, "Failed to open codec: (", av_err2string(ret), ")");
 }
@@ -214,7 +224,7 @@ bool supported_sample_fmt(
 std::string get_supported_formats(const AVSampleFormat* sample_fmts) {
  std::vector<std::string> ret;
  while (*sample_fmts != AV_SAMPLE_FMT_NONE) {
-    ret.emplace_back(av_get_sample_fmt_name(*sample_fmts));
+    ret.emplace_back(libav().av_get_sample_fmt_name(*sample_fmts));
    ++sample_fmts;
  }
  return c10::Join(", ", ret);
@@ -226,7 +236,7 @@ AVSampleFormat get_enc_fmt(
    const AVCodec* codec) {
  if (encoder_format) {
    auto& enc_fmt_val = encoder_format.value();
-    auto fmt = av_get_sample_fmt(enc_fmt_val.c_str());
+    auto fmt = libav().av_get_sample_fmt(enc_fmt_val.c_str());
    TORCH_CHECK(
        fmt != AV_SAMPLE_FMT_NONE, "Unknown sample format: ", enc_fmt_val);
    TORCH_CHECK(
@@ -313,8 +323,8 @@ std::string get_supported_channels(const uint64_t* channel_layouts) {
  std::vector<std::string> names;
  while (*channel_layouts) {
    std::stringstream ss;
-    ss << av_get_channel_layout_nb_channels(*channel_layouts);
-    ss << " (" << av_get_channel_name(*channel_layouts) << ")";
+    ss << libav().av_get_channel_layout_nb_channels(*channel_layouts);
+    ss << " (" << libav().av_get_channel_name(*channel_layouts) << ")";
    names.emplace_back(ss.str());
    ++channel_layouts;
  }
@@ -331,10 +341,10 @@ uint64_t get_channel_layout(
    TORCH_CHECK(
        val > 0, "The number of channels must be greater than 0. Found: ", val);
    if (!codec->channel_layouts) {
-      return static_cast<uint64_t>(av_get_default_channel_layout(val));
+      return static_cast<uint64_t>(libav().av_get_default_channel_layout(val));
    }
    for (const uint64_t* it = codec->channel_layouts; *it; ++it) {
-      if (av_get_channel_layout_nb_channels(*it) == val) {
+      if (libav().av_get_channel_layout_nb_channels(*it) == val) {
        return *it;
      }
    }
@@ -371,8 +381,9 @@ void configure_audio_codec_ctx(
    const c10::optional<CodecConfig>& codec_config) {
  codec_ctx->sample_fmt = format;
  codec_ctx->sample_rate = sample_rate;
-  codec_ctx->time_base = av_inv_q(av_d2q(sample_rate, 1 << 24));
-  codec_ctx->channels = av_get_channel_layout_nb_channels(channel_layout);
+  codec_ctx->time_base = av_inv_q(libav().av_d2q(sample_rate, 1 << 24));
+  codec_ctx->channels =
+      libav().av_get_channel_layout_nb_channels(channel_layout);
  codec_ctx->channel_layout = channel_layout;

  // Set optional stuff
@@ -411,7 +422,7 @@ bool supported_pix_fmt(const AVPixelFormat fmt, const AVPixelFormat* pix_fmts) {
 std::string get_supported_formats(const AVPixelFormat* pix_fmts) {
  std::vector<std::string> ret;
  while (*pix_fmts != AV_PIX_FMT_NONE) {
-    ret.emplace_back(av_get_pix_fmt_name(*pix_fmts));
+    ret.emplace_back(libav().av_get_pix_fmt_name(*pix_fmts));
    ++pix_fmts;
  }
  return c10::Join(", ", ret);
@@ -423,7 +434,7 @@ AVPixelFormat get_enc_fmt(
    const AVCodec* codec) {
  if (encoder_format) {
    const auto& val = encoder_format.value();
-    auto fmt = av_get_pix_fmt(val.c_str());
+    auto fmt = libav().av_get_pix_fmt(val.c_str());
    TORCH_CHECK(
        supported_pix_fmt(fmt, codec->pix_fmts),
        codec->name,
@@ -461,7 +472,7 @@ AVRational get_enc_rate(
        std::isfinite(enc_rate) && enc_rate > 0,
        "Encoder sample rate must be positive and fininte. Found: ",
        enc_rate);
-    AVRational rate = av_d2q(enc_rate, 1 << 24);
+    AVRational rate = libav().av_d2q(enc_rate, 1 << 24);
    TORCH_CHECK(
        supported_frame_rate(rate, codec->supported_framerates),
        codec->name,
@@ -545,14 +556,14 @@ void configure_hw_accel(AVCodecContext* ctx, const std::string& hw_accel) {
  // context to AVCodecContext. But this way, it will be deallocated
  // automatically at the time AVCodecContext is freed, so we do that.

-  ctx->hw_device_ctx = av_buffer_ref(get_cuda_context(device.index()));
+  ctx->hw_device_ctx = libav().av_buffer_ref(get_cuda_context(device.index()));
  TORCH_INTERNAL_ASSERT(
      ctx->hw_device_ctx, "Failed to reference HW device context.");

  ctx->sw_pix_fmt = ctx->pix_fmt;
  ctx->pix_fmt = AV_PIX_FMT_CUDA;

-  ctx->hw_frames_ctx = av_hwframe_ctx_alloc(ctx->hw_device_ctx);
+  ctx->hw_frames_ctx = libav().av_hwframe_ctx_alloc(ctx->hw_device_ctx);
  TORCH_CHECK(ctx->hw_frames_ctx, "Failed to create CUDA frame context.");

  auto frames_ctx = (AVHWFramesContext*)(ctx->hw_frames_ctx->data);
@@ -562,7 +573,7 @@ void configure_hw_accel(AVCodecContext* ctx, const std::string& hw_accel) {
  frames_ctx->height = ctx->height;
  frames_ctx->initial_pool_size = 5;

-  int ret = av_hwframe_ctx_init(ctx->hw_frames_ctx);
+  int ret = libav().av_hwframe_ctx_init(ctx->hw_frames_ctx);
  TORCH_CHECK(
      ret >= 0,
      "Failed to initialize CUDA frame context: ",
@@ -574,11 +585,12 @@ void configure_hw_accel(AVCodecContext* ctx, const std::string& hw_accel) {
 ////////////////////////////////////////////////////////////////////////////////

 AVStream* get_stream(AVFormatContext* format_ctx, AVCodecContext* codec_ctx) {
-  AVStream* stream = avformat_new_stream(format_ctx, nullptr);
+  AVStream* stream = libav().avformat_new_stream(format_ctx, nullptr);
  TORCH_CHECK(stream, "Failed to allocate stream.");

  stream->time_base = codec_ctx->time_base;
-  int ret = avcodec_parameters_from_context(stream->codecpar, codec_ctx);
+  int ret =
+      libav().avcodec_parameters_from_context(stream->codecpar, codec_ctx);
  TORCH_CHECK(
      ret >= 0, "Failed to copy the stream parameter: ", av_err2string(ret));
  return stream;
@@ -605,7 +617,7 @@ FilterGraph get_audio_filter_graph(
    if (filter_desc || src_fmt != enc_fmt ||
        src_sample_rate != enc_sample_rate || src_ch_layout != enc_ch_layout) {
      std::stringstream ss;
-      ss << "aformat=sample_fmts=" << av_get_sample_fmt_name(enc_fmt)
+      ss << "aformat=sample_fmts=" << libav().av_get_sample_fmt_name(enc_fmt)
         << ":sample_rates=" << enc_sample_rate << ":channel_layouts=0x"
         << std::hex << enc_ch_layout;
      parts.push_back(ss.str());
@@ -656,7 +668,7 @@ FilterGraph get_video_filter_graph(
    }
    if (filter_desc || src_fmt != enc_fmt) {
      std::stringstream ss;
-      ss << "format=" << av_get_pix_fmt_name(enc_fmt);
+      ss << "format=" << libav().av_get_pix_fmt_name(enc_fmt);
      parts.emplace_back(ss.str());
    }
    if (filter_desc ||
@@ -695,7 +707,7 @@ AVFramePtr get_audio_frame(
  frame->channel_layout = channel_layout;
  frame->sample_rate = sample_rate;
  frame->nb_samples = nb_samples;
-  int ret = av_frame_get_buffer(frame, 0);
+  int ret = libav().av_frame_get_buffer(frame, 0);
  TORCH_CHECK(
      ret >= 0, "Error allocating the source audio frame:", av_err2string(ret));

@@ -711,7 +723,7 @@ AVFramePtr get_video_frame(AVPixelFormat src_fmt, int width, int height) {
  frame->format = src_fmt;
  frame->width = width;
  frame->height = height;
-  int ret = av_frame_get_buffer(frame, 0);
+  int ret = libav().av_frame_get_buffer(frame, 0);
  TORCH_CHECK(
      ret >= 0, "Error allocating a video buffer :", av_err2string(ret));

@@ -756,10 +768,10 @@ EncodeProcess get_audio_encode_process(
  // case, restrictions on the format to support tensor inputs do not apply, and
  // so we directly get the format via FFmpeg.
  const AVSampleFormat src_fmt = (disable_converter)
-      ? av_get_sample_fmt(format.c_str())
+      ? libav().av_get_sample_fmt(format.c_str())
      : get_src_sample_fmt(format);
-  const auto src_ch_layout =
-      static_cast<uint64_t>(av_get_default_channel_layout(src_num_channels));
+  const auto src_ch_layout = static_cast<uint64_t>(
+      libav().av_get_default_channel_layout(src_num_channels));

  // 2. Fetch codec from default or override
  TORCH_CHECK(
@@ -779,7 +791,7 @@ EncodeProcess get_audio_encode_process(
      // https://github.com/FFmpeg/FFmpeg/blob/0684e58886881a998f1a7b510d73600ff1df2b90/libavcodec/vorbisenc.c#L1277
      // This is the case for at least until FFmpeg 6.0, so it will be
      // like this for a while.
-      return static_cast<uint64_t>(av_get_default_channel_layout(2));
+      return static_cast<uint64_t>(libav().av_get_default_channel_layout(2));
    }
    return get_channel_layout(src_ch_layout, encoder_num_channels, codec);
  }();
@@ -867,9 +879,9 @@ EncodeProcess get_video_encode_process(
  // case, restrictions on the format to support tensor inputs do not apply, and
  // so we directly get the format via FFmpeg.
  const AVPixelFormat src_fmt = (disable_converter)
-      ? av_get_pix_fmt(format.c_str())
+      ? libav().av_get_pix_fmt(format.c_str())
      : get_src_pix_fmt(format);
-  const AVRational src_rate = av_d2q(frame_rate, 1 << 24);
+  const AVRational src_rate = libav().av_d2q(frame_rate, 1 << 24);

  // 2. Fetch codec from default or override
  TORCH_CHECK(
@@ -936,7 +948,8 @@ EncodeProcess get_video_encode_process(
  AVFramePtr src_frame = [&]() {
    if (codec_ctx->hw_frames_ctx) {
      AVFramePtr frame{alloc_avframe()};
-      int ret = av_hwframe_get_buffer(codec_ctx->hw_frames_ctx, frame, 0);
+      int ret =
+          libav().av_hwframe_get_buffer(codec_ctx->hw_frames_ctx, frame, 0);
      TORCH_CHECK(ret >= 0, "Failed to fetch CUDA frame: ", av_err2string(ret));
      frame->nb_samples = 1;
      frame->pts = 0;

--- a/torchaudio/csrc/ffmpeg/stream_writer/encoder.cpp
+++ b/torchaudio/csrc/ffmpeg/stream_writer/encoder.cpp
+#include <torchaudio/csrc/ffmpeg/libav.h>
 #include <torchaudio/csrc/ffmpeg/stream_writer/encoder.h>

 namespace torchaudio::io {

+using detail::libav;
+
 Encoder::Encoder(
    AVFormatContext* format_ctx,
    AVCodecContext* codec_ctx,
@@ -13,10 +16,10 @@ Encoder::Encoder(
 ///
 /// @param frame Frame data to encode
 void Encoder::encode(AVFrame* frame) {
-  int ret = avcodec_send_frame(codec_ctx, frame);
+  int ret = libav().avcodec_send_frame(codec_ctx, frame);
  TORCH_CHECK(ret >= 0, "Failed to encode frame (", av_err2string(ret), ").");
  while (ret >= 0) {
-    ret = avcodec_receive_packet(codec_ctx, packet);
+    ret = libav().avcodec_receive_packet(codec_ctx, packet);
    if (ret == AVERROR(EAGAIN) || ret == AVERROR_EOF) {
      if (ret == AVERROR_EOF) {
        // Note:
@@ -31,7 +34,7 @@ void Encoder::encode(AVFrame* frame) {
        // An alternative is to use `av_write_frame` functoin, but in that case
        // client code is responsible for ordering packets, which makes it
        // complicated to use StreamWriter
-        ret = av_interleaved_write_frame(format_ctx, nullptr);
+        ret = libav().av_interleaved_write_frame(format_ctx, nullptr);
        TORCH_CHECK(
            ret >= 0, "Failed to flush packet (", av_err2string(ret), ").");
      }
@@ -51,10 +54,11 @@ void Encoder::encode(AVFrame* frame) {
      // This has to be set before av_packet_rescale_ts bellow.
      packet->duration = 1;
    }
-    av_packet_rescale_ts(packet, codec_ctx->time_base, stream->time_base);
+    libav().av_packet_rescale_ts(
+        packet, codec_ctx->time_base, stream->time_base);
    packet->stream_index = stream->index;

-    ret = av_interleaved_write_frame(format_ctx, packet);
+    ret = libav().av_interleaved_write_frame(format_ctx, packet);
    TORCH_CHECK(ret >= 0, "Failed to write packet (", av_err2string(ret), ").");
  }
 }

--- a/torchaudio/csrc/ffmpeg/stream_writer/packet_writer.cpp
+++ b/torchaudio/csrc/ffmpeg/stream_writer/packet_writer.cpp
+#include <torchaudio/csrc/ffmpeg/libav.h>
 #include <torchaudio/csrc/ffmpeg/stream_writer/packet_writer.h>

 namespace torchaudio::io {
+
+using detail::libav;
+
 namespace {
 AVStream* add_stream(
    AVFormatContext* format_ctx,
    const StreamParams& stream_params) {
-  AVStream* stream = avformat_new_stream(format_ctx, nullptr);
-  int ret =
-      avcodec_parameters_copy(stream->codecpar, stream_params.codec_params);
+  AVStream* stream = libav().avformat_new_stream(format_ctx, nullptr);
+  int ret = libav().avcodec_parameters_copy(
+      stream->codecpar, stream_params.codec_params);
  TORCH_CHECK(
      ret >= 0,
      "Failed to copy the stream's codec parameters. (",
@@ -26,11 +30,12 @@ PacketWriter::PacketWriter(

 void PacketWriter::write_packet(const AVPacketPtr& packet) {
  AVPacket dst_packet;
-  int ret = av_packet_ref(&dst_packet, packet);
+  int ret = libav().av_packet_ref(&dst_packet, packet);
  TORCH_CHECK(ret >= 0, "Failed to copy packet.");
-  av_packet_rescale_ts(&dst_packet, original_time_base, stream->time_base);
+  libav().av_packet_rescale_ts(
+      &dst_packet, original_time_base, stream->time_base);
  dst_packet.stream_index = stream->index;
-  ret = av_interleaved_write_frame(format_ctx, &dst_packet);
+  ret = libav().av_interleaved_write_frame(format_ctx, &dst_packet);
  TORCH_CHECK(ret >= 0, "Failed to write packet to destination.");
 }
 } // namespace torchaudio::io
--- a/torchaudio/csrc/ffmpeg/stream_writer/stream_writer.cpp
+++ b/torchaudio/csrc/ffmpeg/stream_writer/stream_writer.cpp
+#include <torchaudio/csrc/ffmpeg/libav.h>
 #include <torchaudio/csrc/ffmpeg/stream_writer/stream_writer.h>

 #ifdef USE_CUDA
 #include <c10/cuda/CUDAStream.h>
 #endif

-namespace torchaudio {
-namespace io {
+namespace torchaudio::io {
+
+using detail::libav;
+
 namespace {

 AVFormatContext* get_output_format_context(
@@ -19,7 +22,7 @@ AVFormatContext* get_output_format_context(
  }

  AVFormatContext* p = nullptr;
-  int ret = avformat_alloc_output_context2(
+  int ret = libav().avformat_alloc_output_context2(
      &p, nullptr, format ? format.value().c_str() : nullptr, dst.c_str());
  TORCH_CHECK(
      ret >= 0,
@@ -208,14 +211,14 @@ void StreamWriter::add_video_frame_stream(
 }

 void StreamWriter::set_metadata(const OptionDict& metadata) {
-  av_dict_free(&format_ctx->metadata);
+  libav().av_dict_free(&format_ctx->metadata);
  for (auto const& [key, value] : metadata) {
-    av_dict_set(&format_ctx->metadata, key.c_str(), value.c_str(), 0);
+    libav().av_dict_set(&format_ctx->metadata, key.c_str(), value.c_str(), 0);
  }
 }

 void StreamWriter::dump_format(int64_t i) {
-  av_dump_format(format_ctx, (int)i, format_ctx->url, 1);
+  libav().av_dump_format(format_ctx, (int)i, format_ctx->url, 1);
 }

 void StreamWriter::open(const c10::optional<OptionDict>& option) {
@@ -231,10 +234,10 @@ void StreamWriter::open(const c10::optional<OptionDict>& option) {
  AVDictionary* opt = get_option_dict(option);
  if (!(fmt->flags & AVFMT_NOFILE) &&
      !(format_ctx->flags & AVFMT_FLAG_CUSTOM_IO)) {
-    ret = avio_open2(
+    ret = libav().avio_open2(
        &format_ctx->pb, format_ctx->url, AVIO_FLAG_WRITE, nullptr, &opt);
    if (ret < 0) {
-      av_dict_free(&opt);
+      libav().av_dict_free(&opt);
      TORCH_CHECK(
          false,
          "Failed to open dst: ",
@@ -245,7 +248,7 @@ void StreamWriter::open(const c10::optional<OptionDict>& option) {
    }
  }

-  ret = avformat_write_header(format_ctx, &opt);
+  ret = libav().avformat_write_header(format_ctx, &opt);
  clean_up_dict(opt);
  TORCH_CHECK(
      ret >= 0,
@@ -258,7 +261,7 @@ void StreamWriter::open(const c10::optional<OptionDict>& option) {
 }

 void StreamWriter::close() {
-  int ret = av_write_trailer(format_ctx);
+  int ret = libav().av_write_trailer(format_ctx);
  if (ret < 0) {
    LOG(WARNING) << "Failed to write trailer. (" << av_err2string(ret) << ").";
  }
@@ -269,7 +272,7 @@ void StreamWriter::close() {
  if (!(fmt->flags & AVFMT_NOFILE) &&
      !(format_ctx->flags & AVFMT_FLAG_CUSTOM_IO)) {
    // avio_closep can be only applied to AVIOContext opened by avio_open
-    avio_closep(&(format_ctx->pb));
+    libav().avio_closep(&(format_ctx->pb));
  }
  is_open = false;
 }
@@ -355,12 +358,13 @@ AVIOContext* get_io_context(
    int buffer_size,
    int (*write_packet)(void* opaque, uint8_t* buf, int buf_size),
    int64_t (*seek)(void* opaque, int64_t offset, int whence)) {
-  unsigned char* buffer = static_cast<unsigned char*>(av_malloc(buffer_size));
+  unsigned char* buffer =
+      static_cast<unsigned char*>(libav().av_malloc(buffer_size));
  TORCH_CHECK(buffer, "Failed to allocate buffer.");
-  AVIOContext* io_ctx = avio_alloc_context(
+  AVIOContext* io_ctx = libav().avio_alloc_context(
      buffer, buffer_size, 1, opaque, nullptr, write_packet, seek);
  if (!io_ctx) {
-    av_freep(&buffer);
+    libav().av_freep(&buffer);
    TORCH_CHECK(false, "Failed to allocate AVIOContext.");
  }
  return io_ctx;
@@ -384,5 +388,4 @@ StreamWriterCustomIO::StreamWriterCustomIO(
    : CustomOutput(opaque, buffer_size, write_packet, seek),
      StreamWriter(io_ctx, format) {}

-} // namespace io
-} // namespace torchaudio
+} // namespace torchaudio::io
--- a/torchaudio/csrc/ffmpeg/stream_writer/tensor_converter.cpp
+++ b/torchaudio/csrc/ffmpeg/stream_writer/tensor_converter.cpp
+#include <torchaudio/csrc/ffmpeg/libav.h>
 #include <torchaudio/csrc/ffmpeg/stream_writer/tensor_converter.h>

 #ifdef USE_CUDA
@@ -6,6 +7,8 @@

 namespace torchaudio::io {

+using detail::libav;
+
 namespace {

 using InitFunc = TensorConverter::InitFunc;
@@ -41,8 +44,8 @@ void convert_func_(const torch::Tensor& chunk, AVFrame* buffer) {
  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(chunk.size(1) == buffer->channels);

  // https://ffmpeg.org/doxygen/4.1/muxing_8c_source.html#l00334
-  if (!av_frame_is_writable(buffer)) {
-    int ret = av_frame_make_writable(buffer);
+  if (!libav().av_frame_is_writable(buffer)) {
+    int ret = libav().av_frame_make_writable(buffer);
    TORCH_INTERNAL_ASSERT(
        ret >= 0, "Failed to make frame writable: ", av_err2string(ret));
  }
@@ -145,8 +148,8 @@ void write_interlaced_video(
  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(frame.size(3) == num_channels);

  // https://ffmpeg.org/doxygen/4.1/muxing_8c_source.html#l00472
-  if (!av_frame_is_writable(buffer)) {
-    int ret = av_frame_make_writable(buffer);
+  if (!libav().av_frame_is_writable(buffer)) {
+    int ret = libav().av_frame_make_writable(buffer);
    TORCH_INTERNAL_ASSERT(
        ret >= 0, "Failed to make frame writable: ", av_err2string(ret));
  }
@@ -187,7 +190,7 @@ void write_planar_video(
    AVFrame* buffer,
    int num_planes) {
  const auto num_colors =
-      av_pix_fmt_desc_get((AVPixelFormat)buffer->format)->nb_components;
+      libav().av_pix_fmt_desc_get((AVPixelFormat)buffer->format)->nb_components;
  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(frame.dim() == 4);
  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(frame.size(0) == 1);
  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(frame.size(1) == num_colors);
@@ -195,8 +198,8 @@ void write_planar_video(
  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(frame.size(3), buffer->width);

  // https://ffmpeg.org/doxygen/4.1/muxing_8c_source.html#l00472
-  if (!av_frame_is_writable(buffer)) {
-    int ret = av_frame_make_writable(buffer);
+  if (!libav().av_frame_is_writable(buffer)) {
+    int ret = libav().av_frame_make_writable(buffer);
    TORCH_INTERNAL_ASSERT(
        ret >= 0, "Failed to make frame writable: ", av_err2string(ret));
  }
@@ -308,7 +311,7 @@ std::pair<InitFunc, ConvertFunc> get_video_func(AVFrame* buffer) {
        TORCH_CHECK(
            false,
            "Unexpected pixel format for CUDA: ",
-            av_get_pix_fmt_name(sw_pix_fmt));
+            libav().av_get_pix_fmt_name(sw_pix_fmt));
    }
  }

@@ -317,7 +320,7 @@ std::pair<InitFunc, ConvertFunc> get_video_func(AVFrame* buffer) {
    case AV_PIX_FMT_GRAY8:
    case AV_PIX_FMT_RGB24:
    case AV_PIX_FMT_BGR24: {
-      int channels = av_pix_fmt_desc_get(pix_fmt)->nb_components;
+      int channels = libav().av_pix_fmt_desc_get(pix_fmt)->nb_components;
      InitFunc init_func = [=](const torch::Tensor& t, AVFrame* f) {
        validate_video_input(t, f, channels);
        return init_interlaced(t);
@@ -339,7 +342,9 @@ std::pair<InitFunc, ConvertFunc> get_video_func(AVFrame* buffer) {
    }
    default:
      TORCH_CHECK(
-          false, "Unexpected pixel format: ", av_get_pix_fmt_name(pix_fmt));
+          false,
+          "Unexpected pixel format: ",
+          libav().av_get_pix_fmt_name(pix_fmt));
  }
 }

@@ -383,7 +388,9 @@ TensorConverter::TensorConverter(AVMediaType type, AVFrame* buf, int buf_size)
      break;
    default:
      TORCH_INTERNAL_ASSERT(
-          false, "Unsupported media type: ", av_get_media_type_string(type));
+          false,
+          "Unsupported media type: ",
+          libav().av_get_media_type_string(type));
  }
 }