Migrate the binding of FFmpeg utils to PyBind11 (#3228)

Summary: Utilities functions are only available to Python, so no need to use TorchBind for them. This should allow us to remove link-whole flag when linking `libtorchaudio_ffmpeg` part. Pull Request resolved: https://github.com/pytorch/audio/pull/3228 Reviewed By: nateanl Differential Revision: D44639560 Pulled By: mthrok fbshipit-source-id: 5116073ee8c5ab572c63ad123942c4826bfe1100

Migrate the binding of FFmpeg utils to PyBind11 (#3228)
Summary: Utilities functions are only available to Python, so no need to use TorchBind for them. This should allow us to remove link-whole flag when linking `libtorchaudio_ffmpeg` part. Pull Request resolved: https://github.com/pytorch/audio/pull/3228 Reviewed By: nateanl Differential Revision: D44639560 Pulled By: mthrok fbshipit-source-id: 5116073ee8c5ab572c63ad123942c4826bfe1100
61c31bc0 · moto · Facebook GitHub Bot · c22cd167 · 61c31bc0 · 61c31bc0
Commit 61c31bc0 authored Apr 03, 2023 by moto Committed by Facebook GitHub Bot Apr 03, 2023
5 changed files
--- a/torchaudio/_extension/utils.py
+++ b/torchaudio/_extension/utils.py
@@ -88,9 +88,9 @@ def _init_ffmpeg():
    import torchaudio.lib._torchaudio_ffmpeg  # noqa
-    torch.ops.torchaudio.ffmpeg_init()
+    torchaudio.lib._torchaudio_ffmpeg.init()
-    if torch.ops.torchaudio.ffmpeg_get_log_level() > 8:
+    if torchaudio.lib._torchaudio_ffmpeg.get_log_level() > 8:
-        torch.ops.torchaudio.ffmpeg_set_log_level(8)
+        torchaudio.lib._torchaudio_ffmpeg.set_log_level(8)
 def _init_dll_path():

--- a/torchaudio/csrc/ffmpeg/CMakeLists.txt
+++ b/torchaudio/csrc/ffmpeg/CMakeLists.txt
@@ -21,7 +21,6 @@ set(
  stream_writer/stream_writer.cpp
  stream_writer/tensor_converter.cpp
  compat.cpp
-  utils.cpp
  )
 if (USE_CUDA)

--- a/torchaudio/csrc/ffmpeg/pybind/pybind.cpp
+++ b/torchaudio/csrc/ffmpeg/pybind/pybind.cpp
@@ -8,6 +8,97 @@ namespace torchaudio {
 namespace io {
 namespace {
+std::map<std::string, std::tuple<int64_t, int64_t, int64_t>> get_versions() {
+  std::map<std::string, std::tuple<int64_t, int64_t, int64_t>> ret;
+#define add_version(NAME)            \
+  {                                  \
+    int ver = NAME##_version();      \
+    ret.emplace(                     \
+        "lib" #NAME,                 \
+        std::make_tuple<>(           \
+            AV_VERSION_MAJOR(ver),   \
+            AV_VERSION_MINOR(ver),   \
+            AV_VERSION_MICRO(ver))); \
+  }
+  add_version(avutil);
+  add_version(avcodec);
+  add_version(avformat);
+  add_version(avfilter);
+  add_version(avdevice);
+  return ret;
+#undef add_version
+}
+std::map<std::string, std::string> get_demuxers(bool req_device) {
+  std::map<std::string, std::string> ret;
+  const AVInputFormat* fmt = nullptr;
+  void* i = nullptr;
+  while ((fmt = av_demuxer_iterate(&i))) {
+    assert(fmt);
+    bool is_device = [&]() {
+      const AVClass* avclass = fmt->priv_class;
+      return avclass && AV_IS_INPUT_DEVICE(avclass->category);
+    }();
+    if (req_device == is_device) {
+      ret.emplace(fmt->name, fmt->long_name);
+    }
+  }
+  return ret;
+}
+std::map<std::string, std::string> get_muxers(bool req_device) {
+  std::map<std::string, std::string> ret;
+  const AVOutputFormat* fmt = nullptr;
+  void* i = nullptr;
+  while ((fmt = av_muxer_iterate(&i))) {
+    assert(fmt);
+    bool is_device = [&]() {
+      const AVClass* avclass = fmt->priv_class;
+      return avclass && AV_IS_OUTPUT_DEVICE(avclass->category);
+    }();
+    if (req_device == is_device) {
+      ret.emplace(fmt->name, fmt->long_name);
+    }
+  }
+  return ret;
+}
+std::map<std::string, std::string> get_codecs(
+    AVMediaType type,
+    bool req_encoder) {
+  const AVCodec* c = nullptr;
+  void* i = nullptr;
+  std::map<std::string, std::string> ret;
+  while ((c = av_codec_iterate(&i))) {
+    assert(c);
+    if ((req_encoder && av_codec_is_encoder(c)) ||
+        (!req_encoder && av_codec_is_decoder(c))) {
+      if (c->type == type && c->name) {
+        ret.emplace(c->name, c->long_name ? c->long_name : "");
+      }
+    }
+  }
+  return ret;
+}
+std::vector<std::string> get_protocols(bool output) {
+  void* opaque = nullptr;
+  const char* name = nullptr;
+  std::vector<std::string> ret;
+  while ((name = avio_enum_protocols(&opaque, output))) {
+    assert(name);
+    ret.emplace_back(name);
+  }
+  return ret;
+}
+std::string get_build_config() {
+  return avcodec_configuration();
+}
 // The reason we inherit FileObj instead of making it an attribute
 // is so that FileObj is instantiated first.
 // AVIOContext must be initialized before AVFormat, and outlive AVFormat.
@@ -31,7 +122,31 @@ struct StreamWriterFileObj : private FileObj, public StreamWriter {
 };
 PYBIND11_MODULE(_torchaudio_ffmpeg, m) {
+  m.def("init", []() { avdevice_register_all(); });
+  m.def("get_log_level", []() { return av_log_get_level(); });
+  m.def("set_log_level", [](int level) { av_log_set_level(level); });
+  m.def("get_versions", &get_versions);
+  m.def("get_muxers", []() { return get_muxers(false); });
+  m.def("get_demuxers", []() { return get_demuxers(false); });
+  m.def("get_input_devices", []() { return get_demuxers(true); });
+  m.def("get_build_config", &get_build_config);
+  m.def("get_output_devices", []() { return get_muxers(true); });
+  m.def("get_audio_decoders", []() {
+    return get_codecs(AVMEDIA_TYPE_AUDIO, false);
+  });
+  m.def("get_audio_encoders", []() {
+    return get_codecs(AVMEDIA_TYPE_AUDIO, true);
+  });
+  m.def("get_video_decoders", []() {
+    return get_codecs(AVMEDIA_TYPE_VIDEO, false);
+  });
+  m.def("get_video_encoders", []() {
+    return get_codecs(AVMEDIA_TYPE_VIDEO, true);
+  });
+  m.def("get_input_protocols", []() { return get_protocols(false); });
+  m.def("get_output_protocols", []() { return get_protocols(true); });
  m.def("clear_cuda_context_cache", &clear_cuda_context_cache);
  py::class_<Chunk>(m, "Chunk", py::module_local())
      .def_readwrite("frames", &Chunk::frames)
      .def_readwrite("pts", &Chunk::pts);

--- a/torchaudio/csrc/ffmpeg/utils.cpp
+++ b/torchaudio/csrc/ffmpeg/utils.cpp
-#include <torch/script.h>
-#include <torchaudio/csrc/ffmpeg/ffmpeg.h>
-namespace torchaudio {
-namespace io {
-namespace {
-c10::Dict<std::string, std::tuple<int64_t, int64_t, int64_t>> get_versions() {
-  c10::Dict<std::string, std::tuple<int64_t, int64_t, int64_t>> ret;
-#define add_version(NAME)            \
-  {                                  \
-    int ver = NAME##_version();      \
-    ret.insert(                      \
-        "lib" #NAME,                 \
-        std::make_tuple<>(           \
-            AV_VERSION_MAJOR(ver),   \
-            AV_VERSION_MINOR(ver),   \
-            AV_VERSION_MICRO(ver))); \
-  }
-  add_version(avutil);
-  add_version(avcodec);
-  add_version(avformat);
-  add_version(avfilter);
-  add_version(avdevice);
-  return ret;
-#undef add_version
-}
-c10::Dict<std::string, std::string> get_demuxers(bool req_device) {
-  c10::Dict<std::string, std::string> ret;
-  const AVInputFormat* fmt = nullptr;
-  void* i = nullptr;
-  while ((fmt = av_demuxer_iterate(&i))) {
-    assert(fmt);
-    bool is_device = [&]() {
-      const AVClass* avclass = fmt->priv_class;
-      return avclass && AV_IS_INPUT_DEVICE(avclass->category);
-    }();
-    if (req_device == is_device) {
-      ret.insert(fmt->name, fmt->long_name);
-    }
-  }
-  return ret;
-}
-c10::Dict<std::string, std::string> get_muxers(bool req_device) {
-  c10::Dict<std::string, std::string> ret;
-  const AVOutputFormat* fmt = nullptr;
-  void* i = nullptr;
-  while ((fmt = av_muxer_iterate(&i))) {
-    assert(fmt);
-    bool is_device = [&]() {
-      const AVClass* avclass = fmt->priv_class;
-      return avclass && AV_IS_OUTPUT_DEVICE(avclass->category);
-    }();
-    if (req_device == is_device) {
-      ret.insert(fmt->name, fmt->long_name);
-    }
-  }
-  return ret;
-}
-c10::Dict<std::string, std::string> get_codecs(
-    AVMediaType type,
-    bool req_encoder) {
-  const AVCodec* c = nullptr;
-  void* i = nullptr;
-  c10::Dict<std::string, std::string> ret;
-  while ((c = av_codec_iterate(&i))) {
-    assert(c);
-    if ((req_encoder && av_codec_is_encoder(c)) ||
-        (!req_encoder && av_codec_is_decoder(c))) {
-      if (c->type == type && c->name) {
-        ret.insert(c->name, c->long_name ? c->long_name : "");
-      }
-    }
-  }
-  return ret;
-}
-std::vector<std::string> get_protocols(bool output) {
-  void* opaque = nullptr;
-  const char* name = nullptr;
-  std::vector<std::string> ret;
-  while ((name = avio_enum_protocols(&opaque, output))) {
-    assert(name);
-    ret.emplace_back(name);
-  }
-  return ret;
-}
-std::string get_build_config() {
-  return avcodec_configuration();
-}
-TORCH_LIBRARY_FRAGMENT(torchaudio, m) {
-  m.def("torchaudio::ffmpeg_init", []() { avdevice_register_all(); });
-  m.def("torchaudio::ffmpeg_get_log_level", []() -> int64_t {
-    return static_cast<int64_t>(av_log_get_level());
-  });
-  m.def("torchaudio::ffmpeg_set_log_level", [](int64_t level) {
-    av_log_set_level(static_cast<int>(level));
-  });
-  m.def("torchaudio::ffmpeg_get_versions", &get_versions);
-  m.def("torchaudio::ffmpeg_get_muxers", []() { return get_muxers(false); });
-  m.def(
-      "torchaudio::ffmpeg_get_demuxers", []() { return get_demuxers(false); });
-  m.def("torchaudio::ffmpeg_get_input_devices", []() {
-    return get_demuxers(true);
-  });
-  m.def("torchaudio::ffmpeg_get_build_config", []() {
-    return get_build_config();
-  });
-  m.def("torchaudio::ffmpeg_get_output_devices", []() {
-    return get_muxers(true);
-  });
-  m.def("torchaudio::ffmpeg_get_audio_decoders", []() {
-    return get_codecs(AVMEDIA_TYPE_AUDIO, false);
-  });
-  m.def("torchaudio::ffmpeg_get_audio_encoders", []() {
-    return get_codecs(AVMEDIA_TYPE_AUDIO, true);
-  });
-  m.def("torchaudio::ffmpeg_get_video_decoders", []() {
-    return get_codecs(AVMEDIA_TYPE_VIDEO, false);
-  });
-  m.def("torchaudio::ffmpeg_get_video_encoders", []() {
-    return get_codecs(AVMEDIA_TYPE_VIDEO, true);
-  });
-  m.def("torchaudio::ffmpeg_get_input_protocols", []() {
-    return get_protocols(false);
-  });
-  m.def("torchaudio::ffmpeg_get_output_protocols", []() {
-    return get_protocols(true);
-  });
-}
-} // namespace
-} // namespace io
-} // namespace torchaudio
--- a/torchaudio/utils/ffmpeg_utils.py
+++ b/torchaudio/utils/ffmpeg_utils.py
@@ -16,7 +16,7 @@ def get_versions() -> Dict[str, Tuple[int]]:
        dict: mapping from library names to version string,
            i.e. `"libavutil": (56, 22, 100)`.
    """
-    return torch.ops.torchaudio.ffmpeg_get_versions()
+    return torchaudio.lib._torchaudio_ffmpeg.get_versions()
 @torchaudio._extension.fail_if_no_ffmpeg
@@ -25,7 +25,7 @@ def get_log_level() -> int:
    See :py:func:`set_log_level` for the detailo.
    """
-    return torch.ops.torchaudio.ffmpeg_get_log_level()
+    return torchaudio.lib._torchaudio_ffmpeg.get_log_level()
 @torchaudio._extension.fail_if_no_ffmpeg
@@ -62,7 +62,7 @@ def set_log_level(level: int):
                  Extremely verbose debugging, useful for libav* development.
    """
-    torch.ops.torchaudio.ffmpeg_set_log_level(level)
+    torchaudio.lib._torchaudio_ffmpeg.set_log_level(level)
 @torchaudio._extension.fail_if_no_ffmpeg
@@ -80,7 +80,7 @@ def get_demuxers() -> Dict[str, str]:
        ... aax: CRI AAX
        ... ac3: raw AC-3
    """
-    return torch.ops.torchaudio.ffmpeg_get_demuxers()
+    return torchaudio.lib._torchaudio_ffmpeg.get_demuxers()
 @torchaudio._extension.fail_if_no_ffmpeg
@@ -99,7 +99,7 @@ def get_muxers() -> Dict[str, str]:
        ... adx: CRI ADX
        ... aiff: Audio IFF
    """
-    return torch.ops.torchaudio.ffmpeg_get_muxers()
+    return torchaudio.lib._torchaudio_ffmpeg.get_muxers()
 @torchaudio._extension.fail_if_no_ffmpeg
@@ -118,7 +118,7 @@ def get_audio_decoders() -> Dict[str, str]:
        ... adx: CRI ADX
        ... aiff: Audio IFF
    """
-    return torch.ops.torchaudio.ffmpeg_get_audio_decoders()
+    return torchaudio.lib._torchaudio_ffmpeg.get_audio_decoders()
 @torchaudio._extension.fail_if_no_ffmpeg
@@ -138,7 +138,7 @@ def get_audio_encoders() -> Dict[str, str]:
        ... ac3_fixed: ATSC A/52A (AC-3)
        ... alac: ALAC (Apple Lossless Audio Codec)
    """
-    return torch.ops.torchaudio.ffmpeg_get_audio_encoders()
+    return torchaudio.lib._torchaudio_ffmpeg.get_audio_encoders()
 @torchaudio._extension.fail_if_no_ffmpeg
@@ -158,7 +158,7 @@ def get_video_decoders() -> Dict[str, str]:
        ... amv: AMV Video
        ... anm: Deluxe Paint Animation
    """
-    return torch.ops.torchaudio.ffmpeg_get_video_decoders()
+    return torchaudio.lib._torchaudio_ffmpeg.get_video_decoders()
 @torchaudio._extension.fail_if_no_ffmpeg
@@ -179,7 +179,7 @@ def get_video_encoders() -> Dict[str, str]:
        ... asv1: ASUS V1
        ... asv2: ASUS V2
    """
-    return torch.ops.torchaudio.ffmpeg_get_video_encoders()
+    return torchaudio.lib._torchaudio_ffmpeg.get_video_encoders()
 @torchaudio._extension.fail_if_no_ffmpeg
@@ -195,7 +195,7 @@ def get_input_devices() -> Dict[str, str]:
        ... avfoundation: AVFoundation input device
        ... lavfi: Libavfilter virtual input device
    """
-    return torch.ops.torchaudio.ffmpeg_get_input_devices()
+    return torchaudio.lib._torchaudio_ffmpeg.get_input_devices()
 @torchaudio._extension.fail_if_no_ffmpeg
@@ -210,7 +210,7 @@ def get_output_devices() -> Dict[str, str]:
        >>>     print(f"{k}: {v}")
        ... audiotoolbox: AudioToolbox output device
    """
-    return torch.ops.torchaudio.ffmpeg_get_output_devices()
+    return torchaudio.lib._torchaudio_ffmpeg.get_output_devices()
 @torchaudio._extension.fail_if_no_ffmpeg
@@ -224,7 +224,7 @@ def get_input_protocols() -> List[str]:
        >>> print(get_input_protocols())
        ... ['file', 'ftp', 'hls', 'http','https', 'pipe', 'rtmp', 'tcp', 'tls', 'udp', 'unix']
    """
-    return torch.ops.torchaudio.ffmpeg_get_input_protocols()
+    return torchaudio.lib._torchaudio_ffmpeg.get_input_protocols()
 @torchaudio._extension.fail_if_no_ffmpeg
@@ -238,7 +238,7 @@ def get_output_protocols() -> List[str]:
        >>> print(get_output_protocols())
        ... ['file', 'ftp', 'http', 'https', 'md5', 'pipe', 'prompeg', 'rtmp', 'tee', 'tcp', 'tls', 'udp', 'unix']
    """
-    return torch.ops.torchaudio.ffmpeg_get_output_protocols()
+    return torchaudio.lib._torchaudio_ffmpeg.get_output_protocols()
 @torchaudio._extension.fail_if_no_ffmpeg
@@ -252,7 +252,7 @@ def get_build_config() -> str:
        >>> print(get_build_config())
        --prefix=/Users/runner/miniforge3 --cc=arm64-apple-darwin20.0.0-clang --enable-gpl --enable-hardcoded-tables --enable-libfreetype --enable-libopenh264 --enable-neon --enable-libx264 --enable-libx265 --enable-libaom --enable-libsvtav1 --enable-libxml2 --enable-libvpx --enable-pic --enable-pthreads --enable-shared --disable-static --enable-version3 --enable-zlib --enable-libmp3lame --pkg-config=/Users/runner/miniforge3/conda-bld/ffmpeg_1646229390493/_build_env/bin/pkg-config --enable-cross-compile --arch=arm64 --target-os=darwin --cross-prefix=arm64-apple-darwin20.0.0- --host-cc=/Users/runner/miniforge3/conda-bld/ffmpeg_1646229390493/_build_env/bin/x86_64-apple-darwin13.4.0-clang  # noqa
    """
-    return torch.ops.torchaudio.ffmpeg_get_build_config()
+    return torchaudio.lib._torchaudio_ffmpeg.get_build_config()
 @torchaudio._extension.fail_if_no_ffmpeg