Simplify HW encoder object handling (#3138)

Summary: hw_device_ctx and hw_frame_ctx assigned to an AVCodecContext object are owned by libavformat, and get freed in [av_codec_free](https://ffmpeg.org/doxygen/4.1/group__lavc__core.html#gaf869d0829ed607cec3a4a02a1c7026b3) (actually in [avcodec_close](https://ffmpeg.org/doxygen/4.1/libavcodec_2utils_8c_source.html#l01069)), so we do not need to keep the reference around. Pull Request resolved: https://github.com/pytorch/audio/pull/3138 Reviewed By: nateanl Differential Revision: D43738009 Pulled By: mthrok fbshipit-source-id: 8c1f4217fa7b21dce872d12be9245056f3fc7537

Simplify HW encoder object handling (#3138)
Summary: hw_device_ctx and hw_frame_ctx assigned to an AVCodecContext object are owned by libavformat, and get freed in [av_codec_free](https://ffmpeg.org/doxygen/4.1/group__lavc__core.html#gaf869d0829ed607cec3a4a02a1c7026b3) (actually in [avcodec_close](https://ffmpeg.org/doxygen/4.1/libavcodec_2utils_8c_source.html#l01069)), so we do not need to keep the reference around. Pull Request resolved: https://github.com/pytorch/audio/pull/3138 Reviewed By: nateanl Differential Revision: D43738009 Pulled By: mthrok fbshipit-source-id: 8c1f4217fa7b21dce872d12be9245056f3fc7537
26acdbff · moto · Facebook GitHub Bot · 41e3b93d · 26acdbff · 26acdbff
Commit 26acdbff authored Mar 03, 2023 by moto Committed by Facebook GitHub Bot Mar 03, 2023
3 changed files
--- a/torchaudio/csrc/ffmpeg/stream_writer/stream_writer.cpp
+++ b/torchaudio/csrc/ffmpeg/stream_writer/stream_writer.cpp
@@ -342,6 +342,56 @@ AVCodecContextPtr get_audio_codec(
  return ctx;
 }
+void configure_hw_accel(AVCodecContext* ctx, const std::string& hw_accel) {
+  torch::Device device{hw_accel};
+  TORCH_CHECK(
+      device.type() == c10::DeviceType::CUDA,
+      "Only CUDA is supported for hardware acceleration. Found: ",
+      device.str());
+  // NOTES:
+  // 1. Examples like
+  // https://ffmpeg.org/doxygen/4.1/hw_decode_8c-example.html#a9 wraps the HW
+  // device context and the HW frames context with av_buffer_ref. This
+  // increments the reference counting and the resource won't be automatically
+  // dallocated at the time AVCodecContex is destructed. (We will need to
+  // decrement once ourselves), so we do not do it. When adding support to share
+  // context objects, this needs to be reviewed.
+  //
+  // 2. When encoding, it is technically not necessary to attach HW device
+  // context to AVCodecContext. But this way, it will be deallocated
+  // automatically at the time AVCodecContext is freed, so we do that.
+  int ret = av_hwdevice_ctx_create(
+      &ctx->hw_device_ctx,
+      AV_HWDEVICE_TYPE_CUDA,
+      std::to_string(device.index()).c_str(),
+      nullptr,
+      0);
+  TORCH_CHECK(
+      ret >= 0, "Failed to create CUDA device context: ", av_err2string(ret));
+  assert(ctx->hw_device_ctx);
+  ctx->sw_pix_fmt = ctx->pix_fmt;
+  ctx->pix_fmt = AV_PIX_FMT_CUDA;
+  ctx->hw_frames_ctx = av_hwframe_ctx_alloc(ctx->hw_device_ctx);
+  TORCH_CHECK(ctx->hw_frames_ctx, "Failed to create CUDA frame context.");
+  auto frames_ctx = (AVHWFramesContext*)(ctx->hw_frames_ctx->data);
+  frames_ctx->format = ctx->pix_fmt;
+  frames_ctx->sw_format = ctx->sw_pix_fmt;
+  frames_ctx->width = ctx->width;
+  frames_ctx->height = ctx->height;
+  frames_ctx->initial_pool_size = 5;
+  ret = av_hwframe_ctx_init(ctx->hw_frames_ctx);
+  TORCH_CHECK(
+      ret >= 0,
+      "Failed to initialize CUDA frame context: ",
+      av_err2string(ret));
+}
 AVCodecContextPtr get_video_codec(
    AVFORMAT_CONST AVOutputFormat* oformat,
    double frame_rate,
@@ -350,59 +400,18 @@ AVCodecContextPtr get_video_codec(
    const c10::optional<std::string>& encoder,
    const c10::optional<OptionDict>& encoder_option,
    const c10::optional<std::string>& encoder_format,
-    const c10::optional<std::string>& hw_accel,
+    const c10::optional<std::string>& hw_accel) {
-    AVBufferRefPtr& hw_device_ctx,
-    AVBufferRefPtr& hw_frame_ctx) {
  AVCodecContextPtr ctx = get_codec_ctx(AVMEDIA_TYPE_VIDEO, oformat, encoder);
  configure_video_codec(ctx, frame_rate, width, height, encoder_format);
  if (hw_accel) {
-#ifndef USE_CUDA
+#ifdef USE_CUDA
-    TORCH_CHECK(
+    configure_hw_accel(ctx, hw_accel.value());
-        false,
-        "torchaudio is not compiled with CUDA support. Hardware acceleration is not available.");
 #else
-    torch::Device device{hw_accel.value()};
-    TORCH_CHECK(
-        device.type() == c10::DeviceType::CUDA,
-        "Only CUDA is supported for hardware acceleration. Found: ",
-        device.str());
-    AVBufferRef* device_ctx = nullptr;
-    int ret = av_hwdevice_ctx_create(
-        &device_ctx,
-        AV_HWDEVICE_TYPE_CUDA,
-        std::to_string(device.index()).c_str(),
-        nullptr,
-        0);
-    TORCH_CHECK(
-        ret >= 0, "Failed to create CUDA device context: ", av_err2string(ret));
-    hw_device_ctx.reset(device_ctx);
-    AVBufferRef* frames_ref = av_hwframe_ctx_alloc(device_ctx);
-    TORCH_CHECK(frames_ref, "Failed to create CUDA frame context.");
-    hw_frame_ctx.reset(frames_ref);
-    AVHWFramesContext* frames_ctx = (AVHWFramesContext*)(frames_ref->data);
-    frames_ctx->format = AV_PIX_FMT_CUDA;
-    frames_ctx->sw_format = ctx->pix_fmt;
-    frames_ctx->width = ctx->width;
-    frames_ctx->height = ctx->height;
-    frames_ctx->initial_pool_size = 20;
-    ctx->sw_pix_fmt = ctx->pix_fmt;
-    ctx->pix_fmt = AV_PIX_FMT_CUDA;
-    ret = av_hwframe_ctx_init(frames_ref);
-    TORCH_CHECK(
-        ret >= 0,
-        "Failed to initialize CUDA frame context: ",
-        av_err2string(ret));
-    ctx->hw_frames_ctx = av_buffer_ref(frames_ref);
    TORCH_CHECK(
-        ctx->hw_frames_ctx,
+        false,
-        "Failed to attach CUDA frames to encoding context: ",
+        "torchaudio is not compiled with CUDA support. ",
-        av_err2string(ret));
+        "Hardware acceleration is not available.");
 #endif
  }
@@ -478,27 +487,18 @@ void StreamWriter::add_video_stream(
    const c10::optional<OptionDict>& encoder_option,
    const c10::optional<std::string>& encoder_format,
    const c10::optional<std::string>& hw_accel) {
-  AVBufferRefPtr hw_device_ctx{};
-  AVBufferRefPtr hw_frame_ctx{};
-  AVCodecContextPtr ctx = get_video_codec(
-      pFormatContext->oformat,
-      frame_rate,
-      width,
-      height,
-      encoder,
-      encoder_option,
-      encoder_format,
-      hw_accel,
-      hw_device_ctx,
-      hw_frame_ctx);
  streams.emplace_back(std::make_unique<VideoOutputStream>(
      pFormatContext,
      get_src_pixel_fmt(format),
-      std::move(ctx),
+      get_video_codec(
-      std::move(hw_device_ctx),
+          pFormatContext->oformat,
-      std::move(hw_frame_ctx)));
+          frame_rate,
+          width,
+          height,
+          encoder,
+          encoder_option,
+          encoder_format,
+          hw_accel)));
 }
 void StreamWriter::set_metadata(const OptionDict& metadata) {

--- a/torchaudio/csrc/ffmpeg/stream_writer/video_output_stream.cpp
+++ b/torchaudio/csrc/ffmpeg/stream_writer/video_output_stream.cpp
@@ -59,17 +59,13 @@ AVFramePtr get_video_frame(AVPixelFormat src_fmt, AVCodecContext* codec_ctx) {
 VideoOutputStream::VideoOutputStream(
    AVFormatContext* format_ctx,
    AVPixelFormat src_fmt,
-    AVCodecContextPtr&& codec_ctx_,
+    AVCodecContextPtr&& codec_ctx_)
-    AVBufferRefPtr&& hw_device_ctx_,
-    AVBufferRefPtr&& hw_frame_ctx_)
    : OutputStream(
          format_ctx,
          codec_ctx_,
          get_video_filter(src_fmt, codec_ctx_)),
      buffer(get_video_frame(src_fmt, codec_ctx_)),
      converter(buffer),
-      hw_device_ctx(std::move(hw_device_ctx_)),
-      hw_frame_ctx(std::move(hw_frame_ctx_)),
      codec_ctx(std::move(codec_ctx_)) {}
 void VideoOutputStream::write_chunk(const torch::Tensor& frames) {

--- a/torchaudio/csrc/ffmpeg/stream_writer/video_output_stream.h
+++ b/torchaudio/csrc/ffmpeg/stream_writer/video_output_stream.h
@@ -7,16 +7,12 @@ namespace torchaudio::io {
 struct VideoOutputStream : OutputStream {
  AVFramePtr buffer;
  VideoTensorConverter converter;
-  AVBufferRefPtr hw_device_ctx;
-  AVBufferRefPtr hw_frame_ctx;
  AVCodecContextPtr codec_ctx;
  VideoOutputStream(
      AVFormatContext* format_ctx,
      AVPixelFormat src_fmt,
-      AVCodecContextPtr&& codec_ctx,
+      AVCodecContextPtr&& codec_ctx);
-      AVBufferRefPtr&& hw_device_ctx,
-      AVBufferRefPtr&& hw_frame_ctx);
  void write_chunk(const torch::Tensor& frames) override;