Simplify HW encoder object handling (#3138)

Summary: hw_device_ctx and hw_frame_ctx assigned to an AVCodecContext object are owned by libavformat, and get freed in [av_codec_free](https://ffmpeg.org/doxygen/4.1/group__lavc__core.html#gaf869d0829ed607cec3a4a02a1c7026b3) (actually in [avcodec_close](https://ffmpeg.org/doxygen/4.1/libavcodec_2utils_8c_source.html#l01069)), so we do not need to keep the reference around. Pull Request resolved: https://github.com/pytorch/audio/pull/3138 Reviewed By: nateanl Differential Revision: D43738009 Pulled By: mthrok fbshipit-source-id: 8c1f4217fa7b21dce872d12be9245056f3fc7537

Simplify HW encoder object handling (#3138)
Summary: hw_device_ctx and hw_frame_ctx assigned to an AVCodecContext object are owned by libavformat, and get freed in [av_codec_free](https://ffmpeg.org/doxygen/4.1/group__lavc__core.html#gaf869d0829ed607cec3a4a02a1c7026b3) (actually in [avcodec_close](https://ffmpeg.org/doxygen/4.1/libavcodec_2utils_8c_source.html#l01069)), so we do not need to keep the reference around. Pull Request resolved: https://github.com/pytorch/audio/pull/3138 Reviewed By: nateanl Differential Revision: D43738009 Pulled By: mthrok fbshipit-source-id: 8c1f4217fa7b21dce872d12be9245056f3fc7537
26acdbff · moto · Facebook GitHub Bot · 41e3b93d · 26acdbff · 26acdbff
Commit 26acdbff authored Mar 03, 2023 by moto Committed by Facebook GitHub Bot Mar 03, 2023
3 changed files
--- a/torchaudio/csrc/ffmpeg/stream_writer/stream_writer.cpp
+++ b/torchaudio/csrc/ffmpeg/stream_writer/stream_writer.cpp
@@ -342,67 +342,76 @@ AVCodecContextPtr get_audio_codec(
  return ctx;
 }
-AVCodecContextPtr get_video_codec(
+void configure_hw_accel(AVCodecContext* ctx, const std::string& hw_accel) {
-    AVFORMAT_CONST AVOutputFormat* oformat,
+  torch::Device device{hw_accel};
-    double frame_rate,
-    int64_t width,
-    int64_t height,
-    const c10::optional<std::string>& encoder,
-    const c10::optional<OptionDict>& encoder_option,
-    const c10::optional<std::string>& encoder_format,
-    const c10::optional<std::string>& hw_accel,
-    AVBufferRefPtr& hw_device_ctx,
-    AVBufferRefPtr& hw_frame_ctx) {
-  AVCodecContextPtr ctx = get_codec_ctx(AVMEDIA_TYPE_VIDEO, oformat, encoder);
-  configure_video_codec(ctx, frame_rate, width, height, encoder_format);
-  if (hw_accel) {
-#ifndef USE_CUDA
-    TORCH_CHECK(
-        false,
-        "torchaudio is not compiled with CUDA support. Hardware acceleration is not available.");
-#else
-    torch::Device device{hw_accel.value()};
  TORCH_CHECK(
      device.type() == c10::DeviceType::CUDA,
      "Only CUDA is supported for hardware acceleration. Found: ",
      device.str());
-    AVBufferRef* device_ctx = nullptr;
+  // NOTES:
+  // 1. Examples like
+  // https://ffmpeg.org/doxygen/4.1/hw_decode_8c-example.html#a9 wraps the HW
+  // device context and the HW frames context with av_buffer_ref. This
+  // increments the reference counting and the resource won't be automatically
+  // dallocated at the time AVCodecContex is destructed. (We will need to
+  // decrement once ourselves), so we do not do it. When adding support to share
+  // context objects, this needs to be reviewed.
+  //
+  // 2. When encoding, it is technically not necessary to attach HW device
+  // context to AVCodecContext. But this way, it will be deallocated
+  // automatically at the time AVCodecContext is freed, so we do that.
  int ret = av_hwdevice_ctx_create(
-        &device_ctx,
+      &ctx->hw_device_ctx,
      AV_HWDEVICE_TYPE_CUDA,
      std::to_string(device.index()).c_str(),
      nullptr,
      0);
  TORCH_CHECK(
      ret >= 0, "Failed to create CUDA device context: ", av_err2string(ret));
-    hw_device_ctx.reset(device_ctx);
+  assert(ctx->hw_device_ctx);
+  ctx->sw_pix_fmt = ctx->pix_fmt;
+  ctx->pix_fmt = AV_PIX_FMT_CUDA;
-    AVBufferRef* frames_ref = av_hwframe_ctx_alloc(device_ctx);
+  ctx->hw_frames_ctx = av_hwframe_ctx_alloc(ctx->hw_device_ctx);
-    TORCH_CHECK(frames_ref, "Failed to create CUDA frame context.");
+  TORCH_CHECK(ctx->hw_frames_ctx, "Failed to create CUDA frame context.");
-    hw_frame_ctx.reset(frames_ref);
-    AVHWFramesContext* frames_ctx = (AVHWFramesContext*)(frames_ref->data);
+  auto frames_ctx = (AVHWFramesContext*)(ctx->hw_frames_ctx->data);
-    frames_ctx->format = AV_PIX_FMT_CUDA;
+  frames_ctx->format = ctx->pix_fmt;
-    frames_ctx->sw_format = ctx->pix_fmt;
+  frames_ctx->sw_format = ctx->sw_pix_fmt;
  frames_ctx->width = ctx->width;
  frames_ctx->height = ctx->height;
-    frames_ctx->initial_pool_size = 20;
+  frames_ctx->initial_pool_size = 5;
-    ctx->sw_pix_fmt = ctx->pix_fmt;
-    ctx->pix_fmt = AV_PIX_FMT_CUDA;
-    ret = av_hwframe_ctx_init(frames_ref);
+  ret = av_hwframe_ctx_init(ctx->hw_frames_ctx);
  TORCH_CHECK(
      ret >= 0,
      "Failed to initialize CUDA frame context: ",
      av_err2string(ret));
+}
-    ctx->hw_frames_ctx = av_buffer_ref(frames_ref);
+AVCodecContextPtr get_video_codec(
+    AVFORMAT_CONST AVOutputFormat* oformat,
+    double frame_rate,
+    int64_t width,
+    int64_t height,
+    const c10::optional<std::string>& encoder,
+    const c10::optional<OptionDict>& encoder_option,
+    const c10::optional<std::string>& encoder_format,
+    const c10::optional<std::string>& hw_accel) {
+  AVCodecContextPtr ctx = get_codec_ctx(AVMEDIA_TYPE_VIDEO, oformat, encoder);
+  configure_video_codec(ctx, frame_rate, width, height, encoder_format);
+  if (hw_accel) {
+#ifdef USE_CUDA
+    configure_hw_accel(ctx, hw_accel.value());
+#else
    TORCH_CHECK(
-        ctx->hw_frames_ctx,
+        false,
-        "Failed to attach CUDA frames to encoding context: ",
+        "torchaudio is not compiled with CUDA support. ",
-        av_err2string(ret));
+        "Hardware acceleration is not available.");
 #endif
  }
@@ -478,10 +487,10 @@ void StreamWriter::add_video_stream(
    const c10::optional<OptionDict>& encoder_option,
    const c10::optional<std::string>& encoder_format,
    const c10::optional<std::string>& hw_accel) {
-  AVBufferRefPtr hw_device_ctx{};
+  streams.emplace_back(std::make_unique<VideoOutputStream>(
-  AVBufferRefPtr hw_frame_ctx{};
+      pFormatContext,
+      get_src_pixel_fmt(format),
-  AVCodecContextPtr ctx = get_video_codec(
+      get_video_codec(
          pFormatContext->oformat,
          frame_rate,
          width,
@@ -489,16 +498,7 @@ void StreamWriter::add_video_stream(
          encoder,
          encoder_option,
          encoder_format,
-      hw_accel,
+          hw_accel)));
-      hw_device_ctx,
-      hw_frame_ctx);
-  streams.emplace_back(std::make_unique<VideoOutputStream>(
-      pFormatContext,
-      get_src_pixel_fmt(format),
-      std::move(ctx),
-      std::move(hw_device_ctx),
-      std::move(hw_frame_ctx)));
 }
 void StreamWriter::set_metadata(const OptionDict& metadata) {

--- a/torchaudio/csrc/ffmpeg/stream_writer/video_output_stream.cpp
+++ b/torchaudio/csrc/ffmpeg/stream_writer/video_output_stream.cpp
@@ -59,17 +59,13 @@ AVFramePtr get_video_frame(AVPixelFormat src_fmt, AVCodecContext* codec_ctx) {
 VideoOutputStream::VideoOutputStream(
    AVFormatContext* format_ctx,
    AVPixelFormat src_fmt,
-    AVCodecContextPtr&& codec_ctx_,
+    AVCodecContextPtr&& codec_ctx_)
-    AVBufferRefPtr&& hw_device_ctx_,
-    AVBufferRefPtr&& hw_frame_ctx_)
    : OutputStream(
          format_ctx,
          codec_ctx_,
          get_video_filter(src_fmt, codec_ctx_)),
      buffer(get_video_frame(src_fmt, codec_ctx_)),
      converter(buffer),
-      hw_device_ctx(std::move(hw_device_ctx_)),
-      hw_frame_ctx(std::move(hw_frame_ctx_)),
      codec_ctx(std::move(codec_ctx_)) {}
 void VideoOutputStream::write_chunk(const torch::Tensor& frames) {

--- a/torchaudio/csrc/ffmpeg/stream_writer/video_output_stream.h
+++ b/torchaudio/csrc/ffmpeg/stream_writer/video_output_stream.h
@@ -7,16 +7,12 @@ namespace torchaudio::io {
 struct VideoOutputStream : OutputStream {
  AVFramePtr buffer;
  VideoTensorConverter converter;
-  AVBufferRefPtr hw_device_ctx;
-  AVBufferRefPtr hw_frame_ctx;
  AVCodecContextPtr codec_ctx;
  VideoOutputStream(
      AVFormatContext* format_ctx,
      AVPixelFormat src_fmt,
-      AVCodecContextPtr&& codec_ctx,
+      AVCodecContextPtr&& codec_ctx);
-      AVBufferRefPtr&& hw_device_ctx,
-      AVBufferRefPtr&& hw_frame_ctx);
  void write_chunk(const torch::Tensor& frames) override;