Add CUDA HW encoding support to StreamWriter (#2505)

Summary: This commits add CUDA hardware encoding to StreamWriter. For certain video formats, it can encode video directly from CUDA Tensor, without needing to move the data to host CPU. Pull Request resolved: https://github.com/pytorch/audio/pull/2505 Reviewed By: hwangjeff Differential Revision: D37446830 Pulled By: mthrok fbshipit-source-id: eee6424f01a99a3b611dcad45ed58f86cba4672a

Add CUDA HW encoding support to StreamWriter (#2505)
Summary: This commits add CUDA hardware encoding to StreamWriter. For certain video formats, it can encode video directly from CUDA Tensor, without needing to move the data to host CPU. Pull Request resolved: https://github.com/pytorch/audio/pull/2505 Reviewed By: hwangjeff Differential Revision: D37446830 Pulled By: mthrok fbshipit-source-id: eee6424f01a99a3b611dcad45ed58f86cba4672a
95eada24 · moto · Facebook GitHub Bot · 28da8b84 · 95eada24 · 95eada24
Commit 95eada24 authored Sep 01, 2022 by moto Committed by Facebook GitHub Bot Sep 01, 2022
6 changed files
--- a/torchaudio/csrc/ffmpeg/pybind/stream_writer.cpp
+++ b/torchaudio/csrc/ffmpeg/pybind/stream_writer.cpp
@@ -41,7 +41,8 @@ void StreamWriterFileObj::add_video_stream(
    std::string format,
    const c10::optional<std::string>& encoder,
    const c10::optional<std::map<std::string, std::string>>& encoder_option,
-    const c10::optional<std::string>& encoder_format) {
+    const c10::optional<std::string>& encoder_format,
+    const c10::optional<std::string>& hw_accel) {
  StreamWriter::add_video_stream(
      frame_rate,
      width,
@@ -49,7 +50,8 @@ void StreamWriterFileObj::add_video_stream(
      format,
      encoder,
      map2dict(encoder_option),
-      encoder_format);
+      encoder_format,
+      hw_accel);
 }

 } // namespace ffmpeg

--- a/torchaudio/csrc/ffmpeg/pybind/stream_writer.h
+++ b/torchaudio/csrc/ffmpeg/pybind/stream_writer.h
@@ -27,7 +27,8 @@ class StreamWriterFileObj : protected FileObj, public StreamWriterBinding {
      std::string format,
      const c10::optional<std::string>& encoder,
      const c10::optional<std::map<std::string, std::string>>& encoder_option,
-      const c10::optional<std::string>& encoder_format);
+      const c10::optional<std::string>& encoder_format,
+      const c10::optional<std::string>& hw_accel);
 };

 } // namespace ffmpeg

--- a/torchaudio/csrc/ffmpeg/stream_writer/stream_writer.cpp
+++ b/torchaudio/csrc/ffmpeg/stream_writer/stream_writer.cpp
 #include <torchaudio/csrc/ffmpeg/stream_writer/stream_writer.h>

+#ifdef USE_CUDA
+#include <c10/cuda/CUDAStream.h>
+#endif
+
 namespace torchaudio {
 namespace ffmpeg {
 namespace {
@@ -258,6 +262,13 @@ AVFramePtr get_audio_frame(
  return frame;
 }

+AVFramePtr get_hw_video_frame(AVCodecContextPtr& codec_ctx) {
+  AVFramePtr frame{};
+  int ret = av_hwframe_get_buffer(codec_ctx->hw_frames_ctx, frame, 0);
+  TORCH_CHECK(ret >= 0, "Failed to fetch CUDA frame: ", av_err2string(ret));
+  return frame;
+}
+
 AVFramePtr get_video_frame(
    enum AVPixelFormat fmt,
    AVCodecContextPtr& codec_ctx) {
@@ -417,7 +428,9 @@ void StreamWriter::add_audio_stream(
      std::move(src_frame),
      std::move(dst_frame),
      0,
-      frame_capacity});
+      frame_capacity,
+      AVBufferRefPtr{},
+      AVBufferRefPtr{}});
 }

 void StreamWriter::add_video_stream(
@@ -427,21 +440,93 @@ void StreamWriter::add_video_stream(
    const std::string& format,
    const c10::optional<std::string>& encoder,
    const c10::optional<OptionDict>& encoder_option,
-    const c10::optional<std::string>& encoder_format) {
+    const c10::optional<std::string>& encoder_format,
+    const c10::optional<std::string>& hw_accel) {
+  const torch::Device device = [&]() {
+    if (!hw_accel) {
+      return torch::Device{c10::DeviceType::CPU};
+    }
+#ifdef USE_CUDA
+    torch::Device d{hw_accel.value()};
+    TORCH_CHECK(
+        d.type() == c10::DeviceType::CUDA,
+        "Only CUDA is supported for hardware acceleration. Found:",
+        device.str());
+    return d;
+#else
+    TORCH_CHECK(
+        false,
+        "torchaudio is not compiled with CUDA support. Hardware acceleration is not available.");
+#endif
+  }();
  enum AVPixelFormat src_fmt = _get_src_pixel_fmt(format);

  AVCodecContextPtr ctx =
      get_codec_ctx(AVMEDIA_TYPE_VIDEO, pFormatContext->oformat, encoder);
  configure_video_codec(ctx, frame_rate, width, height, encoder_format);
+
+  AVBufferRefPtr hw_device_ctx{};
+  AVBufferRefPtr hw_frame_ctx{};
+#ifdef USE_CUDA
+  if (device.type() == c10::DeviceType::CUDA) {
+    AVBufferRef* device_ctx = nullptr;
+    int ret = av_hwdevice_ctx_create(
+        &device_ctx,
+        AV_HWDEVICE_TYPE_CUDA,
+        std::to_string(device.index()).c_str(),
+        nullptr,
+        0);
+    TORCH_CHECK(
+        ret >= 0, "Failed to create CUDA device context: ", av_err2string(ret));
+    hw_device_ctx.reset(device_ctx);
+
+    AVBufferRef* frames_ref = av_hwframe_ctx_alloc(device_ctx);
+    TORCH_CHECK(frames_ref, "Failed to create CUDA frame context.");
+    hw_frame_ctx.reset(frames_ref);
+
+    AVHWFramesContext* frames_ctx = (AVHWFramesContext*)(frames_ref->data);
+    frames_ctx->format = AV_PIX_FMT_CUDA;
+    frames_ctx->sw_format = ctx->pix_fmt;
+    frames_ctx->width = ctx->width;
+    frames_ctx->height = ctx->height;
+    frames_ctx->initial_pool_size = 20;
+    ctx->sw_pix_fmt = ctx->pix_fmt;
+    ctx->pix_fmt = AV_PIX_FMT_CUDA;
+
+    ret = av_hwframe_ctx_init(frames_ref);
+    TORCH_CHECK(
+        ret >= 0,
+        "Failed to initialize CUDA frame context: ",
+        av_err2string(ret));
+
+    ctx->hw_frames_ctx = av_buffer_ref(frames_ref);
+    TORCH_CHECK(
+        ctx->hw_frames_ctx,
+        "Failed to attach CUDA frames to encoding context: ",
+        av_err2string(ret));
+  }
+#endif
+
  open_codec(ctx, encoder_option);
  AVStream* stream = add_stream(ctx);

-  std::unique_ptr<FilterGraph> filter = src_fmt == ctx->pix_fmt
-      ? std::unique_ptr<FilterGraph>(nullptr)
-      : _get_video_filter(src_fmt, ctx);
-  AVFramePtr src_frame = get_video_frame(src_fmt, ctx);
+  std::unique_ptr<FilterGraph> filter = [&]() {
+    if (src_fmt != ctx->pix_fmt && device.type() == c10::DeviceType::CPU) {
+      return _get_video_filter(src_fmt, ctx);
+    }
+    return std::unique_ptr<FilterGraph>(nullptr);
+  }();
+
+  // CUDA: require src_frame
+  // CPU: require dst_frame when filter is enabled
+  AVFramePtr src_frame = [&]() {
+    if (device.type() == c10::DeviceType::CUDA) {
+      return get_hw_video_frame(ctx);
+    }
+    return get_video_frame(src_fmt, ctx);
+  }();
  AVFramePtr dst_frame =
-      filter ? AVFramePtr{} : get_video_frame(ctx->pix_fmt, ctx);
+      filter ? get_video_frame(ctx->pix_fmt, ctx) : AVFramePtr{};
  streams.emplace_back(OutputStream{
      stream,
      std::move(ctx),
@@ -449,7 +534,9 @@ void StreamWriter::add_video_stream(
      std::move(src_frame),
      std::move(dst_frame),
      0,
-      -1});
+      -1,
+      std::move(hw_device_ctx),
+      std::move(hw_frame_ctx)});
 }

 AVStream* StreamWriter::add_stream(AVCodecContextPtr& codec_ctx) {
@@ -720,6 +807,31 @@ void StreamWriter::write_video_chunk(int i, const torch::Tensor& frames) {
  OutputStream& os = streams[i];
  enum AVPixelFormat fmt = static_cast<AVPixelFormat>(os.src_frame->format);

+#ifdef USE_CUDA
+  if (fmt == AV_PIX_FMT_CUDA) {
+    TORCH_CHECK(frames.device().is_cuda(), "Input tensor has to be on CUDA.");
+    enum AVPixelFormat sw_fmt = os.codec_ctx->sw_pix_fmt;
+    validate_video_input(sw_fmt, os.codec_ctx, frames);
+    switch (sw_fmt) {
+      case AV_PIX_FMT_RGB0:
+      case AV_PIX_FMT_BGR0:
+        write_interlaced_video_cuda(os, frames, true);
+        return;
+      case AV_PIX_FMT_GBRP:
+      case AV_PIX_FMT_GBRP16LE:
+      case AV_PIX_FMT_YUV444P:
+      case AV_PIX_FMT_YUV444P16LE:
+        write_planar_video_cuda(os, frames, av_pix_fmt_count_planes(sw_fmt));
+        return;
+      default:
+        TORCH_CHECK(
+            false,
+            "Unexpected pixel format for CUDA: ",
+            av_get_pix_fmt_name(sw_fmt));
+    }
+  }
+#endif
+
  TORCH_CHECK(frames.device().is_cpu(), "Input tensor has to be on CPU.");
  validate_video_input(fmt, os.codec_ctx, frames);
  switch (fmt) {
@@ -736,6 +848,75 @@ void StreamWriter::write_video_chunk(int i, const torch::Tensor& frames) {
  }
 }

+#ifdef USE_CUDA
+void StreamWriter::write_interlaced_video_cuda(
+    OutputStream& os,
+    const torch::Tensor& frames,
+    bool pad_extra) {
+  const auto num_frames = frames.size(0);
+  const auto num_channels = frames.size(1);
+  const auto height = frames.size(2);
+  const auto width = frames.size(3);
+  const auto num_channels_buffer = num_channels + (pad_extra ? 1 : 0);
+
+  using namespace torch::indexing;
+  torch::Tensor buffer =
+      torch::empty({height, width, num_channels_buffer}, frames.options());
+  size_t spitch = width * num_channels_buffer;
+  for (int i = 0; i < num_frames; ++i) {
+    // Slice frame as HWC
+    auto chunk = frames.index({i}).permute({1, 2, 0});
+    buffer.index_put_({"...", Slice(0, num_channels)}, chunk);
+
+    if (cudaSuccess !=
+        cudaMemcpy2D(
+            (void*)(os.src_frame->data[0]),
+            os.src_frame->linesize[0],
+            (const void*)(buffer.data_ptr<uint8_t>()),
+            spitch,
+            spitch,
+            height,
+            cudaMemcpyDeviceToDevice)) {
+      TORCH_CHECK(false, "Failed to copy pixel data from CUDA tensor.");
+    }
+    os.src_frame->pts = os.num_frames;
+    os.num_frames += 1;
+    encode_frame(os.src_frame, os.codec_ctx, os.stream);
+  }
+}
+
+void StreamWriter::write_planar_video_cuda(
+    OutputStream& os,
+    const torch::Tensor& frames,
+    int num_planes) {
+  const auto num_frames = frames.size(0);
+  const auto height = frames.size(2);
+  const auto width = frames.size(3);
+
+  using namespace torch::indexing;
+  torch::Tensor buffer = torch::empty({height, width}, frames.options());
+  for (int i = 0; i < num_frames; ++i) {
+    for (int j = 0; j < num_planes; ++j) {
+      buffer.index_put_({"..."}, frames.index({i, j}));
+      if (cudaSuccess !=
+          cudaMemcpy2D(
+              (void*)(os.src_frame->data[j]),
+              os.src_frame->linesize[j],
+              (const void*)(buffer.data_ptr<uint8_t>()),
+              width,
+              width,
+              height,
+              cudaMemcpyDeviceToDevice)) {
+        TORCH_CHECK(false, "Failed to copy pixel data from CUDA tensor.");
+      }
+    }
+    os.src_frame->pts = os.num_frames;
+    os.num_frames += 1;
+    encode_frame(os.src_frame, os.codec_ctx, os.stream);
+  }
+}
+#endif
+
 // Interlaced video
 // Each frame is composed of one plane, and color components for each pixel are
 // collocated.

--- a/torchaudio/csrc/ffmpeg/stream_writer/stream_writer.h
+++ b/torchaudio/csrc/ffmpeg/stream_writer/stream_writer.h
@@ -17,10 +17,14 @@ struct OutputStream {
  int64_t num_frames;
  // Audio-only: The maximum frames that frame can hold
  int64_t frame_capacity;
+  // Video-only: HW acceleration
+  AVBufferRefPtr hw_device_ctx;
+  AVBufferRefPtr hw_frame_ctx;
 };

 class StreamWriter {
  AVFormatOutputContextPtr pFormatContext;
+  AVBufferRefPtr pHWBufferRef;
  std::vector<OutputStream> streams;
  AVPacketPtr pkt;

@@ -55,7 +59,8 @@ class StreamWriter {
      const std::string& format,
      const c10::optional<std::string>& encoder,
      const c10::optional<OptionDict>& encoder_option,
-      const c10::optional<std::string>& encoder_format);
+      const c10::optional<std::string>& encoder_format,
+      const c10::optional<std::string>& hw_accel);
  void set_metadata(const OptionDict& metadata);

 private:
@@ -79,6 +84,16 @@ class StreamWriter {
      const torch::Tensor& chunk,
      int num_planes);
  void write_interlaced_video(OutputStream& os, const torch::Tensor& chunk);
+#ifdef USE_CUDA
+  void write_planar_video_cuda(
+      OutputStream& os,
+      const torch::Tensor& chunk,
+      int num_planes);
+  void write_interlaced_video_cuda(
+      OutputStream& os,
+      const torch::Tensor& chunk,
+      bool pad_extra = true);
+#endif
  void process_frame(
      AVFrame* src_frame,
      std::unique_ptr<FilterGraph>& filter,

--- a/torchaudio/csrc/ffmpeg/stream_writer/stream_writer_binding.cpp
+++ b/torchaudio/csrc/ffmpeg/stream_writer/stream_writer_binding.cpp
@@ -43,7 +43,8 @@ TORCH_LIBRARY_FRAGMENT(torchaudio, m) {
             const std::string& format,
             const c10::optional<std::string>& encoder,
             const c10::optional<OptionDict>& encoder_option,
-             const c10::optional<std::string>& encoder_format) {
+             const c10::optional<std::string>& encoder_format,
+             const c10::optional<std::string>& hw_accel) {
            s->add_video_stream(
                frame_rate,
                width,
@@ -51,7 +52,8 @@ TORCH_LIBRARY_FRAGMENT(torchaudio, m) {
                format,
                encoder,
                encoder_option,
-                encoder_format);
+                encoder_format,
+                hw_accel);
          })
      .def(
          "set_metadata",

--- a/torchaudio/io/_stream_writer.py
+++ b/torchaudio/io/_stream_writer.py
@@ -159,6 +159,7 @@ class StreamWriter:
        encoder: Optional[str] = None,
        encoder_option: Optional[Dict[str, str]] = None,
        encoder_format: Optional[str] = None,
+        hw_accel: Optional[str] = None,
    ):
        """Add an output video stream.

@@ -189,8 +190,18 @@ class StreamWriter:
            encoder_option (dict or None, optional): {encoder_option}

            encoder_format (str or None, optional): {encoder_format}
+
+            hw_accel (str or None, optional): Enable hardware acceleration.
+
+                When video is encoded on CUDA hardware, for example
+                `encoder="h264_nvenc"`, passing CUDA device indicator to `hw_accel`
+                (i.e. `hw_accel="cuda:0"`) will make StreamWriter expect video
+                chunk to be CUDA Tensor. Passing CPU Tensor will result in an error.
+
+                If `None`, the video chunk Tensor has to be CPU Tensor.
+                Default: ``None``.
        """
-        self._s.add_video_stream(frame_rate, width, height, format, encoder, encoder_option, encoder_format)
+        self._s.add_video_stream(frame_rate, width, height, format, encoder, encoder_option, encoder_format, hw_accel)

    def set_metadata(self, metadata: Dict[str, str]):
        """Set file-level metadata