Extract image conversions into separate class (#3120)

Summary: Pull Request resolved: https://github.com/pytorch/audio/pull/3120 This commits extract image conversion ops into ImageTensorConverter class, and make it independent from OutputStream class. ImageTensorConverter class implementes range-based for-loop interface, like ``` for (auto const& frame : ImageTensorConverter::convert(...)) { post_process_with_avframe(frame); } ``` This allows to decouple encoder from image conversion. Reviewed By: nateanl Differential Revision: D43666296 fbshipit-source-id: 754efe677bc7695b3f138a6d076be2106e186b79

Extract image conversions into separate class (#3120)
Summary: Pull Request resolved: https://github.com/pytorch/audio/pull/3120 This commits extract image conversion ops into ImageTensorConverter class, and make it independent from OutputStream class. ImageTensorConverter class implementes range-based for-loop interface, like ``` for (auto const& frame : ImageTensorConverter::convert(...)) { post_process_with_avframe(frame); } ``` This allows to decouple encoder from image conversion. Reviewed By: nateanl Differential Revision: D43666296 fbshipit-source-id: 754efe677bc7695b3f138a6d076be2106e186b79
0bf00d20 · Moto Hira · Facebook GitHub Bot · c9c8c7e1 · 0bf00d20 · 0bf00d20
Commit 0bf00d20 authored Mar 01, 2023 by Moto Hira Committed by Facebook GitHub Bot Mar 01, 2023
7 changed files
--- a/torchaudio/csrc/ffmpeg/CMakeLists.txt
+++ b/torchaudio/csrc/ffmpeg/CMakeLists.txt
@@ -17,8 +17,10 @@ set(
  stream_reader/stream_processor.cpp
  stream_reader/stream_reader.cpp
  stream_writer/encoder.cpp
+  stream_writer/converter.cpp
  stream_writer/output_stream.cpp
  stream_writer/audio_output_stream.cpp
+  stream_writer/video_converter.cpp
  stream_writer/video_output_stream.cpp
  stream_writer/stream_writer.cpp
  compat.cpp

--- a/torchaudio/csrc/ffmpeg/stream_writer/converter.cpp
+++ b/torchaudio/csrc/ffmpeg/stream_writer/converter.cpp
+#include <torchaudio/csrc/ffmpeg/stream_writer/converter.h>
+
+namespace torchaudio::io {
+
+using Iterator = Generator::Iterator;
+using ConvertFunc = Generator::ConvertFunc;
+
+////////////////////////////////////////////////////////////////////////////////
+// Generator
+////////////////////////////////////////////////////////////////////////////////
+
+Generator::Generator(torch::Tensor frames_, AVFrame* buff, ConvertFunc& func)
+    : frames(std::move(frames_)), buffer(buff), convert_func(func) {}
+
+Iterator Generator::begin() const {
+  return Iterator{frames, buffer, convert_func};
+}
+
+int64_t Generator::end() const {
+  return frames.size(0);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Iterator
+////////////////////////////////////////////////////////////////////////////////
+
+Iterator::Iterator(
+    const torch::Tensor frames_,
+    AVFrame* buffer_,
+    ConvertFunc& convert_func_)
+    : frames(frames_), buffer(buffer_), convert_func(convert_func_) {}
+
+Iterator& Iterator::operator++() {
+  ++i;
+  return *this;
+}
+
+AVFrame* Iterator::operator*() const {
+  convert_func(frames.index({i}), buffer);
+  return buffer;
+}
+
+bool Iterator::operator!=(const int64_t other) const {
+  return i != other;
+}
+
+} // namespace torchaudio::io
--- a/torchaudio/csrc/ffmpeg/stream_writer/converter.h
+++ b/torchaudio/csrc/ffmpeg/stream_writer/converter.h
+#pragma once
+
+#include <torch/types.h>
+#include <torchaudio/csrc/ffmpeg/ffmpeg.h>
+
+namespace torchaudio::io {
+
+//////////////////////////////////////////////////////////////////////////////
+// Generator
+//////////////////////////////////////////////////////////////////////////////
+// Genrator class is responsible for implementing an interface compatible with
+// range-based for loop interface (begin and end), and initialization of frame
+// data (channel reordering and ensuring the contiguous-ness).
+class Generator {
+ public:
+  // Convert function writes input frame Tensor to destinatoin AVFrame
+  // both tensor input and AVFrame are expected to be valid and properly
+  // allocated. (i.e. glorified copy)
+  // It is one-to-one conversion. Performed in Iterator.
+  using ConvertFunc = std::function<void(const torch::Tensor&, AVFrame*)>;
+
+  ////////////////////////////////////////////////////////////////////////////
+  // Iterator
+  ////////////////////////////////////////////////////////////////////////////
+  // Iterator class is responsible for implementing iterator protocol, that is
+  // increment, comaprison against, and dereference (applying conversion
+  // function in it).
+  class Iterator {
+    // Input tensor, has to be NCHW or NHWC, uint8, CPU or CUDA
+    // It will be sliced at dereference time.
+    const torch::Tensor frames;
+    // Output buffer (not owned, but modified by Iterator)
+    AVFrame* buffer;
+    // Function that converts one frame Tensor into AVFrame.
+    ConvertFunc& convert_func;
+
+    // Index
+    int64_t i = 0;
+
+   public:
+    Iterator(
+        const torch::Tensor tensor,
+        AVFrame* buffer,
+        ConvertFunc& convert_func);
+
+    Iterator& operator++();
+    AVFrame* operator*() const;
+    bool operator!=(const int64_t other) const;
+  };
+
+ private:
+  // Tensor representing video frames provided by client code
+  // Expected (and validated) to be NCHW, uint8.
+  torch::Tensor frames;
+
+  // Output buffer (not owned, passed to iterator)
+  AVFrame* buffer;
+
+  // ops: not owned.
+  ConvertFunc& convert_func;
+
+ public:
+  Generator(torch::Tensor frames, AVFrame* buffer, ConvertFunc& convert_func);
+
+  [[nodiscard]] Iterator begin() const;
+  [[nodiscard]] int64_t end() const;
+};
+} // namespace torchaudio::io
--- a/torchaudio/csrc/ffmpeg/stream_writer/video_converter.cpp
+++ b/torchaudio/csrc/ffmpeg/stream_writer/video_converter.cpp
+#include <torchaudio/csrc/ffmpeg/stream_writer/video_converter.h>
+
+#ifdef USE_CUDA
+#include <c10/cuda/CUDAStream.h>
+#endif
+
+namespace torchaudio::io {
+
+////////////////////////////////////////////////////////////////////////////////
+// VideoTensorConverter
+////////////////////////////////////////////////////////////////////////////////
+
+using InitFunc = VideoTensorConverter::InitFunc;
+using ConvertFunc = Generator::ConvertFunc;
+
+namespace {
+
+// Interlaced video
+// Each frame is composed of one plane, and color components for each pixel are
+// collocated.
+// The memory layout is 1D linear, interpretated as following.
+//
+//    |<----- linesize[0] ----->|
+//      0   1 ...   W
+// 0: RGB RGB ... RGB PAD ... PAD
+// 1: RGB RGB ... RGB PAD ... PAD
+//            ...
+// H: RGB RGB ... RGB PAD ... PAD
+void write_interlaced_video(const torch::Tensor& frame, AVFrame* buffer) {
+  const auto height = frame.size(0);
+  const auto width = frame.size(1);
+  const auto num_channels = frame.size(2);
+
+  size_t stride = width * num_channels;
+  // TODO: writable
+  // https://ffmpeg.org/doxygen/4.1/muxing_8c_source.html#l00472
+  TORCH_INTERNAL_ASSERT(av_frame_is_writable(buffer), "frame is not writable.");
+
+  uint8_t* src = frame.data_ptr<uint8_t>();
+  uint8_t* dst = buffer->data[0];
+  for (int h = 0; h < height; ++h) {
+    std::memcpy(dst, src, stride);
+    src += width * num_channels;
+    dst += buffer->linesize[0];
+  }
+}
+
+// Planar video
+// Each frame is composed of multiple planes.
+// One plane can contain one of more color components.
+// (but at the moment only accept formats without subsampled color components)
+//
+// The memory layout is interpreted as follow
+//
+//    |<----- linesize[0] ----->|
+//       0   1 ...  W1
+//  0:   Y   Y ...   Y PAD ... PAD
+//  1:   Y   Y ...   Y PAD ... PAD
+//             ...
+// H1:   Y   Y ...   Y PAD ... PAD
+//
+//    |<--- linesize[1] ---->|
+//       0 ...  W2
+//  0:  UV ...  UV PAD ... PAD
+//  1:  UV ...  UV PAD ... PAD
+//         ...
+// H2:  UV ...  UV PAD ... PAD
+//
+void write_planar_video(
+    const torch::Tensor& frame,
+    AVFrame* buffer,
+    int num_planes) {
+  const auto height = frame.size(1);
+  const auto width = frame.size(2);
+
+  // TODO: writable
+  // https://ffmpeg.org/doxygen/4.1/muxing_8c_source.html#l00472
+  TORCH_INTERNAL_ASSERT(av_frame_is_writable(buffer), "frame is not writable.");
+
+  for (int j = 0; j < num_planes; ++j) {
+    uint8_t* src = frame.index({j}).data_ptr<uint8_t>();
+    uint8_t* dst = buffer->data[j];
+    for (int h = 0; h < height; ++h) {
+      memcpy(dst, src, width);
+      src += width;
+      dst += buffer->linesize[j];
+    }
+  }
+}
+
+void write_interlaced_video_cuda(
+    const torch::Tensor& frame,
+    AVFrame* buffer,
+    bool pad_extra) {
+#ifndef USE_CUDA
+  TORCH_CHECK(
+      false,
+      "torchaudio is not compiled with CUDA support. Hardware acceleration is not available.");
+#else
+  const auto height = frame.size(0);
+  const auto width = frame.size(1);
+  const auto num_channels = frame.size(2) + (pad_extra ? 1 : 0);
+  size_t spitch = width * num_channels;
+  if (cudaSuccess !=
+      cudaMemcpy2D(
+          (void*)(buffer->data[0]),
+          buffer->linesize[0],
+          (const void*)(frame.data_ptr<uint8_t>()),
+          spitch,
+          spitch,
+          height,
+          cudaMemcpyDeviceToDevice)) {
+    TORCH_CHECK(false, "Failed to copy pixel data from CUDA tensor.");
+  }
+#endif
+}
+
+void write_planar_video_cuda(
+    const torch::Tensor& frame,
+    AVFrame* buffer,
+    int num_planes) {
+#ifndef USE_CUDA
+  TORCH_CHECK(
+      false,
+      "torchaudio is not compiled with CUDA support. Hardware acceleration is not available.");
+#else
+  const auto height = frame.size(1);
+  const auto width = frame.size(2);
+  for (int j = 0; j < num_planes; ++j) {
+    if (cudaSuccess !=
+        cudaMemcpy2D(
+            (void*)(buffer->data[j]),
+            buffer->linesize[j],
+            (const void*)(frame.index({j}).data_ptr<uint8_t>()),
+            width,
+            width,
+            height,
+            cudaMemcpyDeviceToDevice)) {
+      TORCH_CHECK(false, "Failed to copy pixel data from CUDA tensor.");
+    }
+  }
+#endif
+}
+
+// NCHW ->NHWC, ensure contiguous
+torch::Tensor init_interlaced(const torch::Tensor& tensor) {
+  return tensor.permute({0, 2, 3, 1}).contiguous();
+}
+
+// Keep NCHW, ensure contiguous
+torch::Tensor init_planar(const torch::Tensor& tensor) {
+  return tensor.contiguous();
+}
+
+std::pair<InitFunc, ConvertFunc> get_func(
+    enum AVPixelFormat pix_fmt,
+    enum AVPixelFormat sw_pix_fmt) {
+  using namespace std::placeholders;
+  if (pix_fmt == AV_PIX_FMT_CUDA) {
+    switch (sw_pix_fmt) {
+      case AV_PIX_FMT_RGB0:
+      case AV_PIX_FMT_BGR0: {
+        ConvertFunc convert_func = [](const torch::Tensor& t, AVFrame* f) {
+          write_interlaced_video_cuda(t, f, true);
+        };
+        return {init_interlaced, convert_func};
+      }
+      case AV_PIX_FMT_GBRP:
+      case AV_PIX_FMT_GBRP16LE:
+      case AV_PIX_FMT_YUV444P:
+      case AV_PIX_FMT_YUV444P16LE: {
+        auto num_planes = av_pix_fmt_count_planes(sw_pix_fmt);
+        ConvertFunc convert_func = [=](const torch::Tensor& t, AVFrame* f) {
+          write_planar_video_cuda(t, f, num_planes);
+        };
+        return {init_planar, convert_func};
+      }
+      default:
+        TORCH_CHECK(
+            false,
+            "Unexpected pixel format for CUDA: ",
+            av_get_pix_fmt_name(sw_pix_fmt));
+    }
+  }
+
+  switch (pix_fmt) {
+    case AV_PIX_FMT_GRAY8:
+    case AV_PIX_FMT_RGB24:
+    case AV_PIX_FMT_BGR24: {
+      return {init_interlaced, write_interlaced_video};
+    }
+    case AV_PIX_FMT_YUV444P: {
+      auto num_planes = av_pix_fmt_count_planes(pix_fmt);
+      ConvertFunc convert_func = [=](const torch::Tensor& t, AVFrame* f) {
+        write_planar_video(t, f, num_planes);
+      };
+      return {init_planar, convert_func};
+    }
+    default:
+      TORCH_CHECK(
+          false, "Unexpected pixel format: ", av_get_pix_fmt_name(pix_fmt));
+  }
+}
+
+AVFramePtr get_video_frame(AVPixelFormat src_fmt, AVCodecContext* codec_ctx) {
+  AVFramePtr frame{};
+  if (codec_ctx->pix_fmt == AV_PIX_FMT_CUDA) {
+    int ret = av_hwframe_get_buffer(codec_ctx->hw_frames_ctx, frame, 0);
+    TORCH_CHECK(ret >= 0, "Failed to fetch CUDA frame: ", av_err2string(ret));
+  } else {
+    frame->format = src_fmt;
+    frame->width = codec_ctx->width;
+    frame->height = codec_ctx->height;
+
+    int ret = av_frame_get_buffer(frame, 0);
+    TORCH_CHECK(
+        ret >= 0,
+        "Error allocating a video buffer (",
+        av_err2string(ret),
+        ").");
+  }
+  return frame;
+}
+
+void validate_video_input(
+    enum AVPixelFormat fmt,
+    AVCodecContext* ctx,
+    const torch::Tensor& t) {
+  if (fmt == AV_PIX_FMT_CUDA) {
+    TORCH_CHECK(t.device().is_cuda(), "Input tensor has to be on CUDA.");
+    fmt = ctx->sw_pix_fmt;
+  } else {
+    TORCH_CHECK(t.device().is_cpu(), "Input tensor has to be on CPU.");
+  }
+
+  auto dtype = t.dtype().toScalarType();
+  TORCH_CHECK(dtype == c10::ScalarType::Byte, "Expected Tensor of uint8 type.");
+  TORCH_CHECK(t.dim() == 4, "Input Tensor has to be 4D.");
+
+  // Note: the number of color components is not same as the number of planes.
+  // For example, YUV420P has only two planes. U and V are in the second plane.
+  int num_color_components = av_pix_fmt_desc_get(fmt)->nb_components;
+
+  const auto channels = t.size(1);
+  const auto height = t.size(2);
+  const auto width = t.size(3);
+  TORCH_CHECK(
+      channels == num_color_components && height == ctx->height &&
+          width == ctx->width,
+      "Expected tensor with shape (N, ",
+      num_color_components,
+      ", ",
+      ctx->height,
+      ", ",
+      ctx->width,
+      ") (NCHW format). Found ",
+      t.sizes());
+}
+
+} // namespace
+
+VideoTensorConverter::VideoTensorConverter(
+    enum AVPixelFormat src_fmt_,
+    AVCodecContext* codec_ctx_)
+    : src_fmt(src_fmt_),
+      codec_ctx(codec_ctx_),
+      buffer(get_video_frame(src_fmt_, codec_ctx_)) {
+  std::tie(init_func, convert_func) = get_func(src_fmt, codec_ctx->sw_pix_fmt);
+}
+
+Generator VideoTensorConverter::convert(const torch::Tensor& frames) {
+  validate_video_input(src_fmt, codec_ctx, frames);
+  return Generator{init_func(frames), buffer, convert_func};
+}
+
+} // namespace torchaudio::io
--- a/torchaudio/csrc/ffmpeg/stream_writer/video_converter.h
+++ b/torchaudio/csrc/ffmpeg/stream_writer/video_converter.h
+#pragma once
+
+#include <torchaudio/csrc/ffmpeg/stream_writer/converter.h>
+
+namespace torchaudio::io {
+
+////////////////////////////////////////////////////////////////////////////////
+// VideoTensorConverter
+////////////////////////////////////////////////////////////////////////////////
+// VideoTensorConverter is responsible for picking up the right set of
+// conversion process (InitFunc and ConvertFunc) based on the input pixel format
+// information, and own them.
+class VideoTensorConverter {
+ public:
+  // Initialization is one-time process applied to frames before the iteration
+  // starts. i.e. either convert to NHWC.
+  using InitFunc = std::function<torch::Tensor(const torch::Tensor&)>;
+
+ private:
+  enum AVPixelFormat src_fmt;
+  AVCodecContext* codec_ctx;
+  AVFramePtr buffer;
+
+  InitFunc init_func{};
+  Generator::ConvertFunc convert_func{};
+
+ public:
+  VideoTensorConverter(enum AVPixelFormat src_fmt, AVCodecContext* codec_ctx);
+  Generator convert(const torch::Tensor& frames);
+};
+} // namespace torchaudio::io
--- a/torchaudio/csrc/ffmpeg/stream_writer/video_output_stream.cpp
+++ b/torchaudio/csrc/ffmpeg/stream_writer/video_output_stream.cpp
@@ -33,26 +33,6 @@ FilterGraph get_video_filter(AVPixelFormat src_fmt, AVCodecContext* codec_ctx) {
  return p;
 }

-AVFramePtr get_video_frame(AVPixelFormat src_fmt, AVCodecContext* codec_ctx) {
-  AVFramePtr frame{};
-  if (codec_ctx->pix_fmt == AV_PIX_FMT_CUDA) {
-    int ret = av_hwframe_get_buffer(codec_ctx->hw_frames_ctx, frame, 0);
-    TORCH_CHECK(ret >= 0, "Failed to fetch CUDA frame: ", av_err2string(ret));
-  } else {
-    frame->format = src_fmt;
-    frame->width = codec_ctx->width;
-    frame->height = codec_ctx->height;
-
-    int ret = av_frame_get_buffer(frame, 0);
-    TORCH_CHECK(
-        ret >= 0,
-        "Error allocating a video buffer (",
-        av_err2string(ret),
-        ").");
-  }
-  return frame;
-}
-
 } // namespace

 VideoOutputStream::VideoOutputStream(
@@ -65,245 +45,17 @@ VideoOutputStream::VideoOutputStream(
          format_ctx,
          codec_ctx_,
          get_video_filter(src_fmt, codec_ctx_)),
-      src_frame(get_video_frame(src_fmt, codec_ctx_)),
+      converter(src_fmt, codec_ctx_),
      hw_device_ctx(std::move(hw_device_ctx_)),
      hw_frame_ctx(std::move(hw_frame_ctx_)),
      codec_ctx(std::move(codec_ctx_)) {}

-namespace {
-
-void validate_video_input(
-    enum AVPixelFormat fmt,
-    AVCodecContext* ctx,
-    const torch::Tensor& t) {
-  if (fmt == AV_PIX_FMT_CUDA) {
-    TORCH_CHECK(t.device().is_cuda(), "Input tensor has to be on CUDA.");
-    fmt = ctx->sw_pix_fmt;
-  } else {
-    TORCH_CHECK(t.device().is_cpu(), "Input tensor has to be on CPU.");
-  }
-
-  auto dtype = t.dtype().toScalarType();
-  TORCH_CHECK(dtype == c10::ScalarType::Byte, "Expected Tensor of uint8 type.");
-  TORCH_CHECK(t.dim() == 4, "Input Tensor has to be 4D.");
-
-  // Note: the number of color components is not same as the number of planes.
-  // For example, YUV420P has only two planes. U and V are in the second plane.
-  int num_color_components = av_pix_fmt_desc_get(fmt)->nb_components;
-
-  const auto channels = t.size(1);
-  const auto height = t.size(2);
-  const auto width = t.size(3);
-  TORCH_CHECK(
-      channels == num_color_components && height == ctx->height &&
-          width == ctx->width,
-      "Expected tensor with shape (N, ",
-      num_color_components,
-      ", ",
-      ctx->height,
-      ", ",
-      ctx->width,
-      ") (NCHW format). Found ",
-      t.sizes());
-}
-
-void write_interlaced_video_cuda(
-    const torch::Tensor& chunk,
-    AVFrame* buffer,
-    bool pad_extra) {
-#ifdef USE_CUDA
-  const auto height = chunk.size(0);
-  const auto width = chunk.size(1);
-  const auto num_channels = chunk.size(2) + (pad_extra ? 1 : 0);
-  size_t spitch = width * num_channels;
-  if (cudaSuccess !=
-      cudaMemcpy2D(
-          (void*)(buffer->data[0]),
-          buffer->linesize[0],
-          (const void*)(chunk.data_ptr<uint8_t>()),
-          spitch,
-          spitch,
-          height,
-          cudaMemcpyDeviceToDevice)) {
-    TORCH_CHECK(false, "Failed to copy pixel data from CUDA tensor.");
-  }
-#else
-  TORCH_CHECK(
-      false,
-      "torchaudio is not compiled with CUDA support. Hardware acceleration is not available.");
-#endif
-}
-
-void write_planar_video_cuda(
-    const torch::Tensor& chunk,
-    AVFrame* buffer,
-    int num_planes) {
-#ifdef USE_CUDA
-  const auto height = chunk.size(1);
-  const auto width = chunk.size(2);
-  for (int j = 0; j < num_planes; ++j) {
-    if (cudaSuccess !=
-        cudaMemcpy2D(
-            (void*)(buffer->data[j]),
-            buffer->linesize[j],
-            (const void*)(chunk.index({j}).data_ptr<uint8_t>()),
-            width,
-            width,
-            height,
-            cudaMemcpyDeviceToDevice)) {
-      TORCH_CHECK(false, "Failed to copy pixel data from CUDA tensor.");
-    }
-  }
-#else
-  TORCH_CHECK(
-      false,
-      "torchaudio is not compiled with CUDA support. Hardware acceleration is not available.");
-#endif
-}
-
-// Interlaced video
-// Each frame is composed of one plane, and color components for each pixel are
-// collocated.
-// The memory layout is 1D linear, interpretated as following.
-//
-//    |<----- linesize[0] ----->|
-//      0   1 ...   W
-// 0: RGB RGB ... RGB PAD ... PAD
-// 1: RGB RGB ... RGB PAD ... PAD
-//            ...
-// H: RGB RGB ... RGB PAD ... PAD
-void write_interlaced_video(const torch::Tensor& chunk, AVFrame* buffer) {
-  const auto height = chunk.size(0);
-  const auto width = chunk.size(1);
-  const auto num_channels = chunk.size(2);
-
-  size_t stride = width * num_channels;
-  // TODO: writable
-  // https://ffmpeg.org/doxygen/4.1/muxing_8c_source.html#l00472
-  TORCH_INTERNAL_ASSERT(av_frame_is_writable(buffer), "frame is not writable.");
-
-  uint8_t* src = chunk.data_ptr<uint8_t>();
-  uint8_t* dst = buffer->data[0];
-  for (int h = 0; h < height; ++h) {
-    std::memcpy(dst, src, stride);
-    src += width * num_channels;
-    dst += buffer->linesize[0];
-  }
-}
-
-// Planar video
-// Each frame is composed of multiple planes.
-// One plane can contain one of more color components.
-// (but at the moment only accept formats without subsampled color components)
-//
-// The memory layout is interpreted as follow
-//
-//    |<----- linesize[0] ----->|
-//       0   1 ...  W1
-//  0:   Y   Y ...   Y PAD ... PAD
-//  1:   Y   Y ...   Y PAD ... PAD
-//             ...
-// H1:   Y   Y ...   Y PAD ... PAD
-//
-//    |<--- linesize[1] ---->|
-//       0 ...  W2
-//  0:  UV ...  UV PAD ... PAD
-//  1:  UV ...  UV PAD ... PAD
-//         ...
-// H2:  UV ...  UV PAD ... PAD
-//
-void write_planar_video(
-    const torch::Tensor& chunk,
-    AVFrame* buffer,
-    int num_planes) {
-  const auto height = chunk.size(1);
-  const auto width = chunk.size(2);
-
-  // TODO: writable
-  // https://ffmpeg.org/doxygen/4.1/muxing_8c_source.html#l00472
-  TORCH_INTERNAL_ASSERT(av_frame_is_writable(buffer), "frame is not writable.");
-
-  for (int j = 0; j < num_planes; ++j) {
-    uint8_t* src = chunk.index({j}).data_ptr<uint8_t>();
-    uint8_t* dst = buffer->data[j];
-    for (int h = 0; h < height; ++h) {
-      memcpy(dst, src, width);
-      src += width;
-      dst += buffer->linesize[j];
-    }
-  }
-}
-
-} // namespace
-
 void VideoOutputStream::write_chunk(const torch::Tensor& frames) {
-  enum AVPixelFormat fmt = static_cast<AVPixelFormat>(src_frame->format);
-  validate_video_input(fmt, codec_ctx, frames);
-  const auto num_frames = frames.size(0);
-
-#ifdef USE_CUDA
-  if (fmt == AV_PIX_FMT_CUDA) {
-    fmt = codec_ctx->sw_pix_fmt;
-    switch (fmt) {
-      case AV_PIX_FMT_RGB0:
-      case AV_PIX_FMT_BGR0: {
-        auto chunks = frames.permute({0, 2, 3, 1}).contiguous(); // to NHWC
-        for (int i = 0; i < num_frames; ++i) {
-          write_interlaced_video_cuda(chunks.index({i}), src_frame, true);
-          process_frame();
-        }
-        return;
-      }
-      case AV_PIX_FMT_GBRP:
-      case AV_PIX_FMT_GBRP16LE:
-      case AV_PIX_FMT_YUV444P:
-      case AV_PIX_FMT_YUV444P16LE: {
-        auto chunks = frames.contiguous();
-        for (int i = 0; i < num_frames; ++i) {
-          write_planar_video_cuda(
-              chunks.index({i}), src_frame, av_pix_fmt_count_planes(fmt));
-          process_frame();
-        }
-        return;
-      }
-      default:
-        TORCH_CHECK(
-            false,
-            "Unexpected pixel format for CUDA: ",
-            av_get_pix_fmt_name(fmt));
-    }
+  for (const auto& frame : converter.convert(frames)) {
+    frame->pts = num_frames;
+    num_frames += 1;
+    process_frame(frame);
  }
-#endif
-
-  switch (fmt) {
-    case AV_PIX_FMT_GRAY8:
-    case AV_PIX_FMT_RGB24:
-    case AV_PIX_FMT_BGR24: {
-      auto chunks = frames.permute({0, 2, 3, 1}).contiguous();
-      for (int i = 0; i < num_frames; ++i) {
-        write_interlaced_video(chunks.index({i}), src_frame);
-        process_frame();
-      }
-      return;
-    }
-    case AV_PIX_FMT_YUV444P: {
-      auto chunks = frames.contiguous();
-      for (int i = 0; i < num_frames; ++i) {
-        write_planar_video(
-            chunks.index({i}), src_frame, av_pix_fmt_count_planes(fmt));
-        process_frame();
-      }
-      return;
-    }
-    default:
-      TORCH_CHECK(false, "Unexpected pixel format: ", av_get_pix_fmt_name(fmt));
-  }
-}
-
-void VideoOutputStream::process_frame() {
-  src_frame->pts = num_frames;
-  num_frames += 1;
-  OutputStream::process_frame(src_frame);
 }

 } // namespace torchaudio::io
--- a/torchaudio/csrc/ffmpeg/stream_writer/video_output_stream.h
+++ b/torchaudio/csrc/ffmpeg/stream_writer/video_output_stream.h
 #pragma once
 #include <torchaudio/csrc/ffmpeg/stream_writer/output_stream.h>
+#include <torchaudio/csrc/ffmpeg/stream_writer/video_converter.h>

 namespace torchaudio::io {

 struct VideoOutputStream : OutputStream {
-  AVFramePtr src_frame;
-
+  VideoTensorConverter converter;
  AVBufferRefPtr hw_device_ctx;
  AVBufferRefPtr hw_frame_ctx;
  AVCodecContextPtr codec_ctx;
@@ -18,7 +18,6 @@ struct VideoOutputStream : OutputStream {
      AVBufferRefPtr&& hw_frame_ctx);

  void write_chunk(const torch::Tensor& frames) override;
-  void process_frame();

  ~VideoOutputStream() override = default;
 };