Delay the initialization of CUDA tensor converter (#3419)

Summary: StreamReader decoding process is composed of the three steps; 1. Decode the incoming AVPacket into AVFrame 2. Pass AVFrame through AVFilter to perform post process 3. Convert the resulgint AVFrame The internal of StreamReader was refactored in https://github.com/pytorch/audio/issues/3188 so that the above pipeline is initialized at the time output stream is defined and output stream shape can be retrieved. For CPU decoder, this works fine because resizing happens in step 2, and the resulting shape can be retrievable. However, this is problematic for GPU decoder, as resizing is currently done using GPU decoder option (step 1) and there seems to be no interface to retrieve the output shape. This refactor introduced regression, which is described in https://github.com/pytorch/audio/issues/3405 AVFilter internally is adoptive to the change of input frame size. This commit changes the conversion process to be similar, so that it will wait until the first frame comes in to finalize the frame shape. Fix https://github.com/pytorch/audio/issues/3405 Pull Request resolved: https://github.com/pytorch/audio/pull/3419 Differential Revision: D46557505 Pulled By: mthrok fbshipit-source-id: 46ad2d82c8c30f368ebfbaf6947718a5036c7dc6

Delay the initialization of CUDA tensor converter (#3419)
Summary: StreamReader decoding process is composed of the three steps; 1. Decode the incoming AVPacket into AVFrame 2. Pass AVFrame through AVFilter to perform post process 3. Convert the resulgint AVFrame The internal of StreamReader was refactored in https://github.com/pytorch/audio/issues/3188 so that the above pipeline is initialized at the time output stream is defined and output stream shape can be retrieved. For CPU decoder, this works fine because resizing happens in step 2, and the resulting shape can be retrievable. However, this is problematic for GPU decoder, as resizing is currently done using GPU decoder option (step 1) and there seems to be no interface to retrieve the output shape. This refactor introduced regression, which is described in https://github.com/pytorch/audio/issues/3405 AVFilter internally is adoptive to the change of input frame size. This commit changes the conversion process to be similar, so that it will wait until the first frame comes in to finalize the frame shape. Fix https://github.com/pytorch/audio/issues/3405 Pull Request resolved: https://github.com/pytorch/audio/pull/3419 Differential Revision: D46557505 Pulled By: mthrok fbshipit-source-id: 46ad2d82c8c30f368ebfbaf6947718a5036c7dc6
7dff24ca · moto · Facebook GitHub Bot · a7fea8a6 · 7dff24ca · 7dff24ca
Commit 7dff24ca authored Jun 08, 2023 by moto Committed by Facebook GitHub Bot Jun 08, 2023
4 changed files
--- a/test/torchaudio_unittest/io/stream_reader_test.py
+++ b/test/torchaudio_unittest/io/stream_reader_test.py
@@ -1110,65 +1110,100 @@ class CuvidHWAccelInterfaceTest(TorchaudioTestCase):

 @_media_source
 class CudaDecoderTest(_MediaSourceMixin, TempDirMixin, TorchaudioTestCase):
-    @skipIfNoHWAccel("h264_cuvid")
-    def test_h264_cuvid(self):
-        """GPU decoder works for H264"""
-        src = self.get_src(get_asset_path("nasa_13013.mp4"))
+    def _test_decode(
+        self,
+        decoder: str,
+        src_path: str,
+        height: int,
+        width: int,
+        ref_num_frames: int,
+        hw_accel=None,
+        decoder_option=None,
+        dtype: torch.dtype = torch.uint8,
+    ):
+        src = self.get_src(get_asset_path(src_path))
        r = StreamReader(src)
-        r.add_video_stream(10, decoder="h264_cuvid")
+        r.add_video_stream(10, decoder=decoder, decoder_option=decoder_option, hw_accel=hw_accel)

        num_frames = 0
        for (chunk,) in r.stream():
-            self.assertEqual(chunk.device, torch.device("cpu"))
-            self.assertEqual(chunk.dtype, torch.uint8)
-            self.assertEqual(chunk.shape, torch.Size([10, 3, 270, 480]))
+            self.assertEqual(chunk.device, torch.device(hw_accel or "cpu"))
+            self.assertEqual(chunk.dtype, dtype)
+            self.assertEqual(chunk.shape, torch.Size([10, 3, height, width]))
            num_frames += chunk.size(0)
-        assert num_frames == 390
+        assert num_frames == ref_num_frames
+
+    @skipIfNoHWAccel("h264_cuvid")
+    def test_h264_cuvid(self):
+        """GPU decoder works for H264"""
+        self._test_decode("h264_cuvid", "nasa_13013.mp4", 270, 480, 390)

    @skipIfNoHWAccel("h264_cuvid")
    def test_h264_cuvid_hw_accel(self):
        """GPU decoder works for H264 with HW acceleration, and put the frames on CUDA tensor"""
-        src = self.get_src(get_asset_path("nasa_13013.mp4"))
-        r = StreamReader(src)
-        r.add_video_stream(10, decoder="h264_cuvid", hw_accel="cuda")
+        self._test_decode("h264_cuvid", "nasa_13013.mp4", 270, 480, 390, hw_accel="cuda:0")

-        num_frames = 0
-        for (chunk,) in r.stream():
-            self.assertEqual(chunk.device, torch.device("cuda:0"))
-            self.assertEqual(chunk.dtype, torch.uint8)
-            self.assertEqual(chunk.shape, torch.Size([10, 3, 270, 480]))
-            num_frames += chunk.size(0)
-        assert num_frames == 390
+    @skipIfNoHWAccel("h264_cuvid")
+    def test_h264_cuvid_hw_accel_resize(self):
+        """GPU decoder works for H264 with HW acceleration and resize option"""
+        w, h = 240, 136
+        self._test_decode(
+            "h264_cuvid", "nasa_13013.mp4", h, w, 390, hw_accel="cuda:0", decoder_option={"resize": f"{w}x{h}"}
+        )
+
+    @skipIfNoHWAccel("h264_cuvid")
+    def test_h264_cuvid_hw_accel_crop(self):
+        """GPU decoder works for H264 with HW acceleration and crop option"""
+        top, bottom, left, right = 3, 5, 7, 9
+        self._test_decode(
+            "h264_cuvid",
+            "nasa_13013.mp4",
+            262,
+            464,
+            390,
+            hw_accel="cuda:0",
+            decoder_option={"crop": f"{top}x{bottom}x{left}x{right}"},
+        )

    @skipIfNoHWAccel("hevc_cuvid")
    def test_hevc_cuvid(self):
        """GPU decoder works for H265/HEVC"""
-        src = self.get_src(get_asset_path("testsrc.hevc"))
-        r = StreamReader(src)
-        r.add_video_stream(10, decoder="hevc_cuvid")
-
-        num_frames = 0
-        for (chunk,) in r.stream():
-            self.assertEqual(chunk.device, torch.device("cpu"))
-            self.assertEqual(chunk.dtype, torch.uint8)
-            self.assertEqual(chunk.shape, torch.Size([10, 3, 144, 256]))
-            num_frames += chunk.size(0)
-        assert num_frames == 300
+        self._test_decode("hevc_cuvid", "testsrc.hevc", 144, 256, 300)

    @skipIfNoHWAccel("hevc_cuvid")
    def test_hevc_cuvid_hw_accel(self):
        """GPU decoder works for H265/HEVC with HW acceleration, and put the frames on CUDA tensor"""
-        src = self.get_src(get_asset_path("testsrc.hevc"))
-        r = StreamReader(src)
-        r.add_video_stream(10, decoder="hevc_cuvid", hw_accel="cuda")
+        self._test_decode("hevc_cuvid", "testsrc.hevc", 144, 256, 300, hw_accel="cuda:0", dtype=torch.int16)

-        num_frames = 0
-        for (chunk,) in r.stream():
-            self.assertEqual(chunk.device, torch.device("cuda:0"))
-            self.assertEqual(chunk.dtype, torch.int16)
-            self.assertEqual(chunk.shape, torch.Size([10, 3, 144, 256]))
-            num_frames += chunk.size(0)
-        assert num_frames == 300
+    @skipIfNoHWAccel("hevc_cuvid")
+    def test_hevc_cuvid_hw_accel_resize(self):
+        """GPU decoder works for H265/HEVC with HW acceleration and resize option"""
+        w, h = 128, 64
+        self._test_decode(
+            "hevc_cuvid",
+            "testsrc.hevc",
+            h,
+            w,
+            300,
+            hw_accel="cuda:0",
+            dtype=torch.int16,
+            decoder_option={"resize": f"{w}x{h}"},
+        )
+
+    @skipIfNoHWAccel("hevc_cuvid")
+    def test_hevc_cuvid_hw_accel_crop(self):
+        """GPU decoder works for H265/HEVC with HW acceleration and crop option"""
+        top, bottom, left, right = 3, 5, 7, 9
+        self._test_decode(
+            "hevc_cuvid",
+            "testsrc.hevc",
+            136,
+            240,
+            300,
+            hw_accel="cuda:0",
+            dtype=torch.int16,
+            decoder_option={"crop": f"{top}x{bottom}x{left}x{right}"},
+        )


 @skipIfNoHWAccel("h264_cuvid")

--- a/torchaudio/csrc/ffmpeg/stream_reader/conversion.cpp
+++ b/torchaudio/csrc/ffmpeg/stream_reader/conversion.cpp
@@ -399,15 +399,14 @@ torch::Tensor NV12Converter::convert(const AVFrame* src) {

 #ifdef USE_CUDA

+CudaImageConverterBase::CudaImageConverterBase(const torch::Device& device)
+    : device(device) {}
+
 ////////////////////////////////////////////////////////////////////////////////
 // NV12 CUDA
 ////////////////////////////////////////////////////////////////////////////////
-NV12CudaConverter::NV12CudaConverter(int h, int w, const torch::Device& device)
-    : ImageConverterBase(h, w, 3),
-      tmp_uv(get_image_buffer(
-          {1, height / 2, width / 2, 2},
-          device,
-          torch::kUInt8)) {
+NV12CudaConverter::NV12CudaConverter(const torch::Device& device)
+    : CudaImageConverterBase(device) {
  TORCH_WARN_ONCE(
      "The output format NV12 is selected. "
      "This will be implicitly converted to YUV444P, "
@@ -470,8 +469,16 @@ void NV12CudaConverter::convert(const AVFrame* src, torch::Tensor& dst) {
 }

 torch::Tensor NV12CudaConverter::convert(const AVFrame* src) {
-  torch::Tensor buffer =
-      get_image_buffer({1, num_channels, height, width}, tmp_uv.device());
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(src);
+  if (!init) {
+    height = src->height;
+    width = src->width;
+    tmp_uv =
+        get_image_buffer({1, height / 2, width / 2, 2}, device, torch::kUInt8);
+    init = true;
+  }
+
+  torch::Tensor buffer = get_image_buffer({1, 3, height, width}, device);
  convert(src, buffer);
  return buffer;
 }
@@ -479,12 +486,8 @@ torch::Tensor NV12CudaConverter::convert(const AVFrame* src) {
 ////////////////////////////////////////////////////////////////////////////////
 // P010 CUDA
 ////////////////////////////////////////////////////////////////////////////////
-P010CudaConverter::P010CudaConverter(int h, int w, const torch::Device& device)
-    : ImageConverterBase(h, w, 3),
-      tmp_uv(get_image_buffer(
-          {1, height / 2, width / 2, 2},
-          device,
-          torch::kInt16)) {
+P010CudaConverter::P010CudaConverter(const torch::Device& device)
+    : CudaImageConverterBase{device} {
  TORCH_WARN_ONCE(
      "The output format P010 is selected. "
      "This will be implicitly converted to YUV444P, "
@@ -551,8 +554,17 @@ void P010CudaConverter::convert(const AVFrame* src, torch::Tensor& dst) {
 }

 torch::Tensor P010CudaConverter::convert(const AVFrame* src) {
-  torch::Tensor buffer = get_image_buffer(
-      {1, num_channels, height, width}, tmp_uv.device(), torch::kInt16);
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(src);
+  if (!init) {
+    height = src->height;
+    width = src->width;
+    tmp_uv =
+        get_image_buffer({1, height / 2, width / 2, 2}, device, torch::kInt16);
+    init = true;
+  }
+
+  torch::Tensor buffer =
+      get_image_buffer({1, 3, height, width}, device, torch::kInt16);
  convert(src, buffer);
  return buffer;
 }
@@ -560,11 +572,8 @@ torch::Tensor P010CudaConverter::convert(const AVFrame* src) {
 ////////////////////////////////////////////////////////////////////////////////
 // YUV444P CUDA
 ////////////////////////////////////////////////////////////////////////////////
-YUV444PCudaConverter::YUV444PCudaConverter(
-    int h,
-    int w,
-    const torch::Device& device)
-    : ImageConverterBase(h, w, 3), device(device) {}
+YUV444PCudaConverter::YUV444PCudaConverter(const torch::Device& device)
+    : CudaImageConverterBase(device) {}

 void YUV444PCudaConverter::convert(const AVFrame* src, torch::Tensor& dst) {
  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(src);
@@ -589,7 +598,7 @@ void YUV444PCudaConverter::convert(const AVFrame* src, torch::Tensor& dst) {
      FFMPEG av_get_pix_fmt_name(sw_fmt));

  // Write Y plane directly
-  for (int i = 0; i < num_channels; ++i) {
+  for (int i = 0; i < 3; ++i) {
    auto status = cudaMemcpy2D(
        dst.index({0, i}).data_ptr(),
        width,
@@ -604,8 +613,13 @@ void YUV444PCudaConverter::convert(const AVFrame* src, torch::Tensor& dst) {
 }

 torch::Tensor YUV444PCudaConverter::convert(const AVFrame* src) {
-  torch::Tensor buffer =
-      get_image_buffer({1, num_channels, height, width}, device);
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(src);
+  if (!init) {
+    height = src->height;
+    width = src->width;
+    init = true;
+  }
+  torch::Tensor buffer = get_image_buffer({1, 3, height, width}, device);
  convert(src, buffer);
  return buffer;
 }

--- a/torchaudio/csrc/ffmpeg/stream_reader/conversion.h
+++ b/torchaudio/csrc/ffmpeg/stream_reader/conversion.h
@@ -87,29 +87,40 @@ class NV12Converter : public ImageConverterBase {

 #ifdef USE_CUDA

-class NV12CudaConverter : ImageConverterBase {
-  torch::Tensor tmp_uv;
+// Note:
+// GPU decoders are tricky. They allow to change the resolution as part of
+// decoder option, and the resulting resolution is (seemingly) not retrievable.
+// Therefore, we adopt delayed frame size initialization.
+// For that purpose, we do not inherit from ImageConverterBase.
+struct CudaImageConverterBase {
+  const torch::Device device;
+  bool init = false;
+  int height = -1;
+  int width = -1;
+  explicit CudaImageConverterBase(const torch::Device& device);
+};
+
+class NV12CudaConverter : CudaImageConverterBase {
+  torch::Tensor tmp_uv{};

 public:
-  NV12CudaConverter(int height, int width, const torch::Device& device);
+  explicit NV12CudaConverter(const torch::Device& device);
  void convert(const AVFrame* src, torch::Tensor& dst);
  torch::Tensor convert(const AVFrame* src);
 };

-class P010CudaConverter : ImageConverterBase {
-  torch::Tensor tmp_uv;
+class P010CudaConverter : CudaImageConverterBase {
+  torch::Tensor tmp_uv{};

 public:
-  P010CudaConverter(int height, int width, const torch::Device& device);
+  explicit P010CudaConverter(const torch::Device& device);
  void convert(const AVFrame* src, torch::Tensor& dst);
  torch::Tensor convert(const AVFrame* src);
 };

-class YUV444PCudaConverter : ImageConverterBase {
-  const torch::Device device;
-
+class YUV444PCudaConverter : CudaImageConverterBase {
 public:
-  YUV444PCudaConverter(int height, int width, const torch::Device& device);
+  explicit YUV444PCudaConverter(const torch::Device& device);
  void convert(const AVFrame* src, torch::Tensor& dst);
  torch::Tensor convert(const AVFrame* src);
 };

--- a/torchaudio/csrc/ffmpeg/stream_reader/post_process.cpp
+++ b/torchaudio/csrc/ffmpeg/stream_reader/post_process.cpp
@@ -403,17 +403,17 @@ std::unique_ptr<IPostDecodeProcess> get_unchunked_cuda_video_process(
    case AV_PIX_FMT_NV12: {
      using C = NV12CudaConverter;
      return std::make_unique<ProcessImpl<C, B>>(
-          std::move(filter), C{i.height, i.width, device}, B{i.time_base});
+          std::move(filter), C{device}, B{i.time_base});
    }
    case AV_PIX_FMT_P010: {
      using C = P010CudaConverter;
      return std::make_unique<ProcessImpl<C, B>>(
-          std::move(filter), C{i.height, i.width, device}, B{i.time_base});
+          std::move(filter), C{device}, B{i.time_base});
    }
    case AV_PIX_FMT_YUV444P: {
      using C = YUV444PCudaConverter;
      return std::make_unique<ProcessImpl<C, B>>(
-          std::move(filter), C{i.height, i.width, device}, B{i.time_base});
+          std::move(filter), C{device}, B{i.time_base});
    }
    case AV_PIX_FMT_P016: {
      TORCH_CHECK(
@@ -524,21 +524,21 @@ std::unique_ptr<IPostDecodeProcess> get_chunked_cuda_video_process(
      using C = NV12CudaConverter;
      return std::make_unique<ProcessImpl<C, B>>(
          std::move(filter),
-          C{i.height, i.width, device},
+          C{device},
          B{i.time_base, frames_per_chunk, num_chunks});
    }
    case AV_PIX_FMT_P010: {
      using C = P010CudaConverter;
      return std::make_unique<ProcessImpl<C, B>>(
          std::move(filter),
-          C{i.height, i.width, device},
+          C{device},
          B{i.time_base, frames_per_chunk, num_chunks});
    }
    case AV_PIX_FMT_YUV444P: {
      using C = YUV444PCudaConverter;
      return std::make_unique<ProcessImpl<C, B>>(
          std::move(filter),
-          C{i.height, i.width, device},
+          C{device},
          B{i.time_base, frames_per_chunk, num_chunks});
    }
    case AV_PIX_FMT_P016: {