Add 420p10le CPU support to StreamReader (#3332)

Summary: This commit add support to decode YUV420P010LE format. The image tensor returned by this format - NCHW format (C == 3) - int16 type - value range [0, 2^10). Note that the value range is different from what "hevc_cuvid" decoder returns. "hevc_cuvid" decoder uses full range of int16 (internally, it's uint16) to express the color (with some intervals), but the values returned by CPU "hevc" decoder are with in [0, 2^10). Address https://github.com/pytorch/audio/issues/3331 Pull Request resolved: https://github.com/pytorch/audio/pull/3332 Reviewed By: hwangjeff Differential Revision: D45925097 Pulled By: mthrok fbshipit-source-id: 4e669b65c030f388bba2fdbb8f00faf7e2981508

Add 420p10le CPU support to StreamReader (#3332)
Summary: This commit add support to decode YUV420P010LE format. The image tensor returned by this format - NCHW format (C == 3) - int16 type - value range [0, 2^10). Note that the value range is different from what "hevc_cuvid" decoder returns. "hevc_cuvid" decoder uses full range of int16 (internally, it's uint16) to express the color (with some intervals), but the values returned by CPU "hevc" decoder are with in [0, 2^10). Address https://github.com/pytorch/audio/issues/3331 Pull Request resolved: https://github.com/pytorch/audio/pull/3332 Reviewed By: hwangjeff Differential Revision: D45925097 Pulled By: mthrok fbshipit-source-id: 4e669b65c030f388bba2fdbb8f00faf7e2981508
c12f4734 · moto · Facebook GitHub Bot · d38a7854 · c12f4734 · c12f4734
Commit c12f4734 authored May 16, 2023 by moto Committed by Facebook GitHub Bot May 16, 2023
4 changed files
--- a/test/torchaudio_unittest/io/stream_reader_test.py
+++ b/test/torchaudio_unittest/io/stream_reader_test.py
@@ -1054,6 +1054,7 @@ class StreamReaderImageTest(_MediaSourceMixin, TempDirMixin, TorchaudioTestCase)
            rgb16 = ((rgb.to(torch.int32) - 128) << 8).to(torch.int16)

            yuv = rgb_to_yuv_ccir(rgb)
+            yuv16 = yuv.to(torch.int16) * 4
            bgr = rgb[:, [2, 1, 0], :, :]
            gray = rgb_to_gray(rgb)
            argb = torch.cat([alpha, rgb], dim=1)
@@ -1073,6 +1074,7 @@ class StreamReaderImageTest(_MediaSourceMixin, TempDirMixin, TorchaudioTestCase)
            s.add_basic_video_stream(frames_per_chunk=-1, format="rgba")
            s.add_basic_video_stream(frames_per_chunk=-1, format="abgr")
            s.add_basic_video_stream(frames_per_chunk=-1, format="bgra")
+            s.add_basic_video_stream(frames_per_chunk=-1, format="yuv420p10le")
            s.process_all_packets()
            chunks = s.pop_chunks()
            self.assertEqual(chunks[0], yuv, atol=1, rtol=0)
@@ -1086,6 +1088,7 @@ class StreamReaderImageTest(_MediaSourceMixin, TempDirMixin, TorchaudioTestCase)
            self.assertEqual(chunks[8], rgba, atol=0, rtol=0)
            self.assertEqual(chunks[9], abgr, atol=0, rtol=0)
            self.assertEqual(chunks[10], bgra, atol=0, rtol=0)
+            self.assertEqual(chunks[11], yuv16, atol=4, rtol=0)


 @skipIfNoHWAccel("h264_cuvid")

--- a/torchaudio/csrc/ffmpeg/stream_reader/conversion.cpp
+++ b/torchaudio/csrc/ffmpeg/stream_reader/conversion.cpp
@@ -269,6 +269,78 @@ torch::Tensor YUV420PConverter::convert(const AVFrame* src) {
  return buffer;
 }

+////////////////////////////////////////////////////////////////////////////////
+// YUV420P10LE
+////////////////////////////////////////////////////////////////////////////////
+YUV420P10LEConverter::YUV420P10LEConverter(int h, int w)
+    : ImageConverterBase(h, w, 3) {
+  TORCH_WARN_ONCE(
+      "The output format YUV420PLE is selected. "
+      "This will be implicitly converted to YUV444P (16-bit), "
+      "in which all the color components Y, U, V have the same dimension.");
+}
+
+void YUV420P10LEConverter::convert(const AVFrame* src, torch::Tensor& dst) {
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(src);
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+      (AVPixelFormat)(src->format) == AV_PIX_FMT_YUV420P10LE);
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(src->height == height);
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(src->width == width);
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(dst.size(1) == 3);
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(dst.size(2) == height);
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(dst.size(3) == width);
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(dst.dtype() == torch::kInt16);
+
+  // Write Y plane directly
+  {
+    int16_t* p_dst = dst.data_ptr<int16_t>();
+    uint8_t* p_src = src->data[0];
+    for (int h = 0; h < height; ++h) {
+      memcpy(p_dst, p_src, (size_t)width * 2);
+      p_dst += width;
+      p_src += src->linesize[0];
+    }
+  }
+  // Chroma (U and V planes) are subsamapled by 2 in both vertical and
+  // holizontal directions.
+  // https://en.wikipedia.org/wiki/Chroma_subsampling
+  // Since we are returning data in Tensor, which has the same size for all
+  // color planes, we need to upsample the UV planes. PyTorch has interpolate
+  // function but it does not work for int16 type. So we manually copy them.
+  //
+  //              block1  block2  block3  block4
+  // ab -> aabb = a  b   *  a  b *       *
+  // cd    aabb                   a  b      a  b
+  //       ccdd   c  d      c  d
+  //       ccdd                   c  d      c  d
+  //
+  auto block00 = dst.slice(2, 0, {}, 2).slice(3, 0, {}, 2);
+  auto block01 = dst.slice(2, 0, {}, 2).slice(3, 1, {}, 2);
+  auto block10 = dst.slice(2, 1, {}, 2).slice(3, 0, {}, 2);
+  auto block11 = dst.slice(2, 1, {}, 2).slice(3, 1, {}, 2);
+  for (int i = 1; i < 3; ++i) {
+    // borrow data
+    auto tmp = torch::from_blob(
+        src->data[i],
+        {height / 2, width / 2},
+        {src->linesize[i] / 2, 1},
+        [](void*) {},
+        torch::TensorOptions().dtype(torch::kInt16).layout(torch::kStrided));
+    // Copy to each block
+    block00.slice(1, i, i + 1).copy_(tmp);
+    block01.slice(1, i, i + 1).copy_(tmp);
+    block10.slice(1, i, i + 1).copy_(tmp);
+    block11.slice(1, i, i + 1).copy_(tmp);
+  }
+}
+
+torch::Tensor YUV420P10LEConverter::convert(const AVFrame* src) {
+  torch::Tensor buffer =
+      get_image_buffer({1, num_channels, height, width}, torch::kInt16);
+  convert(src, buffer);
+  return buffer;
+}
+
 ////////////////////////////////////////////////////////////////////////////////
 // NV12
 ////////////////////////////////////////////////////////////////////////////////

--- a/torchaudio/csrc/ffmpeg/stream_reader/conversion.h
+++ b/torchaudio/csrc/ffmpeg/stream_reader/conversion.h
@@ -73,6 +73,13 @@ class YUV420PConverter : public ImageConverterBase {
  torch::Tensor convert(const AVFrame* src);
 };

+class YUV420P10LEConverter : public ImageConverterBase {
+ public:
+  YUV420P10LEConverter(int height, int width);
+  void convert(const AVFrame* src, torch::Tensor& dst);
+  torch::Tensor convert(const AVFrame* src);
+};
+
 class NV12Converter : public ImageConverterBase {
  torch::Tensor tmp_uv;


--- a/torchaudio/csrc/ffmpeg/stream_reader/post_process.cpp
+++ b/torchaudio/csrc/ffmpeg/stream_reader/post_process.cpp
@@ -363,6 +363,11 @@ std::unique_ptr<IPostDecodeProcess> get_unchunked_video_process(
      return std::make_unique<ProcessImpl<C, B>>(
          std::move(filter), C{h, w}, B{tb});
    }
+    case AV_PIX_FMT_YUV420P10LE: {
+      using C = YUV420P10LEConverter;
+      return std::make_unique<ProcessImpl<C, B>>(
+          std::move(filter), C{h, w}, B{tb});
+    }
    case AV_PIX_FMT_NV12: {
      using C = NV12Converter;
      return std::make_unique<ProcessImpl<C, B>>(
@@ -474,6 +479,11 @@ std::unique_ptr<IPostDecodeProcess> get_chunked_video_process(
      return std::make_unique<ProcessImpl<C, B>>(
          std::move(filter), C{h, w}, B{tb, frames_per_chunk, num_chunks});
    }
+    case AV_PIX_FMT_YUV420P10LE: {
+      using C = YUV420P10LEConverter;
+      return std::make_unique<ProcessImpl<C, B>>(
+          std::move(filter), C{h, w}, B{tb, frames_per_chunk, num_chunks});
+    }
    case AV_PIX_FMT_NV12: {
      using C = NV12Converter;
      return std::make_unique<ProcessImpl<C, B>>(