Support CUDA frame in FilterGraph (#3183)

Summary: This commit adds CUDA frame support to FilterGraph It initializes and attaches CUDA frames context to FilterGraph, so that CUDA frames can be processed in FilterGraph. As a result, it enables 1. CUDA filter support such as `scale_cuda` 2. Properly retrieve the pixel format coming out of FilterGraph when CUDA HW acceleration is enabled. (currently it is reported as "cuda") Resolves https://github.com/pytorch/audio/issues/3159 Pull Request resolved: https://github.com/pytorch/audio/pull/3183 Reviewed By: hwangjeff Differential Revision: D44183722 Pulled By: mthrok fbshipit-source-id: 522d21039c361ddfaa87fa89cf49c19d210ac62f

Support CUDA frame in FilterGraph (#3183)
Summary: This commit adds CUDA frame support to FilterGraph It initializes and attaches CUDA frames context to FilterGraph, so that CUDA frames can be processed in FilterGraph. As a result, it enables 1. CUDA filter support such as `scale_cuda` 2. Properly retrieve the pixel format coming out of FilterGraph when CUDA HW acceleration is enabled. (currently it is reported as "cuda") Resolves https://github.com/pytorch/audio/issues/3159 Pull Request resolved: https://github.com/pytorch/audio/pull/3183 Reviewed By: hwangjeff Differential Revision: D44183722 Pulled By: mthrok fbshipit-source-id: 522d21039c361ddfaa87fa89cf49c19d210ac62f
c5b96558 · moto · Facebook GitHub Bot · 0c8c138c · c5b96558 · c5b96558
Commit c5b96558 authored Mar 20, 2023 by moto Committed by Facebook GitHub Bot Mar 20, 2023
12 changed files
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -654,6 +654,7 @@ jobs:
          command: .circleci/unittest/linux/scripts/run_test.sh
          environment:
              TORCHAUDIO_TEST_ALLOW_SKIP_IF_NO_CUDA: true
+              TORCHAUDIO_TEST_ALLOW_SKIP_IF_NO_HW_ACCEL: true
              TORCHAUDIO_TEST_ALLOW_SKIP_IF_ON_PYTHON_310: true
              TORCHAUDIO_TEST_ALLOW_SKIP_IF_NO_AUDIO_OUT_DEVICE: true
              TORCHAUDIO_TEST_ALLOW_SKIP_IF_NO_MACOS: true
@@ -689,11 +690,12 @@ jobs:
      - run:
          name: Run tests
          environment:
+              TORCHAUDIO_TEST_ALLOW_SKIP_IF_NO_HW_ACCEL: true
              TORCHAUDIO_TEST_ALLOW_SKIP_IF_ON_PYTHON_310: true
              TORCHAUDIO_TEST_ALLOW_SKIP_IF_CUDA_SMALL_MEMORY: true
              TORCHAUDIO_TEST_ALLOW_SKIP_IF_NO_MACOS: true
          command: |
-            docker run -t --gpus all -v $PWD:$PWD -w $PWD -e "CI=${CI}" -e TORCHAUDIO_TEST_ALLOW_SKIP_IF_ON_PYTHON_310 -e TORCHAUDIO_TEST_ALLOW_SKIP_IF_CUDA_SMALL_MEMORY -e TORCHAUDIO_TEST_ALLOW_SKIP_IF_NO_MACOS "${image_name}" .circleci/unittest/linux/scripts/run_test.sh
+            docker run -t --gpus all -v $PWD:$PWD -w $PWD -e "CI=${CI}" -e TORCHAUDIO_TEST_ALLOW_SKIP_IF_ON_PYTHON_310 -e TORCHAUDIO_TEST_ALLOW_SKIP_IF_CUDA_SMALL_MEMORY -e TORCHAUDIO_TEST_ALLOW_SKIP_IF_NO_MACOS -e TORCHAUDIO_TEST_ALLOW_SKIP_IF_NO_HW_ACCEL "${image_name}" .circleci/unittest/linux/scripts/run_test.sh
      - store_test_results:
          path: test-results
      - store_artifacts:
@@ -726,6 +728,7 @@ jobs:
              TORCHAUDIO_TEST_ALLOW_SKIP_IF_NO_CMD_COMPUTE_SPECTROGRAM_FEATS: true
              TORCHAUDIO_TEST_ALLOW_SKIP_IF_NO_CMD_SOX: true
              TORCHAUDIO_TEST_ALLOW_SKIP_IF_NO_CUDA: true
+              TORCHAUDIO_TEST_ALLOW_SKIP_IF_NO_HW_ACCEL: true
              TORCHAUDIO_TEST_ALLOW_SKIP_IF_NO_KALDI: true
              TORCHAUDIO_TEST_ALLOW_SKIP_IF_NO_SOX: true
              TORCHAUDIO_TEST_ALLOW_SKIP_IF_ON_PYTHON_310: true
@@ -814,6 +817,7 @@ jobs:
              TORCHAUDIO_TEST_ALLOW_SKIP_IF_NO_CMD_COMPUTE_MFCC_FEATS: true
              TORCHAUDIO_TEST_ALLOW_SKIP_IF_NO_CMD_COMPUTE_SPECTROGRAM_FEATS: true
              TORCHAUDIO_TEST_ALLOW_SKIP_IF_NO_CUDA: true
+              TORCHAUDIO_TEST_ALLOW_SKIP_IF_NO_HW_ACCEL: true
              TORCHAUDIO_TEST_ALLOW_SKIP_IF_NO_QUANTIZATION: true
              TORCHAUDIO_TEST_ALLOW_SKIP_IF_ON_PYTHON_310: true
              TORCHAUDIO_TEST_ALLOW_SKIP_IF_NO_MOD_sentencepiece: true

--- a/.circleci/config.yml.in
+++ b/.circleci/config.yml.in
@@ -654,6 +654,7 @@ jobs:
          command: .circleci/unittest/linux/scripts/run_test.sh
          environment:
              TORCHAUDIO_TEST_ALLOW_SKIP_IF_NO_CUDA: true
+              TORCHAUDIO_TEST_ALLOW_SKIP_IF_NO_HW_ACCEL: true
              TORCHAUDIO_TEST_ALLOW_SKIP_IF_ON_PYTHON_310: true
              TORCHAUDIO_TEST_ALLOW_SKIP_IF_NO_AUDIO_OUT_DEVICE: true
              TORCHAUDIO_TEST_ALLOW_SKIP_IF_NO_MACOS: true
@@ -689,11 +690,12 @@ jobs:
      - run:
          name: Run tests
          environment:
+              TORCHAUDIO_TEST_ALLOW_SKIP_IF_NO_HW_ACCEL: true
              TORCHAUDIO_TEST_ALLOW_SKIP_IF_ON_PYTHON_310: true
              TORCHAUDIO_TEST_ALLOW_SKIP_IF_CUDA_SMALL_MEMORY: true
              TORCHAUDIO_TEST_ALLOW_SKIP_IF_NO_MACOS: true
          command: |
-            docker run -t --gpus all -v $PWD:$PWD -w $PWD -e "CI=${CI}" -e TORCHAUDIO_TEST_ALLOW_SKIP_IF_ON_PYTHON_310 -e TORCHAUDIO_TEST_ALLOW_SKIP_IF_CUDA_SMALL_MEMORY -e TORCHAUDIO_TEST_ALLOW_SKIP_IF_NO_MACOS "${image_name}" .circleci/unittest/linux/scripts/run_test.sh
+            docker run -t --gpus all -v $PWD:$PWD -w $PWD -e "CI=${CI}" -e TORCHAUDIO_TEST_ALLOW_SKIP_IF_ON_PYTHON_310 -e TORCHAUDIO_TEST_ALLOW_SKIP_IF_CUDA_SMALL_MEMORY -e TORCHAUDIO_TEST_ALLOW_SKIP_IF_NO_MACOS -e TORCHAUDIO_TEST_ALLOW_SKIP_IF_NO_HW_ACCEL "${image_name}" .circleci/unittest/linux/scripts/run_test.sh
      - store_test_results:
          path: test-results
      - store_artifacts:
@@ -726,6 +728,7 @@ jobs:
              TORCHAUDIO_TEST_ALLOW_SKIP_IF_NO_CMD_COMPUTE_SPECTROGRAM_FEATS: true
              TORCHAUDIO_TEST_ALLOW_SKIP_IF_NO_CMD_SOX: true
              TORCHAUDIO_TEST_ALLOW_SKIP_IF_NO_CUDA: true
+              TORCHAUDIO_TEST_ALLOW_SKIP_IF_NO_HW_ACCEL: true
              TORCHAUDIO_TEST_ALLOW_SKIP_IF_NO_KALDI: true
              TORCHAUDIO_TEST_ALLOW_SKIP_IF_NO_SOX: true
              TORCHAUDIO_TEST_ALLOW_SKIP_IF_ON_PYTHON_310: true
@@ -814,6 +817,7 @@ jobs:
              TORCHAUDIO_TEST_ALLOW_SKIP_IF_NO_CMD_COMPUTE_MFCC_FEATS: true
              TORCHAUDIO_TEST_ALLOW_SKIP_IF_NO_CMD_COMPUTE_SPECTROGRAM_FEATS: true
              TORCHAUDIO_TEST_ALLOW_SKIP_IF_NO_CUDA: true
+              TORCHAUDIO_TEST_ALLOW_SKIP_IF_NO_HW_ACCEL: true
              TORCHAUDIO_TEST_ALLOW_SKIP_IF_NO_QUANTIZATION: true
              TORCHAUDIO_TEST_ALLOW_SKIP_IF_ON_PYTHON_310: true
              TORCHAUDIO_TEST_ALLOW_SKIP_IF_NO_MOD_sentencepiece: true

--- a/.github/workflows/unittest-linux-gpu.yml
+++ b/.github/workflows/unittest-linux-gpu.yml
@@ -72,6 +72,7 @@ jobs:
        export TORCHAUDIO_TEST_ALLOW_SKIP_IF_NO_CMD_COMPUTE_MFCC_FEATS=true
        export TORCHAUDIO_TEST_ALLOW_SKIP_IF_NO_CMD_COMPUTE_SPECTROGRAM_FEATS=true
        export TORCHAUDIO_TEST_ALLOW_SKIP_IF_CUDA_SMALL_MEMORY=true
+        export TORCHAUDIO_TEST_ALLOW_SKIP_IF_NO_HW_ACCEL=true
        export TORCHAUDIO_TEST_ALLOW_SKIP_IF_ON_PYTHON_310=true

        declare -a args=(

--- a/test/torchaudio_unittest/assets/testsrc.hevc
+++ b/test/torchaudio_unittest/assets/testsrc.hevc
--- a/test/torchaudio_unittest/common_utils/__init__.py
+++ b/test/torchaudio_unittest/common_utils/__init__.py
@@ -10,6 +10,7 @@ from .case_utils import (
    skipIfNoCuda,
    skipIfNoExec,
    skipIfNoFFmpeg,
+    skipIfNoHWAccel,
    skipIfNoKaldi,
    skipIfNoMacOS,
    skipIfNoModule,
@@ -55,6 +56,7 @@ __all__ = [
    "skipIfRocm",
    "skipIfNoQengine",
    "skipIfNoFFmpeg",
+    "skipIfNoHWAccel",
    "skipIfPy310",
    "get_wav_data",
    "normalize_wav",

--- a/test/torchaudio_unittest/common_utils/case_utils.py
+++ b/test/torchaudio_unittest/common_utils/case_utils.py
@@ -12,6 +12,7 @@ import torch
 import torchaudio
 from torch.testing._internal.common_utils import TestCase as PytorchTestCase
 from torchaudio._internal.module_utils import is_module_available
+from torchaudio.utils.ffmpeg_utils import get_video_decoders, get_video_encoders

 from .backend_utils import set_audio_backend

@@ -270,6 +271,19 @@ skipIfNoMacOS = _skipIf(
 )


+def skipIfNoHWAccel(name):
+    key = "NO_HW_ACCEL"
+    if not is_ffmpeg_available():
+        return _skipIf(True, reason="ffmpeg features are not available.", key=key)
+    if not torch.cuda.is_available():
+        return _skipIf(True, reason="CUDA is not available.", key=key)
+    if torchaudio._extension._check_cuda_version() is None:
+        return _skipIf(True, "Torchaudio is not compiled with CUDA.", key=key)
+    if name not in get_video_decoders() and name not in get_video_encoders():
+        return _skipIf(True, f"{name} is not in the list of available decoders or encoders", key=key)
+    return _pass
+
+
 def zip_equal(*iterables):
    """With the regular Python `zip` function, if one iterable is longer than the other,
    the remainder portions are ignored.This is resolved in Python 3.10 where we can use

--- a/test/torchaudio_unittest/io/stream_reader_test.py
+++ b/test/torchaudio_unittest/io/stream_reader_test.py
@@ -14,10 +14,12 @@ from torchaudio_unittest.common_utils import (
    save_image,
    save_wav,
    skipIfNoFFmpeg,
+    skipIfNoHWAccel,
    TempDirMixin,
    TorchaudioTestCase,
 )

+
 if is_ffmpeg_available():
    from torchaudio.io import StreamReader, StreamWriter
    from torchaudio.io._stream_reader import (
@@ -1048,3 +1050,105 @@ class StreamReaderImageTest(_MediaSourceMixin, TempDirMixin, TorchaudioTestCase)
            self.assertEqual(chunks[8], rgba, atol=0, rtol=0)
            self.assertEqual(chunks[9], abgr, atol=0, rtol=0)
            self.assertEqual(chunks[10], bgra, atol=0, rtol=0)
+
+
+@skipIfNoHWAccel("h264_cuvid")
+class CuvidHWAccelInterfaceTest(TorchaudioTestCase):
+    def test_dup_hw_acel(self):
+        """Specifying the same source stream with and without HW accel should fail (instead of segfault later)"""
+        src = get_asset_path("nasa_13013.mp4")
+        r = StreamReader(src)
+        r.add_video_stream(-1, decoder="h264_cuvid")
+        with self.assertRaises(RuntimeError):
+            r.add_video_stream(-1, decoder="h264_cuvid", hw_accel="cuda")
+
+        r = StreamReader(src)
+        r.add_video_stream(-1, decoder="h264_cuvid", hw_accel="cuda")
+        with self.assertRaises(RuntimeError):
+            r.add_video_stream(-1, decoder="h264_cuvid")
+
+
+@_media_source
+class CudaDecoderTest(_MediaSourceMixin, TempDirMixin, TorchaudioTestCase):
+    @skipIfNoHWAccel("h264_cuvid")
+    def test_h264_cuvid(self):
+        """GPU decoder works for H264"""
+        src = self.get_src(get_asset_path("nasa_13013.mp4"))
+        r = StreamReader(src)
+        r.add_video_stream(10, decoder="h264_cuvid")
+
+        num_frames = 0
+        for (chunk,) in r.stream():
+            self.assertEqual(chunk.device, torch.device("cpu"))
+            self.assertEqual(chunk.dtype, torch.uint8)
+            self.assertEqual(chunk.shape, torch.Size([10, 3, 270, 480]))
+            num_frames += chunk.size(0)
+        assert num_frames == 390
+
+    @skipIfNoHWAccel("h264_cuvid")
+    def test_h264_cuvid_hw_accel(self):
+        """GPU decoder works for H264 with HW acceleration, and put the frames on CUDA tensor"""
+        src = self.get_src(get_asset_path("nasa_13013.mp4"))
+        r = StreamReader(src)
+        r.add_video_stream(10, decoder="h264_cuvid", hw_accel="cuda")
+
+        num_frames = 0
+        for (chunk,) in r.stream():
+            self.assertEqual(chunk.device, torch.device("cuda:0"))
+            self.assertEqual(chunk.dtype, torch.uint8)
+            self.assertEqual(chunk.shape, torch.Size([10, 3, 270, 480]))
+            num_frames += chunk.size(0)
+        assert num_frames == 390
+
+    @skipIfNoHWAccel("hevc_cuvid")
+    def test_hevc_cuvid(self):
+        """GPU decoder works for H265/HEVC"""
+        src = self.get_src(get_asset_path("testsrc.hevc"))
+        r = StreamReader(src)
+        r.add_video_stream(10, decoder="hevc_cuvid")
+
+        num_frames = 0
+        for (chunk,) in r.stream():
+            self.assertEqual(chunk.device, torch.device("cpu"))
+            self.assertEqual(chunk.dtype, torch.uint8)
+            self.assertEqual(chunk.shape, torch.Size([10, 3, 144, 256]))
+            num_frames += chunk.size(0)
+        assert num_frames == 300
+
+    @skipIfNoHWAccel("hevc_cuvid")
+    def test_hevc_cuvid_hw_accel(self):
+        """GPU decoder works for H265/HEVC with HW acceleration, and put the frames on CUDA tensor"""
+        src = self.get_src(get_asset_path("testsrc.hevc"))
+        r = StreamReader(src)
+        r.add_video_stream(10, decoder="hevc_cuvid", hw_accel="cuda")
+
+        num_frames = 0
+        for (chunk,) in r.stream():
+            self.assertEqual(chunk.device, torch.device("cuda:0"))
+            self.assertEqual(chunk.dtype, torch.int16)
+            self.assertEqual(chunk.shape, torch.Size([10, 3, 144, 256]))
+            num_frames += chunk.size(0)
+        assert num_frames == 300
+
+
+@skipIfNoHWAccel("h264_cuvid")
+class FilterGraphWithCudaAccel(TorchaudioTestCase):
+    def test_sclae_cuda_change_size(self):
+        """scale_cuda filter can be used when HW accel is on"""
+        src = get_asset_path("nasa_13013.mp4")
+        r = StreamReader(src)
+        r.add_video_stream(10, decoder="h264_cuvid", hw_accel="cuda", filter_desc="scale_cuda=iw/2:ih/2")
+        num_frames = 0
+        for (chunk,) in r.stream():
+            self.assertEqual(chunk.device, torch.device("cuda:0"))
+            self.assertEqual(chunk.dtype, torch.uint8)
+            self.assertEqual(chunk.shape, torch.Size([10, 3, 135, 240]))
+            num_frames += chunk.size(0)
+        assert num_frames == 390
+
+    def test_scale_cuda_format(self):
+        """yuv444p format conversion does not work (yet)"""
+        src = get_asset_path("nasa_13013.mp4")
+        r = StreamReader(src)
+        with self.assertRaises(RuntimeError):
+            r.add_video_stream(10, decoder="h264_cuvid", hw_accel="cuda", filter_desc="scale_cuda=format=yuv444p")
--- a/torchaudio/csrc/ffmpeg/filter_graph.cpp
+++ b/torchaudio/csrc/ffmpeg/filter_graph.cpp
@@ -165,7 +165,8 @@ void FilterGraph::add_process(const std::string& filter_description) {
          av_err2string(ret) + ".)");
 }

-void FilterGraph::create_filter() {
+void FilterGraph::create_filter(AVBufferRef* hw_frames_ctx) {
+  buffersrc_ctx->outputs[0]->hw_frames_ctx = hw_frames_ctx;
  int ret = avfilter_graph_config(pFilterGraph, nullptr);
  TORCH_CHECK(ret >= 0, "Failed to configure the graph: " + av_err2string(ret));
  // char* desc = avfilter_graph_dump(pFilterGraph, NULL);
@@ -196,6 +197,10 @@ FilterGraphOutputInfo FilterGraph::get_output_info() const {
    ret.num_channels = av_get_channel_layout_nb_channels(l->channel_layout);
 #endif
  } else {
+    if (l->format == AV_PIX_FMT_CUDA && l->hw_frames_ctx) {
+      auto frames_ctx = (AVHWFramesContext*)(l->hw_frames_ctx->data);
+      ret.format = frames_ctx->sw_format;
+    }
    ret.frame_rate = l->frame_rate;
    ret.height = l->h;
    ret.width = l->w;

--- a/torchaudio/csrc/ffmpeg/filter_graph.h
+++ b/torchaudio/csrc/ffmpeg/filter_graph.h
@@ -63,7 +63,7 @@ class FilterGraph {

  void add_process(const std::string& filter_description);

-  void create_filter();
+  void create_filter(AVBufferRef* hw_frames_ctx = nullptr);

  //////////////////////////////////////////////////////////////////////////////
  // Query methods

--- a/torchaudio/csrc/ffmpeg/stream_reader/sink.cpp
+++ b/torchaudio/csrc/ffmpeg/stream_reader/sink.cpp
+#include <torchaudio/csrc/ffmpeg/hw_context.h>
 #include <torchaudio/csrc/ffmpeg/stream_reader/buffer/chunked_buffer.h>
 #include <torchaudio/csrc/ffmpeg/stream_reader/buffer/unchunked_buffer.h>
 #include <torchaudio/csrc/ffmpeg/stream_reader/sink.h>
@@ -47,14 +48,8 @@ std::unique_ptr<Buffer> get_buffer(
          codec_ctx->channels);
    }
  } else {
-    // Note
-    // When using HW decoder, the pixel format is CUDA, and FilterGraph does
-    // not yet support CUDA frames, nor propagating the software pixel format,
-    // so here, we refer to AVCodecContext* to look at the pixel format.
    AVPixelFormat fmt = (AVPixelFormat)(info.format);
-    if (fmt == AV_PIX_FMT_CUDA) {
-      fmt = codec_ctx->sw_pix_fmt;
-    }
+    TORCH_INTERNAL_ASSERT(fmt != AV_PIX_FMT_CUDA);

    if (frames_per_chunk == -1) {
      return detail::get_unchunked_buffer(fmt, info.height, info.width, device);
@@ -77,7 +72,6 @@ FilterGraph get_filter_graph(
    AVRational frame_rate,
    const std::string& filter_description) {
  auto p = FilterGraph{codec_ctx->codec_type};
-
  switch (codec_ctx->codec_type) {
    case AVMEDIA_TYPE_AUDIO:
      p.add_audio_src(
@@ -100,7 +94,11 @@ FilterGraph get_filter_graph(
  }
  p.add_sink();
  p.add_process(filter_description);
-  p.create_filter();
+  if (codec_ctx->hw_frames_ctx) {
+    p.create_filter(av_buffer_ref(codec_ctx->hw_frames_ctx));
+  } else {
+    p.create_filter(nullptr);
+  }
  return p;
 }


--- a/torchaudio/csrc/ffmpeg/stream_reader/stream_processor.cpp
+++ b/torchaudio/csrc/ffmpeg/stream_reader/stream_processor.cpp
@@ -81,6 +81,27 @@ enum AVPixelFormat get_hw_format(
  return AV_PIX_FMT_NONE;
 }

+AVBufferRef* get_hw_frames_ctx(AVCodecContext* codec_ctx) {
+  AVBufferRef* p = av_hwframe_ctx_alloc(codec_ctx->hw_device_ctx);
+  TORCH_CHECK(
+      p,
+      "Failed to allocate CUDA frame context from device context at ",
+      codec_ctx->hw_device_ctx);
+  auto frames_ctx = (AVHWFramesContext*)(p->data);
+  frames_ctx->format = codec_ctx->pix_fmt;
+  frames_ctx->sw_format = codec_ctx->sw_pix_fmt;
+  frames_ctx->width = codec_ctx->width;
+  frames_ctx->height = codec_ctx->height;
+  frames_ctx->initial_pool_size = 5;
+  int ret = av_hwframe_ctx_init(p);
+  if (ret >= 0) {
+    return p;
+  }
+  av_buffer_unref(&p);
+  TORCH_CHECK(
+      false, "Failed to initialize CUDA frame context: ", av_err2string(ret));
+}
+
 void configure_codec_context(
    AVCodecContext* codec_ctx,
    const AVCodecParameters* params,
@@ -135,6 +156,9 @@ AVCodecContextPtr get_codec_ctx(
      alloc_codec_context(params->codec_id, decoder_name);
  configure_codec_context(codec_ctx, params, device);
  open_codec(codec_ctx, decoder_option);
+  if (codec_ctx->hw_device_ctx) {
+    codec_ctx->hw_frames_ctx = av_buffer_ref(get_hw_frames_ctx(codec_ctx));
+  }
  return codec_ctx;
 }

@@ -160,6 +184,38 @@ KeyType StreamProcessor::add_stream(
    AVRational frame_rate,
    const c10::optional<std::string>& filter_description,
    const torch::Device& device) {
+  // If device is provided, then check that codec_ctx has hw_device_ctx set.
+  // In case, defining an output stream with HW accel on an input stream that
+  // has decoder set without HW accel, it will cause seg fault.
+  // i.e.
+  // The following should be rejected here.
+  // reader = StreamReader(...)
+  // reader.add_video_stream(..., decoder="h264_cuvid")
+  // reader.add_video_stream(..., decoder="h264_cuvid", hw_accel="cuda")
+  // TODO:
+  // One idea to work around this is to always define HW device context, and
+  // if HW acceleration is not required, insert `hwdownload` filter.
+  // This way it will be possible to handle both cases at the same time.
+  switch (device.type()) {
+    case torch::kCPU:
+      TORCH_CHECK(
+          !codec_ctx->hw_device_ctx,
+          "Decoding without Hardware acceleration is requested, however, "
+          "the decoder has been already defined with a HW acceleration. "
+          "Decoding a stream with and without HW acceleration simultaneously "
+          "is not supported.");
+      break;
+    case torch::kCUDA:
+      TORCH_CHECK(
+          codec_ctx->hw_device_ctx,
+          "CUDA Hardware acceleration is requested, however, the decoder has "
+          "been already defined without a HW acceleration. "
+          "Decoding a stream with and without HW acceleration simultaneously "
+          "is not supported.");
+      break;
+    default:;
+  }
+
  switch (codec_ctx->codec_type) {
    case AVMEDIA_TYPE_AUDIO:
    case AVMEDIA_TYPE_VIDEO:

--- a/torchaudio/csrc/ffmpeg/stream_reader/stream_reader.cpp
+++ b/torchaudio/csrc/ffmpeg/stream_reader/stream_reader.cpp
@@ -350,6 +350,15 @@ void StreamReader::add_stream(
    processors[i] = std::make_unique<StreamProcessor>(
        stream->time_base, stream->codecpar, decoder, decoder_option, device);
    processors[i]->set_discard_timestamp(seek_timestamp);
+  } else {
+    if (decoder) {
+      // TODO: Validate that the decoder is consistent as the one used to define
+      // previous output streams.
+      // i.e. the following is not permitted.
+      // reader.add_video_stream(..., decoder="h264")
+      // reader.add_video_stream(..., decoder="x264")
+      // reader.add_video_stream(..., decoder="h264_cuvid")
+    }
  }
  stream->discard = AVDISCARD_DEFAULT;