Decouple StreamProcessor construction and decoder configuration (#3223)

Summary: Pull Request resolved: https://github.com/pytorch/audio/pull/3223 Each `StreamProcessor` is responsible for processing a source stream. In the case where we support packet passthrough, `StreamProcessor`'s choice of decoder is irrelevant as no decoding is performed. Currently, however, `StreamProcessor` requires decoder params and fixes a decoder at construction time. To accommodate this future packet passthrough use case, this PR decouples the construction of `StreamProcessor` from the configuration of the decoder that it uses. Reviewed By: mthrok Differential Revision: D44554934 fbshipit-source-id: 1d1a89015e1181b71dfb95c928de4fc3ec6f63b6

Decouple StreamProcessor construction and decoder configuration (#3223)
Summary: Pull Request resolved: https://github.com/pytorch/audio/pull/3223 Each `StreamProcessor` is responsible for processing a source stream. In the case where we support packet passthrough, `StreamProcessor`'s choice of decoder is irrelevant as no decoding is performed. Currently, however, `StreamProcessor` requires decoder params and fixes a decoder at construction time. To accommodate this future packet passthrough use case, this PR decouples the construction of `StreamProcessor` from the configuration of the decoder that it uses. Reviewed By: mthrok Differential Revision: D44554934 fbshipit-source-id: 1d1a89015e1181b71dfb95c928de4fc3ec6f63b6
bb75caa4 · Jeff Hwang · Facebook GitHub Bot · 493b5018 · bb75caa4 · bb75caa4
Commit bb75caa4 authored Mar 30, 2023 by Jeff Hwang Committed by Facebook GitHub Bot Mar 30, 2023
3 changed files
--- a/torchaudio/csrc/ffmpeg/stream_reader/stream_processor.cpp
+++ b/torchaudio/csrc/ffmpeg/stream_reader/stream_processor.cpp
@@ -168,14 +168,8 @@ AVCodecContextPtr get_codec_ctx(
 using KeyType = StreamProcessor::KeyType;
-StreamProcessor::StreamProcessor(
+StreamProcessor::StreamProcessor(const AVRational& time_base)
-    const AVRational& time_base,
+    : stream_time_base(time_base) {}
-    const AVCodecParameters* params,
-    const c10::optional<std::string>& decoder_name,
-    const c10::optional<OptionDict>& decoder_option,
-    const torch::Device& device)
-    : stream_time_base(time_base),
-      codec_ctx(get_codec_ctx(params, decoder_name, decoder_option, device)) {}
 ////////////////////////////////////////////////////////////////////////////////
 // Configurations
@@ -186,6 +180,8 @@ KeyType StreamProcessor::add_stream(
    AVRational frame_rate,
    const std::string& filter_description,
    const torch::Device& device) {
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+      is_decoder_set(), "Decoder hasn't been set.");
  // If device is provided, then check that codec_ctx has hw_device_ctx set.
  // In case, defining an output stream with HW accel on an input stream that
  // has decoder set without HW accel, it will cause seg fault.
@@ -258,6 +254,15 @@ void StreamProcessor::set_discard_timestamp(int64_t timestamp) {
      av_rescale_q(timestamp, av_get_time_base_q(), stream_time_base);
 }
+void StreamProcessor::set_decoder(
+    const AVCodecParameters* codecpar,
+    const c10::optional<std::string>& decoder_name,
+    const c10::optional<OptionDict>& decoder_option,
+    const torch::Device& device) {
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(!codec_ctx, "Decoder has already been set.");
+  codec_ctx = get_codec_ctx(codecpar, decoder_name, decoder_option, device);
+}
 ////////////////////////////////////////////////////////////////////////////////
 // Query methods
 ////////////////////////////////////////////////////////////////////////////////
@@ -279,12 +284,19 @@ bool StreamProcessor::is_buffer_ready() const {
  return true;
 }
+bool StreamProcessor::is_decoder_set() const {
+  return codec_ctx;
+}
 ////////////////////////////////////////////////////////////////////////////////
 // The streaming process
 ////////////////////////////////////////////////////////////////////////////////
 // 0: some kind of success
 // <0: Some error happened
 int StreamProcessor::process_packet(AVPacket* packet) {
+  if (!is_decoder_set()) {
+    return 0;
+  }
  int ret = avcodec_send_packet(codec_ctx, packet);
  while (ret >= 0) {
    ret = avcodec_receive_frame(codec_ctx, frame);
@@ -340,9 +352,11 @@ int StreamProcessor::process_packet(AVPacket* packet) {
 }
 void StreamProcessor::flush() {
-  avcodec_flush_buffers(codec_ctx);
+  if (is_decoder_set()) {
-  for (auto& ite : post_processes) {
+    avcodec_flush_buffers(codec_ctx);
-    ite.second->flush();
+    for (auto& ite : post_processes) {
+      ite.second->flush();
+    }
  }
 }

--- a/torchaudio/csrc/ffmpeg/stream_reader/stream_processor.h
+++ b/torchaudio/csrc/ffmpeg/stream_reader/stream_processor.h
@@ -16,9 +16,9 @@ class StreamProcessor {
 private:
  // Stream time base which is not stored in AVCodecContextPtr
  AVRational stream_time_base;
-  AVCodecContextPtr codec_ctx;
  // Components for decoding source media
+  AVCodecContextPtr codec_ctx{nullptr};
  AVFramePtr frame;
  KeyType current_key = 0;
@@ -32,12 +32,7 @@ class StreamProcessor {
  int64_t discard_before_pts = 0;
 public:
-  StreamProcessor(
+  explicit StreamProcessor(const AVRational& time_base);
-      const AVRational& time_base,
-      const AVCodecParameters* codecpar,
-      const c10::optional<std::string>& decoder_name,
-      const c10::optional<OptionDict>& decoder_option,
-      const torch::Device& device);
  ~StreamProcessor() = default;
  // Non-copyable
  StreamProcessor(const StreamProcessor&) = delete;
@@ -69,6 +64,12 @@ class StreamProcessor {
  // The input timestamp must be expressed in AV_TIME_BASE unit.
  void set_discard_timestamp(int64_t timestamp);
+  void set_decoder(
+      const AVCodecParameters* codecpar,
+      const c10::optional<std::string>& decoder_name,
+      const c10::optional<OptionDict>& decoder_option,
+      const torch::Device& device);
  //////////////////////////////////////////////////////////////////////////////
  // Query methods
  //////////////////////////////////////////////////////////////////////////////
@@ -76,6 +77,7 @@ class StreamProcessor {
  [[nodiscard]] FilterGraphOutputInfo get_filter_output_info(KeyType key) const;
  bool is_buffer_ready() const;
+  [[nodiscard]] bool is_decoder_set() const;
  //////////////////////////////////////////////////////////////////////////////
  // The streaming process

--- a/torchaudio/csrc/ffmpeg/stream_reader/stream_reader.cpp
+++ b/torchaudio/csrc/ffmpeg/stream_reader/stream_reader.cpp
@@ -345,19 +345,18 @@ void StreamReader::add_stream(
      "Failed to detect the source stream format.");
  if (!processors[i]) {
-    processors[i] = std::make_unique<StreamProcessor>(
+    processors[i] = std::make_unique<StreamProcessor>(stream->time_base);
-        stream->time_base, stream->codecpar, decoder, decoder_option, device);
    processors[i]->set_discard_timestamp(seek_timestamp);
+  }
+  if (!processors[i]->is_decoder_set()) {
+    processors[i]->set_decoder(
+        stream->codecpar, decoder, decoder_option, device);
  } else {
-    if (decoder) {
+    TORCH_CHECK(
-      // TODO: Validate that the decoder is consistent as the one used to define
+        !decoder && (!decoder_option || decoder_option.value().size() == 0),
-      // previous output streams.
+        "Decoder options were provided, but the decoder has already been initialized.")
-      // i.e. the following is not permitted.
-      // reader.add_video_stream(..., decoder="h264")
-      // reader.add_video_stream(..., decoder="x264")
-      // reader.add_video_stream(..., decoder="h264_cuvid")
-    }
  }
  stream->discard = AVDISCARD_DEFAULT;
  auto frame_rate = [&]() -> AVRational {