Refactor StreamReader - let StreamProcessor own codec context (#3157)

Summary: Pull Request resolved: https://github.com/pytorch/audio/pull/3157 AVCodecContext plays central role in decoding and encoding. Currently in StreamReader, the object is owned inside of Decoder class and it's not accessible from other objects. This commit move the ownership of AVCodecContext out of Decoder to StreamProcessor class so that other components can check access its field. Also, the Decoder class, which is super thin wrapper around AVCodecContext object, is now absorbed to StreamProcessor class. Reviewed By: xiaohui-zhang Differential Revision: D43924664 fbshipit-source-id: e53254955d9ce16871e393bcd8bb2794ce6a51ff

Refactor StreamReader - let StreamProcessor own codec context (#3157)
Summary: Pull Request resolved: https://github.com/pytorch/audio/pull/3157 AVCodecContext plays central role in decoding and encoding. Currently in StreamReader, the object is owned inside of Decoder class and it's not accessible from other objects. This commit move the ownership of AVCodecContext out of Decoder to StreamProcessor class so that other components can check access its field. Also, the Decoder class, which is super thin wrapper around AVCodecContext object, is now absorbed to StreamProcessor class. Reviewed By: xiaohui-zhang Differential Revision: D43924664 fbshipit-source-id: e53254955d9ce16871e393bcd8bb2794ce6a51ff
a8f4e97b · Moto Hira · Facebook GitHub Bot · 430dd17c · a8f4e97b · 430dd17c
Commit a8f4e97b authored Mar 09, 2023 by Moto Hira Committed by Facebook GitHub Bot Mar 09, 2023
9 changed files
--- a/torchaudio/csrc/ffmpeg/CMakeLists.txt
+++ b/torchaudio/csrc/ffmpeg/CMakeLists.txt
@@ -12,7 +12,6 @@ set(
  stream_reader/buffer/common.cpp
  stream_reader/buffer/chunked_buffer.cpp
  stream_reader/buffer/unchunked_buffer.cpp
-  stream_reader/decoder.cpp
  stream_reader/sink.cpp
  stream_reader/stream_processor.cpp
  stream_reader/stream_reader.cpp

--- a/torchaudio/csrc/ffmpeg/stream_reader/decoder.cpp
+++ b/torchaudio/csrc/ffmpeg/stream_reader/decoder.cpp
-#include <torchaudio/csrc/ffmpeg/stream_reader/decoder.h>
-namespace torchaudio {
-namespace io {
-////////////////////////////////////////////////////////////////////////////////
-// Decoder
-////////////////////////////////////////////////////////////////////////////////
-namespace {
-AVCodecContextPtr get_decode_context(
-    enum AVCodecID codec_id,
-    const c10::optional<std::string>& decoder_name) {
-  const AVCodec* pCodec = !decoder_name.has_value()
-      ? avcodec_find_decoder(codec_id)
-      : avcodec_find_decoder_by_name(decoder_name.value().c_str());
-  if (!pCodec) {
-    std::stringstream ss;
-    if (!decoder_name.has_value()) {
-      ss << "Unsupported codec: \"" << avcodec_get_name(codec_id) << "\", ("
-         << codec_id << ").";
-    } else {
-      ss << "Unsupported codec: \"" << decoder_name.value() << "\".";
-    }
-    TORCH_CHECK(pCodec, ss.str());
-  }
-  AVCodecContext* pCodecContext = avcodec_alloc_context3(pCodec);
-  TORCH_CHECK(pCodecContext, "Failed to allocate CodecContext.");
-  return AVCodecContextPtr(pCodecContext);
-}
-#ifdef USE_CUDA
-enum AVPixelFormat get_hw_format(
-    AVCodecContext* ctx,
-    const enum AVPixelFormat* pix_fmts) {
-  const enum AVPixelFormat* p = nullptr;
-  AVPixelFormat pix_fmt = *static_cast<AVPixelFormat*>(ctx->opaque);
-  for (p = pix_fmts; *p != -1; p++) {
-    if (*p == pix_fmt) {
-      return *p;
-    }
-  }
-  TORCH_WARN("Failed to get HW surface format.");
-  return AV_PIX_FMT_NONE;
-}
-const AVCodecHWConfig* get_cuda_config(const AVCodec* pCodec) {
-  for (int i = 0;; ++i) {
-    const AVCodecHWConfig* config = avcodec_get_hw_config(pCodec, i);
-    if (!config) {
-      break;
-    }
-    if (config->device_type == AV_HWDEVICE_TYPE_CUDA &&
-        config->methods & AV_CODEC_HW_CONFIG_METHOD_HW_DEVICE_CTX) {
-      return config;
-    }
-  }
-  TORCH_CHECK(
-      false,
-      "CUDA device was requested, but the codec \"",
-      pCodec->name,
-      "\" is not supported.");
-}
-#endif
-void init_codec_context(
-    AVCodecContext* pCodecContext,
-    AVCodecParameters* pParams,
-    const c10::optional<OptionDict>& decoder_option,
-    const torch::Device& device,
-    enum AVPixelFormat* pHwFmt) {
-  int ret = avcodec_parameters_to_context(pCodecContext, pParams);
-  TORCH_CHECK(
-      ret >= 0, "Failed to set CodecContext parameter: " + av_err2string(ret));
-#ifdef USE_CUDA
-  // Enable HW Acceleration
-  if (device.type() == c10::DeviceType::CUDA) {
-    *pHwFmt = get_cuda_config(pCodecContext->codec)->pix_fmt;
-    // https://www.ffmpeg.org/doxygen/trunk/hw__decode_8c_source.html#l00221
-    // 1. Set HW pixel format (config->pix_fmt) to opaue pointer.
-    pCodecContext->opaque = static_cast<void*>(pHwFmt);
-    // 2. Set pCodecContext->get_format call back function which
-    // will retrieve the HW pixel format from opaque pointer.
-    pCodecContext->get_format = get_hw_format;
-  }
-#endif
-  AVDictionary* opts = get_option_dict(decoder_option);
-  // Default to single thread execution.
-  if (!av_dict_get(opts, "threads", nullptr, 0)) {
-    av_dict_set(&opts, "threads", "1", 0);
-  }
-  ret = avcodec_open2(pCodecContext, pCodecContext->codec, &opts);
-  clean_up_dict(opts);
-  TORCH_CHECK(
-      ret >= 0, "Failed to initialize CodecContext: " + av_err2string(ret));
-  if (pParams->codec_type == AVMEDIA_TYPE_AUDIO && !pParams->channel_layout)
-    pParams->channel_layout =
-        av_get_default_channel_layout(pCodecContext->channels);
-}
-} // namespace
-Decoder::Decoder(
-    AVCodecParameters* pParam,
-    const c10::optional<std::string>& decoder_name,
-    const c10::optional<OptionDict>& decoder_option,
-    const torch::Device& device)
-    : pCodecContext(get_decode_context(pParam->codec_id, decoder_name)) {
-  init_codec_context(pCodecContext, pParam, decoder_option, device, &pHwFmt);
-}
-int Decoder::process_packet(AVPacket* pPacket) {
-  return avcodec_send_packet(pCodecContext, pPacket);
-}
-int Decoder::get_frame(AVFrame* pFrame) {
-  return avcodec_receive_frame(pCodecContext, pFrame);
-}
-int Decoder::get_frame_number() const {
-  return pCodecContext->frame_number;
-}
-void Decoder::flush_buffer() {
-  avcodec_flush_buffers(pCodecContext);
-}
-} // namespace io
-} // namespace torchaudio
--- a/torchaudio/csrc/ffmpeg/stream_reader/decoder.h
+++ b/torchaudio/csrc/ffmpeg/stream_reader/decoder.h
-#pragma once
-#include <torchaudio/csrc/ffmpeg/ffmpeg.h>
-namespace torchaudio {
-namespace io {
-class Decoder {
-  AVCodecContextPtr pCodecContext;
-  enum AVPixelFormat pHwFmt = AV_PIX_FMT_NONE;
- public:
-  // Default constructable
-  Decoder(
-      AVCodecParameters* pParam,
-      const c10::optional<std::string>& decoder_name,
-      const c10::optional<OptionDict>& decoder_option,
-      const torch::Device& device);
-  // Custom destructor to clean up the resources
-  ~Decoder() = default;
-  // Non-copyable
-  Decoder(const Decoder&) = delete;
-  Decoder& operator=(const Decoder&) = delete;
-  // Movable
-  Decoder(Decoder&&) = default;
-  Decoder& operator=(Decoder&&) = default;
-  // Process incoming packet
-  int process_packet(AVPacket* pPacket);
-  // Fetch a decoded frame
-  int get_frame(AVFrame* pFrame);
-  int get_frame_number() const;
-  // Flush buffer (for seek)
-  void flush_buffer();
-};
-} // namespace io
-} // namespace torchaudio
--- a/torchaudio/csrc/ffmpeg/stream_reader/sink.cpp
+++ b/torchaudio/csrc/ffmpeg/stream_reader/sink.cpp
@@ -49,27 +49,27 @@ std::unique_ptr<Buffer> get_buffer(
 std::unique_ptr<FilterGraph> get_filter_graph(
    AVRational input_time_base,
-    AVCodecParameters* codecpar,
+    AVCodecContext* codec_ctx,
    AVRational frame_rate,
    const std::string& filter_description) {
-  auto p = std::make_unique<FilterGraph>(codecpar->codec_type);
+  auto p = std::make_unique<FilterGraph>(codec_ctx->codec_type);
-  switch (codecpar->codec_type) {
+  switch (codec_ctx->codec_type) {
    case AVMEDIA_TYPE_AUDIO:
      p->add_audio_src(
-          static_cast<AVSampleFormat>(codecpar->format),
+          codec_ctx->sample_fmt,
          input_time_base,
-          codecpar->sample_rate,
+          codec_ctx->sample_rate,
-          codecpar->channel_layout);
+          codec_ctx->channel_layout);
      break;
    case AVMEDIA_TYPE_VIDEO:
      p->add_video_src(
-          static_cast<AVPixelFormat>(codecpar->format),
+          codec_ctx->pix_fmt,
          input_time_base,
          frame_rate,
-          codecpar->width,
+          codec_ctx->width,
-          codecpar->height,
+          codec_ctx->height,
-          codecpar->sample_aspect_ratio);
+          codec_ctx->sample_aspect_ratio);
      break;
    default:
      TORCH_CHECK(false, "Only audio/video are supported.");
@@ -84,25 +84,25 @@ std::unique_ptr<FilterGraph> get_filter_graph(
 Sink::Sink(
    AVRational input_time_base_,
-    AVCodecParameters* codecpar_,
+    AVCodecContext* codec_ctx_,
    int frames_per_chunk,
    int num_chunks,
    AVRational frame_rate_,
    const c10::optional<std::string>& filter_description_,
    const torch::Device& device)
    : input_time_base(input_time_base_),
-      codecpar(codecpar_),
+      codec_ctx(codec_ctx_),
      frame_rate(frame_rate_),
      filter_description(filter_description_.value_or(
-          codecpar->codec_type == AVMEDIA_TYPE_AUDIO ? "anull" : "null")),
+          codec_ctx->codec_type == AVMEDIA_TYPE_AUDIO ? "anull" : "null")),
      filter(get_filter_graph(
          input_time_base_,
-          codecpar_,
+          codec_ctx,
          frame_rate,
          filter_description)),
      output_time_base(filter->get_output_timebase()),
      buffer(get_buffer(
-          codecpar_->codec_type,
+          codec_ctx->codec_type,
          frames_per_chunk,
          num_chunks,
          double(output_time_base.num) / output_time_base.den,
@@ -139,7 +139,7 @@ FilterGraphOutputInfo Sink::get_filter_output_info() const {
 void Sink::flush() {
  filter = get_filter_graph(
-      input_time_base, codecpar, frame_rate, filter_description);
+      input_time_base, codec_ctx, frame_rate, filter_description);
  buffer->flush();
 }

--- a/torchaudio/csrc/ffmpeg/stream_reader/sink.h
+++ b/torchaudio/csrc/ffmpeg/stream_reader/sink.h
@@ -12,7 +12,7 @@ class Sink {
  // Parameters for recreating FilterGraph
  AVRational input_time_base;
-  AVCodecParameters* codecpar;
+  AVCodecContext* codec_ctx;
  AVRational frame_rate;
  std::string filter_description;
  std::unique_ptr<FilterGraph> filter;
@@ -23,7 +23,7 @@ class Sink {
  std::unique_ptr<Buffer> buffer;
  Sink(
      AVRational input_time_base,
-      AVCodecParameters* codecpar,
+      AVCodecContext* codec_ctx,
      int frames_per_chunk,
      int num_chunks,
      AVRational frame_rate,

--- a/torchaudio/csrc/ffmpeg/stream_reader/stream_processor.cpp
+++ b/torchaudio/csrc/ffmpeg/stream_reader/stream_processor.cpp
@@ -4,15 +4,127 @@
 namespace torchaudio {
 namespace io {
+namespace {
+AVCodecContextPtr alloc_codec_context(
+    enum AVCodecID codec_id,
+    const c10::optional<std::string>& decoder_name) {
+  const AVCodec* codec = [&]() {
+    if (decoder_name) {
+      const AVCodec* c =
+          avcodec_find_decoder_by_name(decoder_name.value().c_str());
+      TORCH_CHECK(c, "Unsupported codec: ", decoder_name.value());
+      return c;
+    } else {
+      const AVCodec* c = avcodec_find_decoder(codec_id);
+      TORCH_CHECK(c, "Unsupported codec: ", avcodec_get_name(codec_id));
+      return c;
+    }
+  }();
+  AVCodecContext* codec_ctx = avcodec_alloc_context3(codec);
+  TORCH_CHECK(codec_ctx, "Failed to allocate CodecContext.");
+  return AVCodecContextPtr(codec_ctx);
+}
+const AVCodecHWConfig* get_cuda_config(const AVCodec* codec) {
+  for (int i = 0;; ++i) {
+    const AVCodecHWConfig* config = avcodec_get_hw_config(codec, i);
+    if (!config) {
+      break;
+    }
+    if (config->device_type == AV_HWDEVICE_TYPE_CUDA &&
+        config->methods & AV_CODEC_HW_CONFIG_METHOD_HW_DEVICE_CTX) {
+      return config;
+    }
+  }
+  TORCH_CHECK(
+      false,
+      "CUDA device was requested, but the codec \"",
+      codec->name,
+      "\" is not supported.");
+}
+enum AVPixelFormat get_hw_format(
+    AVCodecContext* codec_ctx,
+    const enum AVPixelFormat* pix_fmts) {
+  const AVCodecHWConfig* cfg = static_cast<AVCodecHWConfig*>(codec_ctx->opaque);
+  for (const enum AVPixelFormat* p = pix_fmts; *p != -1; p++) {
+    if (*p == cfg->pix_fmt) {
+      return *p;
+    }
+  }
+  TORCH_WARN("Failed to get HW surface format.");
+  return AV_PIX_FMT_NONE;
+}
+void configure_codec_context(
+    AVCodecContext* codec_ctx,
+    const AVCodecParameters* params,
+    const torch::Device& device) {
+  int ret = avcodec_parameters_to_context(codec_ctx, params);
+  TORCH_CHECK(
+      ret >= 0, "Failed to set CodecContext parameter: ", av_err2string(ret));
+  if (device.type() == c10::DeviceType::CUDA) {
+#ifndef USE_CUDA
+    TORCH_CHECK(false, "torchaudio is not compiled with CUDA support.");
+#else
+    const AVCodecHWConfig* cfg = get_cuda_config(codec_ctx->codec);
+    // https://www.ffmpeg.org/doxygen/trunk/hw__decode_8c_source.html#l00221
+    // 1. Set HW config to opaue pointer.
+    codec_ctx->opaque = static_cast<void*>(const_cast<AVCodecHWConfig*>(cfg));
+    // 2. Set pCodecContext->get_format call back function which
+    // will retrieve the HW pixel format from opaque pointer.
+    codec_ctx->get_format = get_hw_format;
+#endif
+  }
+}
+void open_codec(
+    AVCodecContext* codec_ctx,
+    const c10::optional<OptionDict>& decoder_option) {
+  AVDictionary* opts = get_option_dict(decoder_option);
+  // Default to single thread execution.
+  if (!av_dict_get(opts, "threads", nullptr, 0)) {
+    av_dict_set(&opts, "threads", "1", 0);
+  }
+  if (!codec_ctx->channel_layout) {
+    codec_ctx->channel_layout =
+        av_get_default_channel_layout(codec_ctx->channels);
+  }
+  int ret = avcodec_open2(codec_ctx, codec_ctx->codec, &opts);
+  clean_up_dict(opts);
+  TORCH_CHECK(
+      ret >= 0, "Failed to initialize CodecContext: ", av_err2string(ret));
+}
+AVCodecContextPtr get_codec_ctx(
+    const AVCodecParameters* params,
+    const c10::optional<std::string>& decoder_name,
+    const c10::optional<OptionDict>& decoder_option,
+    const torch::Device& device) {
+  AVCodecContextPtr codec_ctx =
+      alloc_codec_context(params->codec_id, decoder_name);
+  configure_codec_context(codec_ctx, params, device);
+  open_codec(codec_ctx, decoder_option);
+  return codec_ctx;
+}
+} // namespace
 using KeyType = StreamProcessor::KeyType;
 StreamProcessor::StreamProcessor(
-    AVStream* stream,
+    const AVRational& time_base,
+    const AVCodecParameters* params,
    const c10::optional<std::string>& decoder_name,
    const c10::optional<OptionDict>& decoder_option,
    const torch::Device& device)
-    : stream(stream),
+    : stream_time_base(time_base),
-      decoder(stream->codecpar, decoder_name, decoder_option, device) {}
+      codec_ctx(get_codec_ctx(params, decoder_name, decoder_option, device)) {}
 ////////////////////////////////////////////////////////////////////////////////
 // Configurations
@@ -23,7 +135,7 @@ KeyType StreamProcessor::add_stream(
    AVRational frame_rate,
    const c10::optional<std::string>& filter_description,
    const torch::Device& device) {
-  switch (stream->codecpar->codec_type) {
+  switch (codec_ctx->codec_type) {
    case AVMEDIA_TYPE_AUDIO:
    case AVMEDIA_TYPE_VIDEO:
      break;
@@ -35,8 +147,8 @@ KeyType StreamProcessor::add_stream(
      std::piecewise_construct,
      std::forward_as_tuple(key),
      std::forward_as_tuple(
-          stream->time_base,
+          stream_time_base,
-          stream->codecpar,
+          codec_ctx,
          frames_per_chunk,
          num_chunks,
          frame_rate,
@@ -52,7 +164,7 @@ void StreamProcessor::remove_stream(KeyType key) {
 void StreamProcessor::set_discard_timestamp(int64_t timestamp) {
  TORCH_CHECK(timestamp >= 0, "timestamp must be non-negative.");
  discard_before_pts =
-      av_rescale_q(timestamp, av_get_time_base_q(), stream->time_base);
+      av_rescale_q(timestamp, av_get_time_base_q(), stream_time_base);
 }
 ////////////////////////////////////////////////////////////////////////////////
@@ -82,15 +194,15 @@ bool StreamProcessor::is_buffer_ready() const {
 // 0: some kind of success
 // <0: Some error happened
 int StreamProcessor::process_packet(AVPacket* packet) {
-  int ret = decoder.process_packet(packet);
+  int ret = avcodec_send_packet(codec_ctx, packet);
  while (ret >= 0) {
-    ret = decoder.get_frame(pFrame1);
+    ret = avcodec_receive_frame(codec_ctx, frame);
    //  AVERROR(EAGAIN) means that new input data is required to return new
    //  output.
    if (ret == AVERROR(EAGAIN))
      return 0;
    if (ret == AVERROR_EOF)
-      return send_frame(NULL);
+      return send_frame(nullptr);
    if (ret < 0)
      return ret;
@@ -100,8 +212,8 @@ int StreamProcessor::process_packet(AVPacket* packet) {
    //
    // We need valid pts because filter_graph does not fall back to
    // best_effort_timestamp.
-    if (pFrame1->pts == AV_NOPTS_VALUE) {
+    if (frame->pts == AV_NOPTS_VALUE) {
-      if (pFrame1->best_effort_timestamp == AV_NOPTS_VALUE) {
+      if (frame->best_effort_timestamp == AV_NOPTS_VALUE) {
        // This happens in drain mode.
        // When the decoder enters drain mode, it starts flushing the internally
        // buffered frames, of which PTS cannot be estimated.
@@ -109,9 +221,9 @@ int StreamProcessor::process_packet(AVPacket* packet) {
        // This is because they might be intra-frames not in chronological
        // order. In this case, we use received frames as-is in the order they
        // are received.
-        pFrame1->pts = decoder.get_frame_number() + 1;
+        frame->pts = codec_ctx->frame_number + 1;
      } else {
-        pFrame1->pts = pFrame1->best_effort_timestamp;
+        frame->pts = frame->best_effort_timestamp;
      }
    }
@@ -126,18 +238,18 @@ int StreamProcessor::process_packet(AVPacket* packet) {
    //    of the data.
    //
    // Note: discard_before_pts < 0 is UB.
-    if (discard_before_pts <= 0 || pFrame1->pts >= discard_before_pts) {
+    if (discard_before_pts <= 0 || frame->pts >= discard_before_pts) {
-      send_frame(pFrame1);
+      send_frame(frame);
    }
    // else we can just unref the frame and continue
-    av_frame_unref(pFrame1);
+    av_frame_unref(frame);
  }
  return ret;
 }
 void StreamProcessor::flush() {
-  decoder.flush_buffer();
+  avcodec_flush_buffers(codec_ctx);
  for (auto& ite : sinks) {
    ite.second.flush();
  }
@@ -145,10 +257,10 @@ void StreamProcessor::flush() {
 // 0: some kind of success
 // <0: Some error happened
-int StreamProcessor::send_frame(AVFrame* pFrame) {
+int StreamProcessor::send_frame(AVFrame* frame_) {
  int ret = 0;
  for (auto& ite : sinks) {
-    int ret2 = ite.second.process_frame(pFrame);
+    int ret2 = ite.second.process_frame(frame_);
    if (ret2 < 0)
      ret = ret2;
  }

--- a/torchaudio/csrc/ffmpeg/stream_reader/stream_processor.h
+++ b/torchaudio/csrc/ffmpeg/stream_reader/stream_processor.h
@@ -2,7 +2,6 @@
 #include <torch/torch.h>
 #include <torchaudio/csrc/ffmpeg/ffmpeg.h>
-#include <torchaudio/csrc/ffmpeg/stream_reader/decoder.h>
 #include <torchaudio/csrc/ffmpeg/stream_reader/sink.h>
 #include <torchaudio/csrc/ffmpeg/stream_reader/typedefs.h>
 #include <map>
@@ -15,13 +14,12 @@ class StreamProcessor {
  using KeyType = int;
 private:
-  // Link to the corresponding stream object
+  // Stream time base which is not stored in AVCodecContextPtr
-  const AVStream* stream;
+  AVRational stream_time_base;
+  AVCodecContextPtr codec_ctx;
  // Components for decoding source media
-  AVFramePtr pFrame1;
+  AVFramePtr frame;
-  AVFramePtr pFrame2;
-  Decoder decoder;
  KeyType current_key = 0;
  std::map<KeyType, Sink> sinks;
@@ -35,7 +33,8 @@ class StreamProcessor {
 public:
  StreamProcessor(
-      AVStream* stream,
+      const AVRational& time_base,
+      const AVCodecParameters* codecpar,
      const c10::optional<std::string>& decoder_name,
      const c10::optional<OptionDict>& decoder_option,
      const torch::Device& device);

--- a/torchaudio/csrc/ffmpeg/stream_reader/stream_reader.cpp
+++ b/torchaudio/csrc/ffmpeg/stream_reader/stream_reader.cpp
@@ -348,7 +348,7 @@ void StreamReader::add_stream(
  if (!processors[i]) {
    processors[i] = std::make_unique<StreamProcessor>(
-        stream, decoder, decoder_option, device);
+        stream->time_base, stream->codecpar, decoder, decoder_option, device);
    processors[i]->set_discard_timestamp(seek_timestamp);
  }
  stream->discard = AVDISCARD_DEFAULT;

--- a/torchaudio/csrc/ffmpeg/stream_reader/stream_reader.h
+++ b/torchaudio/csrc/ffmpeg/stream_reader/stream_reader.h
 #pragma once
 #include <torchaudio/csrc/ffmpeg/ffmpeg.h>
-#include <torchaudio/csrc/ffmpeg/stream_reader/decoder.h>
 #include <torchaudio/csrc/ffmpeg/stream_reader/stream_processor.h>
 #include <torchaudio/csrc/ffmpeg/stream_reader/typedefs.h>
 #include <vector>