Refactor FilterGraph interface (#2508)

Summary: FilterGraph is necessary for StreamWriter when saving video as Tensor array format cannot express commonot video formats like yub420. The current implementation of FilterGraph is specific to StreamReader, as it takes AVCodecParameters object. Not individual parameters. This PR refactor FilterGraph interface so that it can be constructed from more primitive information. Pull Request resolved: https://github.com/pytorch/audio/pull/2508 Reviewed By: hwangjeff Differential Revision: D37466033 Pulled By: mthrok fbshipit-source-id: 8414e985da7579c2dfe260b4dccd2afe113bb573

Refactor FilterGraph interface (#2508)
Summary: FilterGraph is necessary for StreamWriter when saving video as Tensor array format cannot express commonot video formats like yub420. The current implementation of FilterGraph is specific to StreamReader, as it takes AVCodecParameters object. Not individual parameters. This PR refactor FilterGraph interface so that it can be constructed from more primitive information. Pull Request resolved: https://github.com/pytorch/audio/pull/2508 Reviewed By: hwangjeff Differential Revision: D37466033 Pulled By: mthrok fbshipit-source-id: 8414e985da7579c2dfe260b4dccd2afe113bb573
0dd57236 · moto · Facebook GitHub Bot · 0ad03adf · 0dd57236 · 0dd57236
Commit 0dd57236 authored Jun 27, 2022 by moto Committed by Facebook GitHub Bot Jun 27, 2022
7 changed files
--- a/torchaudio/csrc/ffmpeg/README.md
+++ b/torchaudio/csrc/ffmpeg/README.md
@@ -89,9 +89,11 @@ decoder::~Decoder();

 ```c++
 // Default construction (no memory allocation)
-filter_graph = FilterGraph();
+filter_graph = FilterGraph(AVMEDIA_TYPE_AUDIO);
 // Filter configuration
-...
+filter_fraph.add_audio_src(..)
+filter_fraph.add_sink(..)
+filter_fraph.add_process("<filter expression>")
 filter_graph.create_filter();
 // Apply filter
 fitler_graph.add_frame(pFrame);

--- a/torchaudio/csrc/ffmpeg/filter_graph.cpp
+++ b/torchaudio/csrc/ffmpeg/filter_graph.cpp
@@ -4,32 +4,25 @@
 namespace torchaudio {
 namespace ffmpeg {

-FilterGraph::FilterGraph(
-    AVRational time_base,
-    AVCodecParameters* codecpar,
-    const c10::optional<std::string>& filter_description)
-    : input_time_base(time_base),
-      codecpar(codecpar),
-      filter_description(filter_description.value_or(
-          codecpar->codec_type == AVMEDIA_TYPE_AUDIO ? "anull" : "null")),
-      media_type(codecpar->codec_type) {
-  init();
+FilterGraph::FilterGraph(AVMediaType media_type) : media_type(media_type) {
+  switch (media_type) {
+    case AVMEDIA_TYPE_AUDIO:
+    case AVMEDIA_TYPE_VIDEO:
+      break;
+    default:
+      throw std::runtime_error("Only audio and video type is supported.");
+  }
 }

-////////////////////////////////////////////////////////////////////////////////
-// Query method
-////////////////////////////////////////////////////////////////////////////////
-std::string FilterGraph::get_description() const {
-  return filter_description;
-};
-
 ////////////////////////////////////////////////////////////////////////////////
 // Configuration methods
 ////////////////////////////////////////////////////////////////////////////////
 namespace {
 std::string get_audio_src_args(
+    AVSampleFormat format,
    AVRational time_base,
-    AVCodecParameters* codecpar) {
+    int sample_rate,
+    uint64_t channel_layout) {
  char args[512];
  std::snprintf(
      args,
@@ -37,60 +30,61 @@ std::string get_audio_src_args(
      "time_base=%d/%d:sample_rate=%d:sample_fmt=%s:channel_layout=0x%" PRIx64,
      time_base.num,
      time_base.den,
-      codecpar->sample_rate,
-      av_get_sample_fmt_name(static_cast<AVSampleFormat>(codecpar->format)),
-      codecpar->channel_layout);
+      sample_rate,
+      av_get_sample_fmt_name(format),
+      channel_layout);
  return std::string(args);
 }

 std::string get_video_src_args(
+    AVPixelFormat format,
    AVRational time_base,
-    AVCodecParameters* codecpar) {
+    int width,
+    int height,
+    AVRational sample_aspect_ratio) {
  char args[512];
  std::snprintf(
      args,
      sizeof(args),
      "video_size=%dx%d:pix_fmt=%s:time_base=%d/%d:pixel_aspect=%d/%d",
-      codecpar->width,
-      codecpar->height,
-      av_get_pix_fmt_name(static_cast<AVPixelFormat>(codecpar->format)),
+      width,
+      height,
+      av_get_pix_fmt_name(format),
      time_base.num,
      time_base.den,
-      codecpar->sample_aspect_ratio.num,
-      codecpar->sample_aspect_ratio.den);
+      sample_aspect_ratio.num,
+      sample_aspect_ratio.den);
  return std::string(args);
 }

 } // namespace

-void FilterGraph::init() {
-  add_src();
-  add_sink();
-  add_process();
-  create_filter();
+void FilterGraph::add_audio_src(
+    AVSampleFormat format,
+    AVRational time_base,
+    int sample_rate,
+    uint64_t channel_layout) {
+  TORCH_CHECK(
+      media_type == AVMEDIA_TYPE_AUDIO, "The filter graph is not audio type.");
+  std::string args =
+      get_audio_src_args(format, time_base, sample_rate, channel_layout);
+  add_src(args);
 }

-void FilterGraph::reset() {
-  pFilterGraph.reset();
-  buffersrc_ctx = nullptr;
-  buffersink_ctx = nullptr;
-
-  init();
+void FilterGraph::add_video_src(
+    AVPixelFormat format,
+    AVRational time_base,
+    int width,
+    int height,
+    AVRational sample_aspect_ratio) {
+  TORCH_CHECK(
+      media_type == AVMEDIA_TYPE_VIDEO, "The filter graph is not video type.");
+  std::string args =
+      get_video_src_args(format, time_base, width, height, sample_aspect_ratio);
+  add_src(args);
 }

-void FilterGraph::add_src() {
-  std::string args;
-  switch (media_type) {
-    case AVMEDIA_TYPE_AUDIO:
-      args = get_audio_src_args(input_time_base, codecpar);
-      break;
-    case AVMEDIA_TYPE_VIDEO:
-      args = get_video_src_args(input_time_base, codecpar);
-      break;
-    default:
-      throw std::runtime_error("Only audio/video are supported.");
-  }
-
+void FilterGraph::add_src(const std::string& args) {
  const AVFilter* buffersrc = avfilter_get_by_name(
      media_type == AVMEDIA_TYPE_AUDIO ? "abuffer" : "buffer");
  int ret = avfilter_graph_create_filter(
@@ -103,9 +97,6 @@ void FilterGraph::add_src() {
 }

 void FilterGraph::add_sink() {
-  if (media_type == AVMEDIA_TYPE_UNKNOWN) {
-    throw std::runtime_error("Source buffer is not allocated.");
-  }
  if (buffersink_ctx) {
    throw std::runtime_error("Sink buffer is already allocated.");
  }
@@ -158,7 +149,7 @@ class InOuts {

 } // namespace

-void FilterGraph::add_process() {
+void FilterGraph::add_process(const std::string& filter_description) {
  // Note
  // The official example and other derived codes out there use
  // https://ffmpeg.org/doxygen/4.1/filtering_audio_8c-example.html#_a37

--- a/torchaudio/csrc/ffmpeg/filter_graph.h
+++ b/torchaudio/csrc/ffmpeg/filter_graph.h
@@ -5,26 +5,17 @@ namespace torchaudio {
 namespace ffmpeg {

 class FilterGraph {
-  // Parameters required for `reset`
-  // Recreats the underlying filter_graph struct
-  AVRational input_time_base;
-  AVCodecParameters* codecpar;
-  std::string filter_description;
-
-  // Constant just for convenient access.
  AVMediaType media_type;

  AVFilterGraphPtr pFilterGraph;
+
  // AVFilterContext is freed as a part of AVFilterGraph
  // so we do not manage the resource.
  AVFilterContext* buffersrc_ctx = nullptr;
  AVFilterContext* buffersink_ctx = nullptr;

 public:
-  FilterGraph(
-      AVRational time_base,
-      AVCodecParameters* codecpar,
-      const c10::optional<std::string>& filter_desc);
+  explicit FilterGraph(AVMediaType media_type);
  // Custom destructor to release AVFilterGraph*
  ~FilterGraph() = default;
  // Non-copyable
@@ -34,24 +25,27 @@ class FilterGraph {
  FilterGraph(FilterGraph&&) = default;
  FilterGraph& operator=(FilterGraph&&) = default;

-  //////////////////////////////////////////////////////////////////////////////
-  // Query method
-  //////////////////////////////////////////////////////////////////////////////
-  std::string get_description() const;
-
  //////////////////////////////////////////////////////////////////////////////
  // Configuration methods
  //////////////////////////////////////////////////////////////////////////////
-  void init();
+  void add_audio_src(
+      AVSampleFormat format,
+      AVRational time_base,
+      int sample_rate,
+      uint64_t channel_layout);

-  void reset();
+  void add_video_src(
+      AVPixelFormat format,
+      AVRational time_base,
+      int width,
+      int height,
+      AVRational sample_aspect_ratio);

- private:
-  void add_src();
+  void add_src(const std::string& arg);

  void add_sink();

-  void add_process();
+  void add_process(const std::string& filter_description);

  void create_filter();


--- a/torchaudio/csrc/ffmpeg/sink.cpp
+++ b/torchaudio/csrc/ffmpeg/sink.cpp
@@ -23,18 +23,54 @@ std::unique_ptr<Buffer> get_buffer(
          av_get_media_type_string(type));
  }
 }
-} // namespace

-Sink::Sink(
+std::unique_ptr<FilterGraph> get_filter_graph(
    AVRational input_time_base,
    AVCodecParameters* codecpar,
+    const std::string& filter_description) {
+  auto p = std::make_unique<FilterGraph>(codecpar->codec_type);
+
+  switch (codecpar->codec_type) {
+    case AVMEDIA_TYPE_AUDIO:
+      p->add_audio_src(
+          static_cast<AVSampleFormat>(codecpar->format),
+          input_time_base,
+          codecpar->sample_rate,
+          codecpar->channel_layout);
+      break;
+    case AVMEDIA_TYPE_VIDEO:
+      p->add_video_src(
+          static_cast<AVPixelFormat>(codecpar->format),
+          input_time_base,
+          codecpar->width,
+          codecpar->height,
+          codecpar->sample_aspect_ratio);
+      break;
+    default:
+      throw std::runtime_error("Only audio/video are supported.");
+  }
+  p->add_sink();
+  p->add_process(filter_description);
+  p->create_filter();
+  return p;
+}
+
+} // namespace
+
+Sink::Sink(
+    AVRational input_time_base_,
+    AVCodecParameters* codecpar_,
    int frames_per_chunk,
    int num_chunks,
-    const c10::optional<std::string>& filter_description,
+    const c10::optional<std::string>& filter_description_,
    const torch::Device& device)
-    : filter(input_time_base, codecpar, filter_description),
+    : input_time_base(input_time_base_),
+      codecpar(codecpar_),
+      filter_description(filter_description_.value_or(
+          codecpar->codec_type == AVMEDIA_TYPE_AUDIO ? "anull" : "null")),
+      filter(get_filter_graph(input_time_base_, codecpar_, filter_description)),
      buffer(get_buffer(
-          codecpar->codec_type,
+          codecpar_->codec_type,
          frames_per_chunk,
          num_chunks,
          device)) {}
@@ -42,9 +78,9 @@ Sink::Sink(
 // 0: some kind of success
 // <0: Some error happened
 int Sink::process_frame(AVFrame* pFrame) {
-  int ret = filter.add_frame(pFrame);
+  int ret = filter->add_frame(pFrame);
  while (ret >= 0) {
-    ret = filter.get_frame(frame);
+    ret = filter->get_frame(frame);
    //  AVERROR(EAGAIN) means that new input data is required to return new
    //  output.
    if (ret == AVERROR(EAGAIN) || ret == AVERROR_EOF)
@@ -56,12 +92,16 @@ int Sink::process_frame(AVFrame* pFrame) {
  return ret;
 }

+std::string Sink::get_filter_description() const {
+  return filter_description;
+}
+
 bool Sink::is_buffer_ready() const {
  return buffer->is_ready();
 }

 void Sink::flush() {
-  filter.reset();
+  filter = get_filter_graph(input_time_base, codecpar, filter_description);
  buffer->flush();
 }


--- a/torchaudio/csrc/ffmpeg/sink.h
+++ b/torchaudio/csrc/ffmpeg/sink.h
@@ -10,8 +10,13 @@ namespace ffmpeg {
 class Sink {
  AVFramePtr frame;

+  // Parameters for recreating FilterGraph
+  AVRational input_time_base;
+  AVCodecParameters* codecpar;
+  std::string filter_description;
+  std::unique_ptr<FilterGraph> filter;
+
 public:
-  FilterGraph filter;
  std::unique_ptr<Buffer> buffer;
  Sink(
      AVRational input_time_base,
@@ -21,6 +26,7 @@ class Sink {
      const c10::optional<std::string>& filter_description,
      const torch::Device& device);

+  std::string get_filter_description() const;
  int process_frame(AVFrame* frame);
  bool is_buffer_ready() const;


--- a/torchaudio/csrc/ffmpeg/stream_processor.cpp
+++ b/torchaudio/csrc/ffmpeg/stream_processor.cpp
@@ -53,7 +53,7 @@ void StreamProcessor::remove_stream(KeyType key) {
 // Query methods
 ////////////////////////////////////////////////////////////////////////////////
 std::string StreamProcessor::get_filter_description(KeyType key) const {
-  return sinks.at(key).filter.get_description();
+  return sinks.at(key).get_filter_description();
 }

 bool StreamProcessor::is_buffer_ready() const {

--- a/torchaudio/csrc/ffmpeg/stream_reader.h
+++ b/torchaudio/csrc/ffmpeg/stream_reader.h
 #pragma once
 #include <torchaudio/csrc/ffmpeg/decoder.h>
-#include <torchaudio/csrc/ffmpeg/filter_graph.h>
 #include <torchaudio/csrc/ffmpeg/stream_processor.h>
 #include <torchaudio/csrc/ffmpeg/typedefs.h>
 #include <vector>