Update and fill the rest of ffmpeg-integration C++ code (#2113)

Summary: - Introduce AudioBuffer and VideoBuffer for different way of handling frames - Update the way option dictionary is passed - Remove unused AutoFrameUnref - Add SrcStreamInfo/OutputStreamInfo classes Pull Request resolved: https://github.com/pytorch/audio/pull/2113 Reviewed By: nateanl Differential Revision: D33356144 Pulled By: mthrok fbshipit-source-id: e837e84fae48baa7befd5c70599bcd2cbb61514d

Update and fill the rest of ffmpeg-integration C++ code (#2113)
Summary: - Introduce AudioBuffer and VideoBuffer for different way of handling frames - Update the way option dictionary is passed - Remove unused AutoFrameUnref - Add SrcStreamInfo/OutputStreamInfo classes Pull Request resolved: https://github.com/pytorch/audio/pull/2113 Reviewed By: nateanl Differential Revision: D33356144 Pulled By: mthrok fbshipit-source-id: e837e84fae48baa7befd5c70599bcd2cbb61514d
9cb75e74 · moto · Facebook GitHub Bot · fd3c9573 · 9cb75e74 · 9cb75e74
Commit 9cb75e74 authored Dec 30, 2021 by moto Committed by Facebook GitHub Bot Dec 30, 2021
10 changed files
--- a/torchaudio/csrc/ffmpeg/buffer.cpp
+++ b/torchaudio/csrc/ffmpeg/buffer.cpp
@@ -5,8 +5,27 @@
 namespace torchaudio {
 namespace ffmpeg {

-Buffer::Buffer(AVMediaType type) : media_type(type) {}
+Buffer::Buffer(int frames_per_chunk, int num_chunks)
+    : frames_per_chunk(frames_per_chunk), num_chunks(num_chunks) {}

+AudioBuffer::AudioBuffer(int frames_per_chunk, int num_chunks)
+    : Buffer(frames_per_chunk, num_chunks) {}
+
+VideoBuffer::VideoBuffer(int frames_per_chunk, int num_chunks)
+    : Buffer(frames_per_chunk, num_chunks) {}
+
+////////////////////////////////////////////////////////////////////////////////
+// Query
+////////////////////////////////////////////////////////////////////////////////
+bool Buffer::is_ready() const {
+  if (frames_per_chunk < 0)
+    return num_buffered_frames > 0;
+  return num_buffered_frames >= frames_per_chunk;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Modifiers - Push Audio
+////////////////////////////////////////////////////////////////////////////////
 namespace {
 torch::Tensor convert_audio_tensor(AVFrame* pFrame) {
  // ref: https://ffmpeg.org/doxygen/4.1/filter__audio_8c_source.html#l00215
@@ -82,10 +101,64 @@ torch::Tensor convert_audio_tensor(AVFrame* pFrame) {
 }
 } // namespace

-void Buffer::push_audio_frame(AVFrame* pFrame) {
-  chunks.push_back(convert_audio_tensor(pFrame));
+void AudioBuffer::push_tensor(torch::Tensor t) {
+  // If frames_per_chunk < 0, users want to fetch all frames.
+  // Just push back to chunks and that's it.
+  if (frames_per_chunk < 0) {
+    chunks.push_back(t);
+    num_buffered_frames += t.size(0);
+    return;
+  }
+
+  // Push
+  // Note:
+  // For audio, the incoming tensor contains multiple of samples.
+  // For small `frames_per_chunk` value, it might be more than `max_frames`.
+  // If we push the tensor as-is, then, the whole frame might be popped at
+  // trimming stage, resulting buffer always empty. So we slice push the
+  // incoming Tensor.
+
+  // Check the last inserted Tensor and if the numbe of frames is not
+  // frame_per_chunk, reprocess it again with the incomping tensor
+  if (num_buffered_frames % frames_per_chunk) {
+    torch::Tensor prev = chunks.back();
+    chunks.pop_back();
+    num_buffered_frames -= prev.size(0);
+    t = torch::cat({prev, t}, 0);
+  }
+
+  while (true) {
+    int num_input_frames = t.size(0);
+    if (num_input_frames <= frames_per_chunk) {
+      chunks.push_back(t);
+      num_buffered_frames += num_input_frames;
+      break;
+    }
+    // The input tensor contains more frames than frames_per_chunk
+    auto splits = torch::tensor_split(t, {frames_per_chunk, num_input_frames});
+    chunks.push_back(splits[0]);
+    num_buffered_frames += frames_per_chunk;
+    t = splits[1];
+  }
+
+  // Trim
+  // If frames_per_chunk > 0, we only retain the following number of frames and
+  // Discard older frames.
+  int max_frames = num_chunks * frames_per_chunk;
+  while (num_buffered_frames > max_frames) {
+    torch::Tensor& t = chunks.front();
+    num_buffered_frames -= t.size(0);
+    chunks.pop_front();
+  }
+}
+
+void AudioBuffer::push_frame(AVFrame* frame) {
+  push_tensor(convert_audio_tensor(frame));
 }

+////////////////////////////////////////////////////////////////////////////////
+// Modifiers - Push Video
+////////////////////////////////////////////////////////////////////////////////
 namespace {
 torch::Tensor convert_image_tensor(AVFrame* pFrame) {
  // ref:
@@ -130,34 +203,79 @@ torch::Tensor convert_image_tensor(AVFrame* pFrame) {
 }
 } // namespace

-void Buffer::push_video_frame(AVFrame* pFrame) {
-  chunks.push_back(convert_image_tensor(pFrame));
+void VideoBuffer::push_tensor(torch::Tensor t) {
+  // the video frames is expected to contain only one frame
+  chunks.push_back(t);
+  num_buffered_frames += t.size(0);
+
+  if (frames_per_chunk < 0) {
+    return;
+  }
+
+  // Trim
+  int max_frames = num_chunks * frames_per_chunk;
+  if (num_buffered_frames > max_frames) {
+    torch::Tensor& t = chunks.front();
+    num_buffered_frames -= t.size(0);
+    chunks.pop_front();
+  }
 }

-torch::Tensor Buffer::pop_all() {
-  if (!chunks.size())
-    return torch::empty({});
+void VideoBuffer::push_frame(AVFrame* frame) {
+  push_tensor(convert_image_tensor(frame));
+}

-  std::vector<torch::Tensor> tmp;
-  while (chunks.size()) {
-    tmp.push_back(chunks.front());
+////////////////////////////////////////////////////////////////////////////////
+// Modifiers - Pop
+////////////////////////////////////////////////////////////////////////////////
+
+using namespace torch::indexing;
+
+c10::optional<torch::Tensor> Buffer::pop_chunk() {
+  if (!num_buffered_frames) {
+    return c10::optional<torch::Tensor>{};
+  }
+  if (frames_per_chunk < 0) {
+    return c10::optional<torch::Tensor>{pop_all()};
+  }
+  return c10::optional<torch::Tensor>{pop_one_chunk()};
+}
+
+torch::Tensor AudioBuffer::pop_one_chunk() {
+  // Audio deque are aligned with `frames_per_chunk`
+  torch::Tensor ret = chunks.front();
+  chunks.pop_front();
+  num_buffered_frames -= ret.size(0);
+  return ret;
+}
+
+torch::Tensor VideoBuffer::pop_one_chunk() {
+  // Video deque contains one frame par one tensor
+  std::vector<torch::Tensor> ret;
+  while (num_buffered_frames > 0 && ret.size() < frames_per_chunk) {
+    torch::Tensor& t = chunks.front();
+    ret.push_back(t);
    chunks.pop_front();
+    num_buffered_frames -= 1;
  }
-  return torch::cat(tmp, 0);
+  return torch::cat(ret, 0);
 }

-void Buffer::push_frame(AVFrame* frame) {
-  switch (media_type) {
-    case AVMEDIA_TYPE_AUDIO:
-      push_audio_frame(frame);
-      break;
-    case AVMEDIA_TYPE_VIDEO:
-      push_video_frame(frame);
-      break;
-    default:
-      throw std::runtime_error(
-          "Unexpected media type. Only audio/video is supported.");
+torch::Tensor Buffer::pop_all() {
+  // Note:
+  // This method is common to audio/video.
+  // In audio case, each Tensor contains multiple frames
+  // In video case, each Tensor contains one frame,
+  std::vector<torch::Tensor> ret;
+  while (chunks.size()) {
+    torch::Tensor& t = chunks.front();
+    int n_frames = t.size(0);
+    ret.push_back(t);
+    chunks.pop_front();
+    num_buffered_frames -= n_frames;
  }
+  return torch::cat(ret, 0);
 }
+
 } // namespace ffmpeg
 } // namespace torchaudio
--- a/torchaudio/csrc/ffmpeg/buffer.h
+++ b/torchaudio/csrc/ffmpeg/buffer.h
@@ -7,18 +7,82 @@ namespace torchaudio {
 namespace ffmpeg {

 class Buffer {
+ protected:
+  // Each AVFrame is converted to a Tensor and stored here.
  std::deque<torch::Tensor> chunks;
-  AVMediaType media_type;

-  void push_audio_frame(AVFrame* pFrame);
-  void push_video_frame(AVFrame* pFrame);
+  // The number of frames to return as a chunk
+  // If <0, then user wants to receive all the frames
+  const int frames_per_chunk;
+  // The numbe of chunks to retain
+  const int num_chunks;
+  // The number of currently stored chunks
+  // For video, one Tensor corresponds to one frame, but for audio,
+  // one Tensor contains multiple samples, so we track here.
+  int num_buffered_frames = 0;

 public:
-  Buffer(AVMediaType type);
+  Buffer(int frames_per_chunk, int num_chunks);
+  virtual ~Buffer() = default;

-  void push_frame(AVFrame* pFrame);
+  //////////////////////////////////////////////////////////////////////////////
+  // Query
+  //////////////////////////////////////////////////////////////////////////////
+  // Check if buffeer has enoough number of frames for a chunk
+  // If frame_per_chunk <0, returns true if there is >0 frames.
+  // Otherwise, returns if num_frames >= frame_per_chunk.
+  bool is_ready() const;
+
+  //////////////////////////////////////////////////////////////////////////////
+  // Modifiers
+  //////////////////////////////////////////////////////////////////////////////
+  virtual void push_frame(AVFrame* frame) = 0;
+
+  c10::optional<torch::Tensor> pop_chunk();
+
+ private:
+  virtual torch::Tensor pop_one_chunk() = 0;
  torch::Tensor pop_all();
 };

+// Specialization of the handling around push/pop for audio/video.
+
+////////////////////////////////////////////////////////////////////////////////
+// AudioBuffer specialization
+////////////////////////////////////////////////////////////////////////////////
+// For audio, input AVFrame contains multiple frames.
+// When popping the buffered frames chunk-by-chunk, it is easier if they are
+// organized by chunk when pushed to deque object.
+// Therefore, audio implements pushing mechanism that makes sure that
+// each Tensor in deque consists Tensors with `frames_per_chunk` frames.
+class AudioBuffer : public Buffer {
+ public:
+  AudioBuffer(int frames_per_chunk, int num_chunks);
+
+  void push_frame(AVFrame* frame);
+
+ private:
+  void push_tensor(torch::Tensor tensor);
+  torch::Tensor pop_one_chunk();
+};
+
+////////////////////////////////////////////////////////////////////////////////
+// VideoBuffer specialization
+////////////////////////////////////////////////////////////////////////////////
+// For video, input AVFrame contains one frame.
+// Contraty to audio, it is simple to push one frame each time to deque.
+// But this mean that chunks consisting of multiple frames have to be created
+// at popping time.
+class VideoBuffer : public Buffer {
+ public:
+  VideoBuffer(int frames_per_chunk, int num_chunks);
+
+  void push_frame(AVFrame* frame);
+
+ private:
+  void push_tensor(torch::Tensor tensor);
+  torch::Tensor pop_one_chunk();
+};
+
 } // namespace ffmpeg
 } // namespace torchaudio
--- a/torchaudio/csrc/ffmpeg/ffmpeg.cpp
+++ b/torchaudio/csrc/ffmpeg/ffmpeg.cpp
@@ -14,12 +14,20 @@ namespace {
 AVFormatContext* get_format_context(
    const std::string& src,
    const std::string& device,
-    AVDictionary** option) {
+    const std::map<std::string, std::string>& option) {
  AVFormatContext* pFormat = NULL;
  AVInputFormat* pInput =
      device.empty() ? NULL : av_find_input_format(device.c_str());

-  if (avformat_open_input(&pFormat, src.c_str(), pInput, option) < 0)
+  AVDictionary* dict = NULL;
+  for (auto& it : option) {
+    av_dict_set(&dict, it.first.c_str(), it.second.c_str(), 0);
+  }
+
+  int ret = avformat_open_input(&pFormat, src.c_str(), pInput, &dict);
+  av_dict_free(&dict);
+
+  if (ret < 0)
    throw std::runtime_error("Failed to open the input: " + src);
  return pFormat;
 }
@@ -28,7 +36,7 @@ AVFormatContext* get_format_context(
 AVFormatContextPtr::AVFormatContextPtr(
    const std::string& src,
    const std::string& device,
-    AVDictionary** option)
+    const std::map<std::string, std::string>& option)
    : Wrapper<AVFormatContext, AVFormatContextDeleter>(
          get_format_context(src, device, option)) {
  if (avformat_find_stream_info(ptr.get(), NULL) < 0)
@@ -82,17 +90,6 @@ AVFrame* get_av_frame() {

 AVFramePtr::AVFramePtr() : Wrapper<AVFrame, AVFrameDeleter>(get_av_frame()) {}

-///////////////////////////////////////////////////////////////////////////////
-// AVFrame - buffer unref
-////////////////////////////////////////////////////////////////////////////////
-AutoFrameUnref::AutoFrameUnref(AVFramePtr& p) : p_(p){};
-AutoFrameUnref::~AutoFrameUnref() {
-  av_frame_unref(p_);
-}
-AutoFrameUnref::operator AVFrame*() const {
-  return p_;
-}
-
 ////////////////////////////////////////////////////////////////////////////////
 // AVCodecContext
 ////////////////////////////////////////////////////////////////////////////////

--- a/torchaudio/csrc/ffmpeg/ffmpeg.h
+++ b/torchaudio/csrc/ffmpeg/ffmpeg.h
 // One stop header for all ffmepg needs
 #pragma once
 #include <cstdint>
+#include <map>
 #include <memory>
 #include <string>

@@ -58,7 +59,7 @@ struct AVFormatContextPtr
  AVFormatContextPtr(
      const std::string& src,
      const std::string& device,
-      AVDictionary** option);
+      const std::map<std::string, std::string>& option);
 };

 ////////////////////////////////////////////////////////////////////////////////
@@ -101,18 +102,6 @@ struct AVFramePtr : public Wrapper<AVFrame, AVFrameDeleter> {
  AVFramePtr();
 };

-////////////////////////////////////////////////////////////////////////////////
-// AVFrame - buffer unref
-////////////////////////////////////////////////////////////////////////////////
-// Similar to `AutoPacketUnref`, this structure will release the memory
-// allocated for frame content.
-struct AutoFrameUnref {
-  AVFramePtr& p_;
-  AutoFrameUnref(AVFramePtr& p);
-  ~AutoFrameUnref();
-  operator AVFrame*() const;
-};
-
 ////////////////////////////////////////////////////////////////////////////////
 // AVCodecContext
 ////////////////////////////////////////////////////////////////////////////////

--- a/torchaudio/csrc/ffmpeg/filter_graph.cpp
+++ b/torchaudio/csrc/ffmpeg/filter_graph.cpp
@@ -15,6 +15,13 @@ FilterGraph::FilterGraph(
  create_filter();
 }

+////////////////////////////////////////////////////////////////////////////////
+// Query method
+////////////////////////////////////////////////////////////////////////////////
+std::string FilterGraph::get_description() const {
+  return filter_description;
+};
+
 ////////////////////////////////////////////////////////////////////////////////
 // Configuration methods
 ////////////////////////////////////////////////////////////////////////////////

--- a/torchaudio/csrc/ffmpeg/filter_graph.h
+++ b/torchaudio/csrc/ffmpeg/filter_graph.h
@@ -11,10 +11,9 @@ class FilterGraph {
  // so we do not manage the resource.
  AVFilterContext* buffersrc_ctx = nullptr;
  AVFilterContext* buffersink_ctx = nullptr;
-
- public:
  const std::string filter_description;

+ public:
  FilterGraph(
      AVRational time_base,
      AVCodecParameters* codecpar,
@@ -28,6 +27,11 @@ class FilterGraph {
  FilterGraph(FilterGraph&&) = default;
  FilterGraph& operator=(FilterGraph&&) = default;

+  //////////////////////////////////////////////////////////////////////////////
+  // Query method
+  //////////////////////////////////////////////////////////////////////////////
+  std::string get_description() const;
+
  //////////////////////////////////////////////////////////////////////////////
  // Configuration methods
  //////////////////////////////////////////////////////////////////////////////

--- a/torchaudio/csrc/ffmpeg/prototype.cpp
+++ b/torchaudio/csrc/ffmpeg/prototype.cpp
@@ -98,6 +98,10 @@ int64_t find_best_video_stream(S s) {
  return s->s.find_best_video_stream();
 }

+void seek(S s, int64_t timestamp) {
+  s->s.seek(timestamp);
+}
+
 template <typename... Args>
 std::string string_format(const std::string& format, Args... args) {
  char buffer[512];
@@ -309,6 +313,7 @@ TORCH_LIBRARY_FRAGMENT(torchaudio, m) {
  m.def(
      "torchaudio::ffmpeg_streamer_find_best_video_stream",
      find_best_video_stream);
+  m.def("torchaudio::ffmpeg_streamer_seek", seek);
  m.def(
      "torchaudio::ffmpeg_streamer_add_basic_audio_stream",
      add_basic_audio_stream);

--- a/torchaudio/csrc/ffmpeg/streamer.cpp
+++ b/torchaudio/csrc/ffmpeg/streamer.cpp
@@ -133,6 +133,14 @@ bool Streamer::is_buffer_ready() const {
 ////////////////////////////////////////////////////////////////////////////////
 // Configure methods
 ////////////////////////////////////////////////////////////////////////////////
+void Streamer::seek(double timestamp) {
+  int64_t ts = static_cast<int64_t>(timestamp * AV_TIME_BASE);
+  int ret = avformat_seek_file(pFormatContext, -1, INT64_MIN, ts, INT64_MAX, 0);
+  if (ret < 0) {
+    throw std::runtime_error(std::string("Failed to seek: ") + av_err2str(ret));
+  }
+}
+
 void Streamer::add_audio_stream(
    int i,
    int frames_per_chunk,

--- a/torchaudio/csrc/ffmpeg/streamer.h
+++ b/torchaudio/csrc/ffmpeg/streamer.h
@@ -60,6 +60,8 @@ class Streamer {
  //////////////////////////////////////////////////////////////////////////////
  // Configure methods
  //////////////////////////////////////////////////////////////////////////////
+  void seek(double timestamp);
+
  void add_audio_stream(
      int i,
      int frames_per_chunk,

--- a/torchaudio/csrc/ffmpeg/typedefs.h
+++ b/torchaudio/csrc/ffmpeg/typedefs.h
+#pragma once
+
+#include <torchaudio/csrc/ffmpeg/ffmpeg.h>
+#include <iostream>
+
+namespace torchaudio {
+namespace ffmpeg {
+
+struct SrcStreamInfo {
+  AVMediaType media_type;
+  const char* codec_name = NULL;
+  const char* codec_long_name = NULL;
+  const char* fmt_name = NULL;
+  int bit_rate = 0;
+  // Audio
+  double sample_rate = 0;
+  int num_channels = 0;
+  // Video
+  int width = 0;
+  int height = 0;
+  double frame_rate = 0;
+};
+
+struct OutputStreamInfo {
+  int source_index;
+  std::string filter_description;
+  double rate;
+  OutputStreamInfo() = default;
+};
+
+} // namespace ffmpeg
+} // namespace torchaudio