Commit 9cb75e74 authored by moto's avatar moto Committed by Facebook GitHub Bot
Browse files

Update and fill the rest of ffmpeg-integration C++ code (#2113)

Summary:
- Introduce AudioBuffer and VideoBuffer for different way of handling frames
- Update the way option dictionary is passed
- Remove unused AutoFrameUnref
- Add SrcStreamInfo/OutputStreamInfo classes

Pull Request resolved: https://github.com/pytorch/audio/pull/2113

Reviewed By: nateanl

Differential Revision: D33356144

Pulled By: mthrok

fbshipit-source-id: e837e84fae48baa7befd5c70599bcd2cbb61514d
parent fd3c9573
......@@ -5,8 +5,27 @@
namespace torchaudio {
namespace ffmpeg {
Buffer::Buffer(AVMediaType type) : media_type(type) {}
Buffer::Buffer(int frames_per_chunk, int num_chunks)
: frames_per_chunk(frames_per_chunk), num_chunks(num_chunks) {}
AudioBuffer::AudioBuffer(int frames_per_chunk, int num_chunks)
: Buffer(frames_per_chunk, num_chunks) {}
VideoBuffer::VideoBuffer(int frames_per_chunk, int num_chunks)
: Buffer(frames_per_chunk, num_chunks) {}
////////////////////////////////////////////////////////////////////////////////
// Query
////////////////////////////////////////////////////////////////////////////////
bool Buffer::is_ready() const {
if (frames_per_chunk < 0)
return num_buffered_frames > 0;
return num_buffered_frames >= frames_per_chunk;
}
////////////////////////////////////////////////////////////////////////////////
// Modifiers - Push Audio
////////////////////////////////////////////////////////////////////////////////
namespace {
torch::Tensor convert_audio_tensor(AVFrame* pFrame) {
// ref: https://ffmpeg.org/doxygen/4.1/filter__audio_8c_source.html#l00215
......@@ -82,10 +101,64 @@ torch::Tensor convert_audio_tensor(AVFrame* pFrame) {
}
} // namespace
void Buffer::push_audio_frame(AVFrame* pFrame) {
chunks.push_back(convert_audio_tensor(pFrame));
void AudioBuffer::push_tensor(torch::Tensor t) {
// If frames_per_chunk < 0, users want to fetch all frames.
// Just push back to chunks and that's it.
if (frames_per_chunk < 0) {
chunks.push_back(t);
num_buffered_frames += t.size(0);
return;
}
// Push
// Note:
// For audio, the incoming tensor contains multiple of samples.
// For small `frames_per_chunk` value, it might be more than `max_frames`.
// If we push the tensor as-is, then, the whole frame might be popped at
// trimming stage, resulting buffer always empty. So we slice push the
// incoming Tensor.
// Check the last inserted Tensor and if the numbe of frames is not
// frame_per_chunk, reprocess it again with the incomping tensor
if (num_buffered_frames % frames_per_chunk) {
torch::Tensor prev = chunks.back();
chunks.pop_back();
num_buffered_frames -= prev.size(0);
t = torch::cat({prev, t}, 0);
}
while (true) {
int num_input_frames = t.size(0);
if (num_input_frames <= frames_per_chunk) {
chunks.push_back(t);
num_buffered_frames += num_input_frames;
break;
}
// The input tensor contains more frames than frames_per_chunk
auto splits = torch::tensor_split(t, {frames_per_chunk, num_input_frames});
chunks.push_back(splits[0]);
num_buffered_frames += frames_per_chunk;
t = splits[1];
}
// Trim
// If frames_per_chunk > 0, we only retain the following number of frames and
// Discard older frames.
int max_frames = num_chunks * frames_per_chunk;
while (num_buffered_frames > max_frames) {
torch::Tensor& t = chunks.front();
num_buffered_frames -= t.size(0);
chunks.pop_front();
}
}
void AudioBuffer::push_frame(AVFrame* frame) {
push_tensor(convert_audio_tensor(frame));
}
////////////////////////////////////////////////////////////////////////////////
// Modifiers - Push Video
////////////////////////////////////////////////////////////////////////////////
namespace {
torch::Tensor convert_image_tensor(AVFrame* pFrame) {
// ref:
......@@ -130,34 +203,79 @@ torch::Tensor convert_image_tensor(AVFrame* pFrame) {
}
} // namespace
void Buffer::push_video_frame(AVFrame* pFrame) {
chunks.push_back(convert_image_tensor(pFrame));
void VideoBuffer::push_tensor(torch::Tensor t) {
// the video frames is expected to contain only one frame
chunks.push_back(t);
num_buffered_frames += t.size(0);
if (frames_per_chunk < 0) {
return;
}
// Trim
int max_frames = num_chunks * frames_per_chunk;
if (num_buffered_frames > max_frames) {
torch::Tensor& t = chunks.front();
num_buffered_frames -= t.size(0);
chunks.pop_front();
}
}
torch::Tensor Buffer::pop_all() {
if (!chunks.size())
return torch::empty({});
void VideoBuffer::push_frame(AVFrame* frame) {
push_tensor(convert_image_tensor(frame));
}
std::vector<torch::Tensor> tmp;
while (chunks.size()) {
tmp.push_back(chunks.front());
////////////////////////////////////////////////////////////////////////////////
// Modifiers - Pop
////////////////////////////////////////////////////////////////////////////////
using namespace torch::indexing;
c10::optional<torch::Tensor> Buffer::pop_chunk() {
if (!num_buffered_frames) {
return c10::optional<torch::Tensor>{};
}
if (frames_per_chunk < 0) {
return c10::optional<torch::Tensor>{pop_all()};
}
return c10::optional<torch::Tensor>{pop_one_chunk()};
}
torch::Tensor AudioBuffer::pop_one_chunk() {
// Audio deque are aligned with `frames_per_chunk`
torch::Tensor ret = chunks.front();
chunks.pop_front();
num_buffered_frames -= ret.size(0);
return ret;
}
torch::Tensor VideoBuffer::pop_one_chunk() {
// Video deque contains one frame par one tensor
std::vector<torch::Tensor> ret;
while (num_buffered_frames > 0 && ret.size() < frames_per_chunk) {
torch::Tensor& t = chunks.front();
ret.push_back(t);
chunks.pop_front();
num_buffered_frames -= 1;
}
return torch::cat(tmp, 0);
return torch::cat(ret, 0);
}
void Buffer::push_frame(AVFrame* frame) {
switch (media_type) {
case AVMEDIA_TYPE_AUDIO:
push_audio_frame(frame);
break;
case AVMEDIA_TYPE_VIDEO:
push_video_frame(frame);
break;
default:
throw std::runtime_error(
"Unexpected media type. Only audio/video is supported.");
torch::Tensor Buffer::pop_all() {
// Note:
// This method is common to audio/video.
// In audio case, each Tensor contains multiple frames
// In video case, each Tensor contains one frame,
std::vector<torch::Tensor> ret;
while (chunks.size()) {
torch::Tensor& t = chunks.front();
int n_frames = t.size(0);
ret.push_back(t);
chunks.pop_front();
num_buffered_frames -= n_frames;
}
return torch::cat(ret, 0);
}
} // namespace ffmpeg
} // namespace torchaudio
......@@ -7,18 +7,82 @@ namespace torchaudio {
namespace ffmpeg {
class Buffer {
protected:
// Each AVFrame is converted to a Tensor and stored here.
std::deque<torch::Tensor> chunks;
AVMediaType media_type;
void push_audio_frame(AVFrame* pFrame);
void push_video_frame(AVFrame* pFrame);
// The number of frames to return as a chunk
// If <0, then user wants to receive all the frames
const int frames_per_chunk;
// The numbe of chunks to retain
const int num_chunks;
// The number of currently stored chunks
// For video, one Tensor corresponds to one frame, but for audio,
// one Tensor contains multiple samples, so we track here.
int num_buffered_frames = 0;
public:
Buffer(AVMediaType type);
Buffer(int frames_per_chunk, int num_chunks);
virtual ~Buffer() = default;
void push_frame(AVFrame* pFrame);
//////////////////////////////////////////////////////////////////////////////
// Query
//////////////////////////////////////////////////////////////////////////////
// Check if buffeer has enoough number of frames for a chunk
// If frame_per_chunk <0, returns true if there is >0 frames.
// Otherwise, returns if num_frames >= frame_per_chunk.
bool is_ready() const;
//////////////////////////////////////////////////////////////////////////////
// Modifiers
//////////////////////////////////////////////////////////////////////////////
virtual void push_frame(AVFrame* frame) = 0;
c10::optional<torch::Tensor> pop_chunk();
private:
virtual torch::Tensor pop_one_chunk() = 0;
torch::Tensor pop_all();
};
// Specialization of the handling around push/pop for audio/video.
////////////////////////////////////////////////////////////////////////////////
// AudioBuffer specialization
////////////////////////////////////////////////////////////////////////////////
// For audio, input AVFrame contains multiple frames.
// When popping the buffered frames chunk-by-chunk, it is easier if they are
// organized by chunk when pushed to deque object.
// Therefore, audio implements pushing mechanism that makes sure that
// each Tensor in deque consists Tensors with `frames_per_chunk` frames.
class AudioBuffer : public Buffer {
public:
AudioBuffer(int frames_per_chunk, int num_chunks);
void push_frame(AVFrame* frame);
private:
void push_tensor(torch::Tensor tensor);
torch::Tensor pop_one_chunk();
};
////////////////////////////////////////////////////////////////////////////////
// VideoBuffer specialization
////////////////////////////////////////////////////////////////////////////////
// For video, input AVFrame contains one frame.
// Contraty to audio, it is simple to push one frame each time to deque.
// But this mean that chunks consisting of multiple frames have to be created
// at popping time.
class VideoBuffer : public Buffer {
public:
VideoBuffer(int frames_per_chunk, int num_chunks);
void push_frame(AVFrame* frame);
private:
void push_tensor(torch::Tensor tensor);
torch::Tensor pop_one_chunk();
};
} // namespace ffmpeg
} // namespace torchaudio
......@@ -14,12 +14,20 @@ namespace {
AVFormatContext* get_format_context(
const std::string& src,
const std::string& device,
AVDictionary** option) {
const std::map<std::string, std::string>& option) {
AVFormatContext* pFormat = NULL;
AVInputFormat* pInput =
device.empty() ? NULL : av_find_input_format(device.c_str());
if (avformat_open_input(&pFormat, src.c_str(), pInput, option) < 0)
AVDictionary* dict = NULL;
for (auto& it : option) {
av_dict_set(&dict, it.first.c_str(), it.second.c_str(), 0);
}
int ret = avformat_open_input(&pFormat, src.c_str(), pInput, &dict);
av_dict_free(&dict);
if (ret < 0)
throw std::runtime_error("Failed to open the input: " + src);
return pFormat;
}
......@@ -28,7 +36,7 @@ AVFormatContext* get_format_context(
AVFormatContextPtr::AVFormatContextPtr(
const std::string& src,
const std::string& device,
AVDictionary** option)
const std::map<std::string, std::string>& option)
: Wrapper<AVFormatContext, AVFormatContextDeleter>(
get_format_context(src, device, option)) {
if (avformat_find_stream_info(ptr.get(), NULL) < 0)
......@@ -82,17 +90,6 @@ AVFrame* get_av_frame() {
AVFramePtr::AVFramePtr() : Wrapper<AVFrame, AVFrameDeleter>(get_av_frame()) {}
///////////////////////////////////////////////////////////////////////////////
// AVFrame - buffer unref
////////////////////////////////////////////////////////////////////////////////
AutoFrameUnref::AutoFrameUnref(AVFramePtr& p) : p_(p){};
AutoFrameUnref::~AutoFrameUnref() {
av_frame_unref(p_);
}
AutoFrameUnref::operator AVFrame*() const {
return p_;
}
////////////////////////////////////////////////////////////////////////////////
// AVCodecContext
////////////////////////////////////////////////////////////////////////////////
......
// One stop header for all ffmepg needs
#pragma once
#include <cstdint>
#include <map>
#include <memory>
#include <string>
......@@ -58,7 +59,7 @@ struct AVFormatContextPtr
AVFormatContextPtr(
const std::string& src,
const std::string& device,
AVDictionary** option);
const std::map<std::string, std::string>& option);
};
////////////////////////////////////////////////////////////////////////////////
......@@ -101,18 +102,6 @@ struct AVFramePtr : public Wrapper<AVFrame, AVFrameDeleter> {
AVFramePtr();
};
////////////////////////////////////////////////////////////////////////////////
// AVFrame - buffer unref
////////////////////////////////////////////////////////////////////////////////
// Similar to `AutoPacketUnref`, this structure will release the memory
// allocated for frame content.
struct AutoFrameUnref {
AVFramePtr& p_;
AutoFrameUnref(AVFramePtr& p);
~AutoFrameUnref();
operator AVFrame*() const;
};
////////////////////////////////////////////////////////////////////////////////
// AVCodecContext
////////////////////////////////////////////////////////////////////////////////
......
......@@ -15,6 +15,13 @@ FilterGraph::FilterGraph(
create_filter();
}
////////////////////////////////////////////////////////////////////////////////
// Query method
////////////////////////////////////////////////////////////////////////////////
std::string FilterGraph::get_description() const {
return filter_description;
};
////////////////////////////////////////////////////////////////////////////////
// Configuration methods
////////////////////////////////////////////////////////////////////////////////
......
......@@ -11,10 +11,9 @@ class FilterGraph {
// so we do not manage the resource.
AVFilterContext* buffersrc_ctx = nullptr;
AVFilterContext* buffersink_ctx = nullptr;
public:
const std::string filter_description;
public:
FilterGraph(
AVRational time_base,
AVCodecParameters* codecpar,
......@@ -28,6 +27,11 @@ class FilterGraph {
FilterGraph(FilterGraph&&) = default;
FilterGraph& operator=(FilterGraph&&) = default;
//////////////////////////////////////////////////////////////////////////////
// Query method
//////////////////////////////////////////////////////////////////////////////
std::string get_description() const;
//////////////////////////////////////////////////////////////////////////////
// Configuration methods
//////////////////////////////////////////////////////////////////////////////
......
......@@ -98,6 +98,10 @@ int64_t find_best_video_stream(S s) {
return s->s.find_best_video_stream();
}
void seek(S s, int64_t timestamp) {
s->s.seek(timestamp);
}
template <typename... Args>
std::string string_format(const std::string& format, Args... args) {
char buffer[512];
......@@ -309,6 +313,7 @@ TORCH_LIBRARY_FRAGMENT(torchaudio, m) {
m.def(
"torchaudio::ffmpeg_streamer_find_best_video_stream",
find_best_video_stream);
m.def("torchaudio::ffmpeg_streamer_seek", seek);
m.def(
"torchaudio::ffmpeg_streamer_add_basic_audio_stream",
add_basic_audio_stream);
......
......@@ -133,6 +133,14 @@ bool Streamer::is_buffer_ready() const {
////////////////////////////////////////////////////////////////////////////////
// Configure methods
////////////////////////////////////////////////////////////////////////////////
void Streamer::seek(double timestamp) {
int64_t ts = static_cast<int64_t>(timestamp * AV_TIME_BASE);
int ret = avformat_seek_file(pFormatContext, -1, INT64_MIN, ts, INT64_MAX, 0);
if (ret < 0) {
throw std::runtime_error(std::string("Failed to seek: ") + av_err2str(ret));
}
}
void Streamer::add_audio_stream(
int i,
int frames_per_chunk,
......
......@@ -60,6 +60,8 @@ class Streamer {
//////////////////////////////////////////////////////////////////////////////
// Configure methods
//////////////////////////////////////////////////////////////////////////////
void seek(double timestamp);
void add_audio_stream(
int i,
int frames_per_chunk,
......
#pragma once
#include <torchaudio/csrc/ffmpeg/ffmpeg.h>
#include <iostream>
namespace torchaudio {
namespace ffmpeg {
struct SrcStreamInfo {
AVMediaType media_type;
const char* codec_name = NULL;
const char* codec_long_name = NULL;
const char* fmt_name = NULL;
int bit_rate = 0;
// Audio
double sample_rate = 0;
int num_channels = 0;
// Video
int width = 0;
int height = 0;
double frame_rate = 0;
};
struct OutputStreamInfo {
int source_index;
std::string filter_description;
double rate;
OutputStreamInfo() = default;
};
} // namespace ffmpeg
} // namespace torchaudio
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment