Commit 0bf00d20 authored by Moto Hira's avatar Moto Hira Committed by Facebook GitHub Bot
Browse files

Extract image conversions into separate class (#3120)

Summary:
Pull Request resolved: https://github.com/pytorch/audio/pull/3120

This commits extract image conversion ops into ImageTensorConverter class, and make it independent from OutputStream class.

ImageTensorConverter class implementes range-based for-loop interface, like

```
for (auto const& frame : ImageTensorConverter::convert(...)) {
    post_process_with_avframe(frame);
}
```

This allows to decouple encoder from image conversion.

Reviewed By: nateanl

Differential Revision: D43666296

fbshipit-source-id: 754efe677bc7695b3f138a6d076be2106e186b79
parent c9c8c7e1
......@@ -17,8 +17,10 @@ set(
stream_reader/stream_processor.cpp
stream_reader/stream_reader.cpp
stream_writer/encoder.cpp
stream_writer/converter.cpp
stream_writer/output_stream.cpp
stream_writer/audio_output_stream.cpp
stream_writer/video_converter.cpp
stream_writer/video_output_stream.cpp
stream_writer/stream_writer.cpp
compat.cpp
......
#include <torchaudio/csrc/ffmpeg/stream_writer/converter.h>
namespace torchaudio::io {
using Iterator = Generator::Iterator;
using ConvertFunc = Generator::ConvertFunc;
////////////////////////////////////////////////////////////////////////////////
// Generator
////////////////////////////////////////////////////////////////////////////////
Generator::Generator(torch::Tensor frames_, AVFrame* buff, ConvertFunc& func)
: frames(std::move(frames_)), buffer(buff), convert_func(func) {}
Iterator Generator::begin() const {
return Iterator{frames, buffer, convert_func};
}
int64_t Generator::end() const {
return frames.size(0);
}
////////////////////////////////////////////////////////////////////////////////
// Iterator
////////////////////////////////////////////////////////////////////////////////
Iterator::Iterator(
const torch::Tensor frames_,
AVFrame* buffer_,
ConvertFunc& convert_func_)
: frames(frames_), buffer(buffer_), convert_func(convert_func_) {}
Iterator& Iterator::operator++() {
++i;
return *this;
}
AVFrame* Iterator::operator*() const {
convert_func(frames.index({i}), buffer);
return buffer;
}
bool Iterator::operator!=(const int64_t other) const {
return i != other;
}
} // namespace torchaudio::io
#pragma once
#include <torch/types.h>
#include <torchaudio/csrc/ffmpeg/ffmpeg.h>
namespace torchaudio::io {
//////////////////////////////////////////////////////////////////////////////
// Generator
//////////////////////////////////////////////////////////////////////////////
// Genrator class is responsible for implementing an interface compatible with
// range-based for loop interface (begin and end), and initialization of frame
// data (channel reordering and ensuring the contiguous-ness).
class Generator {
public:
// Convert function writes input frame Tensor to destinatoin AVFrame
// both tensor input and AVFrame are expected to be valid and properly
// allocated. (i.e. glorified copy)
// It is one-to-one conversion. Performed in Iterator.
using ConvertFunc = std::function<void(const torch::Tensor&, AVFrame*)>;
////////////////////////////////////////////////////////////////////////////
// Iterator
////////////////////////////////////////////////////////////////////////////
// Iterator class is responsible for implementing iterator protocol, that is
// increment, comaprison against, and dereference (applying conversion
// function in it).
class Iterator {
// Input tensor, has to be NCHW or NHWC, uint8, CPU or CUDA
// It will be sliced at dereference time.
const torch::Tensor frames;
// Output buffer (not owned, but modified by Iterator)
AVFrame* buffer;
// Function that converts one frame Tensor into AVFrame.
ConvertFunc& convert_func;
// Index
int64_t i = 0;
public:
Iterator(
const torch::Tensor tensor,
AVFrame* buffer,
ConvertFunc& convert_func);
Iterator& operator++();
AVFrame* operator*() const;
bool operator!=(const int64_t other) const;
};
private:
// Tensor representing video frames provided by client code
// Expected (and validated) to be NCHW, uint8.
torch::Tensor frames;
// Output buffer (not owned, passed to iterator)
AVFrame* buffer;
// ops: not owned.
ConvertFunc& convert_func;
public:
Generator(torch::Tensor frames, AVFrame* buffer, ConvertFunc& convert_func);
[[nodiscard]] Iterator begin() const;
[[nodiscard]] int64_t end() const;
};
} // namespace torchaudio::io
#include <torchaudio/csrc/ffmpeg/stream_writer/video_converter.h>
#ifdef USE_CUDA
#include <c10/cuda/CUDAStream.h>
#endif
namespace torchaudio::io {
////////////////////////////////////////////////////////////////////////////////
// VideoTensorConverter
////////////////////////////////////////////////////////////////////////////////
using InitFunc = VideoTensorConverter::InitFunc;
using ConvertFunc = Generator::ConvertFunc;
namespace {
// Interlaced video
// Each frame is composed of one plane, and color components for each pixel are
// collocated.
// The memory layout is 1D linear, interpretated as following.
//
// |<----- linesize[0] ----->|
// 0 1 ... W
// 0: RGB RGB ... RGB PAD ... PAD
// 1: RGB RGB ... RGB PAD ... PAD
// ...
// H: RGB RGB ... RGB PAD ... PAD
void write_interlaced_video(const torch::Tensor& frame, AVFrame* buffer) {
const auto height = frame.size(0);
const auto width = frame.size(1);
const auto num_channels = frame.size(2);
size_t stride = width * num_channels;
// TODO: writable
// https://ffmpeg.org/doxygen/4.1/muxing_8c_source.html#l00472
TORCH_INTERNAL_ASSERT(av_frame_is_writable(buffer), "frame is not writable.");
uint8_t* src = frame.data_ptr<uint8_t>();
uint8_t* dst = buffer->data[0];
for (int h = 0; h < height; ++h) {
std::memcpy(dst, src, stride);
src += width * num_channels;
dst += buffer->linesize[0];
}
}
// Planar video
// Each frame is composed of multiple planes.
// One plane can contain one of more color components.
// (but at the moment only accept formats without subsampled color components)
//
// The memory layout is interpreted as follow
//
// |<----- linesize[0] ----->|
// 0 1 ... W1
// 0: Y Y ... Y PAD ... PAD
// 1: Y Y ... Y PAD ... PAD
// ...
// H1: Y Y ... Y PAD ... PAD
//
// |<--- linesize[1] ---->|
// 0 ... W2
// 0: UV ... UV PAD ... PAD
// 1: UV ... UV PAD ... PAD
// ...
// H2: UV ... UV PAD ... PAD
//
void write_planar_video(
const torch::Tensor& frame,
AVFrame* buffer,
int num_planes) {
const auto height = frame.size(1);
const auto width = frame.size(2);
// TODO: writable
// https://ffmpeg.org/doxygen/4.1/muxing_8c_source.html#l00472
TORCH_INTERNAL_ASSERT(av_frame_is_writable(buffer), "frame is not writable.");
for (int j = 0; j < num_planes; ++j) {
uint8_t* src = frame.index({j}).data_ptr<uint8_t>();
uint8_t* dst = buffer->data[j];
for (int h = 0; h < height; ++h) {
memcpy(dst, src, width);
src += width;
dst += buffer->linesize[j];
}
}
}
void write_interlaced_video_cuda(
const torch::Tensor& frame,
AVFrame* buffer,
bool pad_extra) {
#ifndef USE_CUDA
TORCH_CHECK(
false,
"torchaudio is not compiled with CUDA support. Hardware acceleration is not available.");
#else
const auto height = frame.size(0);
const auto width = frame.size(1);
const auto num_channels = frame.size(2) + (pad_extra ? 1 : 0);
size_t spitch = width * num_channels;
if (cudaSuccess !=
cudaMemcpy2D(
(void*)(buffer->data[0]),
buffer->linesize[0],
(const void*)(frame.data_ptr<uint8_t>()),
spitch,
spitch,
height,
cudaMemcpyDeviceToDevice)) {
TORCH_CHECK(false, "Failed to copy pixel data from CUDA tensor.");
}
#endif
}
void write_planar_video_cuda(
const torch::Tensor& frame,
AVFrame* buffer,
int num_planes) {
#ifndef USE_CUDA
TORCH_CHECK(
false,
"torchaudio is not compiled with CUDA support. Hardware acceleration is not available.");
#else
const auto height = frame.size(1);
const auto width = frame.size(2);
for (int j = 0; j < num_planes; ++j) {
if (cudaSuccess !=
cudaMemcpy2D(
(void*)(buffer->data[j]),
buffer->linesize[j],
(const void*)(frame.index({j}).data_ptr<uint8_t>()),
width,
width,
height,
cudaMemcpyDeviceToDevice)) {
TORCH_CHECK(false, "Failed to copy pixel data from CUDA tensor.");
}
}
#endif
}
// NCHW ->NHWC, ensure contiguous
torch::Tensor init_interlaced(const torch::Tensor& tensor) {
return tensor.permute({0, 2, 3, 1}).contiguous();
}
// Keep NCHW, ensure contiguous
torch::Tensor init_planar(const torch::Tensor& tensor) {
return tensor.contiguous();
}
std::pair<InitFunc, ConvertFunc> get_func(
enum AVPixelFormat pix_fmt,
enum AVPixelFormat sw_pix_fmt) {
using namespace std::placeholders;
if (pix_fmt == AV_PIX_FMT_CUDA) {
switch (sw_pix_fmt) {
case AV_PIX_FMT_RGB0:
case AV_PIX_FMT_BGR0: {
ConvertFunc convert_func = [](const torch::Tensor& t, AVFrame* f) {
write_interlaced_video_cuda(t, f, true);
};
return {init_interlaced, convert_func};
}
case AV_PIX_FMT_GBRP:
case AV_PIX_FMT_GBRP16LE:
case AV_PIX_FMT_YUV444P:
case AV_PIX_FMT_YUV444P16LE: {
auto num_planes = av_pix_fmt_count_planes(sw_pix_fmt);
ConvertFunc convert_func = [=](const torch::Tensor& t, AVFrame* f) {
write_planar_video_cuda(t, f, num_planes);
};
return {init_planar, convert_func};
}
default:
TORCH_CHECK(
false,
"Unexpected pixel format for CUDA: ",
av_get_pix_fmt_name(sw_pix_fmt));
}
}
switch (pix_fmt) {
case AV_PIX_FMT_GRAY8:
case AV_PIX_FMT_RGB24:
case AV_PIX_FMT_BGR24: {
return {init_interlaced, write_interlaced_video};
}
case AV_PIX_FMT_YUV444P: {
auto num_planes = av_pix_fmt_count_planes(pix_fmt);
ConvertFunc convert_func = [=](const torch::Tensor& t, AVFrame* f) {
write_planar_video(t, f, num_planes);
};
return {init_planar, convert_func};
}
default:
TORCH_CHECK(
false, "Unexpected pixel format: ", av_get_pix_fmt_name(pix_fmt));
}
}
AVFramePtr get_video_frame(AVPixelFormat src_fmt, AVCodecContext* codec_ctx) {
AVFramePtr frame{};
if (codec_ctx->pix_fmt == AV_PIX_FMT_CUDA) {
int ret = av_hwframe_get_buffer(codec_ctx->hw_frames_ctx, frame, 0);
TORCH_CHECK(ret >= 0, "Failed to fetch CUDA frame: ", av_err2string(ret));
} else {
frame->format = src_fmt;
frame->width = codec_ctx->width;
frame->height = codec_ctx->height;
int ret = av_frame_get_buffer(frame, 0);
TORCH_CHECK(
ret >= 0,
"Error allocating a video buffer (",
av_err2string(ret),
").");
}
return frame;
}
void validate_video_input(
enum AVPixelFormat fmt,
AVCodecContext* ctx,
const torch::Tensor& t) {
if (fmt == AV_PIX_FMT_CUDA) {
TORCH_CHECK(t.device().is_cuda(), "Input tensor has to be on CUDA.");
fmt = ctx->sw_pix_fmt;
} else {
TORCH_CHECK(t.device().is_cpu(), "Input tensor has to be on CPU.");
}
auto dtype = t.dtype().toScalarType();
TORCH_CHECK(dtype == c10::ScalarType::Byte, "Expected Tensor of uint8 type.");
TORCH_CHECK(t.dim() == 4, "Input Tensor has to be 4D.");
// Note: the number of color components is not same as the number of planes.
// For example, YUV420P has only two planes. U and V are in the second plane.
int num_color_components = av_pix_fmt_desc_get(fmt)->nb_components;
const auto channels = t.size(1);
const auto height = t.size(2);
const auto width = t.size(3);
TORCH_CHECK(
channels == num_color_components && height == ctx->height &&
width == ctx->width,
"Expected tensor with shape (N, ",
num_color_components,
", ",
ctx->height,
", ",
ctx->width,
") (NCHW format). Found ",
t.sizes());
}
} // namespace
VideoTensorConverter::VideoTensorConverter(
enum AVPixelFormat src_fmt_,
AVCodecContext* codec_ctx_)
: src_fmt(src_fmt_),
codec_ctx(codec_ctx_),
buffer(get_video_frame(src_fmt_, codec_ctx_)) {
std::tie(init_func, convert_func) = get_func(src_fmt, codec_ctx->sw_pix_fmt);
}
Generator VideoTensorConverter::convert(const torch::Tensor& frames) {
validate_video_input(src_fmt, codec_ctx, frames);
return Generator{init_func(frames), buffer, convert_func};
}
} // namespace torchaudio::io
#pragma once
#include <torchaudio/csrc/ffmpeg/stream_writer/converter.h>
namespace torchaudio::io {
////////////////////////////////////////////////////////////////////////////////
// VideoTensorConverter
////////////////////////////////////////////////////////////////////////////////
// VideoTensorConverter is responsible for picking up the right set of
// conversion process (InitFunc and ConvertFunc) based on the input pixel format
// information, and own them.
class VideoTensorConverter {
public:
// Initialization is one-time process applied to frames before the iteration
// starts. i.e. either convert to NHWC.
using InitFunc = std::function<torch::Tensor(const torch::Tensor&)>;
private:
enum AVPixelFormat src_fmt;
AVCodecContext* codec_ctx;
AVFramePtr buffer;
InitFunc init_func{};
Generator::ConvertFunc convert_func{};
public:
VideoTensorConverter(enum AVPixelFormat src_fmt, AVCodecContext* codec_ctx);
Generator convert(const torch::Tensor& frames);
};
} // namespace torchaudio::io
......@@ -33,26 +33,6 @@ FilterGraph get_video_filter(AVPixelFormat src_fmt, AVCodecContext* codec_ctx) {
return p;
}
AVFramePtr get_video_frame(AVPixelFormat src_fmt, AVCodecContext* codec_ctx) {
AVFramePtr frame{};
if (codec_ctx->pix_fmt == AV_PIX_FMT_CUDA) {
int ret = av_hwframe_get_buffer(codec_ctx->hw_frames_ctx, frame, 0);
TORCH_CHECK(ret >= 0, "Failed to fetch CUDA frame: ", av_err2string(ret));
} else {
frame->format = src_fmt;
frame->width = codec_ctx->width;
frame->height = codec_ctx->height;
int ret = av_frame_get_buffer(frame, 0);
TORCH_CHECK(
ret >= 0,
"Error allocating a video buffer (",
av_err2string(ret),
").");
}
return frame;
}
} // namespace
VideoOutputStream::VideoOutputStream(
......@@ -65,245 +45,17 @@ VideoOutputStream::VideoOutputStream(
format_ctx,
codec_ctx_,
get_video_filter(src_fmt, codec_ctx_)),
src_frame(get_video_frame(src_fmt, codec_ctx_)),
converter(src_fmt, codec_ctx_),
hw_device_ctx(std::move(hw_device_ctx_)),
hw_frame_ctx(std::move(hw_frame_ctx_)),
codec_ctx(std::move(codec_ctx_)) {}
namespace {
void validate_video_input(
enum AVPixelFormat fmt,
AVCodecContext* ctx,
const torch::Tensor& t) {
if (fmt == AV_PIX_FMT_CUDA) {
TORCH_CHECK(t.device().is_cuda(), "Input tensor has to be on CUDA.");
fmt = ctx->sw_pix_fmt;
} else {
TORCH_CHECK(t.device().is_cpu(), "Input tensor has to be on CPU.");
}
auto dtype = t.dtype().toScalarType();
TORCH_CHECK(dtype == c10::ScalarType::Byte, "Expected Tensor of uint8 type.");
TORCH_CHECK(t.dim() == 4, "Input Tensor has to be 4D.");
// Note: the number of color components is not same as the number of planes.
// For example, YUV420P has only two planes. U and V are in the second plane.
int num_color_components = av_pix_fmt_desc_get(fmt)->nb_components;
const auto channels = t.size(1);
const auto height = t.size(2);
const auto width = t.size(3);
TORCH_CHECK(
channels == num_color_components && height == ctx->height &&
width == ctx->width,
"Expected tensor with shape (N, ",
num_color_components,
", ",
ctx->height,
", ",
ctx->width,
") (NCHW format). Found ",
t.sizes());
}
void write_interlaced_video_cuda(
const torch::Tensor& chunk,
AVFrame* buffer,
bool pad_extra) {
#ifdef USE_CUDA
const auto height = chunk.size(0);
const auto width = chunk.size(1);
const auto num_channels = chunk.size(2) + (pad_extra ? 1 : 0);
size_t spitch = width * num_channels;
if (cudaSuccess !=
cudaMemcpy2D(
(void*)(buffer->data[0]),
buffer->linesize[0],
(const void*)(chunk.data_ptr<uint8_t>()),
spitch,
spitch,
height,
cudaMemcpyDeviceToDevice)) {
TORCH_CHECK(false, "Failed to copy pixel data from CUDA tensor.");
}
#else
TORCH_CHECK(
false,
"torchaudio is not compiled with CUDA support. Hardware acceleration is not available.");
#endif
}
void write_planar_video_cuda(
const torch::Tensor& chunk,
AVFrame* buffer,
int num_planes) {
#ifdef USE_CUDA
const auto height = chunk.size(1);
const auto width = chunk.size(2);
for (int j = 0; j < num_planes; ++j) {
if (cudaSuccess !=
cudaMemcpy2D(
(void*)(buffer->data[j]),
buffer->linesize[j],
(const void*)(chunk.index({j}).data_ptr<uint8_t>()),
width,
width,
height,
cudaMemcpyDeviceToDevice)) {
TORCH_CHECK(false, "Failed to copy pixel data from CUDA tensor.");
}
}
#else
TORCH_CHECK(
false,
"torchaudio is not compiled with CUDA support. Hardware acceleration is not available.");
#endif
}
// Interlaced video
// Each frame is composed of one plane, and color components for each pixel are
// collocated.
// The memory layout is 1D linear, interpretated as following.
//
// |<----- linesize[0] ----->|
// 0 1 ... W
// 0: RGB RGB ... RGB PAD ... PAD
// 1: RGB RGB ... RGB PAD ... PAD
// ...
// H: RGB RGB ... RGB PAD ... PAD
void write_interlaced_video(const torch::Tensor& chunk, AVFrame* buffer) {
const auto height = chunk.size(0);
const auto width = chunk.size(1);
const auto num_channels = chunk.size(2);
size_t stride = width * num_channels;
// TODO: writable
// https://ffmpeg.org/doxygen/4.1/muxing_8c_source.html#l00472
TORCH_INTERNAL_ASSERT(av_frame_is_writable(buffer), "frame is not writable.");
uint8_t* src = chunk.data_ptr<uint8_t>();
uint8_t* dst = buffer->data[0];
for (int h = 0; h < height; ++h) {
std::memcpy(dst, src, stride);
src += width * num_channels;
dst += buffer->linesize[0];
}
}
// Planar video
// Each frame is composed of multiple planes.
// One plane can contain one of more color components.
// (but at the moment only accept formats without subsampled color components)
//
// The memory layout is interpreted as follow
//
// |<----- linesize[0] ----->|
// 0 1 ... W1
// 0: Y Y ... Y PAD ... PAD
// 1: Y Y ... Y PAD ... PAD
// ...
// H1: Y Y ... Y PAD ... PAD
//
// |<--- linesize[1] ---->|
// 0 ... W2
// 0: UV ... UV PAD ... PAD
// 1: UV ... UV PAD ... PAD
// ...
// H2: UV ... UV PAD ... PAD
//
void write_planar_video(
const torch::Tensor& chunk,
AVFrame* buffer,
int num_planes) {
const auto height = chunk.size(1);
const auto width = chunk.size(2);
// TODO: writable
// https://ffmpeg.org/doxygen/4.1/muxing_8c_source.html#l00472
TORCH_INTERNAL_ASSERT(av_frame_is_writable(buffer), "frame is not writable.");
for (int j = 0; j < num_planes; ++j) {
uint8_t* src = chunk.index({j}).data_ptr<uint8_t>();
uint8_t* dst = buffer->data[j];
for (int h = 0; h < height; ++h) {
memcpy(dst, src, width);
src += width;
dst += buffer->linesize[j];
}
}
}
} // namespace
void VideoOutputStream::write_chunk(const torch::Tensor& frames) {
enum AVPixelFormat fmt = static_cast<AVPixelFormat>(src_frame->format);
validate_video_input(fmt, codec_ctx, frames);
const auto num_frames = frames.size(0);
#ifdef USE_CUDA
if (fmt == AV_PIX_FMT_CUDA) {
fmt = codec_ctx->sw_pix_fmt;
switch (fmt) {
case AV_PIX_FMT_RGB0:
case AV_PIX_FMT_BGR0: {
auto chunks = frames.permute({0, 2, 3, 1}).contiguous(); // to NHWC
for (int i = 0; i < num_frames; ++i) {
write_interlaced_video_cuda(chunks.index({i}), src_frame, true);
process_frame();
}
return;
}
case AV_PIX_FMT_GBRP:
case AV_PIX_FMT_GBRP16LE:
case AV_PIX_FMT_YUV444P:
case AV_PIX_FMT_YUV444P16LE: {
auto chunks = frames.contiguous();
for (int i = 0; i < num_frames; ++i) {
write_planar_video_cuda(
chunks.index({i}), src_frame, av_pix_fmt_count_planes(fmt));
process_frame();
}
return;
}
default:
TORCH_CHECK(
false,
"Unexpected pixel format for CUDA: ",
av_get_pix_fmt_name(fmt));
}
for (const auto& frame : converter.convert(frames)) {
frame->pts = num_frames;
num_frames += 1;
process_frame(frame);
}
#endif
switch (fmt) {
case AV_PIX_FMT_GRAY8:
case AV_PIX_FMT_RGB24:
case AV_PIX_FMT_BGR24: {
auto chunks = frames.permute({0, 2, 3, 1}).contiguous();
for (int i = 0; i < num_frames; ++i) {
write_interlaced_video(chunks.index({i}), src_frame);
process_frame();
}
return;
}
case AV_PIX_FMT_YUV444P: {
auto chunks = frames.contiguous();
for (int i = 0; i < num_frames; ++i) {
write_planar_video(
chunks.index({i}), src_frame, av_pix_fmt_count_planes(fmt));
process_frame();
}
return;
}
default:
TORCH_CHECK(false, "Unexpected pixel format: ", av_get_pix_fmt_name(fmt));
}
}
void VideoOutputStream::process_frame() {
src_frame->pts = num_frames;
num_frames += 1;
OutputStream::process_frame(src_frame);
}
} // namespace torchaudio::io
#pragma once
#include <torchaudio/csrc/ffmpeg/stream_writer/output_stream.h>
#include <torchaudio/csrc/ffmpeg/stream_writer/video_converter.h>
namespace torchaudio::io {
struct VideoOutputStream : OutputStream {
AVFramePtr src_frame;
VideoTensorConverter converter;
AVBufferRefPtr hw_device_ctx;
AVBufferRefPtr hw_frame_ctx;
AVCodecContextPtr codec_ctx;
......@@ -18,7 +18,6 @@ struct VideoOutputStream : OutputStream {
AVBufferRefPtr&& hw_frame_ctx);
void write_chunk(const torch::Tensor& frames) override;
void process_frame();
~VideoOutputStream() override = default;
};
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment