"docs/source/vscode:/vscode.git/clone" did not exist on "b8ce0f41a3d440aa367e076e2c1556e26bce383d"
Commit 95eada24 authored by moto's avatar moto Committed by Facebook GitHub Bot
Browse files

Add CUDA HW encoding support to StreamWriter (#2505)

Summary:
This commits add CUDA hardware encoding to StreamWriter.
For certain video formats, it can encode video directly from
CUDA Tensor, without needing to move the data to host CPU.

Pull Request resolved: https://github.com/pytorch/audio/pull/2505

Reviewed By: hwangjeff

Differential Revision: D37446830

Pulled By: mthrok

fbshipit-source-id: eee6424f01a99a3b611dcad45ed58f86cba4672a
parent 28da8b84
......@@ -41,7 +41,8 @@ void StreamWriterFileObj::add_video_stream(
std::string format,
const c10::optional<std::string>& encoder,
const c10::optional<std::map<std::string, std::string>>& encoder_option,
const c10::optional<std::string>& encoder_format) {
const c10::optional<std::string>& encoder_format,
const c10::optional<std::string>& hw_accel) {
StreamWriter::add_video_stream(
frame_rate,
width,
......@@ -49,7 +50,8 @@ void StreamWriterFileObj::add_video_stream(
format,
encoder,
map2dict(encoder_option),
encoder_format);
encoder_format,
hw_accel);
}
} // namespace ffmpeg
......
......@@ -27,7 +27,8 @@ class StreamWriterFileObj : protected FileObj, public StreamWriterBinding {
std::string format,
const c10::optional<std::string>& encoder,
const c10::optional<std::map<std::string, std::string>>& encoder_option,
const c10::optional<std::string>& encoder_format);
const c10::optional<std::string>& encoder_format,
const c10::optional<std::string>& hw_accel);
};
} // namespace ffmpeg
......
#include <torchaudio/csrc/ffmpeg/stream_writer/stream_writer.h>
#ifdef USE_CUDA
#include <c10/cuda/CUDAStream.h>
#endif
namespace torchaudio {
namespace ffmpeg {
namespace {
......@@ -258,6 +262,13 @@ AVFramePtr get_audio_frame(
return frame;
}
AVFramePtr get_hw_video_frame(AVCodecContextPtr& codec_ctx) {
AVFramePtr frame{};
int ret = av_hwframe_get_buffer(codec_ctx->hw_frames_ctx, frame, 0);
TORCH_CHECK(ret >= 0, "Failed to fetch CUDA frame: ", av_err2string(ret));
return frame;
}
AVFramePtr get_video_frame(
enum AVPixelFormat fmt,
AVCodecContextPtr& codec_ctx) {
......@@ -417,7 +428,9 @@ void StreamWriter::add_audio_stream(
std::move(src_frame),
std::move(dst_frame),
0,
frame_capacity});
frame_capacity,
AVBufferRefPtr{},
AVBufferRefPtr{}});
}
void StreamWriter::add_video_stream(
......@@ -427,21 +440,93 @@ void StreamWriter::add_video_stream(
const std::string& format,
const c10::optional<std::string>& encoder,
const c10::optional<OptionDict>& encoder_option,
const c10::optional<std::string>& encoder_format) {
const c10::optional<std::string>& encoder_format,
const c10::optional<std::string>& hw_accel) {
const torch::Device device = [&]() {
if (!hw_accel) {
return torch::Device{c10::DeviceType::CPU};
}
#ifdef USE_CUDA
torch::Device d{hw_accel.value()};
TORCH_CHECK(
d.type() == c10::DeviceType::CUDA,
"Only CUDA is supported for hardware acceleration. Found:",
device.str());
return d;
#else
TORCH_CHECK(
false,
"torchaudio is not compiled with CUDA support. Hardware acceleration is not available.");
#endif
}();
enum AVPixelFormat src_fmt = _get_src_pixel_fmt(format);
AVCodecContextPtr ctx =
get_codec_ctx(AVMEDIA_TYPE_VIDEO, pFormatContext->oformat, encoder);
configure_video_codec(ctx, frame_rate, width, height, encoder_format);
AVBufferRefPtr hw_device_ctx{};
AVBufferRefPtr hw_frame_ctx{};
#ifdef USE_CUDA
if (device.type() == c10::DeviceType::CUDA) {
AVBufferRef* device_ctx = nullptr;
int ret = av_hwdevice_ctx_create(
&device_ctx,
AV_HWDEVICE_TYPE_CUDA,
std::to_string(device.index()).c_str(),
nullptr,
0);
TORCH_CHECK(
ret >= 0, "Failed to create CUDA device context: ", av_err2string(ret));
hw_device_ctx.reset(device_ctx);
AVBufferRef* frames_ref = av_hwframe_ctx_alloc(device_ctx);
TORCH_CHECK(frames_ref, "Failed to create CUDA frame context.");
hw_frame_ctx.reset(frames_ref);
AVHWFramesContext* frames_ctx = (AVHWFramesContext*)(frames_ref->data);
frames_ctx->format = AV_PIX_FMT_CUDA;
frames_ctx->sw_format = ctx->pix_fmt;
frames_ctx->width = ctx->width;
frames_ctx->height = ctx->height;
frames_ctx->initial_pool_size = 20;
ctx->sw_pix_fmt = ctx->pix_fmt;
ctx->pix_fmt = AV_PIX_FMT_CUDA;
ret = av_hwframe_ctx_init(frames_ref);
TORCH_CHECK(
ret >= 0,
"Failed to initialize CUDA frame context: ",
av_err2string(ret));
ctx->hw_frames_ctx = av_buffer_ref(frames_ref);
TORCH_CHECK(
ctx->hw_frames_ctx,
"Failed to attach CUDA frames to encoding context: ",
av_err2string(ret));
}
#endif
open_codec(ctx, encoder_option);
AVStream* stream = add_stream(ctx);
std::unique_ptr<FilterGraph> filter = src_fmt == ctx->pix_fmt
? std::unique_ptr<FilterGraph>(nullptr)
: _get_video_filter(src_fmt, ctx);
AVFramePtr src_frame = get_video_frame(src_fmt, ctx);
std::unique_ptr<FilterGraph> filter = [&]() {
if (src_fmt != ctx->pix_fmt && device.type() == c10::DeviceType::CPU) {
return _get_video_filter(src_fmt, ctx);
}
return std::unique_ptr<FilterGraph>(nullptr);
}();
// CUDA: require src_frame
// CPU: require dst_frame when filter is enabled
AVFramePtr src_frame = [&]() {
if (device.type() == c10::DeviceType::CUDA) {
return get_hw_video_frame(ctx);
}
return get_video_frame(src_fmt, ctx);
}();
AVFramePtr dst_frame =
filter ? AVFramePtr{} : get_video_frame(ctx->pix_fmt, ctx);
filter ? get_video_frame(ctx->pix_fmt, ctx) : AVFramePtr{};
streams.emplace_back(OutputStream{
stream,
std::move(ctx),
......@@ -449,7 +534,9 @@ void StreamWriter::add_video_stream(
std::move(src_frame),
std::move(dst_frame),
0,
-1});
-1,
std::move(hw_device_ctx),
std::move(hw_frame_ctx)});
}
AVStream* StreamWriter::add_stream(AVCodecContextPtr& codec_ctx) {
......@@ -720,6 +807,31 @@ void StreamWriter::write_video_chunk(int i, const torch::Tensor& frames) {
OutputStream& os = streams[i];
enum AVPixelFormat fmt = static_cast<AVPixelFormat>(os.src_frame->format);
#ifdef USE_CUDA
if (fmt == AV_PIX_FMT_CUDA) {
TORCH_CHECK(frames.device().is_cuda(), "Input tensor has to be on CUDA.");
enum AVPixelFormat sw_fmt = os.codec_ctx->sw_pix_fmt;
validate_video_input(sw_fmt, os.codec_ctx, frames);
switch (sw_fmt) {
case AV_PIX_FMT_RGB0:
case AV_PIX_FMT_BGR0:
write_interlaced_video_cuda(os, frames, true);
return;
case AV_PIX_FMT_GBRP:
case AV_PIX_FMT_GBRP16LE:
case AV_PIX_FMT_YUV444P:
case AV_PIX_FMT_YUV444P16LE:
write_planar_video_cuda(os, frames, av_pix_fmt_count_planes(sw_fmt));
return;
default:
TORCH_CHECK(
false,
"Unexpected pixel format for CUDA: ",
av_get_pix_fmt_name(sw_fmt));
}
}
#endif
TORCH_CHECK(frames.device().is_cpu(), "Input tensor has to be on CPU.");
validate_video_input(fmt, os.codec_ctx, frames);
switch (fmt) {
......@@ -736,6 +848,75 @@ void StreamWriter::write_video_chunk(int i, const torch::Tensor& frames) {
}
}
#ifdef USE_CUDA
void StreamWriter::write_interlaced_video_cuda(
OutputStream& os,
const torch::Tensor& frames,
bool pad_extra) {
const auto num_frames = frames.size(0);
const auto num_channels = frames.size(1);
const auto height = frames.size(2);
const auto width = frames.size(3);
const auto num_channels_buffer = num_channels + (pad_extra ? 1 : 0);
using namespace torch::indexing;
torch::Tensor buffer =
torch::empty({height, width, num_channels_buffer}, frames.options());
size_t spitch = width * num_channels_buffer;
for (int i = 0; i < num_frames; ++i) {
// Slice frame as HWC
auto chunk = frames.index({i}).permute({1, 2, 0});
buffer.index_put_({"...", Slice(0, num_channels)}, chunk);
if (cudaSuccess !=
cudaMemcpy2D(
(void*)(os.src_frame->data[0]),
os.src_frame->linesize[0],
(const void*)(buffer.data_ptr<uint8_t>()),
spitch,
spitch,
height,
cudaMemcpyDeviceToDevice)) {
TORCH_CHECK(false, "Failed to copy pixel data from CUDA tensor.");
}
os.src_frame->pts = os.num_frames;
os.num_frames += 1;
encode_frame(os.src_frame, os.codec_ctx, os.stream);
}
}
void StreamWriter::write_planar_video_cuda(
OutputStream& os,
const torch::Tensor& frames,
int num_planes) {
const auto num_frames = frames.size(0);
const auto height = frames.size(2);
const auto width = frames.size(3);
using namespace torch::indexing;
torch::Tensor buffer = torch::empty({height, width}, frames.options());
for (int i = 0; i < num_frames; ++i) {
for (int j = 0; j < num_planes; ++j) {
buffer.index_put_({"..."}, frames.index({i, j}));
if (cudaSuccess !=
cudaMemcpy2D(
(void*)(os.src_frame->data[j]),
os.src_frame->linesize[j],
(const void*)(buffer.data_ptr<uint8_t>()),
width,
width,
height,
cudaMemcpyDeviceToDevice)) {
TORCH_CHECK(false, "Failed to copy pixel data from CUDA tensor.");
}
}
os.src_frame->pts = os.num_frames;
os.num_frames += 1;
encode_frame(os.src_frame, os.codec_ctx, os.stream);
}
}
#endif
// Interlaced video
// Each frame is composed of one plane, and color components for each pixel are
// collocated.
......
......@@ -17,10 +17,14 @@ struct OutputStream {
int64_t num_frames;
// Audio-only: The maximum frames that frame can hold
int64_t frame_capacity;
// Video-only: HW acceleration
AVBufferRefPtr hw_device_ctx;
AVBufferRefPtr hw_frame_ctx;
};
class StreamWriter {
AVFormatOutputContextPtr pFormatContext;
AVBufferRefPtr pHWBufferRef;
std::vector<OutputStream> streams;
AVPacketPtr pkt;
......@@ -55,7 +59,8 @@ class StreamWriter {
const std::string& format,
const c10::optional<std::string>& encoder,
const c10::optional<OptionDict>& encoder_option,
const c10::optional<std::string>& encoder_format);
const c10::optional<std::string>& encoder_format,
const c10::optional<std::string>& hw_accel);
void set_metadata(const OptionDict& metadata);
private:
......@@ -79,6 +84,16 @@ class StreamWriter {
const torch::Tensor& chunk,
int num_planes);
void write_interlaced_video(OutputStream& os, const torch::Tensor& chunk);
#ifdef USE_CUDA
void write_planar_video_cuda(
OutputStream& os,
const torch::Tensor& chunk,
int num_planes);
void write_interlaced_video_cuda(
OutputStream& os,
const torch::Tensor& chunk,
bool pad_extra = true);
#endif
void process_frame(
AVFrame* src_frame,
std::unique_ptr<FilterGraph>& filter,
......
......@@ -43,7 +43,8 @@ TORCH_LIBRARY_FRAGMENT(torchaudio, m) {
const std::string& format,
const c10::optional<std::string>& encoder,
const c10::optional<OptionDict>& encoder_option,
const c10::optional<std::string>& encoder_format) {
const c10::optional<std::string>& encoder_format,
const c10::optional<std::string>& hw_accel) {
s->add_video_stream(
frame_rate,
width,
......@@ -51,7 +52,8 @@ TORCH_LIBRARY_FRAGMENT(torchaudio, m) {
format,
encoder,
encoder_option,
encoder_format);
encoder_format,
hw_accel);
})
.def(
"set_metadata",
......
......@@ -159,6 +159,7 @@ class StreamWriter:
encoder: Optional[str] = None,
encoder_option: Optional[Dict[str, str]] = None,
encoder_format: Optional[str] = None,
hw_accel: Optional[str] = None,
):
"""Add an output video stream.
......@@ -189,8 +190,18 @@ class StreamWriter:
encoder_option (dict or None, optional): {encoder_option}
encoder_format (str or None, optional): {encoder_format}
hw_accel (str or None, optional): Enable hardware acceleration.
When video is encoded on CUDA hardware, for example
`encoder="h264_nvenc"`, passing CUDA device indicator to `hw_accel`
(i.e. `hw_accel="cuda:0"`) will make StreamWriter expect video
chunk to be CUDA Tensor. Passing CPU Tensor will result in an error.
If `None`, the video chunk Tensor has to be CPU Tensor.
Default: ``None``.
"""
self._s.add_video_stream(frame_rate, width, height, format, encoder, encoder_option, encoder_format)
self._s.add_video_stream(frame_rate, width, height, format, encoder, encoder_option, encoder_format, hw_accel)
def set_metadata(self, metadata: Dict[str, str]):
"""Set file-level metadata
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment