Commit c5b96558 authored by moto's avatar moto Committed by Facebook GitHub Bot
Browse files

Support CUDA frame in FilterGraph (#3183)

Summary:
This commit adds CUDA frame support to FilterGraph

It initializes and attaches CUDA frames context to FilterGraph,
so that CUDA frames can be processed in FilterGraph.

As a result, it enables
1. CUDA filter support such as `scale_cuda`
2. Properly retrieve the pixel format coming out of FilterGraph when
   CUDA HW acceleration is enabled. (currently it is reported as "cuda")

Resolves https://github.com/pytorch/audio/issues/3159

Pull Request resolved: https://github.com/pytorch/audio/pull/3183

Reviewed By: hwangjeff

Differential Revision: D44183722

Pulled By: mthrok

fbshipit-source-id: 522d21039c361ddfaa87fa89cf49c19d210ac62f
parent 0c8c138c
......@@ -654,6 +654,7 @@ jobs:
command: .circleci/unittest/linux/scripts/run_test.sh
environment:
TORCHAUDIO_TEST_ALLOW_SKIP_IF_NO_CUDA: true
TORCHAUDIO_TEST_ALLOW_SKIP_IF_NO_HW_ACCEL: true
TORCHAUDIO_TEST_ALLOW_SKIP_IF_ON_PYTHON_310: true
TORCHAUDIO_TEST_ALLOW_SKIP_IF_NO_AUDIO_OUT_DEVICE: true
TORCHAUDIO_TEST_ALLOW_SKIP_IF_NO_MACOS: true
......@@ -689,11 +690,12 @@ jobs:
- run:
name: Run tests
environment:
TORCHAUDIO_TEST_ALLOW_SKIP_IF_NO_HW_ACCEL: true
TORCHAUDIO_TEST_ALLOW_SKIP_IF_ON_PYTHON_310: true
TORCHAUDIO_TEST_ALLOW_SKIP_IF_CUDA_SMALL_MEMORY: true
TORCHAUDIO_TEST_ALLOW_SKIP_IF_NO_MACOS: true
command: |
docker run -t --gpus all -v $PWD:$PWD -w $PWD -e "CI=${CI}" -e TORCHAUDIO_TEST_ALLOW_SKIP_IF_ON_PYTHON_310 -e TORCHAUDIO_TEST_ALLOW_SKIP_IF_CUDA_SMALL_MEMORY -e TORCHAUDIO_TEST_ALLOW_SKIP_IF_NO_MACOS "${image_name}" .circleci/unittest/linux/scripts/run_test.sh
docker run -t --gpus all -v $PWD:$PWD -w $PWD -e "CI=${CI}" -e TORCHAUDIO_TEST_ALLOW_SKIP_IF_ON_PYTHON_310 -e TORCHAUDIO_TEST_ALLOW_SKIP_IF_CUDA_SMALL_MEMORY -e TORCHAUDIO_TEST_ALLOW_SKIP_IF_NO_MACOS -e TORCHAUDIO_TEST_ALLOW_SKIP_IF_NO_HW_ACCEL "${image_name}" .circleci/unittest/linux/scripts/run_test.sh
- store_test_results:
path: test-results
- store_artifacts:
......@@ -726,6 +728,7 @@ jobs:
TORCHAUDIO_TEST_ALLOW_SKIP_IF_NO_CMD_COMPUTE_SPECTROGRAM_FEATS: true
TORCHAUDIO_TEST_ALLOW_SKIP_IF_NO_CMD_SOX: true
TORCHAUDIO_TEST_ALLOW_SKIP_IF_NO_CUDA: true
TORCHAUDIO_TEST_ALLOW_SKIP_IF_NO_HW_ACCEL: true
TORCHAUDIO_TEST_ALLOW_SKIP_IF_NO_KALDI: true
TORCHAUDIO_TEST_ALLOW_SKIP_IF_NO_SOX: true
TORCHAUDIO_TEST_ALLOW_SKIP_IF_ON_PYTHON_310: true
......@@ -814,6 +817,7 @@ jobs:
TORCHAUDIO_TEST_ALLOW_SKIP_IF_NO_CMD_COMPUTE_MFCC_FEATS: true
TORCHAUDIO_TEST_ALLOW_SKIP_IF_NO_CMD_COMPUTE_SPECTROGRAM_FEATS: true
TORCHAUDIO_TEST_ALLOW_SKIP_IF_NO_CUDA: true
TORCHAUDIO_TEST_ALLOW_SKIP_IF_NO_HW_ACCEL: true
TORCHAUDIO_TEST_ALLOW_SKIP_IF_NO_QUANTIZATION: true
TORCHAUDIO_TEST_ALLOW_SKIP_IF_ON_PYTHON_310: true
TORCHAUDIO_TEST_ALLOW_SKIP_IF_NO_MOD_sentencepiece: true
......
......@@ -654,6 +654,7 @@ jobs:
command: .circleci/unittest/linux/scripts/run_test.sh
environment:
TORCHAUDIO_TEST_ALLOW_SKIP_IF_NO_CUDA: true
TORCHAUDIO_TEST_ALLOW_SKIP_IF_NO_HW_ACCEL: true
TORCHAUDIO_TEST_ALLOW_SKIP_IF_ON_PYTHON_310: true
TORCHAUDIO_TEST_ALLOW_SKIP_IF_NO_AUDIO_OUT_DEVICE: true
TORCHAUDIO_TEST_ALLOW_SKIP_IF_NO_MACOS: true
......@@ -689,11 +690,12 @@ jobs:
- run:
name: Run tests
environment:
TORCHAUDIO_TEST_ALLOW_SKIP_IF_NO_HW_ACCEL: true
TORCHAUDIO_TEST_ALLOW_SKIP_IF_ON_PYTHON_310: true
TORCHAUDIO_TEST_ALLOW_SKIP_IF_CUDA_SMALL_MEMORY: true
TORCHAUDIO_TEST_ALLOW_SKIP_IF_NO_MACOS: true
command: |
docker run -t --gpus all -v $PWD:$PWD -w $PWD -e "CI=${CI}" -e TORCHAUDIO_TEST_ALLOW_SKIP_IF_ON_PYTHON_310 -e TORCHAUDIO_TEST_ALLOW_SKIP_IF_CUDA_SMALL_MEMORY -e TORCHAUDIO_TEST_ALLOW_SKIP_IF_NO_MACOS "${image_name}" .circleci/unittest/linux/scripts/run_test.sh
docker run -t --gpus all -v $PWD:$PWD -w $PWD -e "CI=${CI}" -e TORCHAUDIO_TEST_ALLOW_SKIP_IF_ON_PYTHON_310 -e TORCHAUDIO_TEST_ALLOW_SKIP_IF_CUDA_SMALL_MEMORY -e TORCHAUDIO_TEST_ALLOW_SKIP_IF_NO_MACOS -e TORCHAUDIO_TEST_ALLOW_SKIP_IF_NO_HW_ACCEL "${image_name}" .circleci/unittest/linux/scripts/run_test.sh
- store_test_results:
path: test-results
- store_artifacts:
......@@ -726,6 +728,7 @@ jobs:
TORCHAUDIO_TEST_ALLOW_SKIP_IF_NO_CMD_COMPUTE_SPECTROGRAM_FEATS: true
TORCHAUDIO_TEST_ALLOW_SKIP_IF_NO_CMD_SOX: true
TORCHAUDIO_TEST_ALLOW_SKIP_IF_NO_CUDA: true
TORCHAUDIO_TEST_ALLOW_SKIP_IF_NO_HW_ACCEL: true
TORCHAUDIO_TEST_ALLOW_SKIP_IF_NO_KALDI: true
TORCHAUDIO_TEST_ALLOW_SKIP_IF_NO_SOX: true
TORCHAUDIO_TEST_ALLOW_SKIP_IF_ON_PYTHON_310: true
......@@ -814,6 +817,7 @@ jobs:
TORCHAUDIO_TEST_ALLOW_SKIP_IF_NO_CMD_COMPUTE_MFCC_FEATS: true
TORCHAUDIO_TEST_ALLOW_SKIP_IF_NO_CMD_COMPUTE_SPECTROGRAM_FEATS: true
TORCHAUDIO_TEST_ALLOW_SKIP_IF_NO_CUDA: true
TORCHAUDIO_TEST_ALLOW_SKIP_IF_NO_HW_ACCEL: true
TORCHAUDIO_TEST_ALLOW_SKIP_IF_NO_QUANTIZATION: true
TORCHAUDIO_TEST_ALLOW_SKIP_IF_ON_PYTHON_310: true
TORCHAUDIO_TEST_ALLOW_SKIP_IF_NO_MOD_sentencepiece: true
......
......@@ -72,6 +72,7 @@ jobs:
export TORCHAUDIO_TEST_ALLOW_SKIP_IF_NO_CMD_COMPUTE_MFCC_FEATS=true
export TORCHAUDIO_TEST_ALLOW_SKIP_IF_NO_CMD_COMPUTE_SPECTROGRAM_FEATS=true
export TORCHAUDIO_TEST_ALLOW_SKIP_IF_CUDA_SMALL_MEMORY=true
export TORCHAUDIO_TEST_ALLOW_SKIP_IF_NO_HW_ACCEL=true
export TORCHAUDIO_TEST_ALLOW_SKIP_IF_ON_PYTHON_310=true
declare -a args=(
......
......@@ -10,6 +10,7 @@ from .case_utils import (
skipIfNoCuda,
skipIfNoExec,
skipIfNoFFmpeg,
skipIfNoHWAccel,
skipIfNoKaldi,
skipIfNoMacOS,
skipIfNoModule,
......@@ -55,6 +56,7 @@ __all__ = [
"skipIfRocm",
"skipIfNoQengine",
"skipIfNoFFmpeg",
"skipIfNoHWAccel",
"skipIfPy310",
"get_wav_data",
"normalize_wav",
......
......@@ -12,6 +12,7 @@ import torch
import torchaudio
from torch.testing._internal.common_utils import TestCase as PytorchTestCase
from torchaudio._internal.module_utils import is_module_available
from torchaudio.utils.ffmpeg_utils import get_video_decoders, get_video_encoders
from .backend_utils import set_audio_backend
......@@ -270,6 +271,19 @@ skipIfNoMacOS = _skipIf(
)
def skipIfNoHWAccel(name):
key = "NO_HW_ACCEL"
if not is_ffmpeg_available():
return _skipIf(True, reason="ffmpeg features are not available.", key=key)
if not torch.cuda.is_available():
return _skipIf(True, reason="CUDA is not available.", key=key)
if torchaudio._extension._check_cuda_version() is None:
return _skipIf(True, "Torchaudio is not compiled with CUDA.", key=key)
if name not in get_video_decoders() and name not in get_video_encoders():
return _skipIf(True, f"{name} is not in the list of available decoders or encoders", key=key)
return _pass
def zip_equal(*iterables):
"""With the regular Python `zip` function, if one iterable is longer than the other,
the remainder portions are ignored.This is resolved in Python 3.10 where we can use
......
......@@ -14,10 +14,12 @@ from torchaudio_unittest.common_utils import (
save_image,
save_wav,
skipIfNoFFmpeg,
skipIfNoHWAccel,
TempDirMixin,
TorchaudioTestCase,
)
if is_ffmpeg_available():
from torchaudio.io import StreamReader, StreamWriter
from torchaudio.io._stream_reader import (
......@@ -1048,3 +1050,105 @@ class StreamReaderImageTest(_MediaSourceMixin, TempDirMixin, TorchaudioTestCase)
self.assertEqual(chunks[8], rgba, atol=0, rtol=0)
self.assertEqual(chunks[9], abgr, atol=0, rtol=0)
self.assertEqual(chunks[10], bgra, atol=0, rtol=0)
@skipIfNoHWAccel("h264_cuvid")
class CuvidHWAccelInterfaceTest(TorchaudioTestCase):
def test_dup_hw_acel(self):
"""Specifying the same source stream with and without HW accel should fail (instead of segfault later)"""
src = get_asset_path("nasa_13013.mp4")
r = StreamReader(src)
r.add_video_stream(-1, decoder="h264_cuvid")
with self.assertRaises(RuntimeError):
r.add_video_stream(-1, decoder="h264_cuvid", hw_accel="cuda")
r = StreamReader(src)
r.add_video_stream(-1, decoder="h264_cuvid", hw_accel="cuda")
with self.assertRaises(RuntimeError):
r.add_video_stream(-1, decoder="h264_cuvid")
@_media_source
class CudaDecoderTest(_MediaSourceMixin, TempDirMixin, TorchaudioTestCase):
@skipIfNoHWAccel("h264_cuvid")
def test_h264_cuvid(self):
"""GPU decoder works for H264"""
src = self.get_src(get_asset_path("nasa_13013.mp4"))
r = StreamReader(src)
r.add_video_stream(10, decoder="h264_cuvid")
num_frames = 0
for (chunk,) in r.stream():
self.assertEqual(chunk.device, torch.device("cpu"))
self.assertEqual(chunk.dtype, torch.uint8)
self.assertEqual(chunk.shape, torch.Size([10, 3, 270, 480]))
num_frames += chunk.size(0)
assert num_frames == 390
@skipIfNoHWAccel("h264_cuvid")
def test_h264_cuvid_hw_accel(self):
"""GPU decoder works for H264 with HW acceleration, and put the frames on CUDA tensor"""
src = self.get_src(get_asset_path("nasa_13013.mp4"))
r = StreamReader(src)
r.add_video_stream(10, decoder="h264_cuvid", hw_accel="cuda")
num_frames = 0
for (chunk,) in r.stream():
self.assertEqual(chunk.device, torch.device("cuda:0"))
self.assertEqual(chunk.dtype, torch.uint8)
self.assertEqual(chunk.shape, torch.Size([10, 3, 270, 480]))
num_frames += chunk.size(0)
assert num_frames == 390
@skipIfNoHWAccel("hevc_cuvid")
def test_hevc_cuvid(self):
"""GPU decoder works for H265/HEVC"""
src = self.get_src(get_asset_path("testsrc.hevc"))
r = StreamReader(src)
r.add_video_stream(10, decoder="hevc_cuvid")
num_frames = 0
for (chunk,) in r.stream():
self.assertEqual(chunk.device, torch.device("cpu"))
self.assertEqual(chunk.dtype, torch.uint8)
self.assertEqual(chunk.shape, torch.Size([10, 3, 144, 256]))
num_frames += chunk.size(0)
assert num_frames == 300
@skipIfNoHWAccel("hevc_cuvid")
def test_hevc_cuvid_hw_accel(self):
"""GPU decoder works for H265/HEVC with HW acceleration, and put the frames on CUDA tensor"""
src = self.get_src(get_asset_path("testsrc.hevc"))
r = StreamReader(src)
r.add_video_stream(10, decoder="hevc_cuvid", hw_accel="cuda")
num_frames = 0
for (chunk,) in r.stream():
self.assertEqual(chunk.device, torch.device("cuda:0"))
self.assertEqual(chunk.dtype, torch.int16)
self.assertEqual(chunk.shape, torch.Size([10, 3, 144, 256]))
num_frames += chunk.size(0)
assert num_frames == 300
@skipIfNoHWAccel("h264_cuvid")
class FilterGraphWithCudaAccel(TorchaudioTestCase):
def test_sclae_cuda_change_size(self):
"""scale_cuda filter can be used when HW accel is on"""
src = get_asset_path("nasa_13013.mp4")
r = StreamReader(src)
r.add_video_stream(10, decoder="h264_cuvid", hw_accel="cuda", filter_desc="scale_cuda=iw/2:ih/2")
num_frames = 0
for (chunk,) in r.stream():
self.assertEqual(chunk.device, torch.device("cuda:0"))
self.assertEqual(chunk.dtype, torch.uint8)
self.assertEqual(chunk.shape, torch.Size([10, 3, 135, 240]))
num_frames += chunk.size(0)
assert num_frames == 390
def test_scale_cuda_format(self):
"""yuv444p format conversion does not work (yet)"""
src = get_asset_path("nasa_13013.mp4")
r = StreamReader(src)
with self.assertRaises(RuntimeError):
r.add_video_stream(10, decoder="h264_cuvid", hw_accel="cuda", filter_desc="scale_cuda=format=yuv444p")
......@@ -165,7 +165,8 @@ void FilterGraph::add_process(const std::string& filter_description) {
av_err2string(ret) + ".)");
}
void FilterGraph::create_filter() {
void FilterGraph::create_filter(AVBufferRef* hw_frames_ctx) {
buffersrc_ctx->outputs[0]->hw_frames_ctx = hw_frames_ctx;
int ret = avfilter_graph_config(pFilterGraph, nullptr);
TORCH_CHECK(ret >= 0, "Failed to configure the graph: " + av_err2string(ret));
// char* desc = avfilter_graph_dump(pFilterGraph, NULL);
......@@ -196,6 +197,10 @@ FilterGraphOutputInfo FilterGraph::get_output_info() const {
ret.num_channels = av_get_channel_layout_nb_channels(l->channel_layout);
#endif
} else {
if (l->format == AV_PIX_FMT_CUDA && l->hw_frames_ctx) {
auto frames_ctx = (AVHWFramesContext*)(l->hw_frames_ctx->data);
ret.format = frames_ctx->sw_format;
}
ret.frame_rate = l->frame_rate;
ret.height = l->h;
ret.width = l->w;
......
......@@ -63,7 +63,7 @@ class FilterGraph {
void add_process(const std::string& filter_description);
void create_filter();
void create_filter(AVBufferRef* hw_frames_ctx = nullptr);
//////////////////////////////////////////////////////////////////////////////
// Query methods
......
#include <torchaudio/csrc/ffmpeg/hw_context.h>
#include <torchaudio/csrc/ffmpeg/stream_reader/buffer/chunked_buffer.h>
#include <torchaudio/csrc/ffmpeg/stream_reader/buffer/unchunked_buffer.h>
#include <torchaudio/csrc/ffmpeg/stream_reader/sink.h>
......@@ -47,14 +48,8 @@ std::unique_ptr<Buffer> get_buffer(
codec_ctx->channels);
}
} else {
// Note
// When using HW decoder, the pixel format is CUDA, and FilterGraph does
// not yet support CUDA frames, nor propagating the software pixel format,
// so here, we refer to AVCodecContext* to look at the pixel format.
AVPixelFormat fmt = (AVPixelFormat)(info.format);
if (fmt == AV_PIX_FMT_CUDA) {
fmt = codec_ctx->sw_pix_fmt;
}
TORCH_INTERNAL_ASSERT(fmt != AV_PIX_FMT_CUDA);
if (frames_per_chunk == -1) {
return detail::get_unchunked_buffer(fmt, info.height, info.width, device);
......@@ -77,7 +72,6 @@ FilterGraph get_filter_graph(
AVRational frame_rate,
const std::string& filter_description) {
auto p = FilterGraph{codec_ctx->codec_type};
switch (codec_ctx->codec_type) {
case AVMEDIA_TYPE_AUDIO:
p.add_audio_src(
......@@ -100,7 +94,11 @@ FilterGraph get_filter_graph(
}
p.add_sink();
p.add_process(filter_description);
p.create_filter();
if (codec_ctx->hw_frames_ctx) {
p.create_filter(av_buffer_ref(codec_ctx->hw_frames_ctx));
} else {
p.create_filter(nullptr);
}
return p;
}
......
......@@ -81,6 +81,27 @@ enum AVPixelFormat get_hw_format(
return AV_PIX_FMT_NONE;
}
AVBufferRef* get_hw_frames_ctx(AVCodecContext* codec_ctx) {
AVBufferRef* p = av_hwframe_ctx_alloc(codec_ctx->hw_device_ctx);
TORCH_CHECK(
p,
"Failed to allocate CUDA frame context from device context at ",
codec_ctx->hw_device_ctx);
auto frames_ctx = (AVHWFramesContext*)(p->data);
frames_ctx->format = codec_ctx->pix_fmt;
frames_ctx->sw_format = codec_ctx->sw_pix_fmt;
frames_ctx->width = codec_ctx->width;
frames_ctx->height = codec_ctx->height;
frames_ctx->initial_pool_size = 5;
int ret = av_hwframe_ctx_init(p);
if (ret >= 0) {
return p;
}
av_buffer_unref(&p);
TORCH_CHECK(
false, "Failed to initialize CUDA frame context: ", av_err2string(ret));
}
void configure_codec_context(
AVCodecContext* codec_ctx,
const AVCodecParameters* params,
......@@ -135,6 +156,9 @@ AVCodecContextPtr get_codec_ctx(
alloc_codec_context(params->codec_id, decoder_name);
configure_codec_context(codec_ctx, params, device);
open_codec(codec_ctx, decoder_option);
if (codec_ctx->hw_device_ctx) {
codec_ctx->hw_frames_ctx = av_buffer_ref(get_hw_frames_ctx(codec_ctx));
}
return codec_ctx;
}
......@@ -160,6 +184,38 @@ KeyType StreamProcessor::add_stream(
AVRational frame_rate,
const c10::optional<std::string>& filter_description,
const torch::Device& device) {
// If device is provided, then check that codec_ctx has hw_device_ctx set.
// In case, defining an output stream with HW accel on an input stream that
// has decoder set without HW accel, it will cause seg fault.
// i.e.
// The following should be rejected here.
// reader = StreamReader(...)
// reader.add_video_stream(..., decoder="h264_cuvid")
// reader.add_video_stream(..., decoder="h264_cuvid", hw_accel="cuda")
// TODO:
// One idea to work around this is to always define HW device context, and
// if HW acceleration is not required, insert `hwdownload` filter.
// This way it will be possible to handle both cases at the same time.
switch (device.type()) {
case torch::kCPU:
TORCH_CHECK(
!codec_ctx->hw_device_ctx,
"Decoding without Hardware acceleration is requested, however, "
"the decoder has been already defined with a HW acceleration. "
"Decoding a stream with and without HW acceleration simultaneously "
"is not supported.");
break;
case torch::kCUDA:
TORCH_CHECK(
codec_ctx->hw_device_ctx,
"CUDA Hardware acceleration is requested, however, the decoder has "
"been already defined without a HW acceleration. "
"Decoding a stream with and without HW acceleration simultaneously "
"is not supported.");
break;
default:;
}
switch (codec_ctx->codec_type) {
case AVMEDIA_TYPE_AUDIO:
case AVMEDIA_TYPE_VIDEO:
......
......@@ -350,6 +350,15 @@ void StreamReader::add_stream(
processors[i] = std::make_unique<StreamProcessor>(
stream->time_base, stream->codecpar, decoder, decoder_option, device);
processors[i]->set_discard_timestamp(seek_timestamp);
} else {
if (decoder) {
// TODO: Validate that the decoder is consistent as the one used to define
// previous output streams.
// i.e. the following is not permitted.
// reader.add_video_stream(..., decoder="h264")
// reader.add_video_stream(..., decoder="x264")
// reader.add_video_stream(..., decoder="h264_cuvid")
}
}
stream->discard = AVDISCARD_DEFAULT;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment