Unverified Commit 06ad05fa authored by Joao Gomes's avatar Joao Gomes Committed by GitHub
Browse files

Read video from memory newapi (#6771)

* add tensor as optional param

* add init from memory

* fix bug

* fix bug

* first working version

* apply formatting and add tests

* simplify tests

* fix tests

* fix wrong variable name

* add path as optional parameter

* add src as optional

* address pr comments

* Fix warning messages

* address pr comments

* make tests stricter

* Revert "make tests stricter"

This reverts commit 6c92e94e8372f381c9496c9f885c2c71b6a4356b.
parent 246de077
...@@ -77,6 +77,7 @@ class TestVideoApi: ...@@ -77,6 +77,7 @@ class TestVideoApi:
# compare the frames and ptss # compare the frames and ptss
for i in range(len(vr_frames)): for i in range(len(vr_frames)):
assert float(av_pts[i]) == approx(vr_pts[i], abs=0.1) assert float(av_pts[i]) == approx(vr_pts[i], abs=0.1)
mean_delta = torch.mean(torch.abs(av_frames[i].float() - vr_frames[i].float())) mean_delta = torch.mean(torch.abs(av_frames[i].float() - vr_frames[i].float()))
# on average the difference is very small and caused # on average the difference is very small and caused
# by decoding (around 1%) # by decoding (around 1%)
...@@ -114,6 +115,46 @@ class TestVideoApi: ...@@ -114,6 +115,46 @@ class TestVideoApi:
# we assure that there is never more than 1% difference in signal # we assure that there is never more than 1% difference in signal
assert max_delta.item() < 0.001 assert max_delta.item() < 0.001
@pytest.mark.parametrize("stream", ["video", "audio"])
@pytest.mark.parametrize("test_video", test_videos.keys())
def test_frame_reading_mem_vs_file(self, test_video, stream):
full_path = os.path.join(VIDEO_DIR, test_video)
# Test video reading from file vs from memory
vr_frames, vr_frames_mem = [], []
vr_pts, vr_pts_mem = [], []
# get vr frames
video_reader = VideoReader(full_path, stream)
for vr_frame in video_reader:
vr_frames.append(vr_frame["data"])
vr_pts.append(vr_frame["pts"])
# get vr frames = read from memory
f = open(full_path, "rb")
fbytes = f.read()
f.close()
video_reader_from_mem = VideoReader(fbytes, stream)
for vr_frame_from_mem in video_reader_from_mem:
vr_frames_mem.append(vr_frame_from_mem["data"])
vr_pts_mem.append(vr_frame_from_mem["pts"])
# same number of frames
assert len(vr_frames) == len(vr_frames_mem)
assert len(vr_pts) == len(vr_pts_mem)
# compare the frames and ptss
for i in range(len(vr_frames)):
assert vr_pts[i] == vr_pts_mem[i]
mean_delta = torch.mean(torch.abs(vr_frames[i].float() - vr_frames_mem[i].float()))
# on average the difference is very small and caused
# by decoding (around 1%)
# TODO: asses empirically how to set this? atm it's 1%
# averaged over all frames
assert mean_delta.item() < 2.55
del vr_frames, vr_pts, vr_frames_mem, vr_pts_mem
@pytest.mark.parametrize("test_video,config", test_videos.items()) @pytest.mark.parametrize("test_video,config", test_videos.items())
def test_metadata(self, test_video, config): def test_metadata(self, test_video, config):
""" """
......
...@@ -165,7 +165,7 @@ struct MediaFormat { ...@@ -165,7 +165,7 @@ struct MediaFormat {
struct DecoderParameters { struct DecoderParameters {
// local file, remote file, http url, rtmp stream uri, etc. anything that // local file, remote file, http url, rtmp stream uri, etc. anything that
// ffmpeg can recognize // ffmpeg can recognize
std::string uri; std::string uri{std::string()};
// timeout on getting bytes for decoding // timeout on getting bytes for decoding
size_t timeoutMs{1000}; size_t timeoutMs{1000};
// logging level, default AV_LOG_PANIC // logging level, default AV_LOG_PANIC
......
...@@ -156,14 +156,34 @@ void Video::_getDecoderParams( ...@@ -156,14 +156,34 @@ void Video::_getDecoderParams(
} // _get decoder params } // _get decoder params
Video::Video(std::string videoPath, std::string stream, int64_t numThreads) { void Video::initFromFile(
C10_LOG_API_USAGE_ONCE("torchvision.csrc.io.video.video.Video"); std::string videoPath,
std::string stream,
int64_t numThreads) {
TORCH_CHECK(!initialized, "Video object can only be initialized once");
initialized = true;
params.uri = videoPath;
_init(stream, numThreads);
}
void Video::initFromMemory(
torch::Tensor videoTensor,
std::string stream,
int64_t numThreads) {
TORCH_CHECK(!initialized, "Video object can only be initialized once");
initialized = true;
callback = MemoryBuffer::getCallback(
videoTensor.data_ptr<uint8_t>(), videoTensor.size(0));
_init(stream, numThreads);
}
void Video::_init(std::string stream, int64_t numThreads) {
// set number of threads global // set number of threads global
numThreads_ = numThreads; numThreads_ = numThreads;
// parse stream information // parse stream information
current_stream = _parseStream(stream); current_stream = _parseStream(stream);
// note that in the initial call we want to get all streams // note that in the initial call we want to get all streams
Video::_getDecoderParams( _getDecoderParams(
0, // video start 0, // video start
0, // headerOnly 0, // headerOnly
std::get<0>(current_stream), // stream info - remove that std::get<0>(current_stream), // stream info - remove that
...@@ -175,11 +195,6 @@ Video::Video(std::string videoPath, std::string stream, int64_t numThreads) { ...@@ -175,11 +195,6 @@ Video::Video(std::string videoPath, std::string stream, int64_t numThreads) {
std::string logMessage, logType; std::string logMessage, logType;
// TODO: add read from memory option
params.uri = videoPath;
logType = "file";
logMessage = videoPath;
// locals // locals
std::vector<double> audioFPS, videoFPS; std::vector<double> audioFPS, videoFPS;
std::vector<double> audioDuration, videoDuration, ccDuration, subsDuration; std::vector<double> audioDuration, videoDuration, ccDuration, subsDuration;
...@@ -190,7 +205,8 @@ Video::Video(std::string videoPath, std::string stream, int64_t numThreads) { ...@@ -190,7 +205,8 @@ Video::Video(std::string videoPath, std::string stream, int64_t numThreads) {
c10::Dict<std::string, std::vector<double>> subsMetadata; c10::Dict<std::string, std::vector<double>> subsMetadata;
// callback and metadata defined in struct // callback and metadata defined in struct
succeeded = decoder.init(params, std::move(callback), &metadata); DecoderInCallback tmp_callback = callback;
succeeded = decoder.init(params, std::move(tmp_callback), &metadata);
if (succeeded) { if (succeeded) {
for (const auto& header : metadata) { for (const auto& header : metadata) {
double fps = double(header.fps); double fps = double(header.fps);
...@@ -225,16 +241,24 @@ Video::Video(std::string videoPath, std::string stream, int64_t numThreads) { ...@@ -225,16 +241,24 @@ Video::Video(std::string videoPath, std::string stream, int64_t numThreads) {
streamsMetadata.insert("subtitles", subsMetadata); streamsMetadata.insert("subtitles", subsMetadata);
streamsMetadata.insert("cc", ccMetadata); streamsMetadata.insert("cc", ccMetadata);
succeeded = Video::setCurrentStream(stream); succeeded = setCurrentStream(stream);
LOG(INFO) << "\nDecoder inited with: " << succeeded << "\n"; LOG(INFO) << "\nDecoder inited with: " << succeeded << "\n";
if (std::get<1>(current_stream) != -1) { if (std::get<1>(current_stream) != -1) {
LOG(INFO) LOG(INFO)
<< "Stream index set to " << std::get<1>(current_stream) << "Stream index set to " << std::get<1>(current_stream)
<< ". If you encounter trouble, consider switching it to automatic stream discovery. \n"; << ". If you encounter trouble, consider switching it to automatic stream discovery. \n";
} }
}
Video::Video(std::string videoPath, std::string stream, int64_t numThreads) {
C10_LOG_API_USAGE_ONCE("torchvision.csrc.io.video.video.Video");
if (!videoPath.empty()) {
initFromFile(videoPath, stream, numThreads);
}
} // video } // video
bool Video::setCurrentStream(std::string stream = "video") { bool Video::setCurrentStream(std::string stream = "video") {
TORCH_CHECK(initialized, "Video object has to be initialized first");
if ((!stream.empty()) && (_parseStream(stream) != current_stream)) { if ((!stream.empty()) && (_parseStream(stream) != current_stream)) {
current_stream = _parseStream(stream); current_stream = _parseStream(stream);
} }
...@@ -256,19 +280,23 @@ bool Video::setCurrentStream(std::string stream = "video") { ...@@ -256,19 +280,23 @@ bool Video::setCurrentStream(std::string stream = "video") {
); );
// callback and metadata defined in Video.h // callback and metadata defined in Video.h
return (decoder.init(params, std::move(callback), &metadata)); DecoderInCallback tmp_callback = callback;
return (decoder.init(params, std::move(tmp_callback), &metadata));
} }
std::tuple<std::string, int64_t> Video::getCurrentStream() const { std::tuple<std::string, int64_t> Video::getCurrentStream() const {
TORCH_CHECK(initialized, "Video object has to be initialized first");
return current_stream; return current_stream;
} }
c10::Dict<std::string, c10::Dict<std::string, std::vector<double>>> Video:: c10::Dict<std::string, c10::Dict<std::string, std::vector<double>>> Video::
getStreamMetadata() const { getStreamMetadata() const {
TORCH_CHECK(initialized, "Video object has to be initialized first");
return streamsMetadata; return streamsMetadata;
} }
void Video::Seek(double ts, bool fastSeek = false) { void Video::Seek(double ts, bool fastSeek = false) {
TORCH_CHECK(initialized, "Video object has to be initialized first");
// initialize the class variables used for seeking and retrurn // initialize the class variables used for seeking and retrurn
_getDecoderParams( _getDecoderParams(
ts, // video start ts, // video start
...@@ -282,11 +310,14 @@ void Video::Seek(double ts, bool fastSeek = false) { ...@@ -282,11 +310,14 @@ void Video::Seek(double ts, bool fastSeek = false) {
); );
// callback and metadata defined in Video.h // callback and metadata defined in Video.h
succeeded = decoder.init(params, std::move(callback), &metadata); DecoderInCallback tmp_callback = callback;
succeeded = decoder.init(params, std::move(tmp_callback), &metadata);
LOG(INFO) << "Decoder init at seek " << succeeded << "\n"; LOG(INFO) << "Decoder init at seek " << succeeded << "\n";
} }
std::tuple<torch::Tensor, double> Video::Next() { std::tuple<torch::Tensor, double> Video::Next() {
TORCH_CHECK(initialized, "Video object has to be initialized first");
// if failing to decode simply return a null tensor (note, should we // if failing to decode simply return a null tensor (note, should we
// raise an exeption?) // raise an exeption?)
double frame_pts_s; double frame_pts_s;
...@@ -345,6 +376,8 @@ std::tuple<torch::Tensor, double> Video::Next() { ...@@ -345,6 +376,8 @@ std::tuple<torch::Tensor, double> Video::Next() {
static auto registerVideo = static auto registerVideo =
torch::class_<Video>("torchvision", "Video") torch::class_<Video>("torchvision", "Video")
.def(torch::init<std::string, std::string, int64_t>()) .def(torch::init<std::string, std::string, int64_t>())
.def("init_from_file", &Video::initFromFile)
.def("init_from_memory", &Video::initFromMemory)
.def("get_current_stream", &Video::getCurrentStream) .def("get_current_stream", &Video::getCurrentStream)
.def("set_current_stream", &Video::setCurrentStream) .def("set_current_stream", &Video::setCurrentStream)
.def("get_metadata", &Video::getStreamMetadata) .def("get_metadata", &Video::getStreamMetadata)
......
...@@ -19,7 +19,19 @@ struct Video : torch::CustomClassHolder { ...@@ -19,7 +19,19 @@ struct Video : torch::CustomClassHolder {
int64_t numThreads_{0}; int64_t numThreads_{0};
public: public:
Video(std::string videoPath, std::string stream, int64_t numThreads); Video(
std::string videoPath = std::string(),
std::string stream = std::string("video"),
int64_t numThreads = 0);
void initFromFile(
std::string videoPath,
std::string stream,
int64_t numThreads);
void initFromMemory(
torch::Tensor videoTensor,
std::string stream,
int64_t numThreads);
std::tuple<std::string, int64_t> getCurrentStream() const; std::tuple<std::string, int64_t> getCurrentStream() const;
c10::Dict<std::string, c10::Dict<std::string, std::vector<double>>> c10::Dict<std::string, c10::Dict<std::string, std::vector<double>>>
getStreamMetadata() const; getStreamMetadata() const;
...@@ -34,6 +46,12 @@ struct Video : torch::CustomClassHolder { ...@@ -34,6 +46,12 @@ struct Video : torch::CustomClassHolder {
// time in comination with any_frame settings // time in comination with any_frame settings
double seekTS = -1; double seekTS = -1;
bool initialized = false;
void _init(
std::string stream,
int64_t numThreads); // expects params.uri OR callback to be set
void _getDecoderParams( void _getDecoderParams(
double videoStartS, double videoStartS,
int64_t getPtsOnly, int64_t getPtsOnly,
......
from typing import Any, Dict, Iterator import warnings
from typing import Any, Dict, Iterator, Optional
import torch import torch
...@@ -71,8 +72,13 @@ class VideoReader: ...@@ -71,8 +72,13 @@ class VideoReader:
If only stream type is passed, the decoder auto-detects first stream of that type. If only stream type is passed, the decoder auto-detects first stream of that type.
Args: Args:
src (string, bytes object, or tensor): The media source.
If string-type, it must be a file path supported by FFMPEG.
If bytes shoud be an in memory representatin of a file supported by FFMPEG.
If Tensor, it is interpreted internally as byte buffer.
It must be one-dimensional, of type ``torch.uint8``.
path (string): Path to the video file in supported format
stream (string, optional): descriptor of the required stream, followed by the stream id, stream (string, optional): descriptor of the required stream, followed by the stream id,
in the format ``{stream_type}:{stream_id}``. Defaults to ``"video:0"``. in the format ``{stream_type}:{stream_id}``. Defaults to ``"video:0"``.
...@@ -85,9 +91,23 @@ class VideoReader: ...@@ -85,9 +91,23 @@ class VideoReader:
device (str, optional): Device to be used for decoding. Defaults to ``"cpu"``. device (str, optional): Device to be used for decoding. Defaults to ``"cpu"``.
To use GPU decoding, pass ``device="cuda"``. To use GPU decoding, pass ``device="cuda"``.
path (str, optional):
.. warning:
This parameter was deprecated in ``0.15`` and will be removed in ``0.17``.
Please use ``src`` instead.
""" """
def __init__(self, path: str, stream: str = "video", num_threads: int = 0, device: str = "cpu") -> None: def __init__(
self,
src: str = "",
stream: str = "video",
num_threads: int = 0,
device: str = "cpu",
path: Optional[str] = None,
) -> None:
_log_api_usage_once(self) _log_api_usage_once(self)
self.is_cuda = False self.is_cuda = False
device = torch.device(device) device = torch.device(device)
...@@ -95,7 +115,7 @@ class VideoReader: ...@@ -95,7 +115,7 @@ class VideoReader:
if not _HAS_GPU_VIDEO_DECODER: if not _HAS_GPU_VIDEO_DECODER:
raise RuntimeError("Not compiled with GPU decoder support.") raise RuntimeError("Not compiled with GPU decoder support.")
self.is_cuda = True self.is_cuda = True
self._c = torch.classes.torchvision.GPUDecoder(path, device) self._c = torch.classes.torchvision.GPUDecoder(src, device)
return return
if not _has_video_opt(): if not _has_video_opt():
raise RuntimeError( raise RuntimeError(
...@@ -105,7 +125,24 @@ class VideoReader: ...@@ -105,7 +125,24 @@ class VideoReader:
+ "build torchvision from source." + "build torchvision from source."
) )
self._c = torch.classes.torchvision.Video(path, stream, num_threads) if src == "":
if path is None:
raise TypeError("src cannot be empty")
src = path
warnings.warn("path is deprecated and will be removed in 0.17. Please use src instead")
elif isinstance(src, bytes):
src = torch.frombuffer(src, dtype=torch.uint8)
if isinstance(src, str):
self._c = torch.classes.torchvision.Video(src, stream, num_threads)
elif isinstance(src, torch.Tensor):
if self.is_cuda:
raise RuntimeError("GPU VideoReader cannot be initialized from Tensor or bytes object.")
self._c = torch.classes.torchvision.Video("", "", 0)
self._c.init_from_memory(src, stream, num_threads)
else:
raise TypeError("`src` must be either string, Tensor or bytes object.")
def __next__(self) -> Dict[str, Any]: def __next__(self) -> Dict[str, Any]:
"""Decodes and returns the next frame of the current stream. """Decodes and returns the next frame of the current stream.
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment