"src/array/vscode:/vscode.git/clone" did not exist on "0235a31ac7cc56ea2f18737df9e47a9d2a712b19"
Unverified Commit 06ad05fa authored by Joao Gomes's avatar Joao Gomes Committed by GitHub
Browse files

Read video from memory newapi (#6771)

* add tensor as optional param

* add init from memory

* fix bug

* fix bug

* first working version

* apply formatting and add tests

* simplify tests

* fix tests

* fix wrong variable name

* add path as optional parameter

* add src as optional

* address pr comments

* Fix warning messages

* address pr comments

* make tests stricter

* Revert "make tests stricter"

This reverts commit 6c92e94e8372f381c9496c9f885c2c71b6a4356b.
parent 246de077
......@@ -77,6 +77,7 @@ class TestVideoApi:
# compare the frames and ptss
for i in range(len(vr_frames)):
assert float(av_pts[i]) == approx(vr_pts[i], abs=0.1)
mean_delta = torch.mean(torch.abs(av_frames[i].float() - vr_frames[i].float()))
# on average the difference is very small and caused
# by decoding (around 1%)
......@@ -114,6 +115,46 @@ class TestVideoApi:
# we assure that there is never more than 1% difference in signal
assert max_delta.item() < 0.001
@pytest.mark.parametrize("stream", ["video", "audio"])
@pytest.mark.parametrize("test_video", test_videos.keys())
def test_frame_reading_mem_vs_file(self, test_video, stream):
full_path = os.path.join(VIDEO_DIR, test_video)
# Test video reading from file vs from memory
vr_frames, vr_frames_mem = [], []
vr_pts, vr_pts_mem = [], []
# get vr frames
video_reader = VideoReader(full_path, stream)
for vr_frame in video_reader:
vr_frames.append(vr_frame["data"])
vr_pts.append(vr_frame["pts"])
# get vr frames = read from memory
f = open(full_path, "rb")
fbytes = f.read()
f.close()
video_reader_from_mem = VideoReader(fbytes, stream)
for vr_frame_from_mem in video_reader_from_mem:
vr_frames_mem.append(vr_frame_from_mem["data"])
vr_pts_mem.append(vr_frame_from_mem["pts"])
# same number of frames
assert len(vr_frames) == len(vr_frames_mem)
assert len(vr_pts) == len(vr_pts_mem)
# compare the frames and ptss
for i in range(len(vr_frames)):
assert vr_pts[i] == vr_pts_mem[i]
mean_delta = torch.mean(torch.abs(vr_frames[i].float() - vr_frames_mem[i].float()))
# on average the difference is very small and caused
# by decoding (around 1%)
# TODO: asses empirically how to set this? atm it's 1%
# averaged over all frames
assert mean_delta.item() < 2.55
del vr_frames, vr_pts, vr_frames_mem, vr_pts_mem
@pytest.mark.parametrize("test_video,config", test_videos.items())
def test_metadata(self, test_video, config):
"""
......
......@@ -165,7 +165,7 @@ struct MediaFormat {
struct DecoderParameters {
// local file, remote file, http url, rtmp stream uri, etc. anything that
// ffmpeg can recognize
std::string uri;
std::string uri{std::string()};
// timeout on getting bytes for decoding
size_t timeoutMs{1000};
// logging level, default AV_LOG_PANIC
......
......@@ -156,14 +156,34 @@ void Video::_getDecoderParams(
} // _get decoder params
Video::Video(std::string videoPath, std::string stream, int64_t numThreads) {
C10_LOG_API_USAGE_ONCE("torchvision.csrc.io.video.video.Video");
void Video::initFromFile(
std::string videoPath,
std::string stream,
int64_t numThreads) {
TORCH_CHECK(!initialized, "Video object can only be initialized once");
initialized = true;
params.uri = videoPath;
_init(stream, numThreads);
}
void Video::initFromMemory(
torch::Tensor videoTensor,
std::string stream,
int64_t numThreads) {
TORCH_CHECK(!initialized, "Video object can only be initialized once");
initialized = true;
callback = MemoryBuffer::getCallback(
videoTensor.data_ptr<uint8_t>(), videoTensor.size(0));
_init(stream, numThreads);
}
void Video::_init(std::string stream, int64_t numThreads) {
// set number of threads global
numThreads_ = numThreads;
// parse stream information
current_stream = _parseStream(stream);
// note that in the initial call we want to get all streams
Video::_getDecoderParams(
_getDecoderParams(
0, // video start
0, // headerOnly
std::get<0>(current_stream), // stream info - remove that
......@@ -175,11 +195,6 @@ Video::Video(std::string videoPath, std::string stream, int64_t numThreads) {
std::string logMessage, logType;
// TODO: add read from memory option
params.uri = videoPath;
logType = "file";
logMessage = videoPath;
// locals
std::vector<double> audioFPS, videoFPS;
std::vector<double> audioDuration, videoDuration, ccDuration, subsDuration;
......@@ -190,7 +205,8 @@ Video::Video(std::string videoPath, std::string stream, int64_t numThreads) {
c10::Dict<std::string, std::vector<double>> subsMetadata;
// callback and metadata defined in struct
succeeded = decoder.init(params, std::move(callback), &metadata);
DecoderInCallback tmp_callback = callback;
succeeded = decoder.init(params, std::move(tmp_callback), &metadata);
if (succeeded) {
for (const auto& header : metadata) {
double fps = double(header.fps);
......@@ -225,16 +241,24 @@ Video::Video(std::string videoPath, std::string stream, int64_t numThreads) {
streamsMetadata.insert("subtitles", subsMetadata);
streamsMetadata.insert("cc", ccMetadata);
succeeded = Video::setCurrentStream(stream);
succeeded = setCurrentStream(stream);
LOG(INFO) << "\nDecoder inited with: " << succeeded << "\n";
if (std::get<1>(current_stream) != -1) {
LOG(INFO)
<< "Stream index set to " << std::get<1>(current_stream)
<< ". If you encounter trouble, consider switching it to automatic stream discovery. \n";
}
}
Video::Video(std::string videoPath, std::string stream, int64_t numThreads) {
C10_LOG_API_USAGE_ONCE("torchvision.csrc.io.video.video.Video");
if (!videoPath.empty()) {
initFromFile(videoPath, stream, numThreads);
}
} // video
bool Video::setCurrentStream(std::string stream = "video") {
TORCH_CHECK(initialized, "Video object has to be initialized first");
if ((!stream.empty()) && (_parseStream(stream) != current_stream)) {
current_stream = _parseStream(stream);
}
......@@ -256,19 +280,23 @@ bool Video::setCurrentStream(std::string stream = "video") {
);
// callback and metadata defined in Video.h
return (decoder.init(params, std::move(callback), &metadata));
DecoderInCallback tmp_callback = callback;
return (decoder.init(params, std::move(tmp_callback), &metadata));
}
std::tuple<std::string, int64_t> Video::getCurrentStream() const {
TORCH_CHECK(initialized, "Video object has to be initialized first");
return current_stream;
}
c10::Dict<std::string, c10::Dict<std::string, std::vector<double>>> Video::
getStreamMetadata() const {
TORCH_CHECK(initialized, "Video object has to be initialized first");
return streamsMetadata;
}
void Video::Seek(double ts, bool fastSeek = false) {
TORCH_CHECK(initialized, "Video object has to be initialized first");
// initialize the class variables used for seeking and retrurn
_getDecoderParams(
ts, // video start
......@@ -282,11 +310,14 @@ void Video::Seek(double ts, bool fastSeek = false) {
);
// callback and metadata defined in Video.h
succeeded = decoder.init(params, std::move(callback), &metadata);
DecoderInCallback tmp_callback = callback;
succeeded = decoder.init(params, std::move(tmp_callback), &metadata);
LOG(INFO) << "Decoder init at seek " << succeeded << "\n";
}
std::tuple<torch::Tensor, double> Video::Next() {
TORCH_CHECK(initialized, "Video object has to be initialized first");
// if failing to decode simply return a null tensor (note, should we
// raise an exeption?)
double frame_pts_s;
......@@ -345,6 +376,8 @@ std::tuple<torch::Tensor, double> Video::Next() {
static auto registerVideo =
torch::class_<Video>("torchvision", "Video")
.def(torch::init<std::string, std::string, int64_t>())
.def("init_from_file", &Video::initFromFile)
.def("init_from_memory", &Video::initFromMemory)
.def("get_current_stream", &Video::getCurrentStream)
.def("set_current_stream", &Video::setCurrentStream)
.def("get_metadata", &Video::getStreamMetadata)
......
......@@ -19,7 +19,19 @@ struct Video : torch::CustomClassHolder {
int64_t numThreads_{0};
public:
Video(std::string videoPath, std::string stream, int64_t numThreads);
Video(
std::string videoPath = std::string(),
std::string stream = std::string("video"),
int64_t numThreads = 0);
void initFromFile(
std::string videoPath,
std::string stream,
int64_t numThreads);
void initFromMemory(
torch::Tensor videoTensor,
std::string stream,
int64_t numThreads);
std::tuple<std::string, int64_t> getCurrentStream() const;
c10::Dict<std::string, c10::Dict<std::string, std::vector<double>>>
getStreamMetadata() const;
......@@ -34,6 +46,12 @@ struct Video : torch::CustomClassHolder {
// time in comination with any_frame settings
double seekTS = -1;
bool initialized = false;
void _init(
std::string stream,
int64_t numThreads); // expects params.uri OR callback to be set
void _getDecoderParams(
double videoStartS,
int64_t getPtsOnly,
......
from typing import Any, Dict, Iterator
import warnings
from typing import Any, Dict, Iterator, Optional
import torch
......@@ -71,8 +72,13 @@ class VideoReader:
If only stream type is passed, the decoder auto-detects first stream of that type.
Args:
src (string, bytes object, or tensor): The media source.
If string-type, it must be a file path supported by FFMPEG.
If bytes shoud be an in memory representatin of a file supported by FFMPEG.
If Tensor, it is interpreted internally as byte buffer.
It must be one-dimensional, of type ``torch.uint8``.
path (string): Path to the video file in supported format
stream (string, optional): descriptor of the required stream, followed by the stream id,
in the format ``{stream_type}:{stream_id}``. Defaults to ``"video:0"``.
......@@ -85,9 +91,23 @@ class VideoReader:
device (str, optional): Device to be used for decoding. Defaults to ``"cpu"``.
To use GPU decoding, pass ``device="cuda"``.
path (str, optional):
.. warning:
This parameter was deprecated in ``0.15`` and will be removed in ``0.17``.
Please use ``src`` instead.
"""
def __init__(self, path: str, stream: str = "video", num_threads: int = 0, device: str = "cpu") -> None:
def __init__(
self,
src: str = "",
stream: str = "video",
num_threads: int = 0,
device: str = "cpu",
path: Optional[str] = None,
) -> None:
_log_api_usage_once(self)
self.is_cuda = False
device = torch.device(device)
......@@ -95,7 +115,7 @@ class VideoReader:
if not _HAS_GPU_VIDEO_DECODER:
raise RuntimeError("Not compiled with GPU decoder support.")
self.is_cuda = True
self._c = torch.classes.torchvision.GPUDecoder(path, device)
self._c = torch.classes.torchvision.GPUDecoder(src, device)
return
if not _has_video_opt():
raise RuntimeError(
......@@ -105,7 +125,24 @@ class VideoReader:
+ "build torchvision from source."
)
self._c = torch.classes.torchvision.Video(path, stream, num_threads)
if src == "":
if path is None:
raise TypeError("src cannot be empty")
src = path
warnings.warn("path is deprecated and will be removed in 0.17. Please use src instead")
elif isinstance(src, bytes):
src = torch.frombuffer(src, dtype=torch.uint8)
if isinstance(src, str):
self._c = torch.classes.torchvision.Video(src, stream, num_threads)
elif isinstance(src, torch.Tensor):
if self.is_cuda:
raise RuntimeError("GPU VideoReader cannot be initialized from Tensor or bytes object.")
self._c = torch.classes.torchvision.Video("", "", 0)
self._c.init_from_memory(src, stream, num_threads)
else:
raise TypeError("`src` must be either string, Tensor or bytes object.")
def __next__(self) -> Dict[str, Any]:
"""Decodes and returns the next frame of the current stream.
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment