Unverified Commit 2e833520 authored by Bruno Korbar's avatar Bruno Korbar Committed by GitHub
Browse files

Pyav backend for VideoReader API (#6598)



* Test: add backend parameter

* VideoReader object now works on backend

* Frame reading now passes

* Keyframe seek now passes

* Pyav backend now supports metadata

* changes in test to reflect GPU decoder change

* Linter?

* Test GPU output

* Addressing Joao's comments

* lint

* lint

* Revert "Test GPU output"

This reverts commit f62e955d7dc81bcb23b40d58ea75413b9b62e76d.

* lint?

* lint

* lint

* Address issues in build?

* hopefully doc fix

* Arrgh

* arrgh

* fix typos

* fix input options

* remove read from memory option in pyav

* skip read from mem test for gpu and pyab be

* fix test

* remove unused import

* Hack to get reading from memory work with pyav

* patch audio test
Co-authored-by: default avatarBruno Korbar <bkorbar@quansight.com>
Co-authored-by: default avatarJoao Gomes <jdsgomes@fb.com>
parent 70faba91
...@@ -3,7 +3,9 @@ import os ...@@ -3,7 +3,9 @@ import os
import pytest import pytest
import torch import torch
from torchvision.io import _HAS_GPU_VIDEO_DECODER, VideoReader import torchvision
from torchvision import _HAS_GPU_VIDEO_DECODER
from torchvision.io import VideoReader
try: try:
import av import av
...@@ -29,8 +31,9 @@ class TestVideoGPUDecoder: ...@@ -29,8 +31,9 @@ class TestVideoGPUDecoder:
], ],
) )
def test_frame_reading(self, video_file): def test_frame_reading(self, video_file):
torchvision.set_video_backend("cuda")
full_path = os.path.join(VIDEO_DIR, video_file) full_path = os.path.join(VIDEO_DIR, video_file)
decoder = VideoReader(full_path, device="cuda") decoder = VideoReader(full_path)
with av.open(full_path) as container: with av.open(full_path) as container:
for av_frame in container.decode(container.streams.video[0]): for av_frame in container.decode(container.streams.video[0]):
av_frames = torch.tensor(av_frame.to_rgb(src_colorspace="ITU709").to_ndarray()) av_frames = torch.tensor(av_frame.to_rgb(src_colorspace="ITU709").to_ndarray())
...@@ -54,7 +57,8 @@ class TestVideoGPUDecoder: ...@@ -54,7 +57,8 @@ class TestVideoGPUDecoder:
], ],
) )
def test_seek_reading(self, keyframes, full_path, duration): def test_seek_reading(self, keyframes, full_path, duration):
decoder = VideoReader(full_path, device="cuda") torchvision.set_video_backend("cuda")
decoder = VideoReader(full_path)
time = duration / 2 time = duration / 2
decoder.seek(time, keyframes_only=keyframes) decoder.seek(time, keyframes_only=keyframes)
with av.open(full_path) as container: with av.open(full_path) as container:
...@@ -79,8 +83,9 @@ class TestVideoGPUDecoder: ...@@ -79,8 +83,9 @@ class TestVideoGPUDecoder:
], ],
) )
def test_metadata(self, video_file): def test_metadata(self, video_file):
torchvision.set_video_backend("cuda")
full_path = os.path.join(VIDEO_DIR, video_file) full_path = os.path.join(VIDEO_DIR, video_file)
decoder = VideoReader(full_path, device="cuda") decoder = VideoReader(full_path)
video_metadata = decoder.get_metadata()["video"] video_metadata = decoder.get_metadata()["video"]
with av.open(full_path) as container: with av.open(full_path) as container:
video = container.streams.video[0] video = container.streams.video[0]
......
...@@ -53,7 +53,9 @@ test_videos = { ...@@ -53,7 +53,9 @@ test_videos = {
class TestVideoApi: class TestVideoApi:
@pytest.mark.skipif(av is None, reason="PyAV unavailable") @pytest.mark.skipif(av is None, reason="PyAV unavailable")
@pytest.mark.parametrize("test_video", test_videos.keys()) @pytest.mark.parametrize("test_video", test_videos.keys())
def test_frame_reading(self, test_video): @pytest.mark.parametrize("backend", ["video_reader", "pyav"])
def test_frame_reading(self, test_video, backend):
torchvision.set_video_backend(backend)
full_path = os.path.join(VIDEO_DIR, test_video) full_path = os.path.join(VIDEO_DIR, test_video)
with av.open(full_path) as av_reader: with av.open(full_path) as av_reader:
if av_reader.streams.video: if av_reader.streams.video:
...@@ -117,50 +119,60 @@ class TestVideoApi: ...@@ -117,50 +119,60 @@ class TestVideoApi:
@pytest.mark.parametrize("stream", ["video", "audio"]) @pytest.mark.parametrize("stream", ["video", "audio"])
@pytest.mark.parametrize("test_video", test_videos.keys()) @pytest.mark.parametrize("test_video", test_videos.keys())
def test_frame_reading_mem_vs_file(self, test_video, stream): @pytest.mark.parametrize("backend", ["video_reader", "pyav"])
def test_frame_reading_mem_vs_file(self, test_video, stream, backend):
torchvision.set_video_backend(backend)
full_path = os.path.join(VIDEO_DIR, test_video) full_path = os.path.join(VIDEO_DIR, test_video)
# Test video reading from file vs from memory reader = VideoReader(full_path)
vr_frames, vr_frames_mem = [], [] reader_md = reader.get_metadata()
vr_pts, vr_pts_mem = [], []
# get vr frames if stream in reader_md:
video_reader = VideoReader(full_path, stream) # Test video reading from file vs from memory
for vr_frame in video_reader: vr_frames, vr_frames_mem = [], []
vr_frames.append(vr_frame["data"]) vr_pts, vr_pts_mem = [], []
vr_pts.append(vr_frame["pts"]) # get vr frames
video_reader = VideoReader(full_path, stream)
# get vr frames = read from memory for vr_frame in video_reader:
f = open(full_path, "rb") vr_frames.append(vr_frame["data"])
fbytes = f.read() vr_pts.append(vr_frame["pts"])
f.close()
video_reader_from_mem = VideoReader(fbytes, stream) # get vr frames = read from memory
f = open(full_path, "rb")
for vr_frame_from_mem in video_reader_from_mem: fbytes = f.read()
vr_frames_mem.append(vr_frame_from_mem["data"]) f.close()
vr_pts_mem.append(vr_frame_from_mem["pts"]) video_reader_from_mem = VideoReader(fbytes, stream)
# same number of frames for vr_frame_from_mem in video_reader_from_mem:
assert len(vr_frames) == len(vr_frames_mem) vr_frames_mem.append(vr_frame_from_mem["data"])
assert len(vr_pts) == len(vr_pts_mem) vr_pts_mem.append(vr_frame_from_mem["pts"])
# compare the frames and ptss # same number of frames
for i in range(len(vr_frames)): assert len(vr_frames) == len(vr_frames_mem)
assert vr_pts[i] == vr_pts_mem[i] assert len(vr_pts) == len(vr_pts_mem)
mean_delta = torch.mean(torch.abs(vr_frames[i].float() - vr_frames_mem[i].float()))
# on average the difference is very small and caused # compare the frames and ptss
# by decoding (around 1%) for i in range(len(vr_frames)):
# TODO: asses empirically how to set this? atm it's 1% assert vr_pts[i] == vr_pts_mem[i]
# averaged over all frames mean_delta = torch.mean(torch.abs(vr_frames[i].float() - vr_frames_mem[i].float()))
assert mean_delta.item() < 2.55 # on average the difference is very small and caused
# by decoding (around 1%)
del vr_frames, vr_pts, vr_frames_mem, vr_pts_mem # TODO: asses empirically how to set this? atm it's 1%
# averaged over all frames
assert mean_delta.item() < 2.55
del vr_frames, vr_pts, vr_frames_mem, vr_pts_mem
else:
del reader, reader_md
@pytest.mark.parametrize("test_video,config", test_videos.items()) @pytest.mark.parametrize("test_video,config", test_videos.items())
def test_metadata(self, test_video, config): @pytest.mark.parametrize("backend", ["video_reader", "pyav"])
def test_metadata(self, test_video, config, backend):
""" """
Test that the metadata returned via pyav corresponds to the one returned Test that the metadata returned via pyav corresponds to the one returned
by the new video decoder API by the new video decoder API
""" """
torchvision.set_video_backend(backend)
full_path = os.path.join(VIDEO_DIR, test_video) full_path = os.path.join(VIDEO_DIR, test_video)
reader = VideoReader(full_path, "video") reader = VideoReader(full_path, "video")
reader_md = reader.get_metadata() reader_md = reader.get_metadata()
...@@ -168,7 +180,9 @@ class TestVideoApi: ...@@ -168,7 +180,9 @@ class TestVideoApi:
assert config.duration == approx(reader_md["video"]["duration"][0], abs=0.5) assert config.duration == approx(reader_md["video"]["duration"][0], abs=0.5)
@pytest.mark.parametrize("test_video", test_videos.keys()) @pytest.mark.parametrize("test_video", test_videos.keys())
def test_seek_start(self, test_video): @pytest.mark.parametrize("backend", ["video_reader", "pyav"])
def test_seek_start(self, test_video, backend):
torchvision.set_video_backend(backend)
full_path = os.path.join(VIDEO_DIR, test_video) full_path = os.path.join(VIDEO_DIR, test_video)
video_reader = VideoReader(full_path, "video") video_reader = VideoReader(full_path, "video")
num_frames = 0 num_frames = 0
...@@ -194,7 +208,9 @@ class TestVideoApi: ...@@ -194,7 +208,9 @@ class TestVideoApi:
assert start_num_frames == num_frames assert start_num_frames == num_frames
@pytest.mark.parametrize("test_video", test_videos.keys()) @pytest.mark.parametrize("test_video", test_videos.keys())
def test_accurateseek_middle(self, test_video): @pytest.mark.parametrize("backend", ["video_reader"])
def test_accurateseek_middle(self, test_video, backend):
torchvision.set_video_backend(backend)
full_path = os.path.join(VIDEO_DIR, test_video) full_path = os.path.join(VIDEO_DIR, test_video)
stream = "video" stream = "video"
video_reader = VideoReader(full_path, stream) video_reader = VideoReader(full_path, stream)
...@@ -233,7 +249,9 @@ class TestVideoApi: ...@@ -233,7 +249,9 @@ class TestVideoApi:
@pytest.mark.skipif(av is None, reason="PyAV unavailable") @pytest.mark.skipif(av is None, reason="PyAV unavailable")
@pytest.mark.parametrize("test_video,config", test_videos.items()) @pytest.mark.parametrize("test_video,config", test_videos.items())
def test_keyframe_reading(self, test_video, config): @pytest.mark.parametrize("backend", ["pyav", "video_reader"])
def test_keyframe_reading(self, test_video, config, backend):
torchvision.set_video_backend(backend)
full_path = os.path.join(VIDEO_DIR, test_video) full_path = os.path.join(VIDEO_DIR, test_video)
av_reader = av.open(full_path) av_reader = av.open(full_path)
......
import os import os
import warnings import warnings
from modulefinder import Module
import torch import torch
from torchvision import datasets, io, models, ops, transforms, utils from torchvision import datasets, io, models, ops, transforms, utils
from .extension import _HAS_OPS from .extension import _HAS_OPS, _load_library
try: try:
from .version import __version__ # noqa: F401 from .version import __version__ # noqa: F401
except ImportError: except ImportError:
pass pass
try:
_load_library("Decoder")
_HAS_GPU_VIDEO_DECODER = True
except (ImportError, OSError, ModuleNotFoundError):
_HAS_GPU_VIDEO_DECODER = False
# Check if torchvision is being imported within the root folder # Check if torchvision is being imported within the root folder
if not _HAS_OPS and os.path.dirname(os.path.realpath(__file__)) == os.path.join( if not _HAS_OPS and os.path.dirname(os.path.realpath(__file__)) == os.path.join(
os.path.realpath(os.getcwd()), "torchvision" os.path.realpath(os.getcwd()), "torchvision"
...@@ -66,11 +74,16 @@ def set_video_backend(backend): ...@@ -66,11 +74,16 @@ def set_video_backend(backend):
backend, please compile torchvision from source. backend, please compile torchvision from source.
""" """
global _video_backend global _video_backend
if backend not in ["pyav", "video_reader"]: if backend not in ["pyav", "video_reader", "cuda"]:
raise ValueError("Invalid video backend '%s'. Options are 'pyav' and 'video_reader'" % backend) raise ValueError("Invalid video backend '%s'. Options are 'pyav', 'video_reader' and 'cuda'" % backend)
if backend == "video_reader" and not io._HAS_VIDEO_OPT: if backend == "video_reader" and not io._HAS_VIDEO_OPT:
# TODO: better messages
message = "video_reader video backend is not available. Please compile torchvision from source and try again" message = "video_reader video backend is not available. Please compile torchvision from source and try again"
warnings.warn(message) raise RuntimeError(message)
elif backend == "cuda" and not _HAS_GPU_VIDEO_DECODER:
# TODO: better messages
message = "cuda video backend is not available."
raise RuntimeError(message)
else: else:
_video_backend = backend _video_backend = backend
......
...@@ -4,10 +4,6 @@ import torch ...@@ -4,10 +4,6 @@ import torch
from ..utils import _log_api_usage_once from ..utils import _log_api_usage_once
try:
from ._load_gpu_decoder import _HAS_GPU_VIDEO_DECODER
except ModuleNotFoundError:
_HAS_GPU_VIDEO_DECODER = False
from ._video_opt import ( from ._video_opt import (
_HAS_VIDEO_OPT, _HAS_VIDEO_OPT,
_probe_video_from_file, _probe_video_from_file,
...@@ -47,7 +43,6 @@ __all__ = [ ...@@ -47,7 +43,6 @@ __all__ = [
"_read_video_timestamps_from_memory", "_read_video_timestamps_from_memory",
"_probe_video_from_memory", "_probe_video_from_memory",
"_HAS_VIDEO_OPT", "_HAS_VIDEO_OPT",
"_HAS_GPU_VIDEO_DECODER",
"_read_video_clip_from_memory", "_read_video_clip_from_memory",
"_read_video_meta_data", "_read_video_meta_data",
"VideoMetaData", "VideoMetaData",
......
from ..extension import _load_library
try:
_load_library("Decoder")
_HAS_GPU_VIDEO_DECODER = True
except (ImportError, OSError):
_HAS_GPU_VIDEO_DECODER = False
import io
import warnings import warnings
from typing import Any, Dict, Iterator, Optional from typing import Any, Dict, Iterator, Optional
import torch import torch
from ..utils import _log_api_usage_once from ..utils import _log_api_usage_once
try:
from ._load_gpu_decoder import _HAS_GPU_VIDEO_DECODER
except ModuleNotFoundError:
_HAS_GPU_VIDEO_DECODER = False
from ._video_opt import _HAS_VIDEO_OPT from ._video_opt import _HAS_VIDEO_OPT
if _HAS_VIDEO_OPT: if _HAS_VIDEO_OPT:
...@@ -22,11 +20,37 @@ else: ...@@ -22,11 +20,37 @@ else:
return False return False
try:
import av
av.logging.set_level(av.logging.ERROR)
if not hasattr(av.video.frame.VideoFrame, "pict_type"):
av = ImportError(
"""\
Your version of PyAV is too old for the necessary video operations in torchvision.
If you are on Python 3.5, you will have to build from source (the conda-forge
packages are not up-to-date). See
https://github.com/mikeboers/PyAV#installation for instructions on how to
install PyAV on your system.
"""
)
except ImportError:
av = ImportError(
"""\
PyAV is not installed, and is necessary for the video operations in torchvision.
See https://github.com/mikeboers/PyAV#installation for instructions on how to
install PyAV on your system.
"""
)
class VideoReader: class VideoReader:
""" """
Fine-grained video-reading API. Fine-grained video-reading API.
Supports frame-by-frame reading of various streams from a single video Supports frame-by-frame reading of various streams from a single video
container. container. Much like previous video_reader API it supports the following
backends: video_reader, pyav, and cuda.
Backends can be set via `torchvision.set_video_backend` function.
.. betastatus:: VideoReader class .. betastatus:: VideoReader class
...@@ -88,16 +112,11 @@ class VideoReader: ...@@ -88,16 +112,11 @@ class VideoReader:
Default value (0) enables multithreading with codec-dependent heuristic. The performance Default value (0) enables multithreading with codec-dependent heuristic. The performance
will depend on the version of FFMPEG codecs supported. will depend on the version of FFMPEG codecs supported.
device (str, optional): Device to be used for decoding. Defaults to ``"cpu"``.
To use GPU decoding, pass ``device="cuda"``.
path (str, optional): path (str, optional):
.. warning: .. warning:
This parameter was deprecated in ``0.15`` and will be removed in ``0.17``. This parameter was deprecated in ``0.15`` and will be removed in ``0.17``.
Please use ``src`` instead. Please use ``src`` instead.
""" """
def __init__( def __init__(
...@@ -105,45 +124,59 @@ class VideoReader: ...@@ -105,45 +124,59 @@ class VideoReader:
src: str = "", src: str = "",
stream: str = "video", stream: str = "video",
num_threads: int = 0, num_threads: int = 0,
device: str = "cpu",
path: Optional[str] = None, path: Optional[str] = None,
) -> None: ) -> None:
_log_api_usage_once(self) _log_api_usage_once(self)
self.is_cuda = False from .. import get_video_backend
device = torch.device(device)
if device.type == "cuda":
if not _HAS_GPU_VIDEO_DECODER:
raise RuntimeError("Not compiled with GPU decoder support.")
self.is_cuda = True
self._c = torch.classes.torchvision.GPUDecoder(src, device)
return
if not _has_video_opt():
raise RuntimeError(
"Not compiled with video_reader support, "
+ "to enable video_reader support, please install "
+ "ffmpeg (version 4.2 is currently supported) and "
+ "build torchvision from source."
)
if src == "":
if path is None:
raise TypeError("src cannot be empty")
src = path
warnings.warn("path is deprecated and will be removed in 0.17. Please use src instead")
elif isinstance(src, bytes):
src = torch.frombuffer(src, dtype=torch.uint8)
self.backend = get_video_backend()
if isinstance(src, str): if isinstance(src, str):
self._c = torch.classes.torchvision.Video(src, stream, num_threads) if src == "":
if path is None:
raise TypeError("src cannot be empty")
src = path
warnings.warn("path is deprecated and will be removed in 0.17. Please use src instead")
elif isinstance(src, bytes):
if self.backend in ["cuda"]:
raise RuntimeError(
"VideoReader cannot be initialized from bytes object when using cuda or pyav backend."
)
elif self.backend == "pyav":
src = io.BytesIO(src)
else:
src = torch.frombuffer(src, dtype=torch.uint8)
elif isinstance(src, torch.Tensor): elif isinstance(src, torch.Tensor):
if self.is_cuda: if self.backend in ["cuda", "pyav"]:
raise RuntimeError("GPU VideoReader cannot be initialized from Tensor or bytes object.") raise RuntimeError(
self._c = torch.classes.torchvision.Video("", "", 0) "VideoReader cannot be initialized from Tensor object when using cuda or pyav backend."
self._c.init_from_memory(src, stream, num_threads) )
else: else:
raise TypeError("`src` must be either string, Tensor or bytes object.") raise TypeError("`src` must be either string, Tensor or bytes object.")
if self.backend == "cuda":
device = torch.device("cuda")
self._c = torch.classes.torchvision.GPUDecoder(src, device)
elif self.backend == "video_reader":
if isinstance(src, str):
self._c = torch.classes.torchvision.Video(src, stream, num_threads)
elif isinstance(src, torch.Tensor):
self._c = torch.classes.torchvision.Video("", "", 0)
self._c.init_from_memory(src, stream, num_threads)
elif self.backend == "pyav":
self.container = av.open(src, metadata_errors="ignore")
# TODO: load metadata
stream_type = stream.split(":")[0]
stream_id = 0 if len(stream.split(":")) == 1 else int(stream.split(":")[1])
self.pyav_stream = {stream_type: stream_id}
self._c = self.container.decode(**self.pyav_stream)
# TODO: add extradata exception
else:
raise RuntimeError("Unknown video backend: {}".format(self.backend))
def __next__(self) -> Dict[str, Any]: def __next__(self) -> Dict[str, Any]:
"""Decodes and returns the next frame of the current stream. """Decodes and returns the next frame of the current stream.
Frames are encoded as a dict with mandatory Frames are encoded as a dict with mandatory
...@@ -156,14 +189,29 @@ class VideoReader: ...@@ -156,14 +189,29 @@ class VideoReader:
and corresponding timestamp (``pts``) in seconds and corresponding timestamp (``pts``) in seconds
""" """
if self.is_cuda: if self.backend == "cuda":
frame = self._c.next() frame = self._c.next()
if frame.numel() == 0: if frame.numel() == 0:
raise StopIteration raise StopIteration
return {"data": frame} return {"data": frame, "pts": None}
frame, pts = self._c.next() elif self.backend == "video_reader":
frame, pts = self._c.next()
else:
try:
frame = next(self._c)
pts = float(frame.pts * frame.time_base)
if "video" in self.pyav_stream:
frame = torch.tensor(frame.to_rgb().to_ndarray()).permute(2, 0, 1)
elif "audio" in self.pyav_stream:
frame = torch.tensor(frame.to_ndarray()).permute(1, 0)
else:
frame = None
except av.error.EOFError:
raise StopIteration
if frame.numel() == 0: if frame.numel() == 0:
raise StopIteration raise StopIteration
return {"data": frame, "pts": pts} return {"data": frame, "pts": pts}
def __iter__(self) -> Iterator[Dict[str, Any]]: def __iter__(self) -> Iterator[Dict[str, Any]]:
...@@ -182,7 +230,18 @@ class VideoReader: ...@@ -182,7 +230,18 @@ class VideoReader:
frame with the exact timestamp if it exists or frame with the exact timestamp if it exists or
the first frame with timestamp larger than ``time_s``. the first frame with timestamp larger than ``time_s``.
""" """
self._c.seek(time_s, keyframes_only) if self.backend in ["cuda", "video_reader"]:
self._c.seek(time_s, keyframes_only)
else:
# handle special case as pyav doesn't catch it
if time_s < 0:
time_s = 0
temp_str = self.container.streams.get(**self.pyav_stream)[0]
offset = int(round(time_s / temp_str.time_base))
if not keyframes_only:
warnings.warn("Accurate seek is not implemented for pyav backend")
self.container.seek(offset, backward=True, any_frame=False, stream=temp_str)
self._c = self.container.decode(**self.pyav_stream)
return self return self
def get_metadata(self) -> Dict[str, Any]: def get_metadata(self) -> Dict[str, Any]:
...@@ -191,6 +250,21 @@ class VideoReader: ...@@ -191,6 +250,21 @@ class VideoReader:
Returns: Returns:
(dict): dictionary containing duration and frame rate for every stream (dict): dictionary containing duration and frame rate for every stream
""" """
if self.backend == "pyav":
metadata = {} # type: Dict[str, Any]
for stream in self.container.streams:
if stream.type not in metadata:
if stream.type == "video":
rate_n = "fps"
else:
rate_n = "framerate"
metadata[stream.type] = {rate_n: [], "duration": []}
rate = stream.average_rate if stream.average_rate is not None else stream.sample_rate
metadata[stream.type]["duration"].append(float(stream.duration * stream.time_base))
metadata[stream.type][rate_n].append(float(rate))
return metadata
return self._c.get_metadata() return self._c.get_metadata()
def set_current_stream(self, stream: str) -> bool: def set_current_stream(self, stream: str) -> bool:
...@@ -210,6 +284,12 @@ class VideoReader: ...@@ -210,6 +284,12 @@ class VideoReader:
Returns: Returns:
(bool): True on succes, False otherwise (bool): True on succes, False otherwise
""" """
if self.is_cuda: if self.backend == "cuda":
print("GPU decoding only works with video stream.") warnings.warn("GPU decoding only works with video stream.")
if self.backend == "pyav":
stream_type = stream.split(":")[0]
stream_id = 0 if len(stream.split(":")) == 1 else int(stream.split(":")[1])
self.pyav_stream = {stream_type: stream_id}
self._c = self.container.decode(**self.pyav_stream)
return True
return self._c.set_current_stream(stream) return self._c.set_current_stream(stream)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment