Unverified Commit 8ea04d13 authored by Bruno Korbar's avatar Bruno Korbar Committed by GitHub
Browse files

Download and Kinetics 400/600/700 Datasets (#3680)



* Initial commit

* pmeiers comments
Co-authored-by: default avatarPhilip Meier <github.pmeier@posteo.de>

* pmeiers changes
Co-authored-by: default avatarPhilip Meier <github.pmeier@posteo.de>

* pmeiers comments
Co-authored-by: default avatarPhilip Meier <github.pmeier@posteo.de>

* replace pandas with system library to avoid crashes

* Lint

* Lint

* fixing unittest

* Minor comments removal

* pmeier comments
Co-authored-by: default avatarPhilip Meier <github.pmeier@posteo.de>

* remove asserts

* address pmeier formatting changes

* address pmeier changes
Co-authored-by: default avatarPhilip Meier <github.pmeier@posteo.de>

* pmeier changes
Co-authored-by: default avatarPhilip Meier <github.pmeier@posteo.de>

* rename n_classes to num_classes

* formatting changes

* doc change to add ".mp4" to backported class

* formatting to correct line length

* adding **kwargs to Kinetics400 class

* remove urlib request and download the file directly

* annotations and files can be already downloaded

* test fix

* add download tests for Kinetics

* users now dont need to provide full path within the root for new Kinetics dataset

* linter

* Update test/test_datasets_download.py

* Update torchvision/datasets/kinetics.py
Co-authored-by: default avatarPhilip Meier <github.pmeier@posteo.de>

* revert whitespace (3680#discussion_r626382842)

* addressing annotation_path parameter which is unnecessary

* Update torchvision/datasets/kinetics.py
Co-authored-by: default avatarPhilip Meier <github.pmeier@posteo.de>

* Update torchvision/datasets/kinetics.py
Co-authored-by: default avatarPhilip Meier <github.pmeier@posteo.de>

* kwargs update
Co-authored-by: default avatarPhilip Meier <github.pmeier@posteo.de>

* expose num_download_workers as public

* swap os.isfile with check_integrity

* nit on private things

* special case if there are no default arguments

* revert changes to kinetics400 test case for BC

* add split_folder changes and support for legacy format

* pmeiers suggestions
Co-authored-by: default avatarPhilip Meier <github.pmeier@posteo.de>

* pmeiers suggestions - root comment

* pmeiers comments - annotation attribute remmoved

* pmeiers suggestion
Co-authored-by: default avatarPhilip Meier <github.pmeier@posteo.de>

* pmeiers suggestion
Co-authored-by: default avatarPhilip Meier <github.pmeier@posteo.de>

* pmeiers suggestion
Co-authored-by: default avatarPhilip Meier <github.pmeier@posteo.de>

* pmeiers suggestion
Co-authored-by: default avatarPhilip Meier <github.pmeier@posteo.de>

* Update torchvision/datasets/kinetics.py
Co-authored-by: default avatarPhilip Meier <github.pmeier@posteo.de>

* Update torchvision/datasets/kinetics.py
Co-authored-by: default avatarPhilip Meier <github.pmeier@posteo.de>

* Update torchvision/datasets/kinetics.py
Co-authored-by: default avatarPhilip Meier <github.pmeier@posteo.de>

* Update torchvision/datasets/kinetics.py
Co-authored-by: default avatarPhilip Meier <github.pmeier@posteo.de>

* Update torchvision/datasets/kinetics.py
Co-authored-by: default avatarPhilip Meier <github.pmeier@posteo.de>

* Update torchvision/datasets/kinetics.py
Co-authored-by: default avatarPhilip Meier <github.pmeier@posteo.de>

* minor debugging

* nit picks

* only include public kwargs into defaults

* add _use_legacy_structure in favour of **kwargs

* add type hints for Kinetics400

* flake8

* flake8

* flake8

* rename to make thigs clearer

* permuting the output
Co-authored-by: default avatarPhilip Meier <github.pmeier@posteo.de>
Co-authored-by: default avatarFrancisco Massa <fvsmassa@gmail.com>
parent 9a6c8bbe
......@@ -416,7 +416,11 @@ class DatasetTestCase(unittest.TestCase):
continue
defaults.append(
{kwarg: default for kwarg, default in zip(argspec.args[-len(argspec.defaults):], argspec.defaults)}
{
kwarg: default
for kwarg, default in zip(argspec.args[-len(argspec.defaults):], argspec.defaults)
if not kwarg.startswith("_")
}
)
if not argspec.varkw:
......@@ -637,7 +641,7 @@ class VideoDatasetTestCase(DatasetTestCase):
def _set_default_frames_per_clip(self, inject_fake_data):
argspec = inspect.getfullargspec(self.DATASET_CLASS.__init__)
args_without_default = argspec.args[1:-len(argspec.defaults)]
args_without_default = argspec.args[1:(-len(argspec.defaults) if argspec.defaults else None)]
frames_per_clip_last = args_without_default[-1] == "frames_per_clip"
@functools.wraps(inject_fake_data)
......
......@@ -944,6 +944,27 @@ class LSUNTestCase(datasets_utils.ImageDatasetTestCase):
super().test_not_found_or_corrupted()
class KineticsTestCase(datasets_utils.VideoDatasetTestCase):
DATASET_CLASS = datasets.Kinetics
ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(
split=("train", "val"), num_classes=("400", "600", "700")
)
def inject_fake_data(self, tmpdir, config):
classes = ("Abseiling", "Zumba")
num_videos_per_class = 2
tmpdir = pathlib.Path(tmpdir) / config['split']
digits = string.ascii_letters + string.digits + "-_"
for cls in classes:
datasets_utils.create_video_folder(
tmpdir,
cls,
lambda _: f"{datasets_utils.create_random_string(11, digits)}.mp4",
num_videos_per_class,
)
return num_videos_per_class * len(classes)
class Kinetics400TestCase(datasets_utils.VideoDatasetTestCase):
DATASET_CLASS = datasets.Kinetics400
......
......@@ -392,6 +392,25 @@ def widerface():
)
def kinetics():
return itertools.chain(
*[
collect_download_configs(
lambda: datasets.Kinetics(
path.join(ROOT, f"Kinetics{num_classes}"),
frames_per_clip=1,
num_classes=num_classes,
split=split,
download=True,
),
name=f"Kinetics, {num_classes}, {split}",
file="kinetics",
)
for num_classes, split in itertools.product(("400", "600", "700"), ("train", "val"))
]
)
def kitti():
return itertools.chain(
*[
......@@ -440,6 +459,7 @@ def make_parametrize_kwargs(download_configs):
usps(),
celeba(),
widerface(),
kinetics(),
kitti(),
)
)
......
......@@ -20,7 +20,7 @@ from .widerface import WIDERFace
from .sbd import SBDataset
from .vision import VisionDataset
from .usps import USPS
from .kinetics import Kinetics400
from .kinetics import Kinetics400, Kinetics
from .hmdb51 import HMDB51
from .ucf101 import UCF101
from .places365 import Places365
......@@ -34,6 +34,6 @@ __all__ = ('LSUN', 'LSUNClass',
'Omniglot', 'SBU', 'Flickr8k', 'Flickr30k',
'VOCSegmentation', 'VOCDetection', 'Cityscapes', 'ImageNet',
'Caltech101', 'Caltech256', 'CelebA', 'WIDERFace', 'SBDataset',
'VisionDataset', 'USPS', 'Kinetics400', 'HMDB51', 'UCF101',
'VisionDataset', 'USPS', 'Kinetics400', "Kinetics", 'HMDB51', 'UCF101',
'Places365', 'Kitti',
)
from .utils import list_dir
import time
import os
import warnings
from os import path
import csv
from typing import Any, Callable, Dict, Optional, Tuple
from functools import partial
from multiprocessing import Pool
from .utils import download_and_extract_archive, download_url, verify_str_arg, check_integrity
from .folder import find_classes, make_dataset
from .video_utils import VideoClips
from .vision import VisionDataset
class Kinetics400(VisionDataset):
"""
`Kinetics-400 <https://deepmind.com/research/open-source/open-source-datasets/kinetics/>`_
def _dl_wrap(tarpath, videopath, line):
download_and_extract_archive(line, tarpath, videopath)
class Kinetics(VisionDataset):
"""` Generic Kinetics <https://deepmind.com/research/open-source/open-source-datasets/kinetics/>`_
dataset.
Kinetics-400 is an action recognition video dataset.
Kinetics-400/600/700 are action recognition video datasets.
This dataset consider every video as a collection of video clips of fixed size, specified
by ``frames_per_clip``, where the step in frames between each clip is given by
``step_between_clips``.
......@@ -20,44 +34,101 @@ class Kinetics400(VisionDataset):
Note that we drop clips which do not have exactly ``frames_per_clip`` elements, so not all
frames in a video might be present.
Internally, it uses a VideoClips object to handle clip creation.
Args:
root (string): Root directory of the Kinetics-400 Dataset. Should be structured as follows:
root (string): Root directory of the Kinetics Dataset.
Directory should be structured as follows:
.. code::
root/
├── class1
│ ├── clip1.avi
│ ├── clip2.avi
│ └── ...
└── class2
├── clipx.avi
└── ...
├── split
│ ├── class1
│ │ ├── clip1.mp4
│ │ ├── clip2.mp4
│ │ ├── clip3.mp4
│ │ ├── ...
│ ├── class2
│ │ ├── clipx.mp4
│ │ └── ...
Note: split is appended automatically using the split argument.
frames_per_clip (int): number of frames in a clip
num_classes (int): select between Kinetics-400 (default), Kinetics-600, and Kinetics-700
split (str): split of the dataset to consider; supports ``"train"`` (default) ``"val"``
frame_rate (float): If omitted, interpolate different frame rate for each clip.
step_between_clips (int): number of frames between each clip
transform (callable, optional): A function/transform that takes in a TxHxWxC video
and returns a transformed version.
download (bool): Download the official version of the dataset to root folder.
num_workers (int): Use multiple workers for VideoClips creation
num_download_workers (int): Use multiprocessing in order to speed up download.
Returns:
tuple: A 3-tuple with the following entries:
- video (Tensor[T, H, W, C]): the `T` video frames
- video (Tensor[T, C, H, W]): the `T` video frames in torch.uint8 tensor
- audio(Tensor[K, L]): the audio frames, where `K` is the number of channels
and `L` is the number of points
and `L` is the number of points in torch.float tensor
- label (int): class of the video clip
Raises:
RuntimeError: If ``download is True`` and the video archives are already extracted.
"""
def __init__(self, root, frames_per_clip, step_between_clips=1, frame_rate=None,
extensions=('avi',), transform=None, _precomputed_metadata=None,
num_workers=1, _video_width=0, _video_height=0,
_video_min_dimension=0, _audio_samples=0, _audio_channels=0):
super(Kinetics400, self).__init__(root)
_TAR_URLS = {
"400": "https://s3.amazonaws.com/kinetics/400/{split}/k400_{split}_path.txt",
"600": "https://s3.amazonaws.com/kinetics/600/{split}/k600_{split}_path.txt",
"700": "https://s3.amazonaws.com/kinetics/700_2020/{split}/k700_2020_{split}_path.txt",
}
_ANNOTATION_URLS = {
"400": "https://s3.amazonaws.com/kinetics/400/annotations/{split}.csv",
"600": "https://s3.amazonaws.com/kinetics/600/annotations/{split}.txt",
"700": "https://s3.amazonaws.com/kinetics/700_2020/annotations/{split}.csv",
}
def __init__(
self,
root: str,
frames_per_clip: int,
num_classes: str = "400",
split: str = "train",
frame_rate: Optional[float] = None,
step_between_clips: int = 1,
transform: Optional[Callable] = None,
extensions: Tuple[str, ...] = ("avi", "mp4"),
download: bool = False,
num_download_workers: int = 1,
num_workers: int = 1,
_precomputed_metadata: Optional[Dict] = None,
_video_width: int = 0,
_video_height: int = 0,
_video_min_dimension: int = 0,
_audio_samples: int = 0,
_audio_channels: int = 0,
_legacy: bool = False,
) -> None:
# TODO: support test
self.num_classes = verify_str_arg(num_classes, arg="num_classes", valid_values=["400", "600", "700"])
self.extensions = extensions
self.num_download_workers = num_download_workers
self.root = root
self._legacy = _legacy
if _legacy:
print("Using legacy structure")
self.split_folder = root
self.split = "unknown"
assert not download, "Cannot download the videos using legacy_structure."
else:
self.split_folder = path.join(root, split)
self.split = verify_str_arg(split, arg="split", valid_values=["train", "val"])
if download:
self.download_and_process_videos()
super().__init__(self.root)
self.classes, class_to_idx = find_classes(self.root)
self.samples = make_dataset(self.root, class_to_idx, extensions, is_valid_file=None)
self.classes, class_to_idx = find_classes(self.split_folder)
self.samples = make_dataset(self.split_folder, class_to_idx, extensions, is_valid_file=None)
video_list = [x[0] for x in self.samples]
self.video_clips = VideoClips(
video_list,
......@@ -74,6 +145,88 @@ class Kinetics400(VisionDataset):
)
self.transform = transform
def download_and_process_videos(self) -> None:
"""Downloads all the videos to the _root_ folder in the expected format."""
tic = time.time()
self._download_videos()
toc = time.time()
print("Elapsed time for downloading in mins ", (toc - tic) / 60)
self._make_ds_structure()
toc2 = time.time()
print("Elapsed time for processing in mins ", (toc2 - toc) / 60)
print("Elapsed time overall in mins ", (toc2 - tic) / 60)
def _download_videos(self) -> None:
"""download tarballs containing the video to "tars" folder and extract them into the _split_ folder where
split is one of the official dataset splits.
Raises:
RuntimeError: if download folder exists, break to prevent downloading entire dataset again.
"""
if path.exists(self.split_folder):
raise RuntimeError(
f"The directory {self.split_folder} already exists. "
f"If you want to re-download or re-extract the images, delete the directory."
)
tar_path = path.join(self.root, "tars")
file_list_path = path.join(self.root, "files")
split_url = self._TAR_URLS[self.num_classes].format(split=self.split)
split_url_filepath = path.join(file_list_path, path.basename(split_url))
if not check_integrity(split_url_filepath):
download_url(split_url, file_list_path)
list_video_urls = open(split_url_filepath, "r")
if self.num_download_workers == 1:
for line in list_video_urls.readlines():
line = str(line).replace("\n", "")
download_and_extract_archive(line, tar_path, self.split_folder)
else:
part = partial(_dl_wrap, tar_path, self.split_folder)
lines = [str(line).replace("\n", "") for line in list_video_urls.readlines()]
poolproc = Pool(self.num_download_workers)
poolproc.map(part, lines)
def _make_ds_structure(self):
"""move videos from
split_folder/
├── clip1.avi
├── clip2.avi
to the correct format as described below:
split_folder/
├── class1
│ ├── clip1.avi
"""
annotation_path = path.join(self.root, "annotations")
if not check_integrity(path.join(annotation_path, f"{self.split}.csv")):
download_url(self._ANNOTATION_URLS[self.num_classes].format(split=self.split), annotation_path)
annotations = path.join(annotation_path, f"{self.split}.csv")
file_fmtstr = "{ytid}_{start:06}_{end:06}.mp4"
with open(annotations) as csvfile:
reader = csv.DictReader(csvfile)
for row in reader:
f = file_fmtstr.format(
ytid=row["youtube_id"],
start=int(row["time_start"]),
end=int(row["time_end"]),
)
label = (
row["label"]
.replace(" ", "_")
.replace("'", "")
.replace("(", "")
.replace(")", "")
)
os.makedirs(path.join(self.split_folder, label), exist_ok=True)
downloaded_file = path.join(self.split_folder, f)
if path.isfile(downloaded_file):
os.replace(
downloaded_file, path.join(self.split_folder, label, f),
)
@property
def metadata(self):
return self.video_clips.metadata
......@@ -83,9 +236,86 @@ class Kinetics400(VisionDataset):
def __getitem__(self, idx):
video, audio, info, video_idx = self.video_clips.get_clip(idx)
if not self._legacy:
# [T,H,W,C] --> [T,C,H,W]
video = video.permute(0, 3, 1, 2)
label = self.samples[video_idx][1]
if self.transform is not None:
video = self.transform(video)
return video, audio, label
class Kinetics400(Kinetics):
"""
`Kinetics-400 <https://deepmind.com/research/open-source/open-source-datasets/kinetics/>`_
dataset.
Kinetics-400 is an action recognition video dataset.
This dataset consider every video as a collection of video clips of fixed size, specified
by ``frames_per_clip``, where the step in frames between each clip is given by
``step_between_clips``.
To give an example, for 2 videos with 10 and 15 frames respectively, if ``frames_per_clip=5``
and ``step_between_clips=5``, the dataset size will be (2 + 3) = 5, where the first two
elements will come from video 1, and the next three elements from video 2.
Note that we drop clips which do not have exactly ``frames_per_clip`` elements, so not all
frames in a video might be present.
Internally, it uses a VideoClips object to handle clip creation.
Args:
root (string): Root directory of the Kinetics-400 Dataset. Should be structured as follows:
.. code::
root/
├── class1
│ ├── clip1.avi
│ ├── clip2.avi
│ ├── clip3.mp4
│ └── ...
└── class2
├── clipx.avi
└── ...
frames_per_clip (int): number of frames in a clip
step_between_clips (int): number of frames between each clip
transform (callable, optional): A function/transform that takes in a TxHxWxC video
and returns a transformed version.
Returns:
tuple: A 3-tuple with the following entries:
- video (Tensor[T, H, W, C]): the `T` video frames
- audio(Tensor[K, L]): the audio frames, where `K` is the number of channels
and `L` is the number of points
- label (int): class of the video clip
"""
def __init__(
self,
root: str,
frames_per_clip: int,
num_classes: Any = None,
split: Any = None,
download: Any = None,
num_download_workers: Any = None,
**kwargs: Any
):
warnings.warn(
"Kinetics400 is deprecated and will be removed in a future release."
"It was replaced by Kinetics(..., num_classes=\"400\").")
if any(value is not None for value in (num_classes, split, download, num_download_workers)):
raise RuntimeError(
"Usage of 'num_classes', 'split', 'download', or 'num_download_workers' is not supported in "
"Kinetics400. Please use Kinetics instead."
)
super(Kinetics400, self).__init__(
root=root,
frames_per_clip=frames_per_clip,
_legacy=True,
**kwargs,
)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment