kinetics.py 10.1 KB
Newer Older
limm's avatar
limm committed
1
2
3
4
5
6
7
8
9
10
11
12
import csv
import os
import time
import urllib
from functools import partial
from multiprocessing import Pool
from os import path
from pathlib import Path
from typing import Any, Callable, Dict, Optional, Tuple, Union

from torch import Tensor

13
from .folder import find_classes, make_dataset
limm's avatar
limm committed
14
from .utils import check_integrity, download_and_extract_archive, download_url, verify_str_arg
15
from .video_utils import VideoClips
16
17
18
from .vision import VisionDataset


limm's avatar
limm committed
19
20
21
22
23
24
def _dl_wrap(tarpath: str, videopath: str, line: str) -> None:
    download_and_extract_archive(line, tarpath, videopath)


class Kinetics(VisionDataset):
    """`Generic Kinetics <https://www.deepmind.com/open-source/kinetics>`_
25
26
    dataset.

limm's avatar
limm committed
27
    Kinetics-400/600/700 are action recognition video datasets.
28
29
30
31
32
33
34
35
36
37
38
    This dataset consider every video as a collection of video clips of fixed size, specified
    by ``frames_per_clip``, where the step in frames between each clip is given by
    ``step_between_clips``.

    To give an example, for 2 videos with 10 and 15 frames respectively, if ``frames_per_clip=5``
    and ``step_between_clips=5``, the dataset size will be (2 + 3) = 5, where the first two
    elements will come from video 1, and the next three elements from video 2.
    Note that we drop clips which do not have exactly ``frames_per_clip`` elements, so not all
    frames in a video might be present.

    Args:
limm's avatar
limm committed
39
40
        root (str or ``pathlib.Path``): Root directory of the Kinetics Dataset.
            Directory should be structured as follows:
41
42
43
            .. code::

                root/
limm's avatar
limm committed
44
45
46
47
48
49
50
51
52
53
54
                ├── split
                │   ├──  class1
                │   │   ├──  vid1.mp4
                │   │   ├──  vid2.mp4
                │   │   ├──  vid3.mp4
                │   │   ├──  ...
                │   ├──  class2
                │   │   ├──   vidx.mp4
                │   │    └── ...

            Note: split is appended automatically using the split argument.
55
        frames_per_clip (int): number of frames in a clip
limm's avatar
limm committed
56
57
58
        num_classes (int): select between Kinetics-400 (default), Kinetics-600, and Kinetics-700
        split (str): split of the dataset to consider; supports ``"train"`` (default) ``"val"`` ``"test"``
        frame_rate (float): If omitted, interpolate different frame rate for each clip.
59
        step_between_clips (int): number of frames between each clip
limm's avatar
limm committed
60
        transform (callable, optional): A function/transform that takes in a TxHxWxC video
61
            and returns a transformed version.
limm's avatar
limm committed
62
63
64
65
66
67
        download (bool): Download the official version of the dataset to root folder.
        num_workers (int): Use multiple workers for VideoClips creation
        num_download_workers (int): Use multiprocessing in order to speed up download.
        output_format (str, optional): The format of the output video tensors (before transforms).
            Can be either "THWC" or "TCHW" (default).
            Note that in most other utils and datasets, the default is actually "THWC".
68
69

    Returns:
70
71
        tuple: A 3-tuple with the following entries:

limm's avatar
limm committed
72
            - video (Tensor[T, C, H, W] or Tensor[T, H, W, C]): the `T` video frames in torch.uint8 tensor
73
            - audio(Tensor[K, L]): the audio frames, where `K` is the number of channels
limm's avatar
limm committed
74
              and `L` is the number of points in torch.float tensor
75
            - label (int): class of the video clip
76

limm's avatar
limm committed
77
78
79
    Raises:
        RuntimeError: If ``download is True`` and the video archives are already extracted.
    """
80

limm's avatar
limm committed
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
    _TAR_URLS = {
        "400": "https://s3.amazonaws.com/kinetics/400/{split}/k400_{split}_path.txt",
        "600": "https://s3.amazonaws.com/kinetics/600/{split}/k600_{split}_path.txt",
        "700": "https://s3.amazonaws.com/kinetics/700_2020/{split}/k700_2020_{split}_path.txt",
    }
    _ANNOTATION_URLS = {
        "400": "https://s3.amazonaws.com/kinetics/400/annotations/{split}.csv",
        "600": "https://s3.amazonaws.com/kinetics/600/annotations/{split}.csv",
        "700": "https://s3.amazonaws.com/kinetics/700_2020/annotations/{split}.csv",
    }

    def __init__(
        self,
        root: Union[str, Path],
        frames_per_clip: int,
        num_classes: str = "400",
        split: str = "train",
        frame_rate: Optional[int] = None,
        step_between_clips: int = 1,
        transform: Optional[Callable] = None,
        extensions: Tuple[str, ...] = ("avi", "mp4"),
        download: bool = False,
        num_download_workers: int = 1,
        num_workers: int = 1,
        _precomputed_metadata: Optional[Dict[str, Any]] = None,
        _video_width: int = 0,
        _video_height: int = 0,
        _video_min_dimension: int = 0,
        _audio_samples: int = 0,
        _audio_channels: int = 0,
        _legacy: bool = False,
        output_format: str = "TCHW",
    ) -> None:

        # TODO: support test
        self.num_classes = verify_str_arg(num_classes, arg="num_classes", valid_values=["400", "600", "700"])
        self.extensions = extensions
        self.num_download_workers = num_download_workers

        self.root = root
        self._legacy = _legacy

        if _legacy:
            print("Using legacy structure")
            self.split_folder = root
            self.split = "unknown"
            output_format = "THWC"
            if download:
                raise ValueError("Cannot download the videos using legacy_structure.")
        else:
            self.split_folder = path.join(root, split)
            self.split = verify_str_arg(split, arg="split", valid_values=["train", "val", "test"])

        if download:
            self.download_and_process_videos()

        super().__init__(self.root)

        self.classes, class_to_idx = find_classes(self.split_folder)
        self.samples = make_dataset(self.split_folder, class_to_idx, extensions, is_valid_file=None)
141
        video_list = [x[0] for x in self.samples]
142
143
144
145
146
147
        self.video_clips = VideoClips(
            video_list,
            frames_per_clip,
            step_between_clips,
            frame_rate,
            _precomputed_metadata,
148
149
150
151
152
            num_workers=num_workers,
            _video_width=_video_width,
            _video_height=_video_height,
            _video_min_dimension=_video_min_dimension,
            _audio_samples=_audio_samples,
153
            _audio_channels=_audio_channels,
limm's avatar
limm committed
154
            output_format=output_format,
155
        )
156
        self.transform = transform
157

limm's avatar
limm committed
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
    def download_and_process_videos(self) -> None:
        """Downloads all the videos to the _root_ folder in the expected format."""
        tic = time.time()
        self._download_videos()
        toc = time.time()
        print("Elapsed time for downloading in mins ", (toc - tic) / 60)
        self._make_ds_structure()
        toc2 = time.time()
        print("Elapsed time for processing in mins ", (toc2 - toc) / 60)
        print("Elapsed time overall in mins ", (toc2 - tic) / 60)

    def _download_videos(self) -> None:
        """download tarballs containing the video to "tars" folder and extract them into the _split_ folder where
        split is one of the official dataset splits.

        Raises:
            RuntimeError: if download folder exists, break to prevent downloading entire dataset again.
        """
        if path.exists(self.split_folder):
            raise RuntimeError(
                f"The directory {self.split_folder} already exists. "
                f"If you want to re-download or re-extract the images, delete the directory."
            )
        tar_path = path.join(self.root, "tars")
        file_list_path = path.join(self.root, "files")

        split_url = self._TAR_URLS[self.num_classes].format(split=self.split)
        split_url_filepath = path.join(file_list_path, path.basename(split_url))
        if not check_integrity(split_url_filepath):
            download_url(split_url, file_list_path)
        with open(split_url_filepath) as file:
            list_video_urls = [urllib.parse.quote(line, safe="/,:") for line in file.read().splitlines()]

        if self.num_download_workers == 1:
            for line in list_video_urls:
                download_and_extract_archive(line, tar_path, self.split_folder)
        else:
            part = partial(_dl_wrap, tar_path, self.split_folder)
            poolproc = Pool(self.num_download_workers)
            poolproc.map(part, list_video_urls)

    def _make_ds_structure(self) -> None:
        """move videos from
        split_folder/
            ├── clip1.avi
            ├── clip2.avi

        to the correct format as described below:
        split_folder/
            ├── class1
            │   ├── clip1.avi

        """
        annotation_path = path.join(self.root, "annotations")
        if not check_integrity(path.join(annotation_path, f"{self.split}.csv")):
            download_url(self._ANNOTATION_URLS[self.num_classes].format(split=self.split), annotation_path)
        annotations = path.join(annotation_path, f"{self.split}.csv")

        file_fmtstr = "{ytid}_{start:06}_{end:06}.mp4"
        with open(annotations) as csvfile:
            reader = csv.DictReader(csvfile)
            for row in reader:
                f = file_fmtstr.format(
                    ytid=row["youtube_id"],
                    start=int(row["time_start"]),
                    end=int(row["time_end"]),
                )
                label = row["label"].replace(" ", "_").replace("'", "").replace("(", "").replace(")", "")
                os.makedirs(path.join(self.split_folder, label), exist_ok=True)
                downloaded_file = path.join(self.split_folder, f)
                if path.isfile(downloaded_file):
                    os.replace(
                        downloaded_file,
                        path.join(self.split_folder, label, f),
                    )

234
    @property
limm's avatar
limm committed
235
    def metadata(self) -> Dict[str, Any]:
236
237
        return self.video_clips.metadata

limm's avatar
limm committed
238
    def __len__(self) -> int:
239
240
        return self.video_clips.num_clips()

limm's avatar
limm committed
241
    def __getitem__(self, idx: int) -> Tuple[Tensor, Tensor, int]:
242
243
244
        video, audio, info, video_idx = self.video_clips.get_clip(idx)
        label = self.samples[video_idx][1]

245
246
247
        if self.transform is not None:
            video = self.transform(video)

248
        return video, audio, label