"vscode:/vscode.git/clone" did not exist on "eef9433b46006af606e9edbcdf6ce5c9255be303"
kinetics.py 12.7 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
import time
import os
import warnings


from os import path
import csv
from typing import Any, Callable, Dict, Optional, Tuple
from functools import partial
from multiprocessing import Pool
11
from torch import Tensor
12
13

from .utils import download_and_extract_archive, download_url, verify_str_arg, check_integrity
14
from .folder import find_classes, make_dataset
15
from .video_utils import VideoClips
16
17
18
from .vision import VisionDataset


19
def _dl_wrap(tarpath: str, videopath: str, line: str) -> None:
20
21
22
23
24
    download_and_extract_archive(line, tarpath, videopath)


class Kinetics(VisionDataset):
    """` Generic Kinetics <https://deepmind.com/research/open-source/open-source-datasets/kinetics/>`_
25
26
    dataset.

27
    Kinetics-400/600/700 are action recognition video datasets.
28
29
30
31
32
33
34
35
36
37
38
    This dataset consider every video as a collection of video clips of fixed size, specified
    by ``frames_per_clip``, where the step in frames between each clip is given by
    ``step_between_clips``.

    To give an example, for 2 videos with 10 and 15 frames respectively, if ``frames_per_clip=5``
    and ``step_between_clips=5``, the dataset size will be (2 + 3) = 5, where the first two
    elements will come from video 1, and the next three elements from video 2.
    Note that we drop clips which do not have exactly ``frames_per_clip`` elements, so not all
    frames in a video might be present.

    Args:
39
40
        root (string): Root directory of the Kinetics Dataset.
            Directory should be structured as follows:
41
42
43
            .. code::

                root/
44
45
46
47
48
49
50
51
52
53
                ├── split
                │   ├──  class1
                │   │   ├──  clip1.mp4
                │   │   ├──  clip2.mp4
                │   │   ├──  clip3.mp4
                │   │   ├──  ...
                │   ├──  class2
                │   │   ├──   clipx.mp4
                │   │    └── ...
            Note: split is appended automatically using the split argument.
54
        frames_per_clip (int): number of frames in a clip
55
56
57
        num_classes (int): select between Kinetics-400 (default), Kinetics-600, and Kinetics-700
        split (str): split of the dataset to consider; supports ``"train"`` (default) ``"val"``
        frame_rate (float): If omitted, interpolate different frame rate for each clip.
58
59
60
        step_between_clips (int): number of frames between each clip
        transform (callable, optional): A function/transform that  takes in a TxHxWxC video
            and returns a transformed version.
61
62
63
        download (bool): Download the official version of the dataset to root folder.
        num_workers (int): Use multiple workers for VideoClips creation
        num_download_workers (int): Use multiprocessing in order to speed up download.
64
65

    Returns:
66
67
        tuple: A 3-tuple with the following entries:

68
            - video (Tensor[T, C, H, W]): the `T` video frames in torch.uint8 tensor
69
            - audio(Tensor[K, L]): the audio frames, where `K` is the number of channels
70
              and `L` is the number of points in torch.float tensor
71
            - label (int): class of the video clip
72
73
74

    Raises:
        RuntimeError: If ``download is True`` and the video archives are already extracted.
75
76
    """

77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
    _TAR_URLS = {
        "400": "https://s3.amazonaws.com/kinetics/400/{split}/k400_{split}_path.txt",
        "600": "https://s3.amazonaws.com/kinetics/600/{split}/k600_{split}_path.txt",
        "700": "https://s3.amazonaws.com/kinetics/700_2020/{split}/k700_2020_{split}_path.txt",
    }
    _ANNOTATION_URLS = {
        "400": "https://s3.amazonaws.com/kinetics/400/annotations/{split}.csv",
        "600": "https://s3.amazonaws.com/kinetics/600/annotations/{split}.txt",
        "700": "https://s3.amazonaws.com/kinetics/700_2020/annotations/{split}.csv",
    }

    def __init__(
        self,
        root: str,
        frames_per_clip: int,
        num_classes: str = "400",
        split: str = "train",
94
        frame_rate: Optional[int] = None,
95
96
97
98
99
100
        step_between_clips: int = 1,
        transform: Optional[Callable] = None,
        extensions: Tuple[str, ...] = ("avi", "mp4"),
        download: bool = False,
        num_download_workers: int = 1,
        num_workers: int = 1,
101
        _precomputed_metadata: Optional[Dict[str, Any]] = None,
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
        _video_width: int = 0,
        _video_height: int = 0,
        _video_min_dimension: int = 0,
        _audio_samples: int = 0,
        _audio_channels: int = 0,
        _legacy: bool = False,
    ) -> None:

        # TODO: support test
        self.num_classes = verify_str_arg(num_classes, arg="num_classes", valid_values=["400", "600", "700"])
        self.extensions = extensions
        self.num_download_workers = num_download_workers

        self.root = root
        self._legacy = _legacy
        if _legacy:
            print("Using legacy structure")
            self.split_folder = root
            self.split = "unknown"
            assert not download, "Cannot download the videos using legacy_structure."
        else:
            self.split_folder = path.join(root, split)
            self.split = verify_str_arg(split, arg="split", valid_values=["train", "val"])

        if download:
            self.download_and_process_videos()

        super().__init__(self.root)
130

131
132
        self.classes, class_to_idx = find_classes(self.split_folder)
        self.samples = make_dataset(self.split_folder, class_to_idx, extensions, is_valid_file=None)
133
        video_list = [x[0] for x in self.samples]
134
135
136
137
138
139
        self.video_clips = VideoClips(
            video_list,
            frames_per_clip,
            step_between_clips,
            frame_rate,
            _precomputed_metadata,
140
141
142
143
144
            num_workers=num_workers,
            _video_width=_video_width,
            _video_height=_video_height,
            _video_min_dimension=_video_min_dimension,
            _audio_samples=_audio_samples,
145
            _audio_channels=_audio_channels,
146
        )
147
        self.transform = transform
148

149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
    def download_and_process_videos(self) -> None:
        """Downloads all the videos to the _root_ folder in the expected format."""
        tic = time.time()
        self._download_videos()
        toc = time.time()
        print("Elapsed time for downloading in mins ", (toc - tic) / 60)
        self._make_ds_structure()
        toc2 = time.time()
        print("Elapsed time for processing in mins ", (toc2 - toc) / 60)
        print("Elapsed time overall in mins ", (toc2 - tic) / 60)

    def _download_videos(self) -> None:
        """download tarballs containing the video to "tars" folder and extract them into the _split_ folder where
        split is one of the official dataset splits.

        Raises:
            RuntimeError: if download folder exists, break to prevent downloading entire dataset again.
        """
        if path.exists(self.split_folder):
            raise RuntimeError(
                f"The directory {self.split_folder} already exists. "
                f"If you want to re-download or re-extract the images, delete the directory."
            )
        tar_path = path.join(self.root, "tars")
        file_list_path = path.join(self.root, "files")

        split_url = self._TAR_URLS[self.num_classes].format(split=self.split)
        split_url_filepath = path.join(file_list_path, path.basename(split_url))
        if not check_integrity(split_url_filepath):
            download_url(split_url, file_list_path)
        list_video_urls = open(split_url_filepath, "r")

        if self.num_download_workers == 1:
            for line in list_video_urls.readlines():
                line = str(line).replace("\n", "")
                download_and_extract_archive(line, tar_path, self.split_folder)
        else:
            part = partial(_dl_wrap, tar_path, self.split_folder)
            lines = [str(line).replace("\n", "") for line in list_video_urls.readlines()]
            poolproc = Pool(self.num_download_workers)
            poolproc.map(part, lines)

191
    def _make_ds_structure(self) -> None:
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
        """move videos from
        split_folder/
            ├── clip1.avi
            ├── clip2.avi

        to the correct format as described below:
        split_folder/
            ├── class1
            │   ├── clip1.avi

        """
        annotation_path = path.join(self.root, "annotations")
        if not check_integrity(path.join(annotation_path, f"{self.split}.csv")):
            download_url(self._ANNOTATION_URLS[self.num_classes].format(split=self.split), annotation_path)
        annotations = path.join(annotation_path, f"{self.split}.csv")

        file_fmtstr = "{ytid}_{start:06}_{end:06}.mp4"
        with open(annotations) as csvfile:
            reader = csv.DictReader(csvfile)
            for row in reader:
                f = file_fmtstr.format(
                    ytid=row["youtube_id"],
                    start=int(row["time_start"]),
                    end=int(row["time_end"]),
                )
                label = (
                    row["label"]
                    .replace(" ", "_")
                    .replace("'", "")
                    .replace("(", "")
                    .replace(")", "")
                )
                os.makedirs(path.join(self.split_folder, label), exist_ok=True)
                downloaded_file = path.join(self.split_folder, f)
                if path.isfile(downloaded_file):
                    os.replace(
                        downloaded_file, path.join(self.split_folder, label, f),
                    )

231
    @property
232
    def metadata(self) -> Dict[str, Any]:
233
234
        return self.video_clips.metadata

235
    def __len__(self) -> int:
236
237
        return self.video_clips.num_clips()

238
    def __getitem__(self, idx: int) -> Tuple[Tensor, Tensor, int]:
239
        video, audio, info, video_idx = self.video_clips.get_clip(idx)
240
241
242
        if not self._legacy:
            # [T,H,W,C] --> [T,C,H,W]
            video = video.permute(0, 3, 1, 2)
243
244
        label = self.samples[video_idx][1]

245
246
247
        if self.transform is not None:
            video = self.transform(video)

248
        return video, audio, label
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306


class Kinetics400(Kinetics):
    """
    `Kinetics-400 <https://deepmind.com/research/open-source/open-source-datasets/kinetics/>`_
    dataset.

    Kinetics-400 is an action recognition video dataset.
    This dataset consider every video as a collection of video clips of fixed size, specified
    by ``frames_per_clip``, where the step in frames between each clip is given by
    ``step_between_clips``.

    To give an example, for 2 videos with 10 and 15 frames respectively, if ``frames_per_clip=5``
    and ``step_between_clips=5``, the dataset size will be (2 + 3) = 5, where the first two
    elements will come from video 1, and the next three elements from video 2.
    Note that we drop clips which do not have exactly ``frames_per_clip`` elements, so not all
    frames in a video might be present.

    Internally, it uses a VideoClips object to handle clip creation.

    Args:
        root (string): Root directory of the Kinetics-400 Dataset. Should be structured as follows:

            .. code::

                root/
                ├── class1
                │   ├── clip1.avi
                │   ├── clip2.avi
                │   ├── clip3.mp4
                │   └── ...
                └── class2
                    ├── clipx.avi
                    └── ...

        frames_per_clip (int): number of frames in a clip
        step_between_clips (int): number of frames between each clip
        transform (callable, optional): A function/transform that  takes in a TxHxWxC video
            and returns a transformed version.

    Returns:
        tuple: A 3-tuple with the following entries:

            - video (Tensor[T, H, W, C]): the `T` video frames
            - audio(Tensor[K, L]): the audio frames, where `K` is the number of channels
              and `L` is the number of points
            - label (int): class of the video clip
    """

    def __init__(
        self,
        root: str,
        frames_per_clip: int,
        num_classes: Any = None,
        split: Any = None,
        download: Any = None,
        num_download_workers: Any = None,
        **kwargs: Any
307
    ) -> None:
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
        warnings.warn(
            "Kinetics400 is deprecated and will be removed in a future release."
            "It was replaced by Kinetics(..., num_classes=\"400\").")
        if any(value is not None for value in (num_classes, split, download, num_download_workers)):
            raise RuntimeError(
                "Usage of 'num_classes', 'split', 'download', or 'num_download_workers' is not supported in "
                "Kinetics400. Please use Kinetics instead."
            )

        super(Kinetics400, self).__init__(
            root=root,
            frames_per_clip=frames_per_clip,
            _legacy=True,
            **kwargs,
        )