"model/vscode:/vscode.git/clone" did not exist on "ec9eb28f4c3481d58c6da38ee488cb8cd5379256"
kinetics.py 10.1 KB
Newer Older
1
import csv
2
import os
3
import time
4
import urllib
5
6
from functools import partial
from multiprocessing import Pool
7
8
9
from os import path
from typing import Any, Callable, Dict, Optional, Tuple

10
from torch import Tensor
11

12
from .folder import find_classes, make_dataset
13
from .utils import check_integrity, download_and_extract_archive, download_url, verify_str_arg
14
from .video_utils import VideoClips
15
16
17
from .vision import VisionDataset


18
def _dl_wrap(tarpath: str, videopath: str, line: str) -> None:
19
20
21
22
    download_and_extract_archive(line, tarpath, videopath)


class Kinetics(VisionDataset):
Nicolas Hug's avatar
Nicolas Hug committed
23
    """`Generic Kinetics <https://www.deepmind.com/open-source/kinetics>`_
24
25
    dataset.

26
    Kinetics-400/600/700 are action recognition video datasets.
27
28
29
30
31
32
33
34
35
36
37
    This dataset consider every video as a collection of video clips of fixed size, specified
    by ``frames_per_clip``, where the step in frames between each clip is given by
    ``step_between_clips``.

    To give an example, for 2 videos with 10 and 15 frames respectively, if ``frames_per_clip=5``
    and ``step_between_clips=5``, the dataset size will be (2 + 3) = 5, where the first two
    elements will come from video 1, and the next three elements from video 2.
    Note that we drop clips which do not have exactly ``frames_per_clip`` elements, so not all
    frames in a video might be present.

    Args:
38
39
        root (string): Root directory of the Kinetics Dataset.
            Directory should be structured as follows:
40
41
42
            .. code::

                root/
43
44
                ├── split
                │   ├──  class1
45
46
47
                │   │   ├──  vid1.mp4
                │   │   ├──  vid2.mp4
                │   │   ├──  vid3.mp4
48
49
                │   │   ├──  ...
                │   ├──  class2
50
                │   │   ├──   vidx.mp4
51
                │   │    └── ...
52

53
            Note: split is appended automatically using the split argument.
54
        frames_per_clip (int): number of frames in a clip
55
        num_classes (int): select between Kinetics-400 (default), Kinetics-600, and Kinetics-700
56
        split (str): split of the dataset to consider; supports ``"train"`` (default) ``"val"`` ``"test"``
57
        frame_rate (float): If omitted, interpolate different frame rate for each clip.
58
59
60
        step_between_clips (int): number of frames between each clip
        transform (callable, optional): A function/transform that  takes in a TxHxWxC video
            and returns a transformed version.
61
62
63
        download (bool): Download the official version of the dataset to root folder.
        num_workers (int): Use multiple workers for VideoClips creation
        num_download_workers (int): Use multiprocessing in order to speed up download.
64
65
66
        output_format (str, optional): The format of the output video tensors (before transforms).
            Can be either "THWC" or "TCHW" (default).
            Note that in most other utils and datasets, the default is actually "THWC".
67
68

    Returns:
69
70
        tuple: A 3-tuple with the following entries:

71
            - video (Tensor[T, C, H, W] or Tensor[T, H, W, C]): the `T` video frames in torch.uint8 tensor
72
            - audio(Tensor[K, L]): the audio frames, where `K` is the number of channels
73
              and `L` is the number of points in torch.float tensor
74
            - label (int): class of the video clip
75
76
77

    Raises:
        RuntimeError: If ``download is True`` and the video archives are already extracted.
78
79
    """

80
81
82
83
84
85
86
    _TAR_URLS = {
        "400": "https://s3.amazonaws.com/kinetics/400/{split}/k400_{split}_path.txt",
        "600": "https://s3.amazonaws.com/kinetics/600/{split}/k600_{split}_path.txt",
        "700": "https://s3.amazonaws.com/kinetics/700_2020/{split}/k700_2020_{split}_path.txt",
    }
    _ANNOTATION_URLS = {
        "400": "https://s3.amazonaws.com/kinetics/400/annotations/{split}.csv",
87
        "600": "https://s3.amazonaws.com/kinetics/600/annotations/{split}.csv",
88
89
90
91
92
93
94
95
96
        "700": "https://s3.amazonaws.com/kinetics/700_2020/annotations/{split}.csv",
    }

    def __init__(
        self,
        root: str,
        frames_per_clip: int,
        num_classes: str = "400",
        split: str = "train",
97
        frame_rate: Optional[int] = None,
98
99
100
101
102
103
        step_between_clips: int = 1,
        transform: Optional[Callable] = None,
        extensions: Tuple[str, ...] = ("avi", "mp4"),
        download: bool = False,
        num_download_workers: int = 1,
        num_workers: int = 1,
104
        _precomputed_metadata: Optional[Dict[str, Any]] = None,
105
106
107
108
109
110
        _video_width: int = 0,
        _video_height: int = 0,
        _video_min_dimension: int = 0,
        _audio_samples: int = 0,
        _audio_channels: int = 0,
        _legacy: bool = False,
111
        output_format: str = "TCHW",
112
113
114
115
116
117
118
119
120
    ) -> None:

        # TODO: support test
        self.num_classes = verify_str_arg(num_classes, arg="num_classes", valid_values=["400", "600", "700"])
        self.extensions = extensions
        self.num_download_workers = num_download_workers

        self.root = root
        self._legacy = _legacy
121

122
123
124
125
        if _legacy:
            print("Using legacy structure")
            self.split_folder = root
            self.split = "unknown"
126
            output_format = "THWC"
127
128
            if download:
                raise ValueError("Cannot download the videos using legacy_structure.")
129
130
        else:
            self.split_folder = path.join(root, split)
131
            self.split = verify_str_arg(split, arg="split", valid_values=["train", "val", "test"])
132
133
134
135
136

        if download:
            self.download_and_process_videos()

        super().__init__(self.root)
137

138
139
        self.classes, class_to_idx = find_classes(self.split_folder)
        self.samples = make_dataset(self.split_folder, class_to_idx, extensions, is_valid_file=None)
140
        video_list = [x[0] for x in self.samples]
141
142
143
144
145
146
        self.video_clips = VideoClips(
            video_list,
            frames_per_clip,
            step_between_clips,
            frame_rate,
            _precomputed_metadata,
147
148
149
150
151
            num_workers=num_workers,
            _video_width=_video_width,
            _video_height=_video_height,
            _video_min_dimension=_video_min_dimension,
            _audio_samples=_audio_samples,
152
            _audio_channels=_audio_channels,
153
            output_format=output_format,
154
        )
155
        self.transform = transform
156

157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
    def download_and_process_videos(self) -> None:
        """Downloads all the videos to the _root_ folder in the expected format."""
        tic = time.time()
        self._download_videos()
        toc = time.time()
        print("Elapsed time for downloading in mins ", (toc - tic) / 60)
        self._make_ds_structure()
        toc2 = time.time()
        print("Elapsed time for processing in mins ", (toc2 - toc) / 60)
        print("Elapsed time overall in mins ", (toc2 - tic) / 60)

    def _download_videos(self) -> None:
        """download tarballs containing the video to "tars" folder and extract them into the _split_ folder where
        split is one of the official dataset splits.

        Raises:
            RuntimeError: if download folder exists, break to prevent downloading entire dataset again.
        """
        if path.exists(self.split_folder):
            raise RuntimeError(
                f"The directory {self.split_folder} already exists. "
                f"If you want to re-download or re-extract the images, delete the directory."
            )
        tar_path = path.join(self.root, "tars")
        file_list_path = path.join(self.root, "files")

        split_url = self._TAR_URLS[self.num_classes].format(split=self.split)
        split_url_filepath = path.join(file_list_path, path.basename(split_url))
        if not check_integrity(split_url_filepath):
            download_url(split_url, file_list_path)
187
188
        with open(split_url_filepath) as file:
            list_video_urls = [urllib.parse.quote(line, safe="/,:") for line in file.read().splitlines()]
189
190

        if self.num_download_workers == 1:
191
            for line in list_video_urls:
192
193
194
195
                download_and_extract_archive(line, tar_path, self.split_folder)
        else:
            part = partial(_dl_wrap, tar_path, self.split_folder)
            poolproc = Pool(self.num_download_workers)
196
            poolproc.map(part, list_video_urls)
197

198
    def _make_ds_structure(self) -> None:
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
        """move videos from
        split_folder/
            ├── clip1.avi
            ├── clip2.avi

        to the correct format as described below:
        split_folder/
            ├── class1
            │   ├── clip1.avi

        """
        annotation_path = path.join(self.root, "annotations")
        if not check_integrity(path.join(annotation_path, f"{self.split}.csv")):
            download_url(self._ANNOTATION_URLS[self.num_classes].format(split=self.split), annotation_path)
        annotations = path.join(annotation_path, f"{self.split}.csv")

        file_fmtstr = "{ytid}_{start:06}_{end:06}.mp4"
        with open(annotations) as csvfile:
            reader = csv.DictReader(csvfile)
            for row in reader:
                f = file_fmtstr.format(
                    ytid=row["youtube_id"],
                    start=int(row["time_start"]),
                    end=int(row["time_end"]),
                )
224
                label = row["label"].replace(" ", "_").replace("'", "").replace("(", "").replace(")", "")
225
226
227
228
                os.makedirs(path.join(self.split_folder, label), exist_ok=True)
                downloaded_file = path.join(self.split_folder, f)
                if path.isfile(downloaded_file):
                    os.replace(
229
230
                        downloaded_file,
                        path.join(self.split_folder, label, f),
231
232
                    )

233
    @property
234
    def metadata(self) -> Dict[str, Any]:
235
236
        return self.video_clips.metadata

237
    def __len__(self) -> int:
238
239
        return self.video_clips.num_clips()

240
    def __getitem__(self, idx: int) -> Tuple[Tensor, Tensor, int]:
241
242
243
        video, audio, info, video_idx = self.video_clips.get_clip(idx)
        label = self.samples[video_idx][1]

244
245
246
        if self.transform is not None:
            video = self.transform(video)

247
        return video, audio, label