ucf101.py 5.4 KB
Newer Older
1
import os
limm's avatar
limm committed
2
3
4
5
from pathlib import Path
from typing import Any, Callable, Dict, List, Optional, Tuple, Union

from torch import Tensor
6

7
from .folder import find_classes, make_dataset
8
from .video_utils import VideoClips
9
10
11
12
from .vision import VisionDataset


class UCF101(VisionDataset):
13
    """
14
    `UCF101 <https://www.crcv.ucf.edu/data/UCF101.php>`_ dataset.
15
16
17
18

    UCF101 is an action recognition video dataset.
    This dataset consider every video as a collection of video clips of fixed size, specified
    by ``frames_per_clip``, where the step in frames between each clip is given by
limm's avatar
limm committed
19
20
21
    ``step_between_clips``. The dataset itself can be downloaded from the dataset website;
    annotations that ``annotation_path`` should be pointing to can be downloaded from `here
    <https://www.crcv.ucf.edu/data/UCF101/UCF101TrainTestSplits-RecognitionTask.zip>`_.
22
23
24
25
26
27
28
29
30
31

    To give an example, for 2 videos with 10 and 15 frames respectively, if ``frames_per_clip=5``
    and ``step_between_clips=5``, the dataset size will be (2 + 3) = 5, where the first two
    elements will come from video 1, and the next three elements from video 2.
    Note that we drop clips which do not have exactly ``frames_per_clip`` elements, so not all
    frames in a video might be present.

    Internally, it uses a VideoClips object to handle clip creation.

    Args:
limm's avatar
limm committed
32
33
34
        root (str or ``pathlib.Path``): Root directory of the UCF101 Dataset.
        annotation_path (str): path to the folder containing the split files;
            see docstring above for download instructions of these files
35
36
37
38
39
        frames_per_clip (int): number of frames in a clip.
        step_between_clips (int, optional): number of frames between each clip.
        fold (int, optional): which fold to use. Should be between 1 and 3.
        train (bool, optional): if ``True``, creates a dataset from the train split,
            otherwise from the ``test`` split.
limm's avatar
limm committed
40
        transform (callable, optional): A function/transform that takes in a TxHxWxC video
41
            and returns a transformed version.
limm's avatar
limm committed
42
43
        output_format (str, optional): The format of the output video tensors (before transforms).
            Can be either "THWC" (default) or "TCHW".
44
45

    Returns:
46
47
        tuple: A 3-tuple with the following entries:

limm's avatar
limm committed
48
            - video (Tensor[T, H, W, C] or Tensor[T, C, H, W]): The `T` video frames
49
50
51
            -  audio(Tensor[K, L]): the audio frames, where `K` is the number of channels
               and `L` is the number of points
            - label (int): class of the video clip
52
    """
53

limm's avatar
limm committed
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
    def __init__(
        self,
        root: Union[str, Path],
        annotation_path: str,
        frames_per_clip: int,
        step_between_clips: int = 1,
        frame_rate: Optional[int] = None,
        fold: int = 1,
        train: bool = True,
        transform: Optional[Callable] = None,
        _precomputed_metadata: Optional[Dict[str, Any]] = None,
        num_workers: int = 1,
        _video_width: int = 0,
        _video_height: int = 0,
        _video_min_dimension: int = 0,
        _audio_samples: int = 0,
        output_format: str = "THWC",
    ) -> None:
        super().__init__(root)
73
        if not 1 <= fold <= 3:
limm's avatar
limm committed
74
            raise ValueError(f"fold should be between 1 and 3, got {fold}")
75

limm's avatar
limm committed
76
        extensions = ("avi",)
77
78
79
        self.fold = fold
        self.train = train

80
        self.classes, class_to_idx = find_classes(self.root)
81
82
        self.samples = make_dataset(self.root, class_to_idx, extensions, is_valid_file=None)
        video_list = [x[0] for x in self.samples]
83
84
85
86
87
88
        video_clips = VideoClips(
            video_list,
            frames_per_clip,
            step_between_clips,
            frame_rate,
            _precomputed_metadata,
89
90
91
92
93
            num_workers=num_workers,
            _video_width=_video_width,
            _video_height=_video_height,
            _video_min_dimension=_video_min_dimension,
            _audio_samples=_audio_samples,
limm's avatar
limm committed
94
            output_format=output_format,
95
        )
96
        # we bookkeep the full version of video clips because we want to be able
limm's avatar
limm committed
97
        # to return the metadata of full version rather than the subset version of
98
99
        # video clips
        self.full_video_clips = video_clips
100
101
        self.indices = self._select_fold(video_list, annotation_path, fold, train)
        self.video_clips = video_clips.subset(self.indices)
102
        self.transform = transform
103

104
    @property
limm's avatar
limm committed
105
    def metadata(self) -> Dict[str, Any]:
106
        return self.full_video_clips.metadata
107

limm's avatar
limm committed
108
    def _select_fold(self, video_list: List[str], annotation_path: str, fold: int, train: bool) -> List[int]:
109
        name = "train" if train else "test"
limm's avatar
limm committed
110
        name = f"{name}list{fold:02d}.txt"
111
        f = os.path.join(annotation_path, name)
limm's avatar
limm committed
112
113
        selected_files = set()
        with open(f) as fid:
114
            data = fid.readlines()
limm's avatar
limm committed
115
116
117
            data = [x.strip().split(" ")[0] for x in data]
            data = [os.path.join(self.root, *x.split("/")) for x in data]
            selected_files.update(data)
118
        indices = [i for i in range(len(video_list)) if video_list[i] in selected_files]
119
120
        return indices

limm's avatar
limm committed
121
    def __len__(self) -> int:
122
123
        return self.video_clips.num_clips()

limm's avatar
limm committed
124
    def __getitem__(self, idx: int) -> Tuple[Tensor, Tensor, int]:
125
        video, audio, info, video_idx = self.video_clips.get_clip(idx)
126
        label = self.samples[self.indices[video_idx]][1]
127

128
129
130
        if self.transform is not None:
            video = self.transform(video)

131
        return video, audio, label