Fix Kinetics dataset docstring (#8121)

c7e29470 · Nicolas Hug · GitHub · 4433680a · c7e29470 · c7e29470
Unverified Commit c7e29470 authored Nov 20, 2023 by Nicolas Hug Committed by GitHub Nov 20, 2023
Hide whitespace changes
Inline Side-by-side

Showing with 10 additions and 10 deletions

torchvision/datasets/kinetics.py torchvision/datasets/kinetics.py +4 -4

torchvision/datasets/video_utils.py torchvision/datasets/video_utils.py +6 -6

No files found.
--- a/torchvision/datasets/kinetics.py
+++ b/torchvision/datasets/kinetics.py
@@ -42,12 +42,12 @@ class Kinetics(VisionDataset):
                root/
                ├── split
                │   ├──  class1
-                │   │   ├──  clip1.mp4
-                │   │   ├──  clip2.mp4
-                │   │   ├──  clip3.mp4
+                │   │   ├──  vid1.mp4
+                │   │   ├──  vid2.mp4
+                │   │   ├──  vid3.mp4
                │   │   ├──  ...
                │   ├──  class2
-                │   │   ├──   clipx.mp4
+                │   │   ├──   vidx.mp4
                │   │    └── ...

            Note: split is appended automatically using the split argument.

--- a/torchvision/datasets/video_utils.py
+++ b/torchvision/datasets/video_utils.py
@@ -135,8 +135,8 @@ class VideoClips:
        self.compute_clips(clip_length_in_frames, frames_between_clips, frame_rate)

    def _compute_frame_pts(self) -> None:
-        self.video_pts = []
-        self.video_fps: List[int] = []
+        self.video_pts = []  # len = num_videos. Each entry is a tensor of shape (num_frames_in_video,)
+        self.video_fps: List[int] = []  # len = num_videos

        # strategy: use a DataLoader to parallelize read_video_timestamps
        # so need to create a dummy dataset first
@@ -152,13 +152,13 @@ class VideoClips:
        with tqdm(total=len(dl)) as pbar:
            for batch in dl:
                pbar.update(1)
-                clips, fps = list(zip(*batch))
+                batch_pts, batch_fps = list(zip(*batch))
                # we need to specify dtype=torch.long because for empty list,
                # torch.as_tensor will use torch.float as default dtype. This
                # happens when decoding fails and no pts is returned in the list.
-                clips = [torch.as_tensor(c, dtype=torch.long) for c in clips]
-                self.video_pts.extend(clips)
-                self.video_fps.extend(fps)
+                batch_pts = [torch.as_tensor(pts, dtype=torch.long) for pts in batch_pts]
+                self.video_pts.extend(batch_pts)
+                self.video_fps.extend(batch_fps)

    def _init_from_metadata(self, metadata: Dict[str, Any]) -> None:
        self.video_paths = metadata["video_paths"]