Use Kinetics instead of Kinetics400 in references (#5787) (#5952)

* Dataset creation now supports "new" version of Kinetics dataset * remove unnecessary warning for now * provide kinetics option * new reading somehow doesn't need BHWC to BCHW transform * Addressing minor comments * Adding kinetics deprication warning for the old Kinetics400 class * lint error * Update torchvision/datasets/kinetics.py Co-authored-by: Nicolas Hug <contact@nicolas-hug.com> * Updating README * Remove BHWC to BCHW * Put warning back * formatting Co-authored-by: Bruno Korbar <bkorbar@quansight.com> Co-authored-by: Vasilis Vryniotis <datumbox@users.noreply.github.com> Co-authored-by: Nicolas Hug <contact@nicolas-hug.com>

Use Kinetics instead of Kinetics400 in references (#5787) (#5952)
* Dataset creation now supports "new" version of Kinetics dataset * remove unnecessary warning for now * provide kinetics option * new reading somehow doesn't need BHWC to BCHW transform * Addressing minor comments * Adding kinetics deprication warning for the old Kinetics400 class * lint error * Update torchvision/datasets/kinetics.py Co-authored-by: Nicolas Hug <contact@nicolas-hug.com> * Updating README * Remove BHWC to BCHW * Put warning back * formatting Co-authored-by: Bruno Korbar <bkorbar@quansight.com> Co-authored-by: Vasilis Vryniotis <datumbox@users.noreply.github.com> Co-authored-by: Nicolas Hug <contact@nicolas-hug.com>
b969cca7 · Bruno Korbar · GitHub · 16af6671 · b969cca7 · b969cca7
Unverified Commit b969cca7 authored May 20, 2022 by Bruno Korbar Committed by GitHub May 20, 2022
6 changed files
--- a/references/video_classification/README.md
+++ b/references/video_classification/README.md
@@ -18,7 +18,7 @@ We assume the training and validation AVI videos are stored at `/data/kinectics4
 Run the training on a single node with 8 GPUs:
 ```bash
-torchrun --nproc_per_node=8 train.py --data-path=/data/kinectics400 --train-dir=train --val-dir=val --batch-size=16 --cache-dataset --sync-bn --amp
+torchrun --nproc_per_node=8 train.py --data-path=/data/kinectics400 --kinetics-version="400" --batch-size=16 --cache-dataset --sync-bn --amp
 ```
 **Note:** all our models were trained on 8 nodes with 8 V100 GPUs each for a total of 64 GPUs. Expected training time for 64 GPUs is 24 hours, depending on the storage solution.
@@ -30,5 +30,13 @@ torchrun --nproc_per_node=8 train.py --data-path=/data/kinectics400 --train-dir=
 ```bash
-python train.py --data-path=/data/kinectics400 --train-dir=train --val-dir=val --batch-size=8 --cache-dataset
+python train.py --data-path=/data/kinectics400 --kinetics-version="400" --batch-size=8 --cache-dataset
 ```
+### Additional Kinetics versions
+Since the original release, additional versions of Kinetics dataset became available (Kinetics 600).
+Our training scripts support these versions of dataset as well by setting the `--kinetics-version` parameter to `"600"`.
+**Note:** training on Kinetics 600 requires a different set of hyperparameters for optimal performance. We do not provide Kinetics 600 pretrained models.
--- a/references/video_classification/presets.py
+++ b/references/video_classification/presets.py
 import torch
 from torchvision.transforms import transforms
-from transforms import ConvertBHWCtoBCHW, ConvertBCHWtoCBHW
+from transforms import ConvertBCHWtoCBHW
 class VideoClassificationPresetTrain:
@@ -14,7 +14,6 @@ class VideoClassificationPresetTrain:
        hflip_prob=0.5,
    ):
        trans = [
-            ConvertBHWCtoBCHW(),
            transforms.ConvertImageDtype(torch.float32),
            transforms.Resize(resize_size),
        ]
@@ -31,7 +30,6 @@ class VideoClassificationPresetEval:
    def __init__(self, *, crop_size, resize_size, mean=(0.43216, 0.394666, 0.37645), std=(0.22803, 0.22145, 0.216989)):
        self.transforms = transforms.Compose(
            [
-                ConvertBHWCtoBCHW(),
                transforms.ConvertImageDtype(torch.float32),
                transforms.Resize(resize_size),
                transforms.Normalize(mean=mean, std=std),

--- a/references/video_classification/train.py
+++ b/references/video_classification/train.py
@@ -130,8 +130,8 @@ def main(args):
    # Data loading code
    print("Loading data")
-    traindir = os.path.join(args.data_path, args.train_dir)
+    traindir = os.path.join(args.data_path, "train")
-    valdir = os.path.join(args.data_path, args.val_dir)
+    valdir = os.path.join(args.data_path, "val")
    print("Loading training data")
    st = time.time()
@@ -145,9 +145,11 @@ def main(args):
    else:
        if args.distributed:
            print("It is recommended to pre-compute the dataset cache on a single-gpu first, as it will be faster")
-        dataset = torchvision.datasets.Kinetics400(
+        dataset = torchvision.datasets.Kinetics(
-            traindir,
+            args.data_path,
            frames_per_clip=args.clip_len,
+            num_classes=args.kinetics_version,
+            split="train",
            step_between_clips=1,
            transform=transform_train,
            frame_rate=15,
@@ -179,9 +181,11 @@ def main(args):
    else:
        if args.distributed:
            print("It is recommended to pre-compute the dataset cache on a single-gpu first, as it will be faster")
-        dataset_test = torchvision.datasets.Kinetics400(
+        dataset_test = torchvision.datasets.Kinetics(
-            valdir,
+            args.data_path,
            frames_per_clip=args.clip_len,
+            num_classes=args.kinetics_version,
+            split="val",
            step_between_clips=1,
            transform=transform_test,
            frame_rate=15,
@@ -312,8 +316,9 @@ def parse_args():
    parser = argparse.ArgumentParser(description="PyTorch Video Classification Training")
    parser.add_argument("--data-path", default="/datasets01_101/kinetics/070618/", type=str, help="dataset path")
-    parser.add_argument("--train-dir", default="train_avi-480p", type=str, help="name of train dir")
+    parser.add_argument(
-    parser.add_argument("--val-dir", default="val_avi-480p", type=str, help="name of val dir")
+        "--kinetics-version", default="400", type=str, choices=["400", "600"], help="Select kinetics version"
+    )
    parser.add_argument("--model", default="r2plus1d_18", type=str, help="model name")
    parser.add_argument("--device", default="cuda", type=str, help="device (Use cuda or cpu Default: cuda)")
    parser.add_argument("--clip-len", default=16, type=int, metavar="N", help="number of frames per clip")

--- a/references/video_classification/transforms.py
+++ b/references/video_classification/transforms.py
@@ -2,13 +2,6 @@ import torch
 import torch.nn as nn
-class ConvertBHWCtoBCHW(nn.Module):
-    """Convert tensor from (B, H, W, C) to (B, C, H, W)"""
-    def forward(self, vid: torch.Tensor) -> torch.Tensor:
-        return vid.permute(0, 3, 1, 2)
 class ConvertBCHWtoCBHW(nn.Module):
    """Convert tensor from (B, C, H, W) to (C, B, H, W)"""

--- a/torchvision/datasets/kinetics.py
+++ b/torchvision/datasets/kinetics.py
@@ -308,6 +308,7 @@ class Kinetics400(Kinetics):
        warnings.warn(
            "The Kinetics400 class is deprecated since 0.12 and will be removed in 0.14."
            "Please use Kinetics(..., num_classes='400') instead."
+            "Note that Kinetics(..., num_classes='400') returns video in a more logical Tensor[T, C, H, W] format."
        )
        if any(value is not None for value in (num_classes, split, download, num_download_workers)):
            raise RuntimeError(

--- a/torchvision/io/video.py
+++ b/torchvision/io/video.py
@@ -153,14 +153,13 @@ def _read_from_stream(
        gc.collect()
    if pts_unit == "sec":
+        # TODO: we should change all of this from ground up to simply take
+        # sec and convert to MS in C++
        start_offset = int(math.floor(start_offset * (1 / stream.time_base)))
        if end_offset != float("inf"):
            end_offset = int(math.ceil(end_offset * (1 / stream.time_base)))
    else:
-        warnings.warn(
+        warnings.warn("The pts_unit 'pts' gives wrong results. Please use pts_unit 'sec'.")
-            "The pts_unit 'pts' gives wrong results and will be removed in a "
-            + "follow-up version. Please use pts_unit 'sec'."
-        )
    frames = {}
    should_buffer = True
@@ -176,9 +175,9 @@ def _read_from_stream(
            # can't use regex directly because of some weird characters sometimes...
            pos = extradata.find(b"DivX")
            d = extradata[pos:]
-            o = re.search(br"DivX(\d+)Build(\d+)(\w)", d)
+            o = re.search(rb"DivX(\d+)Build(\d+)(\w)", d)
            if o is None:
-                o = re.search(br"DivX(\d+)b(\d+)(\w)", d)
+                o = re.search(rb"DivX(\d+)b(\d+)(\w)", d)
            if o is not None:
                should_buffer = o.group(3) == b"p"
    seek_offset = start_offset