Unverified Commit b969cca7 authored by Bruno Korbar's avatar Bruno Korbar Committed by GitHub
Browse files

Use Kinetics instead of Kinetics400 in references (#5787) (#5952)



* Dataset creation now supports "new" version of Kinetics dataset

* remove unnecessary warning for now

* provide kinetics option

* new reading somehow doesn't need BHWC to BCHW transform

* Addressing minor comments

* Adding kinetics deprication warning for the old Kinetics400 class

* lint error

* Update torchvision/datasets/kinetics.py
Co-authored-by: default avatarNicolas Hug <contact@nicolas-hug.com>

* Updating README

* Remove BHWC to BCHW

* Put warning back

* formatting
Co-authored-by: default avatarBruno Korbar <bkorbar@quansight.com>
Co-authored-by: default avatarVasilis Vryniotis <datumbox@users.noreply.github.com>
Co-authored-by: default avatarNicolas Hug <contact@nicolas-hug.com>
parent 16af6671
...@@ -18,7 +18,7 @@ We assume the training and validation AVI videos are stored at `/data/kinectics4 ...@@ -18,7 +18,7 @@ We assume the training and validation AVI videos are stored at `/data/kinectics4
Run the training on a single node with 8 GPUs: Run the training on a single node with 8 GPUs:
```bash ```bash
torchrun --nproc_per_node=8 train.py --data-path=/data/kinectics400 --train-dir=train --val-dir=val --batch-size=16 --cache-dataset --sync-bn --amp torchrun --nproc_per_node=8 train.py --data-path=/data/kinectics400 --kinetics-version="400" --batch-size=16 --cache-dataset --sync-bn --amp
``` ```
**Note:** all our models were trained on 8 nodes with 8 V100 GPUs each for a total of 64 GPUs. Expected training time for 64 GPUs is 24 hours, depending on the storage solution. **Note:** all our models were trained on 8 nodes with 8 V100 GPUs each for a total of 64 GPUs. Expected training time for 64 GPUs is 24 hours, depending on the storage solution.
...@@ -30,5 +30,13 @@ torchrun --nproc_per_node=8 train.py --data-path=/data/kinectics400 --train-dir= ...@@ -30,5 +30,13 @@ torchrun --nproc_per_node=8 train.py --data-path=/data/kinectics400 --train-dir=
```bash ```bash
python train.py --data-path=/data/kinectics400 --train-dir=train --val-dir=val --batch-size=8 --cache-dataset python train.py --data-path=/data/kinectics400 --kinetics-version="400" --batch-size=8 --cache-dataset
``` ```
### Additional Kinetics versions
Since the original release, additional versions of Kinetics dataset became available (Kinetics 600).
Our training scripts support these versions of dataset as well by setting the `--kinetics-version` parameter to `"600"`.
**Note:** training on Kinetics 600 requires a different set of hyperparameters for optimal performance. We do not provide Kinetics 600 pretrained models.
import torch import torch
from torchvision.transforms import transforms from torchvision.transforms import transforms
from transforms import ConvertBHWCtoBCHW, ConvertBCHWtoCBHW from transforms import ConvertBCHWtoCBHW
class VideoClassificationPresetTrain: class VideoClassificationPresetTrain:
...@@ -14,7 +14,6 @@ class VideoClassificationPresetTrain: ...@@ -14,7 +14,6 @@ class VideoClassificationPresetTrain:
hflip_prob=0.5, hflip_prob=0.5,
): ):
trans = [ trans = [
ConvertBHWCtoBCHW(),
transforms.ConvertImageDtype(torch.float32), transforms.ConvertImageDtype(torch.float32),
transforms.Resize(resize_size), transforms.Resize(resize_size),
] ]
...@@ -31,7 +30,6 @@ class VideoClassificationPresetEval: ...@@ -31,7 +30,6 @@ class VideoClassificationPresetEval:
def __init__(self, *, crop_size, resize_size, mean=(0.43216, 0.394666, 0.37645), std=(0.22803, 0.22145, 0.216989)): def __init__(self, *, crop_size, resize_size, mean=(0.43216, 0.394666, 0.37645), std=(0.22803, 0.22145, 0.216989)):
self.transforms = transforms.Compose( self.transforms = transforms.Compose(
[ [
ConvertBHWCtoBCHW(),
transforms.ConvertImageDtype(torch.float32), transforms.ConvertImageDtype(torch.float32),
transforms.Resize(resize_size), transforms.Resize(resize_size),
transforms.Normalize(mean=mean, std=std), transforms.Normalize(mean=mean, std=std),
......
...@@ -130,8 +130,8 @@ def main(args): ...@@ -130,8 +130,8 @@ def main(args):
# Data loading code # Data loading code
print("Loading data") print("Loading data")
traindir = os.path.join(args.data_path, args.train_dir) traindir = os.path.join(args.data_path, "train")
valdir = os.path.join(args.data_path, args.val_dir) valdir = os.path.join(args.data_path, "val")
print("Loading training data") print("Loading training data")
st = time.time() st = time.time()
...@@ -145,9 +145,11 @@ def main(args): ...@@ -145,9 +145,11 @@ def main(args):
else: else:
if args.distributed: if args.distributed:
print("It is recommended to pre-compute the dataset cache on a single-gpu first, as it will be faster") print("It is recommended to pre-compute the dataset cache on a single-gpu first, as it will be faster")
dataset = torchvision.datasets.Kinetics400( dataset = torchvision.datasets.Kinetics(
traindir, args.data_path,
frames_per_clip=args.clip_len, frames_per_clip=args.clip_len,
num_classes=args.kinetics_version,
split="train",
step_between_clips=1, step_between_clips=1,
transform=transform_train, transform=transform_train,
frame_rate=15, frame_rate=15,
...@@ -179,9 +181,11 @@ def main(args): ...@@ -179,9 +181,11 @@ def main(args):
else: else:
if args.distributed: if args.distributed:
print("It is recommended to pre-compute the dataset cache on a single-gpu first, as it will be faster") print("It is recommended to pre-compute the dataset cache on a single-gpu first, as it will be faster")
dataset_test = torchvision.datasets.Kinetics400( dataset_test = torchvision.datasets.Kinetics(
valdir, args.data_path,
frames_per_clip=args.clip_len, frames_per_clip=args.clip_len,
num_classes=args.kinetics_version,
split="val",
step_between_clips=1, step_between_clips=1,
transform=transform_test, transform=transform_test,
frame_rate=15, frame_rate=15,
...@@ -312,8 +316,9 @@ def parse_args(): ...@@ -312,8 +316,9 @@ def parse_args():
parser = argparse.ArgumentParser(description="PyTorch Video Classification Training") parser = argparse.ArgumentParser(description="PyTorch Video Classification Training")
parser.add_argument("--data-path", default="/datasets01_101/kinetics/070618/", type=str, help="dataset path") parser.add_argument("--data-path", default="/datasets01_101/kinetics/070618/", type=str, help="dataset path")
parser.add_argument("--train-dir", default="train_avi-480p", type=str, help="name of train dir") parser.add_argument(
parser.add_argument("--val-dir", default="val_avi-480p", type=str, help="name of val dir") "--kinetics-version", default="400", type=str, choices=["400", "600"], help="Select kinetics version"
)
parser.add_argument("--model", default="r2plus1d_18", type=str, help="model name") parser.add_argument("--model", default="r2plus1d_18", type=str, help="model name")
parser.add_argument("--device", default="cuda", type=str, help="device (Use cuda or cpu Default: cuda)") parser.add_argument("--device", default="cuda", type=str, help="device (Use cuda or cpu Default: cuda)")
parser.add_argument("--clip-len", default=16, type=int, metavar="N", help="number of frames per clip") parser.add_argument("--clip-len", default=16, type=int, metavar="N", help="number of frames per clip")
......
...@@ -2,13 +2,6 @@ import torch ...@@ -2,13 +2,6 @@ import torch
import torch.nn as nn import torch.nn as nn
class ConvertBHWCtoBCHW(nn.Module):
"""Convert tensor from (B, H, W, C) to (B, C, H, W)"""
def forward(self, vid: torch.Tensor) -> torch.Tensor:
return vid.permute(0, 3, 1, 2)
class ConvertBCHWtoCBHW(nn.Module): class ConvertBCHWtoCBHW(nn.Module):
"""Convert tensor from (B, C, H, W) to (C, B, H, W)""" """Convert tensor from (B, C, H, W) to (C, B, H, W)"""
......
...@@ -308,6 +308,7 @@ class Kinetics400(Kinetics): ...@@ -308,6 +308,7 @@ class Kinetics400(Kinetics):
warnings.warn( warnings.warn(
"The Kinetics400 class is deprecated since 0.12 and will be removed in 0.14." "The Kinetics400 class is deprecated since 0.12 and will be removed in 0.14."
"Please use Kinetics(..., num_classes='400') instead." "Please use Kinetics(..., num_classes='400') instead."
"Note that Kinetics(..., num_classes='400') returns video in a more logical Tensor[T, C, H, W] format."
) )
if any(value is not None for value in (num_classes, split, download, num_download_workers)): if any(value is not None for value in (num_classes, split, download, num_download_workers)):
raise RuntimeError( raise RuntimeError(
......
...@@ -153,14 +153,13 @@ def _read_from_stream( ...@@ -153,14 +153,13 @@ def _read_from_stream(
gc.collect() gc.collect()
if pts_unit == "sec": if pts_unit == "sec":
# TODO: we should change all of this from ground up to simply take
# sec and convert to MS in C++
start_offset = int(math.floor(start_offset * (1 / stream.time_base))) start_offset = int(math.floor(start_offset * (1 / stream.time_base)))
if end_offset != float("inf"): if end_offset != float("inf"):
end_offset = int(math.ceil(end_offset * (1 / stream.time_base))) end_offset = int(math.ceil(end_offset * (1 / stream.time_base)))
else: else:
warnings.warn( warnings.warn("The pts_unit 'pts' gives wrong results. Please use pts_unit 'sec'.")
"The pts_unit 'pts' gives wrong results and will be removed in a "
+ "follow-up version. Please use pts_unit 'sec'."
)
frames = {} frames = {}
should_buffer = True should_buffer = True
...@@ -176,9 +175,9 @@ def _read_from_stream( ...@@ -176,9 +175,9 @@ def _read_from_stream(
# can't use regex directly because of some weird characters sometimes... # can't use regex directly because of some weird characters sometimes...
pos = extradata.find(b"DivX") pos = extradata.find(b"DivX")
d = extradata[pos:] d = extradata[pos:]
o = re.search(br"DivX(\d+)Build(\d+)(\w)", d) o = re.search(rb"DivX(\d+)Build(\d+)(\w)", d)
if o is None: if o is None:
o = re.search(br"DivX(\d+)b(\d+)(\w)", d) o = re.search(rb"DivX(\d+)b(\d+)(\w)", d)
if o is not None: if o is not None:
should_buffer = o.group(3) == b"p" should_buffer = o.group(3) == b"p"
seek_offset = start_offset seek_offset = start_offset
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment