"examples/vscode:/vscode.git/clone" did not exist on "4eb9ad0d1c7cc86a4c533b0da261e9bf57128166"
Unverified Commit b969cca7 authored by Bruno Korbar's avatar Bruno Korbar Committed by GitHub
Browse files

Use Kinetics instead of Kinetics400 in references (#5787) (#5952)



* Dataset creation now supports "new" version of Kinetics dataset

* remove unnecessary warning for now

* provide kinetics option

* new reading somehow doesn't need BHWC to BCHW transform

* Addressing minor comments

* Adding kinetics deprication warning for the old Kinetics400 class

* lint error

* Update torchvision/datasets/kinetics.py
Co-authored-by: default avatarNicolas Hug <contact@nicolas-hug.com>

* Updating README

* Remove BHWC to BCHW

* Put warning back

* formatting
Co-authored-by: default avatarBruno Korbar <bkorbar@quansight.com>
Co-authored-by: default avatarVasilis Vryniotis <datumbox@users.noreply.github.com>
Co-authored-by: default avatarNicolas Hug <contact@nicolas-hug.com>
parent 16af6671
......@@ -18,7 +18,7 @@ We assume the training and validation AVI videos are stored at `/data/kinectics4
Run the training on a single node with 8 GPUs:
```bash
torchrun --nproc_per_node=8 train.py --data-path=/data/kinectics400 --train-dir=train --val-dir=val --batch-size=16 --cache-dataset --sync-bn --amp
torchrun --nproc_per_node=8 train.py --data-path=/data/kinectics400 --kinetics-version="400" --batch-size=16 --cache-dataset --sync-bn --amp
```
**Note:** all our models were trained on 8 nodes with 8 V100 GPUs each for a total of 64 GPUs. Expected training time for 64 GPUs is 24 hours, depending on the storage solution.
......@@ -30,5 +30,13 @@ torchrun --nproc_per_node=8 train.py --data-path=/data/kinectics400 --train-dir=
```bash
python train.py --data-path=/data/kinectics400 --train-dir=train --val-dir=val --batch-size=8 --cache-dataset
python train.py --data-path=/data/kinectics400 --kinetics-version="400" --batch-size=8 --cache-dataset
```
### Additional Kinetics versions
Since the original release, additional versions of Kinetics dataset became available (Kinetics 600).
Our training scripts support these versions of dataset as well by setting the `--kinetics-version` parameter to `"600"`.
**Note:** training on Kinetics 600 requires a different set of hyperparameters for optimal performance. We do not provide Kinetics 600 pretrained models.
import torch
from torchvision.transforms import transforms
from transforms import ConvertBHWCtoBCHW, ConvertBCHWtoCBHW
from transforms import ConvertBCHWtoCBHW
class VideoClassificationPresetTrain:
......@@ -14,7 +14,6 @@ class VideoClassificationPresetTrain:
hflip_prob=0.5,
):
trans = [
ConvertBHWCtoBCHW(),
transforms.ConvertImageDtype(torch.float32),
transforms.Resize(resize_size),
]
......@@ -31,7 +30,6 @@ class VideoClassificationPresetEval:
def __init__(self, *, crop_size, resize_size, mean=(0.43216, 0.394666, 0.37645), std=(0.22803, 0.22145, 0.216989)):
self.transforms = transforms.Compose(
[
ConvertBHWCtoBCHW(),
transforms.ConvertImageDtype(torch.float32),
transforms.Resize(resize_size),
transforms.Normalize(mean=mean, std=std),
......
......@@ -130,8 +130,8 @@ def main(args):
# Data loading code
print("Loading data")
traindir = os.path.join(args.data_path, args.train_dir)
valdir = os.path.join(args.data_path, args.val_dir)
traindir = os.path.join(args.data_path, "train")
valdir = os.path.join(args.data_path, "val")
print("Loading training data")
st = time.time()
......@@ -145,9 +145,11 @@ def main(args):
else:
if args.distributed:
print("It is recommended to pre-compute the dataset cache on a single-gpu first, as it will be faster")
dataset = torchvision.datasets.Kinetics400(
traindir,
dataset = torchvision.datasets.Kinetics(
args.data_path,
frames_per_clip=args.clip_len,
num_classes=args.kinetics_version,
split="train",
step_between_clips=1,
transform=transform_train,
frame_rate=15,
......@@ -179,9 +181,11 @@ def main(args):
else:
if args.distributed:
print("It is recommended to pre-compute the dataset cache on a single-gpu first, as it will be faster")
dataset_test = torchvision.datasets.Kinetics400(
valdir,
dataset_test = torchvision.datasets.Kinetics(
args.data_path,
frames_per_clip=args.clip_len,
num_classes=args.kinetics_version,
split="val",
step_between_clips=1,
transform=transform_test,
frame_rate=15,
......@@ -312,8 +316,9 @@ def parse_args():
parser = argparse.ArgumentParser(description="PyTorch Video Classification Training")
parser.add_argument("--data-path", default="/datasets01_101/kinetics/070618/", type=str, help="dataset path")
parser.add_argument("--train-dir", default="train_avi-480p", type=str, help="name of train dir")
parser.add_argument("--val-dir", default="val_avi-480p", type=str, help="name of val dir")
parser.add_argument(
"--kinetics-version", default="400", type=str, choices=["400", "600"], help="Select kinetics version"
)
parser.add_argument("--model", default="r2plus1d_18", type=str, help="model name")
parser.add_argument("--device", default="cuda", type=str, help="device (Use cuda or cpu Default: cuda)")
parser.add_argument("--clip-len", default=16, type=int, metavar="N", help="number of frames per clip")
......
......@@ -2,13 +2,6 @@ import torch
import torch.nn as nn
class ConvertBHWCtoBCHW(nn.Module):
"""Convert tensor from (B, H, W, C) to (B, C, H, W)"""
def forward(self, vid: torch.Tensor) -> torch.Tensor:
return vid.permute(0, 3, 1, 2)
class ConvertBCHWtoCBHW(nn.Module):
"""Convert tensor from (B, C, H, W) to (C, B, H, W)"""
......
......@@ -308,6 +308,7 @@ class Kinetics400(Kinetics):
warnings.warn(
"The Kinetics400 class is deprecated since 0.12 and will be removed in 0.14."
"Please use Kinetics(..., num_classes='400') instead."
"Note that Kinetics(..., num_classes='400') returns video in a more logical Tensor[T, C, H, W] format."
)
if any(value is not None for value in (num_classes, split, download, num_download_workers)):
raise RuntimeError(
......
......@@ -153,14 +153,13 @@ def _read_from_stream(
gc.collect()
if pts_unit == "sec":
# TODO: we should change all of this from ground up to simply take
# sec and convert to MS in C++
start_offset = int(math.floor(start_offset * (1 / stream.time_base)))
if end_offset != float("inf"):
end_offset = int(math.ceil(end_offset * (1 / stream.time_base)))
else:
warnings.warn(
"The pts_unit 'pts' gives wrong results and will be removed in a "
+ "follow-up version. Please use pts_unit 'sec'."
)
warnings.warn("The pts_unit 'pts' gives wrong results. Please use pts_unit 'sec'.")
frames = {}
should_buffer = True
......@@ -176,9 +175,9 @@ def _read_from_stream(
# can't use regex directly because of some weird characters sometimes...
pos = extradata.find(b"DivX")
d = extradata[pos:]
o = re.search(br"DivX(\d+)Build(\d+)(\w)", d)
o = re.search(rb"DivX(\d+)Build(\d+)(\w)", d)
if o is None:
o = re.search(br"DivX(\d+)b(\d+)(\w)", d)
o = re.search(rb"DivX(\d+)b(\d+)(\w)", d)
if o is not None:
should_buffer = o.group(3) == b"p"
seek_offset = start_offset
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment