Commit bf491463 authored by limm's avatar limm
Browse files

add v0.19.1 release

parent e17f5ea2
import copy
import os
import torch
import torch.utils.data
import torchvision
from PIL import Image
import os
from pycocotools import mask as coco_mask
from transforms import Compose
class FilterAndRemapCocoCategories(object):
class FilterAndRemapCocoCategories:
def __init__(self, categories, remap=True):
self.categories = categories
self.remap = remap
......@@ -43,7 +41,7 @@ def convert_coco_poly_to_mask(segmentations, height, width):
return masks
class ConvertCocoPolysToMask(object):
class ConvertCocoPolysToMask:
def __call__(self, image, anno):
w, h = image.size
segmentations = [obj["segmentation"] for obj in anno]
......@@ -70,7 +68,6 @@ def _coco_remove_images_without_annotations(dataset, cat_list=None):
# if more than 1k pixels occupied in the image
return sum(obj["area"] for obj in anno) > 1000
assert isinstance(dataset, torchvision.datasets.CocoDetection)
ids = []
for ds_idx, img_id in enumerate(dataset.ids):
ann_ids = dataset.coco.getAnnIds(imgIds=img_id, iscrowd=None)
......@@ -84,26 +81,32 @@ def _coco_remove_images_without_annotations(dataset, cat_list=None):
return dataset
def get_coco(root, image_set, transforms):
def get_coco(root, image_set, transforms, use_v2=False):
PATHS = {
"train": ("train2017", os.path.join("annotations", "instances_train2017.json")),
"val": ("val2017", os.path.join("annotations", "instances_val2017.json")),
# "train": ("val2017", os.path.join("annotations", "instances_val2017.json"))
}
CAT_LIST = [0, 5, 2, 16, 9, 44, 6, 3, 17, 62, 21, 67, 18, 19, 4,
1, 64, 20, 63, 7, 72]
transforms = Compose([
FilterAndRemapCocoCategories(CAT_LIST, remap=True),
ConvertCocoPolysToMask(),
transforms
])
CAT_LIST = [0, 5, 2, 16, 9, 44, 6, 3, 17, 62, 21, 67, 18, 19, 4, 1, 64, 20, 63, 7, 72]
img_folder, ann_file = PATHS[image_set]
img_folder = os.path.join(root, img_folder)
ann_file = os.path.join(root, ann_file)
dataset = torchvision.datasets.CocoDetection(img_folder, ann_file, transforms=transforms)
# The 2 "Compose" below achieve the same thing: converting coco detection
# samples into segmentation-compatible samples. They just do it with
# slightly different implementations. We could refactor and unify, but
# keeping them separate helps keeping the v2 version clean
if use_v2:
import v2_extras
from torchvision.datasets import wrap_dataset_for_transforms_v2
transforms = Compose([v2_extras.CocoDetectionToVOCSegmentation(), transforms])
dataset = torchvision.datasets.CocoDetection(img_folder, ann_file, transforms=transforms)
dataset = wrap_dataset_for_transforms_v2(dataset, target_keys={"masks", "labels"})
else:
transforms = Compose([FilterAndRemapCocoCategories(CAT_LIST, remap=True), ConvertCocoPolysToMask(), transforms])
dataset = torchvision.datasets.CocoDetection(img_folder, ann_file, transforms=transforms)
if image_set == "train":
dataset = _coco_remove_images_without_annotations(dataset, CAT_LIST)
......
import transforms as T
import torch
def get_modules(use_v2):
# We need a protected import to avoid the V2 warning in case just V1 is used
if use_v2:
import torchvision.transforms.v2
import torchvision.tv_tensors
import v2_extras
return torchvision.transforms.v2, torchvision.tv_tensors, v2_extras
else:
import transforms
return transforms, None, None
class SegmentationPresetTrain:
def __init__(self, base_size, crop_size, hflip_prob=0.5, mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)):
min_size = int(0.5 * base_size)
max_size = int(2.0 * base_size)
def __init__(
self,
*,
base_size,
crop_size,
hflip_prob=0.5,
mean=(0.485, 0.456, 0.406),
std=(0.229, 0.224, 0.225),
backend="pil",
use_v2=False,
):
T, tv_tensors, v2_extras = get_modules(use_v2)
transforms = []
backend = backend.lower()
if backend == "tv_tensor":
transforms.append(T.ToImage())
elif backend == "tensor":
transforms.append(T.PILToTensor())
elif backend != "pil":
raise ValueError(f"backend can be 'tv_tensor', 'tensor' or 'pil', but got {backend}")
transforms += [T.RandomResize(min_size=int(0.5 * base_size), max_size=int(2.0 * base_size))]
trans = [T.RandomResize(min_size, max_size)]
if hflip_prob > 0:
trans.append(T.RandomHorizontalFlip(hflip_prob))
trans.extend([
T.RandomCrop(crop_size),
T.ToTensor(),
T.Normalize(mean=mean, std=std),
])
self.transforms = T.Compose(trans)
transforms += [T.RandomHorizontalFlip(hflip_prob)]
if use_v2:
# We need a custom pad transform here, since the padding we want to perform here is fundamentally
# different from the padding in `RandomCrop` if `pad_if_needed=True`.
transforms += [v2_extras.PadIfSmaller(crop_size, fill={tv_tensors.Mask: 255, "others": 0})]
transforms += [T.RandomCrop(crop_size)]
if backend == "pil":
transforms += [T.PILToTensor()]
if use_v2:
img_type = tv_tensors.Image if backend == "tv_tensor" else torch.Tensor
transforms += [
T.ToDtype(dtype={img_type: torch.float32, tv_tensors.Mask: torch.int64, "others": None}, scale=True)
]
else:
# No need to explicitly convert masks as they're magically int64 already
transforms += [T.ToDtype(torch.float, scale=True)]
transforms += [T.Normalize(mean=mean, std=std)]
if use_v2:
transforms += [T.ToPureTensor()]
self.transforms = T.Compose(transforms)
def __call__(self, img, target):
return self.transforms(img, target)
class SegmentationPresetEval:
def __init__(self, base_size, mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)):
self.transforms = T.Compose([
T.RandomResize(base_size, base_size),
T.ToTensor(),
def __init__(
self, *, base_size, mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225), backend="pil", use_v2=False
):
T, _, _ = get_modules(use_v2)
transforms = []
backend = backend.lower()
if backend == "tensor":
transforms += [T.PILToTensor()]
elif backend == "tv_tensor":
transforms += [T.ToImage()]
elif backend != "pil":
raise ValueError(f"backend can be 'tv_tensor', 'tensor' or 'pil', but got {backend}")
if use_v2:
transforms += [T.Resize(size=(base_size, base_size))]
else:
transforms += [T.RandomResize(min_size=base_size, max_size=base_size)]
if backend == "pil":
# Note: we could just convert to pure tensors even in v2?
transforms += [T.ToImage() if use_v2 else T.PILToTensor()]
transforms += [
T.ToDtype(torch.float, scale=True),
T.Normalize(mean=mean, std=std),
])
]
if use_v2:
transforms += [T.ToPureTensor()]
self.transforms = T.Compose(transforms)
def __call__(self, img, target):
return self.transforms(img, target)
import datetime
import os
import time
import warnings
import presets
import torch
import torch.utils.data
from torch import nn
import torchvision
from coco_utils import get_coco
import presets
import utils
from coco_utils import get_coco
from torch import nn
from torch.optim.lr_scheduler import PolynomialLR
from torchvision.transforms import functional as F, InterpolationMode
def get_dataset(dir_path, name, image_set, transform):
def get_dataset(args, is_train):
def sbd(*args, **kwargs):
return torchvision.datasets.SBDataset(*args, mode='segmentation', **kwargs)
kwargs.pop("use_v2")
return torchvision.datasets.SBDataset(*args, mode="segmentation", **kwargs)
def voc(*args, **kwargs):
kwargs.pop("use_v2")
return torchvision.datasets.VOCSegmentation(*args, **kwargs)
paths = {
"voc": (dir_path, torchvision.datasets.VOCSegmentation, 21),
"voc_aug": (dir_path, sbd, 21),
"coco": (dir_path, get_coco, 21)
"voc": (args.data_path, voc, 21),
"voc_aug": (args.data_path, sbd, 21),
"coco": (args.data_path, get_coco, 21),
}
p, ds_fn, num_classes = paths[name]
p, ds_fn, num_classes = paths[args.dataset]
ds = ds_fn(p, image_set=image_set, transforms=transform)
image_set = "train" if is_train else "val"
ds = ds_fn(p, image_set=image_set, transforms=get_transform(is_train, args), use_v2=args.use_v2)
return ds, num_classes
def get_transform(train):
base_size = 520
crop_size = 480
def get_transform(is_train, args):
if is_train:
return presets.SegmentationPresetTrain(base_size=520, crop_size=480, backend=args.backend, use_v2=args.use_v2)
elif args.weights and args.test_only:
weights = torchvision.models.get_weight(args.weights)
trans = weights.transforms()
return presets.SegmentationPresetTrain(base_size, crop_size) if train else presets.SegmentationPresetEval(base_size)
def preprocessing(img, target):
img = trans(img)
size = F.get_dimensions(img)[1:]
target = F.resize(target, size, interpolation=InterpolationMode.NEAREST)
return img, F.pil_to_tensor(target)
return preprocessing
else:
return presets.SegmentationPresetEval(base_size=520, backend=args.backend, use_v2=args.use_v2)
def criterion(inputs, target):
......@@ -39,42 +59,66 @@ def criterion(inputs, target):
losses[name] = nn.functional.cross_entropy(x, target, ignore_index=255)
if len(losses) == 1:
return losses['out']
return losses["out"]
return losses['out'] + 0.5 * losses['aux']
return losses["out"] + 0.5 * losses["aux"]
def evaluate(model, data_loader, device, num_classes):
model.eval()
confmat = utils.ConfusionMatrix(num_classes)
metric_logger = utils.MetricLogger(delimiter=" ")
header = 'Test:'
with torch.no_grad():
header = "Test:"
num_processed_samples = 0
with torch.inference_mode():
for image, target in metric_logger.log_every(data_loader, 100, header):
image, target = image.to(device), target.to(device)
output = model(image)
output = output['out']
output = output["out"]
confmat.update(target.flatten(), output.argmax(1).flatten())
# FIXME need to take into account that the datasets
# could have been padded in distributed setup
num_processed_samples += image.shape[0]
confmat.reduce_from_all_processes()
num_processed_samples = utils.reduce_across_processes(num_processed_samples)
if (
hasattr(data_loader.dataset, "__len__")
and len(data_loader.dataset) != num_processed_samples
and torch.distributed.get_rank() == 0
):
# See FIXME above
warnings.warn(
f"It looks like the dataset has {len(data_loader.dataset)} samples, but {num_processed_samples} "
"samples were used for the validation, which might bias the results. "
"Try adjusting the batch size and / or the world size. "
"Setting the world size to 1 is always a safe bet."
)
return confmat
def train_one_epoch(model, criterion, optimizer, data_loader, lr_scheduler, device, epoch, print_freq):
def train_one_epoch(model, criterion, optimizer, data_loader, lr_scheduler, device, epoch, print_freq, scaler=None):
model.train()
metric_logger = utils.MetricLogger(delimiter=" ")
metric_logger.add_meter('lr', utils.SmoothedValue(window_size=1, fmt='{value}'))
header = 'Epoch: [{}]'.format(epoch)
metric_logger.add_meter("lr", utils.SmoothedValue(window_size=1, fmt="{value}"))
header = f"Epoch: [{epoch}]"
for image, target in metric_logger.log_every(data_loader, print_freq, header):
image, target = image.to(device), target.to(device)
output = model(image)
loss = criterion(output, target)
with torch.cuda.amp.autocast(enabled=scaler is not None):
output = model(image)
loss = criterion(output, target)
optimizer.zero_grad()
loss.backward()
optimizer.step()
if scaler is not None:
scaler.scale(loss).backward()
scaler.step(optimizer)
scaler.update()
else:
loss.backward()
optimizer.step()
lr_scheduler.step()
......@@ -82,6 +126,12 @@ def train_one_epoch(model, criterion, optimizer, data_loader, lr_scheduler, devi
def main(args):
if args.backend.lower() != "pil" and not args.use_v2:
# TODO: Support tensor backend in V1?
raise ValueError("Use --use-v2 if you want to use the tv_tensor or tensor backend.")
if args.use_v2 and args.dataset != "coco":
raise ValueError("v2 is only support supported for coco dataset for now.")
if args.output_dir:
utils.mkdir(args.output_dir)
......@@ -90,29 +140,42 @@ def main(args):
device = torch.device(args.device)
dataset, num_classes = get_dataset(args.data_path, args.dataset, "train", get_transform(train=True))
dataset_test, _ = get_dataset(args.data_path, args.dataset, "val", get_transform(train=False))
if args.use_deterministic_algorithms:
torch.backends.cudnn.benchmark = False
torch.use_deterministic_algorithms(True)
else:
torch.backends.cudnn.benchmark = True
dataset, num_classes = get_dataset(args, is_train=True)
dataset_test, _ = get_dataset(args, is_train=False)
if args.distributed:
train_sampler = torch.utils.data.distributed.DistributedSampler(dataset)
test_sampler = torch.utils.data.distributed.DistributedSampler(dataset_test)
test_sampler = torch.utils.data.distributed.DistributedSampler(dataset_test, shuffle=False)
else:
train_sampler = torch.utils.data.RandomSampler(dataset)
test_sampler = torch.utils.data.SequentialSampler(dataset_test)
data_loader = torch.utils.data.DataLoader(
dataset, batch_size=args.batch_size,
sampler=train_sampler, num_workers=args.workers,
collate_fn=utils.collate_fn, drop_last=True)
dataset,
batch_size=args.batch_size,
sampler=train_sampler,
num_workers=args.workers,
collate_fn=utils.collate_fn,
drop_last=True,
)
data_loader_test = torch.utils.data.DataLoader(
dataset_test, batch_size=1,
sampler=test_sampler, num_workers=args.workers,
collate_fn=utils.collate_fn)
dataset_test, batch_size=1, sampler=test_sampler, num_workers=args.workers, collate_fn=utils.collate_fn
)
model = torchvision.models.segmentation.__dict__[args.model](num_classes=num_classes,
aux_loss=args.aux_loss,
pretrained=args.pretrained)
model = torchvision.models.get_model(
args.model,
weights=args.weights,
weights_backbone=args.weights_backbone,
num_classes=num_classes,
aux_loss=args.aux_loss,
)
model.to(device)
if args.distributed:
model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model)
......@@ -129,23 +192,50 @@ def main(args):
if args.aux_loss:
params = [p for p in model_without_ddp.aux_classifier.parameters() if p.requires_grad]
params_to_optimize.append({"params": params, "lr": args.lr * 10})
optimizer = torch.optim.SGD(
params_to_optimize,
lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay)
optimizer = torch.optim.SGD(params_to_optimize, lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay)
scaler = torch.cuda.amp.GradScaler() if args.amp else None
lr_scheduler = torch.optim.lr_scheduler.LambdaLR(
optimizer,
lambda x: (1 - x / (len(data_loader) * args.epochs)) ** 0.9)
iters_per_epoch = len(data_loader)
main_lr_scheduler = PolynomialLR(
optimizer, total_iters=iters_per_epoch * (args.epochs - args.lr_warmup_epochs), power=0.9
)
if args.lr_warmup_epochs > 0:
warmup_iters = iters_per_epoch * args.lr_warmup_epochs
args.lr_warmup_method = args.lr_warmup_method.lower()
if args.lr_warmup_method == "linear":
warmup_lr_scheduler = torch.optim.lr_scheduler.LinearLR(
optimizer, start_factor=args.lr_warmup_decay, total_iters=warmup_iters
)
elif args.lr_warmup_method == "constant":
warmup_lr_scheduler = torch.optim.lr_scheduler.ConstantLR(
optimizer, factor=args.lr_warmup_decay, total_iters=warmup_iters
)
else:
raise RuntimeError(
f"Invalid warmup lr method '{args.lr_warmup_method}'. Only linear and constant are supported."
)
lr_scheduler = torch.optim.lr_scheduler.SequentialLR(
optimizer, schedulers=[warmup_lr_scheduler, main_lr_scheduler], milestones=[warmup_iters]
)
else:
lr_scheduler = main_lr_scheduler
if args.resume:
checkpoint = torch.load(args.resume, map_location='cpu')
model_without_ddp.load_state_dict(checkpoint['model'], strict=not args.test_only)
checkpoint = torch.load(args.resume, map_location="cpu", weights_only=True)
model_without_ddp.load_state_dict(checkpoint["model"], strict=not args.test_only)
if not args.test_only:
optimizer.load_state_dict(checkpoint['optimizer'])
lr_scheduler.load_state_dict(checkpoint['lr_scheduler'])
args.start_epoch = checkpoint['epoch'] + 1
optimizer.load_state_dict(checkpoint["optimizer"])
lr_scheduler.load_state_dict(checkpoint["lr_scheduler"])
args.start_epoch = checkpoint["epoch"] + 1
if args.amp:
scaler.load_state_dict(checkpoint["scaler"])
if args.test_only:
# We disable the cudnn benchmarking because it can noticeably affect the accuracy
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True
confmat = evaluate(model, data_loader_test, device=device, num_classes=num_classes)
print(confmat)
return
......@@ -154,54 +244,62 @@ def main(args):
for epoch in range(args.start_epoch, args.epochs):
if args.distributed:
train_sampler.set_epoch(epoch)
train_one_epoch(model, criterion, optimizer, data_loader, lr_scheduler, device, epoch, args.print_freq)
train_one_epoch(model, criterion, optimizer, data_loader, lr_scheduler, device, epoch, args.print_freq, scaler)
confmat = evaluate(model, data_loader_test, device=device, num_classes=num_classes)
print(confmat)
checkpoint = {
'model': model_without_ddp.state_dict(),
'optimizer': optimizer.state_dict(),
'lr_scheduler': lr_scheduler.state_dict(),
'epoch': epoch,
'args': args
"model": model_without_ddp.state_dict(),
"optimizer": optimizer.state_dict(),
"lr_scheduler": lr_scheduler.state_dict(),
"epoch": epoch,
"args": args,
}
utils.save_on_master(
checkpoint,
os.path.join(args.output_dir, 'model_{}.pth'.format(epoch)))
utils.save_on_master(
checkpoint,
os.path.join(args.output_dir, 'checkpoint.pth'))
if args.amp:
checkpoint["scaler"] = scaler.state_dict()
utils.save_on_master(checkpoint, os.path.join(args.output_dir, f"model_{epoch}.pth"))
utils.save_on_master(checkpoint, os.path.join(args.output_dir, "checkpoint.pth"))
total_time = time.time() - start_time
total_time_str = str(datetime.timedelta(seconds=int(total_time)))
print('Training time {}'.format(total_time_str))
print(f"Training time {total_time_str}")
def get_args_parser(add_help=True):
import argparse
parser = argparse.ArgumentParser(description='PyTorch Segmentation Training', add_help=add_help)
parser.add_argument('--data-path', default='/datasets01/COCO/022719/', help='dataset path')
parser.add_argument('--dataset', default='coco', help='dataset name')
parser.add_argument('--model', default='fcn_resnet101', help='model')
parser.add_argument('--aux-loss', action='store_true', help='auxiliar loss')
parser.add_argument('--device', default='cuda', help='device')
parser.add_argument('-b', '--batch-size', default=8, type=int)
parser.add_argument('--epochs', default=30, type=int, metavar='N',
help='number of total epochs to run')
parser.add_argument('-j', '--workers', default=16, type=int, metavar='N',
help='number of data loading workers (default: 16)')
parser.add_argument('--lr', default=0.01, type=float, help='initial learning rate')
parser.add_argument('--momentum', default=0.9, type=float, metavar='M',
help='momentum')
parser.add_argument('--wd', '--weight-decay', default=1e-4, type=float,
metavar='W', help='weight decay (default: 1e-4)',
dest='weight_decay')
parser.add_argument('--print-freq', default=10, type=int, help='print frequency')
parser.add_argument('--output-dir', default='.', help='path where to save')
parser.add_argument('--resume', default='', help='resume from checkpoint')
parser.add_argument('--start-epoch', default=0, type=int, metavar='N',
help='start epoch')
parser = argparse.ArgumentParser(description="PyTorch Segmentation Training", add_help=add_help)
parser.add_argument("--data-path", default="/datasets01/COCO/022719/", type=str, help="dataset path")
parser.add_argument("--dataset", default="coco", type=str, help="dataset name")
parser.add_argument("--model", default="fcn_resnet101", type=str, help="model name")
parser.add_argument("--aux-loss", action="store_true", help="auxiliary loss")
parser.add_argument("--device", default="cuda", type=str, help="device (Use cuda or cpu Default: cuda)")
parser.add_argument(
"-b", "--batch-size", default=8, type=int, help="images per gpu, the total batch size is $NGPU x batch_size"
)
parser.add_argument("--epochs", default=30, type=int, metavar="N", help="number of total epochs to run")
parser.add_argument(
"-j", "--workers", default=16, type=int, metavar="N", help="number of data loading workers (default: 16)"
)
parser.add_argument("--lr", default=0.01, type=float, help="initial learning rate")
parser.add_argument("--momentum", default=0.9, type=float, metavar="M", help="momentum")
parser.add_argument(
"--wd",
"--weight-decay",
default=1e-4,
type=float,
metavar="W",
help="weight decay (default: 1e-4)",
dest="weight_decay",
)
parser.add_argument("--lr-warmup-epochs", default=0, type=int, help="the number of epochs to warmup (default: 0)")
parser.add_argument("--lr-warmup-method", default="linear", type=str, help="the warmup method (default: linear)")
parser.add_argument("--lr-warmup-decay", default=0.01, type=float, help="the decay for lr")
parser.add_argument("--print-freq", default=10, type=int, help="print frequency")
parser.add_argument("--output-dir", default=".", type=str, help="path to save outputs")
parser.add_argument("--resume", default="", type=str, help="path of checkpoint")
parser.add_argument("--start-epoch", default=0, type=int, metavar="N", help="start epoch")
parser.add_argument(
"--test-only",
dest="test_only",
......@@ -209,16 +307,20 @@ def get_args_parser(add_help=True):
action="store_true",
)
parser.add_argument(
"--pretrained",
dest="pretrained",
help="Use pre-trained models from the modelzoo",
action="store_true",
"--use-deterministic-algorithms", action="store_true", help="Forces the use of deterministic algorithms only."
)
# distributed training parameters
parser.add_argument('--world-size', default=1, type=int,
help='number of distributed processes')
parser.add_argument('--dist-url', default='env://', help='url used to set up distributed training')
parser.add_argument("--world-size", default=1, type=int, help="number of distributed processes")
parser.add_argument("--dist-url", default="env://", type=str, help="url used to set up distributed training")
parser.add_argument("--weights", default=None, type=str, help="the weights enum name to load")
parser.add_argument("--weights-backbone", default=None, type=str, help="the backbone weights enum name to load")
# Mixed precision training parameters
parser.add_argument("--amp", action="store_true", help="Use torch.cuda.amp for mixed precision training")
parser.add_argument("--backend", default="PIL", type=str.lower, help="PIL or tensor - case insensitive")
parser.add_argument("--use-v2", action="store_true", help="Use V2 transforms")
return parser
......
import numpy as np
from PIL import Image
import random
import numpy as np
import torch
from torchvision import transforms as T
from torchvision.transforms import functional as F
......@@ -17,7 +16,7 @@ def pad_if_smaller(img, size, fill=0):
return img
class Compose(object):
class Compose:
def __init__(self, transforms):
self.transforms = transforms
......@@ -27,7 +26,7 @@ class Compose(object):
return image, target
class RandomResize(object):
class RandomResize:
def __init__(self, min_size, max_size=None):
self.min_size = min_size
if max_size is None:
......@@ -36,12 +35,12 @@ class RandomResize(object):
def __call__(self, image, target):
size = random.randint(self.min_size, self.max_size)
image = F.resize(image, size)
target = F.resize(target, size, interpolation=Image.NEAREST)
image = F.resize(image, size, antialias=True)
target = F.resize(target, size, interpolation=T.InterpolationMode.NEAREST)
return image, target
class RandomHorizontalFlip(object):
class RandomHorizontalFlip:
def __init__(self, flip_prob):
self.flip_prob = flip_prob
......@@ -52,7 +51,7 @@ class RandomHorizontalFlip(object):
return image, target
class RandomCrop(object):
class RandomCrop:
def __init__(self, size):
self.size = size
......@@ -65,7 +64,7 @@ class RandomCrop(object):
return image, target
class CenterCrop(object):
class CenterCrop:
def __init__(self, size):
self.size = size
......@@ -75,14 +74,26 @@ class CenterCrop(object):
return image, target
class ToTensor(object):
class PILToTensor:
def __call__(self, image, target):
image = F.to_tensor(image)
image = F.pil_to_tensor(image)
target = torch.as_tensor(np.array(target), dtype=torch.int64)
return image, target
class Normalize(object):
class ToDtype:
def __init__(self, dtype, scale=False):
self.dtype = dtype
self.scale = scale
def __call__(self, image, target):
if not self.scale:
return image.to(dtype=self.dtype), target
image = F.convert_image_dtype(image, self.dtype)
return image, target
class Normalize:
def __init__(self, mean, std):
self.mean = mean
self.std = std
......
from collections import defaultdict, deque
import datetime
import errno
import os
import time
from collections import defaultdict, deque
import torch
import torch.distributed as dist
import errno
import os
class SmoothedValue(object):
class SmoothedValue:
"""Track a series of values and provide access to smoothed values over a
window or the global series average.
"""
......@@ -30,11 +30,7 @@ class SmoothedValue(object):
"""
Warning: does not synchronize the deque!
"""
if not is_dist_avail_and_initialized():
return
t = torch.tensor([self.count, self.total], dtype=torch.float64, device='cuda')
dist.barrier()
dist.all_reduce(t)
t = reduce_across_processes([self.count, self.total])
t = t.tolist()
self.count = int(t[0])
self.total = t[1]
......@@ -63,14 +59,11 @@ class SmoothedValue(object):
def __str__(self):
return self.fmt.format(
median=self.median,
avg=self.avg,
global_avg=self.global_avg,
max=self.max,
value=self.value)
median=self.median, avg=self.avg, global_avg=self.global_avg, max=self.max, value=self.value
)
class ConfusionMatrix(object):
class ConfusionMatrix:
def __init__(self, num_classes):
self.num_classes = num_classes
self.mat = None
......@@ -79,7 +72,7 @@ class ConfusionMatrix(object):
n = self.num_classes
if self.mat is None:
self.mat = torch.zeros((n, n), dtype=torch.int64, device=a.device)
with torch.no_grad():
with torch.inference_mode():
k = (a >= 0) & (a < n)
inds = n * a[k].to(torch.int64) + b[k]
self.mat += torch.bincount(inds, minlength=n**2).reshape(n, n)
......@@ -95,27 +88,19 @@ class ConfusionMatrix(object):
return acc_global, acc, iu
def reduce_from_all_processes(self):
if not torch.distributed.is_available():
return
if not torch.distributed.is_initialized():
return
torch.distributed.barrier()
torch.distributed.all_reduce(self.mat)
self.mat = reduce_across_processes(self.mat).to(torch.int64)
def __str__(self):
acc_global, acc, iu = self.compute()
return (
'global correct: {:.1f}\n'
'average row correct: {}\n'
'IoU: {}\n'
'mean IoU: {:.1f}').format(
acc_global.item() * 100,
['{:.1f}'.format(i) for i in (acc * 100).tolist()],
['{:.1f}'.format(i) for i in (iu * 100).tolist()],
iu.mean().item() * 100)
class MetricLogger(object):
return ("global correct: {:.1f}\naverage row correct: {}\nIoU: {}\nmean IoU: {:.1f}").format(
acc_global.item() * 100,
[f"{i:.1f}" for i in (acc * 100).tolist()],
[f"{i:.1f}" for i in (iu * 100).tolist()],
iu.mean().item() * 100,
)
class MetricLogger:
def __init__(self, delimiter="\t"):
self.meters = defaultdict(SmoothedValue)
self.delimiter = delimiter
......@@ -124,7 +109,10 @@ class MetricLogger(object):
for k, v in kwargs.items():
if isinstance(v, torch.Tensor):
v = v.item()
assert isinstance(v, (float, int))
if not isinstance(v, (float, int)):
raise TypeError(
f"This method expects the value of the input arguments to be of type float or int, instead got {type(v)}"
)
self.meters[k].update(v)
def __getattr__(self, attr):
......@@ -132,15 +120,12 @@ class MetricLogger(object):
return self.meters[attr]
if attr in self.__dict__:
return self.__dict__[attr]
raise AttributeError("'{}' object has no attribute '{}'".format(
type(self).__name__, attr))
raise AttributeError(f"'{type(self).__name__}' object has no attribute '{attr}'")
def __str__(self):
loss_str = []
for name, meter in self.meters.items():
loss_str.append(
"{}: {}".format(name, str(meter))
)
loss_str.append(f"{name}: {str(meter)}")
return self.delimiter.join(loss_str)
def synchronize_between_processes(self):
......@@ -153,31 +138,28 @@ class MetricLogger(object):
def log_every(self, iterable, print_freq, header=None):
i = 0
if not header:
header = ''
header = ""
start_time = time.time()
end = time.time()
iter_time = SmoothedValue(fmt='{avg:.4f}')
data_time = SmoothedValue(fmt='{avg:.4f}')
space_fmt = ':' + str(len(str(len(iterable)))) + 'd'
iter_time = SmoothedValue(fmt="{avg:.4f}")
data_time = SmoothedValue(fmt="{avg:.4f}")
space_fmt = ":" + str(len(str(len(iterable)))) + "d"
if torch.cuda.is_available():
log_msg = self.delimiter.join([
header,
'[{0' + space_fmt + '}/{1}]',
'eta: {eta}',
'{meters}',
'time: {time}',
'data: {data}',
'max mem: {memory:.0f}'
])
log_msg = self.delimiter.join(
[
header,
"[{0" + space_fmt + "}/{1}]",
"eta: {eta}",
"{meters}",
"time: {time}",
"data: {data}",
"max mem: {memory:.0f}",
]
)
else:
log_msg = self.delimiter.join([
header,
'[{0' + space_fmt + '}/{1}]',
'eta: {eta}',
'{meters}',
'time: {time}',
'data: {data}'
])
log_msg = self.delimiter.join(
[header, "[{0" + space_fmt + "}/{1}]", "eta: {eta}", "{meters}", "time: {time}", "data: {data}"]
)
MB = 1024.0 * 1024.0
for obj in iterable:
data_time.update(time.time() - end)
......@@ -187,21 +169,28 @@ class MetricLogger(object):
eta_seconds = iter_time.global_avg * (len(iterable) - i)
eta_string = str(datetime.timedelta(seconds=int(eta_seconds)))
if torch.cuda.is_available():
print(log_msg.format(
i, len(iterable), eta=eta_string,
meters=str(self),
time=str(iter_time), data=str(data_time),
memory=torch.cuda.max_memory_allocated() / MB))
print(
log_msg.format(
i,
len(iterable),
eta=eta_string,
meters=str(self),
time=str(iter_time),
data=str(data_time),
memory=torch.cuda.max_memory_allocated() / MB,
)
)
else:
print(log_msg.format(
i, len(iterable), eta=eta_string,
meters=str(self),
time=str(iter_time), data=str(data_time)))
print(
log_msg.format(
i, len(iterable), eta=eta_string, meters=str(self), time=str(iter_time), data=str(data_time)
)
)
i += 1
end = time.time()
total_time = time.time() - start_time
total_time_str = str(datetime.timedelta(seconds=int(total_time)))
print('{} Total time: {}'.format(header, total_time_str))
print(f"{header} Total time: {total_time_str}")
def cat_list(images, fill_value=0):
......@@ -209,7 +198,7 @@ def cat_list(images, fill_value=0):
batch_shape = (len(images),) + max_size
batched_imgs = images[0].new(*batch_shape).fill_(fill_value)
for img, pad_img in zip(images, batched_imgs):
pad_img[..., :img.shape[-2], :img.shape[-1]].copy_(img)
pad_img[..., : img.shape[-2], : img.shape[-1]].copy_(img)
return batched_imgs
......@@ -233,10 +222,11 @@ def setup_for_distributed(is_master):
This function disables printing when not in master process
"""
import builtins as __builtin__
builtin_print = __builtin__.print
def print(*args, **kwargs):
force = kwargs.pop('force', False)
force = kwargs.pop("force", False)
if is_master or force:
builtin_print(*args, **kwargs)
......@@ -273,26 +263,38 @@ def save_on_master(*args, **kwargs):
def init_distributed_mode(args):
if 'RANK' in os.environ and 'WORLD_SIZE' in os.environ:
if "RANK" in os.environ and "WORLD_SIZE" in os.environ:
args.rank = int(os.environ["RANK"])
args.world_size = int(os.environ['WORLD_SIZE'])
args.gpu = int(os.environ['LOCAL_RANK'])
elif 'SLURM_PROCID' in os.environ:
args.rank = int(os.environ['SLURM_PROCID'])
args.gpu = args.rank % torch.cuda.device_count()
args.world_size = int(os.environ["WORLD_SIZE"])
args.gpu = int(os.environ["LOCAL_RANK"])
# elif "SLURM_PROCID" in os.environ:
# args.rank = int(os.environ["SLURM_PROCID"])
# args.gpu = args.rank % torch.cuda.device_count()
elif hasattr(args, "rank"):
pass
else:
print('Not using distributed mode')
print("Not using distributed mode")
args.distributed = False
return
args.distributed = True
torch.cuda.set_device(args.gpu)
args.dist_backend = 'nccl'
print('| distributed init (rank {}): {}'.format(
args.rank, args.dist_url), flush=True)
torch.distributed.init_process_group(backend=args.dist_backend, init_method=args.dist_url,
world_size=args.world_size, rank=args.rank)
args.dist_backend = "nccl"
print(f"| distributed init (rank {args.rank}): {args.dist_url}", flush=True)
torch.distributed.init_process_group(
backend=args.dist_backend, init_method=args.dist_url, world_size=args.world_size, rank=args.rank
)
torch.distributed.barrier()
setup_for_distributed(args.rank == 0)
def reduce_across_processes(val):
if not is_dist_avail_and_initialized():
# nothing to sync, but we still convert to tensor for consistency with the distributed case.
return torch.tensor(val)
t = torch.tensor(val, device="cuda")
dist.barrier()
dist.all_reduce(t)
return t
"""This file only exists to be lazy-imported and avoid V2-related import warnings when just using V1."""
import torch
from torchvision import tv_tensors
from torchvision.transforms import v2
class PadIfSmaller(v2.Transform):
def __init__(self, size, fill=0):
super().__init__()
self.size = size
self.fill = v2._utils._setup_fill_arg(fill)
def _get_params(self, sample):
_, height, width = v2._utils.query_chw(sample)
padding = [0, 0, max(self.size - width, 0), max(self.size - height, 0)]
needs_padding = any(padding)
return dict(padding=padding, needs_padding=needs_padding)
def _transform(self, inpt, params):
if not params["needs_padding"]:
return inpt
fill = v2._utils._get_fill(self.fill, type(inpt))
fill = v2._utils._convert_fill_arg(fill)
return v2.functional.pad(inpt, padding=params["padding"], fill=fill)
class CocoDetectionToVOCSegmentation(v2.Transform):
"""Turn samples from datasets.CocoDetection into the same format as VOCSegmentation.
This is achieved in two steps:
1. COCO differentiates between 91 categories while VOC only supports 21, including background for both. Fortunately,
the COCO categories are a superset of the VOC ones and thus can be mapped. Instances of the 70 categories not
present in VOC are dropped and replaced by background.
2. COCO only offers detection masks, i.e. a (N, H, W) bool-ish tensor, where the truthy values in each individual
mask denote the instance. However, a segmentation mask is a (H, W) integer tensor (typically torch.uint8), where
the value of each pixel denotes the category it belongs to. The detection masks are merged into one segmentation
mask while pixels that belong to multiple detection masks are marked as invalid.
"""
COCO_TO_VOC_LABEL_MAP = dict(
zip(
[0, 5, 2, 16, 9, 44, 6, 3, 17, 62, 21, 67, 18, 19, 4, 1, 64, 20, 63, 7, 72],
range(21),
)
)
INVALID_VALUE = 255
def _coco_detection_masks_to_voc_segmentation_mask(self, target):
if "masks" not in target:
return None
instance_masks, instance_labels_coco = target["masks"], target["labels"]
valid_labels_voc = [
(idx, label_voc)
for idx, label_coco in enumerate(instance_labels_coco.tolist())
if (label_voc := self.COCO_TO_VOC_LABEL_MAP.get(label_coco)) is not None
]
if not valid_labels_voc:
return None
valid_voc_category_idcs, instance_labels_voc = zip(*valid_labels_voc)
instance_masks = instance_masks[list(valid_voc_category_idcs)].to(torch.uint8)
instance_labels_voc = torch.tensor(instance_labels_voc, dtype=torch.uint8)
# Calling `.max()` on the stacked detection masks works fine to separate background from foreground as long as
# there is at most a single instance per pixel. Overlapping instances will be filtered out in the next step.
segmentation_mask, _ = (instance_masks * instance_labels_voc.reshape(-1, 1, 1)).max(dim=0)
segmentation_mask[instance_masks.sum(dim=0) > 1] = self.INVALID_VALUE
return segmentation_mask
def forward(self, image, target):
segmentation_mask = self._coco_detection_masks_to_voc_segmentation_mask(target)
if segmentation_mask is None:
segmentation_mask = torch.zeros(v2.functional.get_size(image), dtype=torch.uint8)
return image, tv_tensors.Mask(segmentation_mask)
'''
"""
Pytorch adaptation of https://omoindrot.github.io/triplet-loss
https://github.com/omoindrot/tensorflow-triplet-loss
'''
"""
import torch
import torch.nn as nn
class TripletMarginLoss(nn.Module):
def __init__(self, margin=1.0, p=2., mining='batch_all'):
super(TripletMarginLoss, self).__init__()
def __init__(self, margin=1.0, p=2.0, mining="batch_all"):
super().__init__()
self.margin = margin
self.p = p
self.mining = mining
if mining == 'batch_all':
if mining == "batch_all":
self.loss_fn = batch_all_triplet_loss
if mining == 'batch_hard':
if mining == "batch_hard":
self.loss_fn = batch_hard_triplet_loss
def forward(self, embeddings, labels):
......
......@@ -4,7 +4,7 @@ import torchvision.models as models
class EmbeddingNet(nn.Module):
def __init__(self, backbone=None):
super(EmbeddingNet, self).__init__()
super().__init__()
if backbone is None:
backbone = models.resnet50(num_classes=128)
......
import random
from collections import defaultdict
import torch
from torch.utils.data.sampler import Sampler
from collections import defaultdict
import random
def create_groups(groups, k):
......@@ -46,7 +47,8 @@ class PKSampler(Sampler):
self.groups = create_groups(groups, self.k)
# Ensures there are enough classes to sample from
assert len(self.groups) >= p
if len(self.groups) < p:
raise ValueError("There are not enough classes to sample from")
def __iter__(self):
# Shuffle samples within groups
......
import unittest
from collections import defaultdict
from torch.utils.data import DataLoader
from torchvision.datasets import FakeData
import torch
import torchvision.transforms as transforms
from sampler import PKSampler
from torch.utils.data import DataLoader
from torchvision.datasets import FakeData
class Tester(unittest.TestCase):
def test_pksampler(self):
p, k = 16, 4
......@@ -19,8 +18,13 @@ class Tester(unittest.TestCase):
self.assertRaises(AssertionError, PKSampler, targets, p, k)
# Ensure p, k constraints on batch
dataset = FakeData(size=1000, num_classes=100, image_size=(3, 1, 1),
transform=transforms.ToTensor())
trans = transforms.Compose(
[
transforms.PILToTensor(),
transforms.ConvertImageDtype(torch.float),
]
)
dataset = FakeData(size=1000, num_classes=100, image_size=(3, 1, 1), transform=trans)
targets = [target.item() for _, target in dataset]
sampler = PKSampler(targets, p, k)
loader = DataLoader(dataset, batch_size=p * k, sampler=sampler)
......@@ -38,5 +42,5 @@ class Tester(unittest.TestCase):
self.assertEqual(bins[b], k)
if __name__ == '__main__':
if __name__ == "__main__":
unittest.main()
import os
import torch
from torch.optim import Adam
from torch.utils.data import DataLoader
import torchvision.transforms as transforms
from torchvision.datasets import FashionMNIST
from loss import TripletMarginLoss
from sampler import PKSampler
from model import EmbeddingNet
from sampler import PKSampler
from torch.optim import Adam
from torch.utils.data import DataLoader
from torchvision.datasets import FashionMNIST
def train_epoch(model, optimizer, criterion, data_loader, device, epoch, print_freq):
......@@ -33,7 +31,7 @@ def train_epoch(model, optimizer, criterion, data_loader, device, epoch, print_f
i += 1
avg_loss = running_loss / print_freq
avg_trip = 100.0 * running_frac_pos_triplets / print_freq
print('[{:d}, {:d}] | loss: {:.4f} | % avg hard triplets: {:.2f}%'.format(epoch, i, avg_loss, avg_trip))
print(f"[{epoch:d}, {i:d}] | loss: {avg_loss:.4f} | % avg hard triplets: {avg_trip:.2f}%")
running_loss = 0
running_frac_pos_triplets = 0
......@@ -53,7 +51,7 @@ def find_best_threshold(dists, targets, device):
return best_thresh, accuracy
@torch.no_grad()
@torch.inference_mode()
def evaluate(model, loader, device):
model.eval()
embeds, labels = [], []
......@@ -79,33 +77,45 @@ def evaluate(model, loader, device):
threshold, accuracy = find_best_threshold(dists, targets, device)
print('accuracy: {:.3f}%, threshold: {:.2f}'.format(accuracy, threshold))
print(f"accuracy: {accuracy:.3f}%, threshold: {threshold:.2f}")
def save(model, epoch, save_dir, file_name):
file_name = 'epoch_' + str(epoch) + '__' + file_name
file_name = "epoch_" + str(epoch) + "__" + file_name
save_path = os.path.join(save_dir, file_name)
torch.save(model.state_dict(), save_path)
def main(args):
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
if args.use_deterministic_algorithms:
torch.backends.cudnn.benchmark = False
torch.use_deterministic_algorithms(True)
else:
torch.backends.cudnn.benchmark = True
p = args.labels_per_batch
k = args.samples_per_label
batch_size = p * k
model = EmbeddingNet()
if args.resume:
model.load_state_dict(torch.load(args.resume))
model.load_state_dict(torch.load(args.resume, weights_only=True))
model.to(device)
criterion = TripletMarginLoss(margin=args.margin)
optimizer = Adam(model.parameters(), lr=args.lr)
transform = transforms.Compose([transforms.Lambda(lambda image: image.convert('RGB')),
transforms.Resize((224, 224)),
transforms.ToTensor()])
transform = transforms.Compose(
[
transforms.Lambda(lambda image: image.convert("RGB")),
transforms.Resize((224, 224)),
transforms.PILToTensor(),
transforms.ConvertImageDtype(torch.float),
]
)
# Using FMNIST to demonstrate embedding learning using triplet loss. This dataset can
# be replaced with any classification dataset.
......@@ -118,48 +128,60 @@ def main(args):
# targets attribute with the same format.
targets = train_dataset.targets.tolist()
train_loader = DataLoader(train_dataset, batch_size=batch_size,
sampler=PKSampler(targets, p, k),
num_workers=args.workers)
test_loader = DataLoader(test_dataset, batch_size=args.eval_batch_size,
shuffle=False,
num_workers=args.workers)
train_loader = DataLoader(
train_dataset, batch_size=batch_size, sampler=PKSampler(targets, p, k), num_workers=args.workers
)
test_loader = DataLoader(test_dataset, batch_size=args.eval_batch_size, shuffle=False, num_workers=args.workers)
if args.test_only:
# We disable the cudnn benchmarking because it can noticeably affect the accuracy
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True
evaluate(model, test_loader, device)
return
for epoch in range(1, args.epochs + 1):
print('Training...')
print("Training...")
train_epoch(model, optimizer, criterion, train_loader, device, epoch, args.print_freq)
print('Evaluating...')
print("Evaluating...")
evaluate(model, test_loader, device)
print('Saving...')
save(model, epoch, args.save_dir, 'ckpt.pth')
print("Saving...")
save(model, epoch, args.save_dir, "ckpt.pth")
def parse_args():
import argparse
parser = argparse.ArgumentParser(description='PyTorch Embedding Learning')
parser.add_argument('--dataset-dir', default='/tmp/fmnist/',
help='FashionMNIST dataset directory path')
parser.add_argument('-p', '--labels-per-batch', default=8, type=int,
help='Number of unique labels/classes per batch')
parser.add_argument('-k', '--samples-per-label', default=8, type=int,
help='Number of samples per label in a batch')
parser.add_argument('--eval-batch-size', default=512, type=int)
parser.add_argument('--epochs', default=10, type=int, metavar='N',
help='Number of training epochs to run')
parser.add_argument('-j', '--workers', default=4, type=int, metavar='N',
help='Number of data loading workers')
parser.add_argument('--lr', default=0.0001, type=float, help='Learning rate')
parser.add_argument('--margin', default=0.2, type=float, help='Triplet loss margin')
parser.add_argument('--print-freq', default=20, type=int, help='Print frequency')
parser.add_argument('--save-dir', default='.', help='Model save directory')
parser.add_argument('--resume', default='', help='Resume from checkpoint')
parser = argparse.ArgumentParser(description="PyTorch Embedding Learning")
parser.add_argument("--dataset-dir", default="/tmp/fmnist/", type=str, help="FashionMNIST dataset directory path")
parser.add_argument(
"-p", "--labels-per-batch", default=8, type=int, help="Number of unique labels/classes per batch"
)
parser.add_argument("-k", "--samples-per-label", default=8, type=int, help="Number of samples per label in a batch")
parser.add_argument("--eval-batch-size", default=512, type=int, help="batch size for evaluation")
parser.add_argument("--epochs", default=10, type=int, metavar="N", help="number of total epochs to run")
parser.add_argument("-j", "--workers", default=4, type=int, metavar="N", help="number of data loading workers")
parser.add_argument("--lr", default=0.0001, type=float, help="initial learning rate")
parser.add_argument("--margin", default=0.2, type=float, help="Triplet loss margin")
parser.add_argument("--print-freq", default=20, type=int, help="print frequency")
parser.add_argument("--save-dir", default=".", type=str, help="Model save directory")
parser.add_argument("--resume", default="", type=str, help="path of checkpoint")
parser.add_argument(
"--test-only",
dest="test_only",
help="Only test the model",
action="store_true",
)
parser.add_argument(
"--use-deterministic-algorithms", action="store_true", help="Forces the use of deterministic algorithms only."
)
return parser.parse_args()
if __name__ == '__main__':
if __name__ == "__main__":
args = parse_args()
main(args)
......@@ -18,11 +18,11 @@ We assume the training and validation AVI videos are stored at `/data/kinectics4
Run the training on a single node with 8 GPUs:
```bash
python -m torch.distributed.launch --nproc_per_node=8 --use_env train.py --data-path=/data/kinectics400 --train-dir=train --val-dir=val --batch-size=16 --cache-dataset --sync-bn --apex
torchrun --nproc_per_node=8 train.py --data-path=/data/kinectics400 --kinetics-version="400" --lr 0.08 --cache-dataset --sync-bn --amp
```
**Note:** all our models were trained on 8 nodes with 8 V100 GPUs each for a total of 64 GPUs. Expected training time for 64 GPUs is 24 hours, depending on the storage solution.
**Note 2:** hyperparameters for exact replication of our training can be found [here](https://github.com/pytorch/vision/blob/master/torchvision/models/video/README.md). Some hyperparameters such as learning rate are scaled linearly in proportion to the number of GPUs.
**Note 2:** hyperparameters for exact replication of our training can be found on the section below. Some hyperparameters such as learning rate must be scaled linearly in proportion to the number of GPUs. The default values assume 64 GPUs.
### Single GPU
......@@ -30,5 +30,96 @@ python -m torch.distributed.launch --nproc_per_node=8 --use_env train.py --data-
```bash
python train.py --data-path=/data/kinectics400 --train-dir=train --val-dir=val --batch-size=8 --cache-dataset
python train.py --data-path=/data/kinectics400 --kinetics-version="400" --batch-size=8 --cache-dataset
```
### Additional Kinetics versions
Since the original release, additional versions of Kinetics dataset became available (Kinetics 600).
Our training scripts support these versions of dataset as well by setting the `--kinetics-version` parameter to `"600"`.
**Note:** training on Kinetics 600 requires a different set of hyperparameters for optimal performance. We do not provide Kinetics 600 pretrained models.
## Video classification models
Starting with version `0.4.0` we have introduced support for basic video tasks and video classification modelling.
For more information about the available models check [here](https://pytorch.org/docs/stable/torchvision/models.html#video-classification).
### Video ResNet models
See reference training script [here](https://github.com/pytorch/vision/blob/main/references/video_classification/train.py):
- input space: RGB
- resize size: [128, 171]
- crop size: [112, 112]
- mean: [0.43216, 0.394666, 0.37645]
- std: [0.22803, 0.22145, 0.216989]
- number of classes: 400
Input data augmentations at training time (with optional parameters):
1. ConvertImageDtype
2. Resize (resize size value above)
3. Random horizontal flip (0.5)
4. Normalization (mean, std, see values above)
5. Random Crop (crop size value above)
6. Convert BCHW to CBHW
Input data augmentations at validation time (with optional parameters):
1. ConvertImageDtype
2. Resize (resize size value above)
3. Normalization (mean, std, see values above)
4. Center Crop (crop size value above)
5. Convert BCHW to CBHW
This translates in the following set of command-line arguments. Please note that `--batch-size` parameter controls the
batch size per GPU. Moreover, note that our default `--lr` is configured for 64 GPUs which is how many we used for the
Video resnet models:
```
# number of frames per clip
--clip_len 16 \
--frame-rate 15 \
# allow for temporal jittering
--clips_per_video 5 \
--batch-size 24 \
--epochs 45 \
--lr 0.64 \
# we use 10 epochs for linear warmup
--lr-warmup-epochs 10 \
# learning rate is decayed at 20, 30, and 40 epoch by a factor of 10
--lr-milestones 20, 30, 40 \
--lr-gamma 0.1 \
--train-resize-size 128 171 \
--train-crop-size 112 112 \
--val-resize-size 128 171 \
--val-crop-size 112 112
```
### S3D
The S3D model was trained similarly to the above but with the following changes on the default configuration:
```
--batch-size=12 --lr 0.2 --clip-len 64 --clips-per-video 5 --sync-bn \
--train-resize-size 256 256 --train-crop-size 224 224 --val-resize-size 256 256 --val-crop-size 224 224
```
We used 64 GPUs to train the architecture.
To estimate the validation statistics of the model, we run the reference script with the following configuration:
```
--batch-size=16 --test-only --clip-len 128 --clips-per-video 1
```
### Additional video modelling resources
- [Video Model Zoo](https://github.com/facebookresearch/VMZ)
- [PySlowFast](https://github.com/facebookresearch/SlowFast)
### References
[0] _D. Tran, H. Wang, L. Torresani, J. Ray, Y. LeCun and M. Paluri_: A Closer Look at Spatiotemporal Convolutions for Action Recognition. _CVPR 2018_ ([paper](https://research.fb.com/wp-content/uploads/2018/04/a-closer-look-at-spatiotemporal-convolutions-for-action-recognition.pdf))
[1] _W. Kay, J. Carreira, K. Simonyan, B. Zhang, C. Hillier, S. Vijayanarasimhan, F. Viola, T. Green, T. Back, P. Natsev, M. Suleyman, A. Zisserman_: The Kinetics Human Action Video Dataset ([paper](https://arxiv.org/abs/1705.06950))
from typing import Tuple
import torchvision
from torch import Tensor
class KineticsWithVideoId(torchvision.datasets.Kinetics):
def __getitem__(self, idx: int) -> Tuple[Tensor, Tensor, int]:
video, audio, info, video_idx = self.video_clips.get_clip(idx)
label = self.samples[video_idx][1]
if self.transform is not None:
video = self.transform(video)
return video, audio, label, video_idx
import torch
from torchvision.transforms import transforms
from transforms import ConvertBHWCtoBCHW, ConvertBCHWtoCBHW
from transforms import ConvertBCHWtoCBHW
class VideoClassificationPresetTrain:
def __init__(self, resize_size, crop_size, mean=(0.43216, 0.394666, 0.37645), std=(0.22803, 0.22145, 0.216989),
hflip_prob=0.5):
def __init__(
self,
*,
crop_size,
resize_size,
mean=(0.43216, 0.394666, 0.37645),
std=(0.22803, 0.22145, 0.216989),
hflip_prob=0.5,
):
trans = [
ConvertBHWCtoBCHW(),
transforms.ConvertImageDtype(torch.float32),
transforms.Resize(resize_size),
# We hard-code antialias=False to preserve results after we changed
# its default from None to True (see
# https://github.com/pytorch/vision/pull/7160)
# TODO: we could re-train the video models with antialias=True?
transforms.Resize(resize_size, antialias=False),
]
if hflip_prob > 0:
trans.append(transforms.RandomHorizontalFlip(hflip_prob))
trans.extend([
transforms.Normalize(mean=mean, std=std),
transforms.RandomCrop(crop_size),
ConvertBCHWtoCBHW()
])
trans.extend([transforms.Normalize(mean=mean, std=std), transforms.RandomCrop(crop_size), ConvertBCHWtoCBHW()])
self.transforms = transforms.Compose(trans)
def __call__(self, x):
......@@ -26,15 +31,20 @@ class VideoClassificationPresetTrain:
class VideoClassificationPresetEval:
def __init__(self, resize_size, crop_size, mean=(0.43216, 0.394666, 0.37645), std=(0.22803, 0.22145, 0.216989)):
self.transforms = transforms.Compose([
ConvertBHWCtoBCHW(),
transforms.ConvertImageDtype(torch.float32),
transforms.Resize(resize_size),
transforms.Normalize(mean=mean, std=std),
transforms.CenterCrop(crop_size),
ConvertBCHWtoCBHW()
])
def __init__(self, *, crop_size, resize_size, mean=(0.43216, 0.394666, 0.37645), std=(0.22803, 0.22145, 0.216989)):
self.transforms = transforms.Compose(
[
transforms.ConvertImageDtype(torch.float32),
# We hard-code antialias=False to preserve results after we changed
# its default from None to True (see
# https://github.com/pytorch/vision/pull/7160)
# TODO: we could re-train the video models with antialias=True?
transforms.Resize(resize_size, antialias=False),
transforms.Normalize(mean=mean, std=std),
transforms.CenterCrop(crop_size),
ConvertBCHWtoCBHW(),
]
)
def __call__(self, x):
return self.transforms(x)
import torch
from bisect import bisect_right
class WarmupMultiStepLR(torch.optim.lr_scheduler._LRScheduler):
def __init__(
self,
optimizer,
milestones,
gamma=0.1,
warmup_factor=1.0 / 3,
warmup_iters=5,
warmup_method="linear",
last_epoch=-1,
):
if not milestones == sorted(milestones):
raise ValueError(
"Milestones should be a list of" " increasing integers. Got {}",
milestones,
)
if warmup_method not in ("constant", "linear"):
raise ValueError(
"Only 'constant' or 'linear' warmup_method accepted"
"got {}".format(warmup_method)
)
self.milestones = milestones
self.gamma = gamma
self.warmup_factor = warmup_factor
self.warmup_iters = warmup_iters
self.warmup_method = warmup_method
super(WarmupMultiStepLR, self).__init__(optimizer, last_epoch)
def get_lr(self):
warmup_factor = 1
if self.last_epoch < self.warmup_iters:
if self.warmup_method == "constant":
warmup_factor = self.warmup_factor
elif self.warmup_method == "linear":
alpha = float(self.last_epoch) / self.warmup_iters
warmup_factor = self.warmup_factor * (1 - alpha) + alpha
return [
base_lr *
warmup_factor *
self.gamma ** bisect_right(self.milestones, self.last_epoch)
for base_lr in self.base_lrs
]
import datetime
import os
import time
import warnings
import datasets
import presets
import torch
import torch.utils.data
from torch.utils.data.dataloader import default_collate
from torch import nn
import torchvision
import torchvision.datasets.video_utils
from torchvision.datasets.samplers import DistributedSampler, UniformClipSampler, RandomClipSampler
import presets
import utils
from scheduler import WarmupMultiStepLR
try:
from apex import amp
except ImportError:
amp = None
from torch import nn
from torch.utils.data.dataloader import default_collate
from torchvision.datasets.samplers import DistributedSampler, RandomClipSampler, UniformClipSampler
def train_one_epoch(model, criterion, optimizer, lr_scheduler, data_loader, device, epoch, print_freq, apex=False):
def train_one_epoch(model, criterion, optimizer, lr_scheduler, data_loader, device, epoch, print_freq, scaler=None):
model.train()
metric_logger = utils.MetricLogger(delimiter=" ")
metric_logger.add_meter('lr', utils.SmoothedValue(window_size=1, fmt='{value}'))
metric_logger.add_meter('clips/s', utils.SmoothedValue(window_size=10, fmt='{value:.3f}'))
metric_logger.add_meter("lr", utils.SmoothedValue(window_size=1, fmt="{value}"))
metric_logger.add_meter("clips/s", utils.SmoothedValue(window_size=10, fmt="{value:.3f}"))
header = 'Epoch: [{}]'.format(epoch)
for video, target in metric_logger.log_every(data_loader, print_freq, header):
header = f"Epoch: [{epoch}]"
for video, target, _ in metric_logger.log_every(data_loader, print_freq, header):
start_time = time.time()
video, target = video.to(device), target.to(device)
output = model(video)
loss = criterion(output, target)
with torch.cuda.amp.autocast(enabled=scaler is not None):
output = model(video)
loss = criterion(output, target)
optimizer.zero_grad()
if apex:
with amp.scale_loss(loss, optimizer) as scaled_loss:
scaled_loss.backward()
if scaler is not None:
scaler.scale(loss).backward()
scaler.step(optimizer)
scaler.update()
else:
loss.backward()
optimizer.step()
optimizer.step()
acc1, acc5 = utils.accuracy(output, target, topk=(1, 5))
batch_size = video.shape[0]
metric_logger.update(loss=loss.item(), lr=optimizer.param_groups[0]["lr"])
metric_logger.meters['acc1'].update(acc1.item(), n=batch_size)
metric_logger.meters['acc5'].update(acc5.item(), n=batch_size)
metric_logger.meters['clips/s'].update(batch_size / (time.time() - start_time))
metric_logger.meters["acc1"].update(acc1.item(), n=batch_size)
metric_logger.meters["acc5"].update(acc5.item(), n=batch_size)
metric_logger.meters["clips/s"].update(batch_size / (time.time() - start_time))
lr_scheduler.step()
def evaluate(model, criterion, data_loader, device):
model.eval()
metric_logger = utils.MetricLogger(delimiter=" ")
header = 'Test:'
with torch.no_grad():
for video, target in metric_logger.log_every(data_loader, 100, header):
header = "Test:"
num_processed_samples = 0
# Group and aggregate output of a video
num_videos = len(data_loader.dataset.samples)
num_classes = len(data_loader.dataset.classes)
agg_preds = torch.zeros((num_videos, num_classes), dtype=torch.float32, device=device)
agg_targets = torch.zeros((num_videos), dtype=torch.int32, device=device)
with torch.inference_mode():
for video, target, video_idx in metric_logger.log_every(data_loader, 100, header):
video = video.to(device, non_blocking=True)
target = target.to(device, non_blocking=True)
output = model(video)
loss = criterion(output, target)
# Use softmax to convert output into prediction probability
preds = torch.softmax(output, dim=1)
for b in range(video.size(0)):
idx = video_idx[b].item()
agg_preds[idx] += preds[b].detach()
agg_targets[idx] = target[b].detach().item()
acc1, acc5 = utils.accuracy(output, target, topk=(1, 5))
# FIXME need to take into account that the datasets
# could have been padded in distributed setup
batch_size = video.shape[0]
metric_logger.update(loss=loss.item())
metric_logger.meters['acc1'].update(acc1.item(), n=batch_size)
metric_logger.meters['acc5'].update(acc5.item(), n=batch_size)
metric_logger.meters["acc1"].update(acc1.item(), n=batch_size)
metric_logger.meters["acc5"].update(acc5.item(), n=batch_size)
num_processed_samples += batch_size
# gather the stats from all processes
num_processed_samples = utils.reduce_across_processes(num_processed_samples)
if isinstance(data_loader.sampler, DistributedSampler):
# Get the len of UniformClipSampler inside DistributedSampler
num_data_from_sampler = len(data_loader.sampler.dataset)
else:
num_data_from_sampler = len(data_loader.sampler)
if (
hasattr(data_loader.dataset, "__len__")
and num_data_from_sampler != num_processed_samples
and torch.distributed.get_rank() == 0
):
# See FIXME above
warnings.warn(
f"It looks like the sampler has {num_data_from_sampler} samples, but {num_processed_samples} "
"samples were used for the validation, which might bias the results. "
"Try adjusting the batch size and / or the world size. "
"Setting the world size to 1 is always a safe bet."
)
metric_logger.synchronize_between_processes()
print(' * Clip Acc@1 {top1.global_avg:.3f} Clip Acc@5 {top5.global_avg:.3f}'
.format(top1=metric_logger.acc1, top5=metric_logger.acc5))
print(
" * Clip Acc@1 {top1.global_avg:.3f} Clip Acc@5 {top5.global_avg:.3f}".format(
top1=metric_logger.acc1, top5=metric_logger.acc5
)
)
# Reduce the agg_preds and agg_targets from all gpu and show result
agg_preds = utils.reduce_across_processes(agg_preds)
agg_targets = utils.reduce_across_processes(agg_targets, op=torch.distributed.ReduceOp.MAX)
agg_acc1, agg_acc5 = utils.accuracy(agg_preds, agg_targets, topk=(1, 5))
print(" * Video Acc@1 {acc1:.3f} Video Acc@5 {acc5:.3f}".format(acc1=agg_acc1, acc5=agg_acc5))
return metric_logger.acc1.global_avg
def _get_cache_path(filepath):
def _get_cache_path(filepath, args):
import hashlib
h = hashlib.sha1(filepath.encode()).hexdigest()
value = f"{filepath}-{args.clip_len}-{args.kinetics_version}-{args.frame_rate}"
h = hashlib.sha1(value.encode()).hexdigest()
cache_path = os.path.join("~", ".torch", "vision", "datasets", "kinetics", h[:10] + ".pt")
cache_path = os.path.expanduser(cache_path)
return cache_path
......@@ -86,83 +128,100 @@ def _get_cache_path(filepath):
def collate_fn(batch):
# remove audio from the batch
batch = [(d[0], d[2]) for d in batch]
batch = [(d[0], d[2], d[3]) for d in batch]
return default_collate(batch)
def main(args):
if args.apex and amp is None:
raise RuntimeError("Failed to import apex. Please install apex from https://www.github.com/nvidia/apex "
"to enable mixed-precision training.")
if args.output_dir:
utils.mkdir(args.output_dir)
utils.init_distributed_mode(args)
print(args)
print("torch version: ", torch.__version__)
print("torchvision version: ", torchvision.__version__)
device = torch.device(args.device)
torch.backends.cudnn.benchmark = True
if args.use_deterministic_algorithms:
torch.backends.cudnn.benchmark = False
torch.use_deterministic_algorithms(True)
else:
torch.backends.cudnn.benchmark = True
# Data loading code
print("Loading data")
traindir = os.path.join(args.data_path, args.train_dir)
valdir = os.path.join(args.data_path, args.val_dir)
val_resize_size = tuple(args.val_resize_size)
val_crop_size = tuple(args.val_crop_size)
train_resize_size = tuple(args.train_resize_size)
train_crop_size = tuple(args.train_crop_size)
traindir = os.path.join(args.data_path, "train")
valdir = os.path.join(args.data_path, "val")
print("Loading training data")
st = time.time()
cache_path = _get_cache_path(traindir)
transform_train = presets.VideoClassificationPresetTrain((128, 171), (112, 112))
cache_path = _get_cache_path(traindir, args)
transform_train = presets.VideoClassificationPresetTrain(crop_size=train_crop_size, resize_size=train_resize_size)
if args.cache_dataset and os.path.exists(cache_path):
print("Loading dataset_train from {}".format(cache_path))
dataset, _ = torch.load(cache_path)
print(f"Loading dataset_train from {cache_path}")
dataset, _ = torch.load(cache_path, weights_only=True)
dataset.transform = transform_train
else:
if args.distributed:
print("It is recommended to pre-compute the dataset cache "
"on a single-gpu first, as it will be faster")
dataset = torchvision.datasets.Kinetics400(
traindir,
print("It is recommended to pre-compute the dataset cache on a single-gpu first, as it will be faster")
dataset = datasets.KineticsWithVideoId(
args.data_path,
frames_per_clip=args.clip_len,
num_classes=args.kinetics_version,
split="train",
step_between_clips=1,
transform=transform_train,
frame_rate=15,
extensions=('avi', 'mp4', )
frame_rate=args.frame_rate,
extensions=(
"avi",
"mp4",
),
output_format="TCHW",
)
if args.cache_dataset:
print("Saving dataset_train to {}".format(cache_path))
print(f"Saving dataset_train to {cache_path}")
utils.mkdir(os.path.dirname(cache_path))
utils.save_on_master((dataset, traindir), cache_path)
print("Took", time.time() - st)
print("Loading validation data")
cache_path = _get_cache_path(valdir)
cache_path = _get_cache_path(valdir, args)
transform_test = presets.VideoClassificationPresetEval((128, 171), (112, 112))
if args.weights and args.test_only:
weights = torchvision.models.get_weight(args.weights)
transform_test = weights.transforms()
else:
transform_test = presets.VideoClassificationPresetEval(crop_size=val_crop_size, resize_size=val_resize_size)
if args.cache_dataset and os.path.exists(cache_path):
print("Loading dataset_test from {}".format(cache_path))
dataset_test, _ = torch.load(cache_path)
print(f"Loading dataset_test from {cache_path}")
dataset_test, _ = torch.load(cache_path, weights_only=True)
dataset_test.transform = transform_test
else:
if args.distributed:
print("It is recommended to pre-compute the dataset cache "
"on a single-gpu first, as it will be faster")
dataset_test = torchvision.datasets.Kinetics400(
valdir,
print("It is recommended to pre-compute the dataset cache on a single-gpu first, as it will be faster")
dataset_test = datasets.KineticsWithVideoId(
args.data_path,
frames_per_clip=args.clip_len,
num_classes=args.kinetics_version,
split="val",
step_between_clips=1,
transform=transform_test,
frame_rate=15,
extensions=('avi', 'mp4',)
frame_rate=args.frame_rate,
extensions=(
"avi",
"mp4",
),
output_format="TCHW",
)
if args.cache_dataset:
print("Saving dataset_test to {}".format(cache_path))
print(f"Saving dataset_test to {cache_path}")
utils.mkdir(os.path.dirname(cache_path))
utils.save_on_master((dataset_test, valdir), cache_path)
......@@ -171,42 +230,64 @@ def main(args):
test_sampler = UniformClipSampler(dataset_test.video_clips, args.clips_per_video)
if args.distributed:
train_sampler = DistributedSampler(train_sampler)
test_sampler = DistributedSampler(test_sampler)
test_sampler = DistributedSampler(test_sampler, shuffle=False)
data_loader = torch.utils.data.DataLoader(
dataset, batch_size=args.batch_size,
sampler=train_sampler, num_workers=args.workers,
pin_memory=True, collate_fn=collate_fn)
dataset,
batch_size=args.batch_size,
sampler=train_sampler,
num_workers=args.workers,
pin_memory=True,
collate_fn=collate_fn,
)
data_loader_test = torch.utils.data.DataLoader(
dataset_test, batch_size=args.batch_size,
sampler=test_sampler, num_workers=args.workers,
pin_memory=True, collate_fn=collate_fn)
dataset_test,
batch_size=args.batch_size,
sampler=test_sampler,
num_workers=args.workers,
pin_memory=True,
collate_fn=collate_fn,
)
print("Creating model")
model = torchvision.models.video.__dict__[args.model](pretrained=args.pretrained)
model = torchvision.models.get_model(args.model, weights=args.weights)
model.to(device)
if args.distributed and args.sync_bn:
model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model)
criterion = nn.CrossEntropyLoss()
lr = args.lr * args.world_size
optimizer = torch.optim.SGD(
model.parameters(), lr=lr, momentum=args.momentum, weight_decay=args.weight_decay)
if args.apex:
model, optimizer = amp.initialize(model, optimizer,
opt_level=args.apex_opt_level
)
optimizer = torch.optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay)
scaler = torch.cuda.amp.GradScaler() if args.amp else None
# convert scheduler to be per iteration, not per epoch, for warmup that lasts
# between different epochs
warmup_iters = args.lr_warmup_epochs * len(data_loader)
lr_milestones = [len(data_loader) * m for m in args.lr_milestones]
lr_scheduler = WarmupMultiStepLR(
optimizer, milestones=lr_milestones, gamma=args.lr_gamma,
warmup_iters=warmup_iters, warmup_factor=1e-5)
iters_per_epoch = len(data_loader)
lr_milestones = [iters_per_epoch * (m - args.lr_warmup_epochs) for m in args.lr_milestones]
main_lr_scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=lr_milestones, gamma=args.lr_gamma)
if args.lr_warmup_epochs > 0:
warmup_iters = iters_per_epoch * args.lr_warmup_epochs
args.lr_warmup_method = args.lr_warmup_method.lower()
if args.lr_warmup_method == "linear":
warmup_lr_scheduler = torch.optim.lr_scheduler.LinearLR(
optimizer, start_factor=args.lr_warmup_decay, total_iters=warmup_iters
)
elif args.lr_warmup_method == "constant":
warmup_lr_scheduler = torch.optim.lr_scheduler.ConstantLR(
optimizer, factor=args.lr_warmup_decay, total_iters=warmup_iters
)
else:
raise RuntimeError(
f"Invalid warmup lr method '{args.lr_warmup_method}'. Only linear and constant are supported."
)
lr_scheduler = torch.optim.lr_scheduler.SequentialLR(
optimizer, schedulers=[warmup_lr_scheduler, main_lr_scheduler], milestones=[warmup_iters]
)
else:
lr_scheduler = main_lr_scheduler
model_without_ddp = model
if args.distributed:
......@@ -214,13 +295,18 @@ def main(args):
model_without_ddp = model.module
if args.resume:
checkpoint = torch.load(args.resume, map_location='cpu')
model_without_ddp.load_state_dict(checkpoint['model'])
optimizer.load_state_dict(checkpoint['optimizer'])
lr_scheduler.load_state_dict(checkpoint['lr_scheduler'])
args.start_epoch = checkpoint['epoch'] + 1
checkpoint = torch.load(args.resume, map_location="cpu", weights_only=True)
model_without_ddp.load_state_dict(checkpoint["model"])
optimizer.load_state_dict(checkpoint["optimizer"])
lr_scheduler.load_state_dict(checkpoint["lr_scheduler"])
args.start_epoch = checkpoint["epoch"] + 1
if args.amp:
scaler.load_state_dict(checkpoint["scaler"])
if args.test_only:
# We disable the cudnn benchmarking because it can noticeably affect the accuracy
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True
evaluate(model, criterion, data_loader_test, device=device)
return
......@@ -229,60 +315,69 @@ def main(args):
for epoch in range(args.start_epoch, args.epochs):
if args.distributed:
train_sampler.set_epoch(epoch)
train_one_epoch(model, criterion, optimizer, lr_scheduler, data_loader,
device, epoch, args.print_freq, args.apex)
train_one_epoch(model, criterion, optimizer, lr_scheduler, data_loader, device, epoch, args.print_freq, scaler)
evaluate(model, criterion, data_loader_test, device=device)
if args.output_dir:
checkpoint = {
'model': model_without_ddp.state_dict(),
'optimizer': optimizer.state_dict(),
'lr_scheduler': lr_scheduler.state_dict(),
'epoch': epoch,
'args': args}
utils.save_on_master(
checkpoint,
os.path.join(args.output_dir, 'model_{}.pth'.format(epoch)))
utils.save_on_master(
checkpoint,
os.path.join(args.output_dir, 'checkpoint.pth'))
"model": model_without_ddp.state_dict(),
"optimizer": optimizer.state_dict(),
"lr_scheduler": lr_scheduler.state_dict(),
"epoch": epoch,
"args": args,
}
if args.amp:
checkpoint["scaler"] = scaler.state_dict()
utils.save_on_master(checkpoint, os.path.join(args.output_dir, f"model_{epoch}.pth"))
utils.save_on_master(checkpoint, os.path.join(args.output_dir, "checkpoint.pth"))
total_time = time.time() - start_time
total_time_str = str(datetime.timedelta(seconds=int(total_time)))
print('Training time {}'.format(total_time_str))
print(f"Training time {total_time_str}")
def parse_args():
def get_args_parser(add_help=True):
import argparse
parser = argparse.ArgumentParser(description='PyTorch Video Classification Training')
parser.add_argument('--data-path', default='/datasets01_101/kinetics/070618/', help='dataset')
parser.add_argument('--train-dir', default='train_avi-480p', help='name of train dir')
parser.add_argument('--val-dir', default='val_avi-480p', help='name of val dir')
parser.add_argument('--model', default='r2plus1d_18', help='model')
parser.add_argument('--device', default='cuda', help='device')
parser.add_argument('--clip-len', default=16, type=int, metavar='N',
help='number of frames per clip')
parser.add_argument('--clips-per-video', default=5, type=int, metavar='N',
help='maximum number of clips per video to consider')
parser.add_argument('-b', '--batch-size', default=24, type=int)
parser.add_argument('--epochs', default=45, type=int, metavar='N',
help='number of total epochs to run')
parser.add_argument('-j', '--workers', default=10, type=int, metavar='N',
help='number of data loading workers (default: 16)')
parser.add_argument('--lr', default=0.01, type=float, help='initial learning rate')
parser.add_argument('--momentum', default=0.9, type=float, metavar='M',
help='momentum')
parser.add_argument('--wd', '--weight-decay', default=1e-4, type=float,
metavar='W', help='weight decay (default: 1e-4)',
dest='weight_decay')
parser.add_argument('--lr-milestones', nargs='+', default=[20, 30, 40], type=int, help='decrease lr on milestones')
parser.add_argument('--lr-gamma', default=0.1, type=float, help='decrease lr by a factor of lr-gamma')
parser.add_argument('--lr-warmup-epochs', default=10, type=int, help='number of warmup epochs')
parser.add_argument('--print-freq', default=10, type=int, help='print frequency')
parser.add_argument('--output-dir', default='.', help='path where to save')
parser.add_argument('--resume', default='', help='resume from checkpoint')
parser.add_argument('--start-epoch', default=0, type=int, metavar='N',
help='start epoch')
parser = argparse.ArgumentParser(description="PyTorch Video Classification Training", add_help=add_help)
parser.add_argument("--data-path", default="/datasets01_101/kinetics/070618/", type=str, help="dataset path")
parser.add_argument(
"--kinetics-version", default="400", type=str, choices=["400", "600"], help="Select kinetics version"
)
parser.add_argument("--model", default="r2plus1d_18", type=str, help="model name")
parser.add_argument("--device", default="cuda", type=str, help="device (Use cuda or cpu Default: cuda)")
parser.add_argument("--clip-len", default=16, type=int, metavar="N", help="number of frames per clip")
parser.add_argument("--frame-rate", default=15, type=int, metavar="N", help="the frame rate")
parser.add_argument(
"--clips-per-video", default=5, type=int, metavar="N", help="maximum number of clips per video to consider"
)
parser.add_argument(
"-b", "--batch-size", default=24, type=int, help="images per gpu, the total batch size is $NGPU x batch_size"
)
parser.add_argument("--epochs", default=45, type=int, metavar="N", help="number of total epochs to run")
parser.add_argument(
"-j", "--workers", default=10, type=int, metavar="N", help="number of data loading workers (default: 10)"
)
parser.add_argument("--lr", default=0.64, type=float, help="initial learning rate")
parser.add_argument("--momentum", default=0.9, type=float, metavar="M", help="momentum")
parser.add_argument(
"--wd",
"--weight-decay",
default=1e-4,
type=float,
metavar="W",
help="weight decay (default: 1e-4)",
dest="weight_decay",
)
parser.add_argument("--lr-milestones", nargs="+", default=[20, 30, 40], type=int, help="decrease lr on milestones")
parser.add_argument("--lr-gamma", default=0.1, type=float, help="decrease lr by a factor of lr-gamma")
parser.add_argument("--lr-warmup-epochs", default=10, type=int, help="the number of epochs to warmup (default: 10)")
parser.add_argument("--lr-warmup-method", default="linear", type=str, help="the warmup method (default: linear)")
parser.add_argument("--lr-warmup-decay", default=0.001, type=float, help="the decay for lr")
parser.add_argument("--print-freq", default=10, type=int, help="print frequency")
parser.add_argument("--output-dir", default=".", type=str, help="path to save outputs")
parser.add_argument("--resume", default="", type=str, help="path of checkpoint")
parser.add_argument("--start-epoch", default=0, type=int, metavar="N", help="start epoch")
parser.add_argument(
"--cache-dataset",
dest="cache_dataset",
......@@ -302,31 +397,50 @@ def parse_args():
action="store_true",
)
parser.add_argument(
"--pretrained",
dest="pretrained",
help="Use pre-trained models from the modelzoo",
action="store_true",
"--use-deterministic-algorithms", action="store_true", help="Forces the use of deterministic algorithms only."
)
# Mixed precision training parameters
parser.add_argument('--apex', action='store_true',
help='Use apex for mixed precision training')
parser.add_argument('--apex-opt-level', default='O1', type=str,
help='For apex mixed precision training'
'O0 for FP32 training, O1 for mixed precision training.'
'For further detail, see https://github.com/NVIDIA/apex/tree/master/examples/imagenet'
)
# distributed training parameters
parser.add_argument('--world-size', default=1, type=int,
help='number of distributed processes')
parser.add_argument('--dist-url', default='env://', help='url used to set up distributed training')
parser.add_argument("--world-size", default=1, type=int, help="number of distributed processes")
parser.add_argument("--dist-url", default="env://", type=str, help="url used to set up distributed training")
parser.add_argument(
"--val-resize-size",
default=(128, 171),
nargs="+",
type=int,
help="the resize size used for validation (default: (128, 171))",
)
parser.add_argument(
"--val-crop-size",
default=(112, 112),
nargs="+",
type=int,
help="the central crop size used for validation (default: (112, 112))",
)
parser.add_argument(
"--train-resize-size",
default=(128, 171),
nargs="+",
type=int,
help="the resize size used for training (default: (128, 171))",
)
parser.add_argument(
"--train-crop-size",
default=(112, 112),
nargs="+",
type=int,
help="the random crop size used for training (default: (112, 112))",
)
parser.add_argument("--weights", default=None, type=str, help="the weights enum name to load")
args = parser.parse_args()
# Mixed precision training parameters
parser.add_argument("--amp", action="store_true", help="Use torch.cuda.amp for mixed precision training")
return args
return parser
if __name__ == "__main__":
args = parse_args()
args = get_args_parser().parse_args()
main(args)
......@@ -2,17 +2,8 @@ import torch
import torch.nn as nn
class ConvertBHWCtoBCHW(nn.Module):
"""Convert tensor from (B, H, W, C) to (B, C, H, W)
"""
def forward(self, vid: torch.Tensor) -> torch.Tensor:
return vid.permute(0, 3, 1, 2)
class ConvertBCHWtoCBHW(nn.Module):
"""Convert tensor from (B, C, H, W) to (C, B, H, W)
"""
"""Convert tensor from (B, C, H, W) to (C, B, H, W)"""
def forward(self, vid: torch.Tensor) -> torch.Tensor:
return vid.permute(1, 0, 2, 3)
from collections import defaultdict, deque
import datetime
import errno
import os
import time
from collections import defaultdict, deque
import torch
import torch.distributed as dist
import errno
import os
class SmoothedValue(object):
class SmoothedValue:
"""Track a series of values and provide access to smoothed values over a
window or the global series average.
"""
......@@ -30,11 +30,7 @@ class SmoothedValue(object):
"""
Warning: does not synchronize the deque!
"""
if not is_dist_avail_and_initialized():
return
t = torch.tensor([self.count, self.total], dtype=torch.float64, device='cuda')
dist.barrier()
dist.all_reduce(t)
t = reduce_across_processes([self.count, self.total])
t = t.tolist()
self.count = int(t[0])
self.total = t[1]
......@@ -63,14 +59,11 @@ class SmoothedValue(object):
def __str__(self):
return self.fmt.format(
median=self.median,
avg=self.avg,
global_avg=self.global_avg,
max=self.max,
value=self.value)
median=self.median, avg=self.avg, global_avg=self.global_avg, max=self.max, value=self.value
)
class MetricLogger(object):
class MetricLogger:
def __init__(self, delimiter="\t"):
self.meters = defaultdict(SmoothedValue)
self.delimiter = delimiter
......@@ -79,7 +72,10 @@ class MetricLogger(object):
for k, v in kwargs.items():
if isinstance(v, torch.Tensor):
v = v.item()
assert isinstance(v, (float, int))
if not isinstance(v, (float, int)):
raise TypeError(
f"This method expects the value of the input arguments to be of type float or int, instead got {type(v)}"
)
self.meters[k].update(v)
def __getattr__(self, attr):
......@@ -87,15 +83,12 @@ class MetricLogger(object):
return self.meters[attr]
if attr in self.__dict__:
return self.__dict__[attr]
raise AttributeError("'{}' object has no attribute '{}'".format(
type(self).__name__, attr))
raise AttributeError(f"'{type(self).__name__}' object has no attribute '{attr}'")
def __str__(self):
loss_str = []
for name, meter in self.meters.items():
loss_str.append(
"{}: {}".format(name, str(meter))
)
loss_str.append(f"{name}: {str(meter)}")
return self.delimiter.join(loss_str)
def synchronize_between_processes(self):
......@@ -108,31 +101,28 @@ class MetricLogger(object):
def log_every(self, iterable, print_freq, header=None):
i = 0
if not header:
header = ''
header = ""
start_time = time.time()
end = time.time()
iter_time = SmoothedValue(fmt='{avg:.4f}')
data_time = SmoothedValue(fmt='{avg:.4f}')
space_fmt = ':' + str(len(str(len(iterable)))) + 'd'
iter_time = SmoothedValue(fmt="{avg:.4f}")
data_time = SmoothedValue(fmt="{avg:.4f}")
space_fmt = ":" + str(len(str(len(iterable)))) + "d"
if torch.cuda.is_available():
log_msg = self.delimiter.join([
header,
'[{0' + space_fmt + '}/{1}]',
'eta: {eta}',
'{meters}',
'time: {time}',
'data: {data}',
'max mem: {memory:.0f}'
])
log_msg = self.delimiter.join(
[
header,
"[{0" + space_fmt + "}/{1}]",
"eta: {eta}",
"{meters}",
"time: {time}",
"data: {data}",
"max mem: {memory:.0f}",
]
)
else:
log_msg = self.delimiter.join([
header,
'[{0' + space_fmt + '}/{1}]',
'eta: {eta}',
'{meters}',
'time: {time}',
'data: {data}'
])
log_msg = self.delimiter.join(
[header, "[{0" + space_fmt + "}/{1}]", "eta: {eta}", "{meters}", "time: {time}", "data: {data}"]
)
MB = 1024.0 * 1024.0
for obj in iterable:
data_time.update(time.time() - end)
......@@ -142,26 +132,33 @@ class MetricLogger(object):
eta_seconds = iter_time.global_avg * (len(iterable) - i)
eta_string = str(datetime.timedelta(seconds=int(eta_seconds)))
if torch.cuda.is_available():
print(log_msg.format(
i, len(iterable), eta=eta_string,
meters=str(self),
time=str(iter_time), data=str(data_time),
memory=torch.cuda.max_memory_allocated() / MB))
print(
log_msg.format(
i,
len(iterable),
eta=eta_string,
meters=str(self),
time=str(iter_time),
data=str(data_time),
memory=torch.cuda.max_memory_allocated() / MB,
)
)
else:
print(log_msg.format(
i, len(iterable), eta=eta_string,
meters=str(self),
time=str(iter_time), data=str(data_time)))
print(
log_msg.format(
i, len(iterable), eta=eta_string, meters=str(self), time=str(iter_time), data=str(data_time)
)
)
i += 1
end = time.time()
total_time = time.time() - start_time
total_time_str = str(datetime.timedelta(seconds=int(total_time)))
print('{} Total time: {}'.format(header, total_time_str))
print(f"{header} Total time: {total_time_str}")
def accuracy(output, target, topk=(1,)):
"""Computes the accuracy over the k top predictions for the specified values of k"""
with torch.no_grad():
with torch.inference_mode():
maxk = max(topk)
batch_size = target.size(0)
......@@ -189,10 +186,11 @@ def setup_for_distributed(is_master):
This function disables printing when not in master process
"""
import builtins as __builtin__
builtin_print = __builtin__.print
def print(*args, **kwargs):
force = kwargs.pop('force', False)
force = kwargs.pop("force", False)
if is_master or force:
builtin_print(*args, **kwargs)
......@@ -229,26 +227,38 @@ def save_on_master(*args, **kwargs):
def init_distributed_mode(args):
if 'RANK' in os.environ and 'WORLD_SIZE' in os.environ:
if "RANK" in os.environ and "WORLD_SIZE" in os.environ:
args.rank = int(os.environ["RANK"])
args.world_size = int(os.environ['WORLD_SIZE'])
args.gpu = int(os.environ['LOCAL_RANK'])
elif 'SLURM_PROCID' in os.environ:
args.rank = int(os.environ['SLURM_PROCID'])
args.world_size = int(os.environ["WORLD_SIZE"])
args.gpu = int(os.environ["LOCAL_RANK"])
elif "SLURM_PROCID" in os.environ:
args.rank = int(os.environ["SLURM_PROCID"])
args.gpu = args.rank % torch.cuda.device_count()
elif hasattr(args, "rank"):
pass
else:
print('Not using distributed mode')
print("Not using distributed mode")
args.distributed = False
return
args.distributed = True
torch.cuda.set_device(args.gpu)
args.dist_backend = 'nccl'
print('| distributed init (rank {}): {}'.format(
args.rank, args.dist_url), flush=True)
torch.distributed.init_process_group(backend=args.dist_backend, init_method=args.dist_url,
world_size=args.world_size, rank=args.rank)
args.dist_backend = "nccl"
print(f"| distributed init (rank {args.rank}): {args.dist_url}", flush=True)
torch.distributed.init_process_group(
backend=args.dist_backend, init_method=args.dist_url, world_size=args.world_size, rank=args.rank
)
torch.distributed.barrier()
setup_for_distributed(args.rank == 0)
def reduce_across_processes(val, op=dist.ReduceOp.SUM):
if not is_dist_avail_and_initialized():
# nothing to sync, but we still convert to tensor for consistency with the distributed case.
return torch.tensor(val)
t = torch.tensor(val, device="cuda")
dist.barrier()
dist.all_reduce(t, op=op)
return t
Utility scripts
===============
* `fbcode_to_main_sync.sh`
This shell script is used to synchronise internal changes with the main repository.
To run this script:
.. code:: bash
chmod +x fbcode_to_main_sync.sh
./fbcode_to_main_sync.sh <commit_hash> <fork_name> <fork_main_branch>
where
``commit_hash`` represents the commit hash in fbsync branch from where we should start the sync.
``fork_name`` is the name of the remote corresponding to your fork, you can check it by doing `"git remote -v"`.
``fork_main_branch`` (optional) is the name of the main branch on your fork(default="main").
This script will create PRs corresponding to the commits in fbsync. Please review these, add the [FBcode->GH] prefix on the title and publish them. Most importantly, add the [FBcode->GH] prefix at the beginning of the merge message as well.
import pathlib
import re
import sys
MODEL_URL_PATTERN = re.compile(r"https://download[.]pytorch[.]org/models/.+?[.]pth")
def main(*roots):
model_urls = set()
for root in roots:
for path in pathlib.Path(root).rglob("*.py"):
with open(path, "r") as file:
for line in file:
model_urls.update(MODEL_URL_PATTERN.findall(line))
print("\n".join(sorted(model_urls)))
if __name__ == "__main__":
main(*sys.argv[1:])
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment