"tests/vscode:/vscode.git/clone" did not exist on "b9aa485538b791658a2d4ebe6097f6fb52930d92"
Unverified Commit 5f0edb97 authored by Philip Meier's avatar Philip Meier Committed by GitHub
Browse files

Add ufmt (usort + black) as code formatter (#4384)



* add ufmt as code formatter

* cleanup

* quote ufmt requirement

* split imports into more groups

* regenerate circleci config

* fix CI

* clarify local testing utils section

* use ufmt pre-commit hook

* split relative imports into local category

* Revert "split relative imports into local category"

This reverts commit f2e224cde2008c56c9347c1f69746d39065cdd51.

* pin black and usort dependencies

* fix local test utils detection

* fix ufmt rev

* add reference utils to local category

* fix usort config

* remove custom categories sorting

* Run pre-commit without fixing flake8

* got a double import in merge
Co-authored-by: default avatarNicolas Hug <nicolashug@fb.com>
parent e45489b1
import bisect
from collections import defaultdict
import copy
from itertools import repeat, chain
import math
import numpy as np
from collections import defaultdict
from itertools import repeat, chain
import numpy as np
import torch
import torch.utils.data
from torch.utils.data.sampler import BatchSampler, Sampler
from torch.utils.model_zoo import tqdm
import torchvision
from PIL import Image
from torch.utils.data.sampler import BatchSampler, Sampler
from torch.utils.model_zoo import tqdm
def _repeat_to_at_least(iterable, n):
......@@ -34,11 +33,11 @@ class GroupedBatchSampler(BatchSampler):
0, i.e. they must be in the range [0, num_groups).
batch_size (int): Size of mini-batch.
"""
def __init__(self, sampler, group_ids, batch_size):
if not isinstance(sampler, Sampler):
raise ValueError(
"sampler should be an instance of "
"torch.utils.data.Sampler, but got sampler={}".format(sampler)
"sampler should be an instance of " "torch.utils.data.Sampler, but got sampler={}".format(sampler)
)
self.sampler = sampler
self.group_ids = group_ids
......@@ -68,8 +67,7 @@ class GroupedBatchSampler(BatchSampler):
if num_remaining > 0:
# for the remaining batches, take first the buffers with largest number
# of elements
for group_id, _ in sorted(buffer_per_group.items(),
key=lambda x: len(x[1]), reverse=True):
for group_id, _ in sorted(buffer_per_group.items(), key=lambda x: len(x[1]), reverse=True):
remaining = self.batch_size - len(buffer_per_group[group_id])
samples_from_group_id = _repeat_to_at_least(samples_per_group[group_id], remaining)
buffer_per_group[group_id].extend(samples_from_group_id[:remaining])
......@@ -85,10 +83,12 @@ class GroupedBatchSampler(BatchSampler):
def _compute_aspect_ratios_slow(dataset, indices=None):
print("Your dataset doesn't support the fast path for "
"computing the aspect ratios, so will iterate over "
"the full dataset and load every image instead. "
"This might take some time...")
print(
"Your dataset doesn't support the fast path for "
"computing the aspect ratios, so will iterate over "
"the full dataset and load every image instead. "
"This might take some time..."
)
if indices is None:
indices = range(len(dataset))
......@@ -104,9 +104,12 @@ def _compute_aspect_ratios_slow(dataset, indices=None):
sampler = SubsetSampler(indices)
data_loader = torch.utils.data.DataLoader(
dataset, batch_size=1, sampler=sampler,
dataset,
batch_size=1,
sampler=sampler,
num_workers=14, # you might want to increase it for faster processing
collate_fn=lambda x: x[0])
collate_fn=lambda x: x[0],
)
aspect_ratios = []
with tqdm(total=len(dataset)) as pbar:
for _i, (img, _) in enumerate(data_loader):
......
import torch
import transforms as T
class DetectionPresetTrain:
def __init__(self, data_augmentation, hflip_prob=0.5, mean=(123., 117., 104.)):
if data_augmentation == 'hflip':
self.transforms = T.Compose([
T.RandomHorizontalFlip(p=hflip_prob),
T.PILToTensor(),
T.ConvertImageDtype(torch.float),
])
elif data_augmentation == 'ssd':
self.transforms = T.Compose([
T.RandomPhotometricDistort(),
T.RandomZoomOut(fill=list(mean)),
T.RandomIoUCrop(),
T.RandomHorizontalFlip(p=hflip_prob),
T.PILToTensor(),
T.ConvertImageDtype(torch.float),
])
elif data_augmentation == 'ssdlite':
self.transforms = T.Compose([
T.RandomIoUCrop(),
T.RandomHorizontalFlip(p=hflip_prob),
T.PILToTensor(),
T.ConvertImageDtype(torch.float),
])
def __init__(self, data_augmentation, hflip_prob=0.5, mean=(123.0, 117.0, 104.0)):
if data_augmentation == "hflip":
self.transforms = T.Compose(
[
T.RandomHorizontalFlip(p=hflip_prob),
T.PILToTensor(),
T.ConvertImageDtype(torch.float),
]
)
elif data_augmentation == "ssd":
self.transforms = T.Compose(
[
T.RandomPhotometricDistort(),
T.RandomZoomOut(fill=list(mean)),
T.RandomIoUCrop(),
T.RandomHorizontalFlip(p=hflip_prob),
T.PILToTensor(),
T.ConvertImageDtype(torch.float),
]
)
elif data_augmentation == "ssdlite":
self.transforms = T.Compose(
[
T.RandomIoUCrop(),
T.RandomHorizontalFlip(p=hflip_prob),
T.PILToTensor(),
T.ConvertImageDtype(torch.float),
]
)
else:
raise ValueError(f'Unknown data augmentation policy "{data_augmentation}"')
......
......@@ -21,26 +21,20 @@ import datetime
import os
import time
import presets
import torch
import torch.utils.data
import torchvision
import torchvision.models.detection
import torchvision.models.detection.mask_rcnn
import utils
from coco_utils import get_coco, get_coco_kp
from group_by_aspect_ratio import GroupedBatchSampler, create_aspect_ratio_groups
from engine import train_one_epoch, evaluate
import presets
import utils
from group_by_aspect_ratio import GroupedBatchSampler, create_aspect_ratio_groups
def get_dataset(name, image_set, transform, data_path):
paths = {
"coco": (data_path, get_coco, 91),
"coco_kp": (data_path, get_coco_kp, 2)
}
paths = {"coco": (data_path, get_coco, 91), "coco_kp": (data_path, get_coco_kp, 2)}
p, ds_fn, num_classes = paths[name]
ds = ds_fn(p, image_set=image_set, transforms=transform)
......@@ -53,42 +47,60 @@ def get_transform(train, data_augmentation):
def get_args_parser(add_help=True):
import argparse
parser = argparse.ArgumentParser(description='PyTorch Detection Training', add_help=add_help)
parser.add_argument('--data-path', default='/datasets01/COCO/022719/', help='dataset')
parser.add_argument('--dataset', default='coco', help='dataset')
parser.add_argument('--model', default='maskrcnn_resnet50_fpn', help='model')
parser.add_argument('--device', default='cuda', help='device')
parser.add_argument('-b', '--batch-size', default=2, type=int,
help='images per gpu, the total batch size is $NGPU x batch_size')
parser.add_argument('--epochs', default=26, type=int, metavar='N',
help='number of total epochs to run')
parser.add_argument('-j', '--workers', default=4, type=int, metavar='N',
help='number of data loading workers (default: 4)')
parser.add_argument('--lr', default=0.02, type=float,
help='initial learning rate, 0.02 is the default value for training '
'on 8 gpus and 2 images_per_gpu')
parser.add_argument('--momentum', default=0.9, type=float, metavar='M',
help='momentum')
parser.add_argument('--wd', '--weight-decay', default=1e-4, type=float,
metavar='W', help='weight decay (default: 1e-4)',
dest='weight_decay')
parser.add_argument('--lr-scheduler', default="multisteplr", help='the lr scheduler (default: multisteplr)')
parser.add_argument('--lr-step-size', default=8, type=int,
help='decrease lr every step-size epochs (multisteplr scheduler only)')
parser.add_argument('--lr-steps', default=[16, 22], nargs='+', type=int,
help='decrease lr every step-size epochs (multisteplr scheduler only)')
parser.add_argument('--lr-gamma', default=0.1, type=float,
help='decrease lr by a factor of lr-gamma (multisteplr scheduler only)')
parser.add_argument('--print-freq', default=20, type=int, help='print frequency')
parser.add_argument('--output-dir', default='.', help='path where to save')
parser.add_argument('--resume', default='', help='resume from checkpoint')
parser.add_argument('--start_epoch', default=0, type=int, help='start epoch')
parser.add_argument('--aspect-ratio-group-factor', default=3, type=int)
parser.add_argument('--rpn-score-thresh', default=None, type=float, help='rpn score threshold for faster-rcnn')
parser.add_argument('--trainable-backbone-layers', default=None, type=int,
help='number of trainable layers of backbone')
parser.add_argument('--data-augmentation', default="hflip", help='data augmentation policy (default: hflip)')
parser = argparse.ArgumentParser(description="PyTorch Detection Training", add_help=add_help)
parser.add_argument("--data-path", default="/datasets01/COCO/022719/", help="dataset")
parser.add_argument("--dataset", default="coco", help="dataset")
parser.add_argument("--model", default="maskrcnn_resnet50_fpn", help="model")
parser.add_argument("--device", default="cuda", help="device")
parser.add_argument(
"-b", "--batch-size", default=2, type=int, help="images per gpu, the total batch size is $NGPU x batch_size"
)
parser.add_argument("--epochs", default=26, type=int, metavar="N", help="number of total epochs to run")
parser.add_argument(
"-j", "--workers", default=4, type=int, metavar="N", help="number of data loading workers (default: 4)"
)
parser.add_argument(
"--lr",
default=0.02,
type=float,
help="initial learning rate, 0.02 is the default value for training " "on 8 gpus and 2 images_per_gpu",
)
parser.add_argument("--momentum", default=0.9, type=float, metavar="M", help="momentum")
parser.add_argument(
"--wd",
"--weight-decay",
default=1e-4,
type=float,
metavar="W",
help="weight decay (default: 1e-4)",
dest="weight_decay",
)
parser.add_argument("--lr-scheduler", default="multisteplr", help="the lr scheduler (default: multisteplr)")
parser.add_argument(
"--lr-step-size", default=8, type=int, help="decrease lr every step-size epochs (multisteplr scheduler only)"
)
parser.add_argument(
"--lr-steps",
default=[16, 22],
nargs="+",
type=int,
help="decrease lr every step-size epochs (multisteplr scheduler only)",
)
parser.add_argument(
"--lr-gamma", default=0.1, type=float, help="decrease lr by a factor of lr-gamma (multisteplr scheduler only)"
)
parser.add_argument("--print-freq", default=20, type=int, help="print frequency")
parser.add_argument("--output-dir", default=".", help="path where to save")
parser.add_argument("--resume", default="", help="resume from checkpoint")
parser.add_argument("--start_epoch", default=0, type=int, help="start epoch")
parser.add_argument("--aspect-ratio-group-factor", default=3, type=int)
parser.add_argument("--rpn-score-thresh", default=None, type=float, help="rpn score threshold for faster-rcnn")
parser.add_argument(
"--trainable-backbone-layers", default=None, type=int, help="number of trainable layers of backbone"
)
parser.add_argument("--data-augmentation", default="hflip", help="data augmentation policy (default: hflip)")
parser.add_argument(
"--sync-bn",
dest="sync_bn",
......@@ -109,9 +121,8 @@ def get_args_parser(add_help=True):
)
# distributed training parameters
parser.add_argument('--world-size', default=1, type=int,
help='number of distributed processes')
parser.add_argument('--dist-url', default='env://', help='url used to set up distributed training')
parser.add_argument("--world-size", default=1, type=int, help="number of distributed processes")
parser.add_argument("--dist-url", default="env://", help="url used to set up distributed training")
return parser
......@@ -128,8 +139,9 @@ def main(args):
# Data loading code
print("Loading data")
dataset, num_classes = get_dataset(args.dataset, "train", get_transform(True, args.data_augmentation),
args.data_path)
dataset, num_classes = get_dataset(
args.dataset, "train", get_transform(True, args.data_augmentation), args.data_path
)
dataset_test, _ = get_dataset(args.dataset, "val", get_transform(False, args.data_augmentation), args.data_path)
print("Creating data loaders")
......@@ -144,27 +156,24 @@ def main(args):
group_ids = create_aspect_ratio_groups(dataset, k=args.aspect_ratio_group_factor)
train_batch_sampler = GroupedBatchSampler(train_sampler, group_ids, args.batch_size)
else:
train_batch_sampler = torch.utils.data.BatchSampler(
train_sampler, args.batch_size, drop_last=True)
train_batch_sampler = torch.utils.data.BatchSampler(train_sampler, args.batch_size, drop_last=True)
data_loader = torch.utils.data.DataLoader(
dataset, batch_sampler=train_batch_sampler, num_workers=args.workers,
collate_fn=utils.collate_fn)
dataset, batch_sampler=train_batch_sampler, num_workers=args.workers, collate_fn=utils.collate_fn
)
data_loader_test = torch.utils.data.DataLoader(
dataset_test, batch_size=1,
sampler=test_sampler, num_workers=args.workers,
collate_fn=utils.collate_fn)
dataset_test, batch_size=1, sampler=test_sampler, num_workers=args.workers, collate_fn=utils.collate_fn
)
print("Creating model")
kwargs = {
"trainable_backbone_layers": args.trainable_backbone_layers
}
kwargs = {"trainable_backbone_layers": args.trainable_backbone_layers}
if "rcnn" in args.model:
if args.rpn_score_thresh is not None:
kwargs["rpn_score_thresh"] = args.rpn_score_thresh
model = torchvision.models.detection.__dict__[args.model](num_classes=num_classes, pretrained=args.pretrained,
**kwargs)
model = torchvision.models.detection.__dict__[args.model](
num_classes=num_classes, pretrained=args.pretrained, **kwargs
)
model.to(device)
if args.distributed and args.sync_bn:
model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model)
......@@ -175,24 +184,25 @@ def main(args):
model_without_ddp = model.module
params = [p for p in model.parameters() if p.requires_grad]
optimizer = torch.optim.SGD(
params, lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay)
optimizer = torch.optim.SGD(params, lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay)
args.lr_scheduler = args.lr_scheduler.lower()
if args.lr_scheduler == 'multisteplr':
if args.lr_scheduler == "multisteplr":
lr_scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=args.lr_steps, gamma=args.lr_gamma)
elif args.lr_scheduler == 'cosineannealinglr':
elif args.lr_scheduler == "cosineannealinglr":
lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=args.epochs)
else:
raise RuntimeError("Invalid lr scheduler '{}'. Only MultiStepLR and CosineAnnealingLR "
"are supported.".format(args.lr_scheduler))
raise RuntimeError(
"Invalid lr scheduler '{}'. Only MultiStepLR and CosineAnnealingLR "
"are supported.".format(args.lr_scheduler)
)
if args.resume:
checkpoint = torch.load(args.resume, map_location='cpu')
model_without_ddp.load_state_dict(checkpoint['model'])
optimizer.load_state_dict(checkpoint['optimizer'])
lr_scheduler.load_state_dict(checkpoint['lr_scheduler'])
args.start_epoch = checkpoint['epoch'] + 1
checkpoint = torch.load(args.resume, map_location="cpu")
model_without_ddp.load_state_dict(checkpoint["model"])
optimizer.load_state_dict(checkpoint["optimizer"])
lr_scheduler.load_state_dict(checkpoint["lr_scheduler"])
args.start_epoch = checkpoint["epoch"] + 1
if args.test_only:
evaluate(model, data_loader_test, device=device)
......@@ -207,25 +217,21 @@ def main(args):
lr_scheduler.step()
if args.output_dir:
checkpoint = {
'model': model_without_ddp.state_dict(),
'optimizer': optimizer.state_dict(),
'lr_scheduler': lr_scheduler.state_dict(),
'args': args,
'epoch': epoch
"model": model_without_ddp.state_dict(),
"optimizer": optimizer.state_dict(),
"lr_scheduler": lr_scheduler.state_dict(),
"args": args,
"epoch": epoch,
}
utils.save_on_master(
checkpoint,
os.path.join(args.output_dir, 'model_{}.pth'.format(epoch)))
utils.save_on_master(
checkpoint,
os.path.join(args.output_dir, 'checkpoint.pth'))
utils.save_on_master(checkpoint, os.path.join(args.output_dir, "model_{}.pth".format(epoch)))
utils.save_on_master(checkpoint, os.path.join(args.output_dir, "checkpoint.pth"))
# evaluate after every epoch
evaluate(model, data_loader_test, device=device)
total_time = time.time() - start_time
total_time_str = str(datetime.timedelta(seconds=int(total_time)))
print('Training time {}'.format(total_time_str))
print("Training time {}".format(total_time_str))
if __name__ == "__main__":
......
......@@ -28,8 +28,9 @@ class Compose(object):
class RandomHorizontalFlip(T.RandomHorizontalFlip):
def forward(self, image: Tensor,
target: Optional[Dict[str, Tensor]] = None) -> Tuple[Tensor, Optional[Dict[str, Tensor]]]:
def forward(
self, image: Tensor, target: Optional[Dict[str, Tensor]] = None
) -> Tuple[Tensor, Optional[Dict[str, Tensor]]]:
if torch.rand(1) < self.p:
image = F.hflip(image)
if target is not None:
......@@ -45,16 +46,18 @@ class RandomHorizontalFlip(T.RandomHorizontalFlip):
class ToTensor(nn.Module):
def forward(self, image: Tensor,
target: Optional[Dict[str, Tensor]] = None) -> Tuple[Tensor, Optional[Dict[str, Tensor]]]:
def forward(
self, image: Tensor, target: Optional[Dict[str, Tensor]] = None
) -> Tuple[Tensor, Optional[Dict[str, Tensor]]]:
image = F.pil_to_tensor(image)
image = F.convert_image_dtype(image)
return image, target
class PILToTensor(nn.Module):
def forward(self, image: Tensor,
target: Optional[Dict[str, Tensor]] = None) -> Tuple[Tensor, Optional[Dict[str, Tensor]]]:
def forward(
self, image: Tensor, target: Optional[Dict[str, Tensor]] = None
) -> Tuple[Tensor, Optional[Dict[str, Tensor]]]:
image = F.pil_to_tensor(image)
return image, target
......@@ -64,15 +67,23 @@ class ConvertImageDtype(nn.Module):
super().__init__()
self.dtype = dtype
def forward(self, image: Tensor,
target: Optional[Dict[str, Tensor]] = None) -> Tuple[Tensor, Optional[Dict[str, Tensor]]]:
def forward(
self, image: Tensor, target: Optional[Dict[str, Tensor]] = None
) -> Tuple[Tensor, Optional[Dict[str, Tensor]]]:
image = F.convert_image_dtype(image, self.dtype)
return image, target
class RandomIoUCrop(nn.Module):
def __init__(self, min_scale: float = 0.3, max_scale: float = 1.0, min_aspect_ratio: float = 0.5,
max_aspect_ratio: float = 2.0, sampler_options: Optional[List[float]] = None, trials: int = 40):
def __init__(
self,
min_scale: float = 0.3,
max_scale: float = 1.0,
min_aspect_ratio: float = 0.5,
max_aspect_ratio: float = 2.0,
sampler_options: Optional[List[float]] = None,
trials: int = 40,
):
super().__init__()
# Configuration similar to https://github.com/weiliu89/caffe/blob/ssd/examples/ssd/ssd_coco.py#L89-L174
self.min_scale = min_scale
......@@ -84,14 +95,15 @@ class RandomIoUCrop(nn.Module):
self.options = sampler_options
self.trials = trials
def forward(self, image: Tensor,
target: Optional[Dict[str, Tensor]] = None) -> Tuple[Tensor, Optional[Dict[str, Tensor]]]:
def forward(
self, image: Tensor, target: Optional[Dict[str, Tensor]] = None
) -> Tuple[Tensor, Optional[Dict[str, Tensor]]]:
if target is None:
raise ValueError("The targets can't be None for this transform.")
if isinstance(image, torch.Tensor):
if image.ndimension() not in {2, 3}:
raise ValueError('image should be 2/3 dimensional. Got {} dimensions.'.format(image.ndimension()))
raise ValueError("image should be 2/3 dimensional. Got {} dimensions.".format(image.ndimension()))
elif image.ndimension() == 2:
image = image.unsqueeze(0)
......@@ -131,8 +143,9 @@ class RandomIoUCrop(nn.Module):
# check at least 1 box with jaccard limitations
boxes = target["boxes"][is_within_crop_area]
ious = torchvision.ops.boxes.box_iou(boxes, torch.tensor([[left, top, right, bottom]],
dtype=boxes.dtype, device=boxes.device))
ious = torchvision.ops.boxes.box_iou(
boxes, torch.tensor([[left, top, right, bottom]], dtype=boxes.dtype, device=boxes.device)
)
if ious.max() < min_jaccard_overlap:
continue
......@@ -149,13 +162,15 @@ class RandomIoUCrop(nn.Module):
class RandomZoomOut(nn.Module):
def __init__(self, fill: Optional[List[float]] = None, side_range: Tuple[float, float] = (1., 4.), p: float = 0.5):
def __init__(
self, fill: Optional[List[float]] = None, side_range: Tuple[float, float] = (1.0, 4.0), p: float = 0.5
):
super().__init__()
if fill is None:
fill = [0., 0., 0.]
fill = [0.0, 0.0, 0.0]
self.fill = fill
self.side_range = side_range
if side_range[0] < 1. or side_range[0] > side_range[1]:
if side_range[0] < 1.0 or side_range[0] > side_range[1]:
raise ValueError("Invalid canvas side range provided {}.".format(side_range))
self.p = p
......@@ -165,11 +180,12 @@ class RandomZoomOut(nn.Module):
# We fake the type to make it work on JIT
return tuple(int(x) for x in self.fill) if is_pil else 0
def forward(self, image: Tensor,
target: Optional[Dict[str, Tensor]] = None) -> Tuple[Tensor, Optional[Dict[str, Tensor]]]:
def forward(
self, image: Tensor, target: Optional[Dict[str, Tensor]] = None
) -> Tuple[Tensor, Optional[Dict[str, Tensor]]]:
if isinstance(image, torch.Tensor):
if image.ndimension() not in {2, 3}:
raise ValueError('image should be 2/3 dimensional. Got {} dimensions.'.format(image.ndimension()))
raise ValueError("image should be 2/3 dimensional. Got {} dimensions.".format(image.ndimension()))
elif image.ndimension() == 2:
image = image.unsqueeze(0)
......@@ -196,8 +212,9 @@ class RandomZoomOut(nn.Module):
image = F.pad(image, [left, top, right, bottom], fill=fill)
if isinstance(image, torch.Tensor):
v = torch.tensor(self.fill, device=image.device, dtype=image.dtype).view(-1, 1, 1)
image[..., :top, :] = image[..., :, :left] = image[..., (top + orig_h):, :] = \
image[..., :, (left + orig_w):] = v
image[..., :top, :] = image[..., :, :left] = image[..., (top + orig_h) :, :] = image[
..., :, (left + orig_w) :
] = v
if target is not None:
target["boxes"][:, 0::2] += left
......@@ -207,8 +224,14 @@ class RandomZoomOut(nn.Module):
class RandomPhotometricDistort(nn.Module):
def __init__(self, contrast: Tuple[float] = (0.5, 1.5), saturation: Tuple[float] = (0.5, 1.5),
hue: Tuple[float] = (-0.05, 0.05), brightness: Tuple[float] = (0.875, 1.125), p: float = 0.5):
def __init__(
self,
contrast: Tuple[float] = (0.5, 1.5),
saturation: Tuple[float] = (0.5, 1.5),
hue: Tuple[float] = (-0.05, 0.05),
brightness: Tuple[float] = (0.875, 1.125),
p: float = 0.5,
):
super().__init__()
self._brightness = T.ColorJitter(brightness=brightness)
self._contrast = T.ColorJitter(contrast=contrast)
......@@ -216,11 +239,12 @@ class RandomPhotometricDistort(nn.Module):
self._saturation = T.ColorJitter(saturation=saturation)
self.p = p
def forward(self, image: Tensor,
target: Optional[Dict[str, Tensor]] = None) -> Tuple[Tensor, Optional[Dict[str, Tensor]]]:
def forward(
self, image: Tensor, target: Optional[Dict[str, Tensor]] = None
) -> Tuple[Tensor, Optional[Dict[str, Tensor]]]:
if isinstance(image, torch.Tensor):
if image.ndimension() not in {2, 3}:
raise ValueError('image should be 2/3 dimensional. Got {} dimensions.'.format(image.ndimension()))
raise ValueError("image should be 2/3 dimensional. Got {} dimensions.".format(image.ndimension()))
elif image.ndimension() == 2:
image = image.unsqueeze(0)
......
from collections import defaultdict, deque
import datetime
import errno
import os
import time
from collections import defaultdict, deque
import torch
import torch.distributed as dist
......@@ -32,7 +32,7 @@ class SmoothedValue(object):
"""
if not is_dist_avail_and_initialized():
return
t = torch.tensor([self.count, self.total], dtype=torch.float64, device='cuda')
t = torch.tensor([self.count, self.total], dtype=torch.float64, device="cuda")
dist.barrier()
dist.all_reduce(t)
t = t.tolist()
......@@ -63,11 +63,8 @@ class SmoothedValue(object):
def __str__(self):
return self.fmt.format(
median=self.median,
avg=self.avg,
global_avg=self.global_avg,
max=self.max,
value=self.value)
median=self.median, avg=self.avg, global_avg=self.global_avg, max=self.max, value=self.value
)
def all_gather(data):
......@@ -130,15 +127,12 @@ class MetricLogger(object):
return self.meters[attr]
if attr in self.__dict__:
return self.__dict__[attr]
raise AttributeError("'{}' object has no attribute '{}'".format(
type(self).__name__, attr))
raise AttributeError("'{}' object has no attribute '{}'".format(type(self).__name__, attr))
def __str__(self):
loss_str = []
for name, meter in self.meters.items():
loss_str.append(
"{}: {}".format(name, str(meter))
)
loss_str.append("{}: {}".format(name, str(meter)))
return self.delimiter.join(loss_str)
def synchronize_between_processes(self):
......@@ -151,31 +145,28 @@ class MetricLogger(object):
def log_every(self, iterable, print_freq, header=None):
i = 0
if not header:
header = ''
header = ""
start_time = time.time()
end = time.time()
iter_time = SmoothedValue(fmt='{avg:.4f}')
data_time = SmoothedValue(fmt='{avg:.4f}')
space_fmt = ':' + str(len(str(len(iterable)))) + 'd'
iter_time = SmoothedValue(fmt="{avg:.4f}")
data_time = SmoothedValue(fmt="{avg:.4f}")
space_fmt = ":" + str(len(str(len(iterable)))) + "d"
if torch.cuda.is_available():
log_msg = self.delimiter.join([
header,
'[{0' + space_fmt + '}/{1}]',
'eta: {eta}',
'{meters}',
'time: {time}',
'data: {data}',
'max mem: {memory:.0f}'
])
log_msg = self.delimiter.join(
[
header,
"[{0" + space_fmt + "}/{1}]",
"eta: {eta}",
"{meters}",
"time: {time}",
"data: {data}",
"max mem: {memory:.0f}",
]
)
else:
log_msg = self.delimiter.join([
header,
'[{0' + space_fmt + '}/{1}]',
'eta: {eta}',
'{meters}',
'time: {time}',
'data: {data}'
])
log_msg = self.delimiter.join(
[header, "[{0" + space_fmt + "}/{1}]", "eta: {eta}", "{meters}", "time: {time}", "data: {data}"]
)
MB = 1024.0 * 1024.0
for obj in iterable:
data_time.update(time.time() - end)
......@@ -185,22 +176,28 @@ class MetricLogger(object):
eta_seconds = iter_time.global_avg * (len(iterable) - i)
eta_string = str(datetime.timedelta(seconds=int(eta_seconds)))
if torch.cuda.is_available():
print(log_msg.format(
i, len(iterable), eta=eta_string,
meters=str(self),
time=str(iter_time), data=str(data_time),
memory=torch.cuda.max_memory_allocated() / MB))
print(
log_msg.format(
i,
len(iterable),
eta=eta_string,
meters=str(self),
time=str(iter_time),
data=str(data_time),
memory=torch.cuda.max_memory_allocated() / MB,
)
)
else:
print(log_msg.format(
i, len(iterable), eta=eta_string,
meters=str(self),
time=str(iter_time), data=str(data_time)))
print(
log_msg.format(
i, len(iterable), eta=eta_string, meters=str(self), time=str(iter_time), data=str(data_time)
)
)
i += 1
end = time.time()
total_time = time.time() - start_time
total_time_str = str(datetime.timedelta(seconds=int(total_time)))
print('{} Total time: {} ({:.4f} s / it)'.format(
header, total_time_str, total_time / len(iterable)))
print("{} Total time: {} ({:.4f} s / it)".format(header, total_time_str, total_time / len(iterable)))
def collate_fn(batch):
......@@ -220,10 +217,11 @@ def setup_for_distributed(is_master):
This function disables printing when not in master process
"""
import builtins as __builtin__
builtin_print = __builtin__.print
def print(*args, **kwargs):
force = kwargs.pop('force', False)
force = kwargs.pop("force", False)
if is_master or force:
builtin_print(*args, **kwargs)
......@@ -260,25 +258,25 @@ def save_on_master(*args, **kwargs):
def init_distributed_mode(args):
if 'RANK' in os.environ and 'WORLD_SIZE' in os.environ:
if "RANK" in os.environ and "WORLD_SIZE" in os.environ:
args.rank = int(os.environ["RANK"])
args.world_size = int(os.environ['WORLD_SIZE'])
args.gpu = int(os.environ['LOCAL_RANK'])
elif 'SLURM_PROCID' in os.environ:
args.rank = int(os.environ['SLURM_PROCID'])
args.world_size = int(os.environ["WORLD_SIZE"])
args.gpu = int(os.environ["LOCAL_RANK"])
elif "SLURM_PROCID" in os.environ:
args.rank = int(os.environ["SLURM_PROCID"])
args.gpu = args.rank % torch.cuda.device_count()
else:
print('Not using distributed mode')
print("Not using distributed mode")
args.distributed = False
return
args.distributed = True
torch.cuda.set_device(args.gpu)
args.dist_backend = 'nccl'
print('| distributed init (rank {}): {}'.format(
args.rank, args.dist_url), flush=True)
torch.distributed.init_process_group(backend=args.dist_backend, init_method=args.dist_url,
world_size=args.world_size, rank=args.rank)
args.dist_backend = "nccl"
print("| distributed init (rank {}): {}".format(args.rank, args.dist_url), flush=True)
torch.distributed.init_process_group(
backend=args.dist_backend, init_method=args.dist_url, world_size=args.world_size, rank=args.rank
)
torch.distributed.barrier()
setup_for_distributed(args.rank == 0)
import copy
import os
import torch
import torch.utils.data
import torchvision
from PIL import Image
import os
from pycocotools import mask as coco_mask
from transforms import Compose
......@@ -90,14 +88,9 @@ def get_coco(root, image_set, transforms):
"val": ("val2017", os.path.join("annotations", "instances_val2017.json")),
# "train": ("val2017", os.path.join("annotations", "instances_val2017.json"))
}
CAT_LIST = [0, 5, 2, 16, 9, 44, 6, 3, 17, 62, 21, 67, 18, 19, 4,
1, 64, 20, 63, 7, 72]
transforms = Compose([
FilterAndRemapCocoCategories(CAT_LIST, remap=True),
ConvertCocoPolysToMask(),
transforms
])
CAT_LIST = [0, 5, 2, 16, 9, 44, 6, 3, 17, 62, 21, 67, 18, 19, 4, 1, 64, 20, 63, 7, 72]
transforms = Compose([FilterAndRemapCocoCategories(CAT_LIST, remap=True), ConvertCocoPolysToMask(), transforms])
img_folder, ann_file = PATHS[image_set]
img_folder = os.path.join(root, img_folder)
......
import torch
import transforms as T
......@@ -11,12 +10,14 @@ class SegmentationPresetTrain:
trans = [T.RandomResize(min_size, max_size)]
if hflip_prob > 0:
trans.append(T.RandomHorizontalFlip(hflip_prob))
trans.extend([
T.RandomCrop(crop_size),
T.PILToTensor(),
T.ConvertImageDtype(torch.float),
T.Normalize(mean=mean, std=std),
])
trans.extend(
[
T.RandomCrop(crop_size),
T.PILToTensor(),
T.ConvertImageDtype(torch.float),
T.Normalize(mean=mean, std=std),
]
)
self.transforms = T.Compose(trans)
def __call__(self, img, target):
......@@ -25,12 +26,14 @@ class SegmentationPresetTrain:
class SegmentationPresetEval:
def __init__(self, base_size, mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)):
self.transforms = T.Compose([
T.RandomResize(base_size, base_size),
T.PILToTensor(),
T.ConvertImageDtype(torch.float),
T.Normalize(mean=mean, std=std),
])
self.transforms = T.Compose(
[
T.RandomResize(base_size, base_size),
T.PILToTensor(),
T.ConvertImageDtype(torch.float),
T.Normalize(mean=mean, std=std),
]
)
def __call__(self, img, target):
return self.transforms(img, target)
......@@ -2,23 +2,23 @@ import datetime
import os
import time
import presets
import torch
import torch.utils.data
from torch import nn
import torchvision
from coco_utils import get_coco
import presets
import utils
from coco_utils import get_coco
from torch import nn
def get_dataset(dir_path, name, image_set, transform):
def sbd(*args, **kwargs):
return torchvision.datasets.SBDataset(*args, mode='segmentation', **kwargs)
return torchvision.datasets.SBDataset(*args, mode="segmentation", **kwargs)
paths = {
"voc": (dir_path, torchvision.datasets.VOCSegmentation, 21),
"voc_aug": (dir_path, sbd, 21),
"coco": (dir_path, get_coco, 21)
"coco": (dir_path, get_coco, 21),
}
p, ds_fn, num_classes = paths[name]
......@@ -39,21 +39,21 @@ def criterion(inputs, target):
losses[name] = nn.functional.cross_entropy(x, target, ignore_index=255)
if len(losses) == 1:
return losses['out']
return losses["out"]
return losses['out'] + 0.5 * losses['aux']
return losses["out"] + 0.5 * losses["aux"]
def evaluate(model, data_loader, device, num_classes):
model.eval()
confmat = utils.ConfusionMatrix(num_classes)
metric_logger = utils.MetricLogger(delimiter=" ")
header = 'Test:'
header = "Test:"
with torch.no_grad():
for image, target in metric_logger.log_every(data_loader, 100, header):
image, target = image.to(device), target.to(device)
output = model(image)
output = output['out']
output = output["out"]
confmat.update(target.flatten(), output.argmax(1).flatten())
......@@ -65,8 +65,8 @@ def evaluate(model, data_loader, device, num_classes):
def train_one_epoch(model, criterion, optimizer, data_loader, lr_scheduler, device, epoch, print_freq):
model.train()
metric_logger = utils.MetricLogger(delimiter=" ")
metric_logger.add_meter('lr', utils.SmoothedValue(window_size=1, fmt='{value}'))
header = 'Epoch: [{}]'.format(epoch)
metric_logger.add_meter("lr", utils.SmoothedValue(window_size=1, fmt="{value}"))
header = "Epoch: [{}]".format(epoch)
for image, target in metric_logger.log_every(data_loader, print_freq, header):
image, target = image.to(device), target.to(device)
output = model(image)
......@@ -101,18 +101,21 @@ def main(args):
test_sampler = torch.utils.data.SequentialSampler(dataset_test)
data_loader = torch.utils.data.DataLoader(
dataset, batch_size=args.batch_size,
sampler=train_sampler, num_workers=args.workers,
collate_fn=utils.collate_fn, drop_last=True)
dataset,
batch_size=args.batch_size,
sampler=train_sampler,
num_workers=args.workers,
collate_fn=utils.collate_fn,
drop_last=True,
)
data_loader_test = torch.utils.data.DataLoader(
dataset_test, batch_size=1,
sampler=test_sampler, num_workers=args.workers,
collate_fn=utils.collate_fn)
dataset_test, batch_size=1, sampler=test_sampler, num_workers=args.workers, collate_fn=utils.collate_fn
)
model = torchvision.models.segmentation.__dict__[args.model](num_classes=num_classes,
aux_loss=args.aux_loss,
pretrained=args.pretrained)
model = torchvision.models.segmentation.__dict__[args.model](
num_classes=num_classes, aux_loss=args.aux_loss, pretrained=args.pretrained
)
model.to(device)
if args.distributed:
model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model)
......@@ -129,42 +132,42 @@ def main(args):
if args.aux_loss:
params = [p for p in model_without_ddp.aux_classifier.parameters() if p.requires_grad]
params_to_optimize.append({"params": params, "lr": args.lr * 10})
optimizer = torch.optim.SGD(
params_to_optimize,
lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay)
optimizer = torch.optim.SGD(params_to_optimize, lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay)
iters_per_epoch = len(data_loader)
main_lr_scheduler = torch.optim.lr_scheduler.LambdaLR(
optimizer,
lambda x: (1 - x / (iters_per_epoch * (args.epochs - args.lr_warmup_epochs))) ** 0.9)
optimizer, lambda x: (1 - x / (iters_per_epoch * (args.epochs - args.lr_warmup_epochs))) ** 0.9
)
if args.lr_warmup_epochs > 0:
warmup_iters = iters_per_epoch * args.lr_warmup_epochs
args.lr_warmup_method = args.lr_warmup_method.lower()
if args.lr_warmup_method == 'linear':
warmup_lr_scheduler = torch.optim.lr_scheduler.LinearLR(optimizer, start_factor=args.lr_warmup_decay,
total_iters=warmup_iters)
elif args.lr_warmup_method == 'constant':
warmup_lr_scheduler = torch.optim.lr_scheduler.ConstantLR(optimizer, factor=args.lr_warmup_decay,
total_iters=warmup_iters)
if args.lr_warmup_method == "linear":
warmup_lr_scheduler = torch.optim.lr_scheduler.LinearLR(
optimizer, start_factor=args.lr_warmup_decay, total_iters=warmup_iters
)
elif args.lr_warmup_method == "constant":
warmup_lr_scheduler = torch.optim.lr_scheduler.ConstantLR(
optimizer, factor=args.lr_warmup_decay, total_iters=warmup_iters
)
else:
raise RuntimeError("Invalid warmup lr method '{}'. Only linear and constant "
"are supported.".format(args.lr_warmup_method))
raise RuntimeError(
"Invalid warmup lr method '{}'. Only linear and constant "
"are supported.".format(args.lr_warmup_method)
)
lr_scheduler = torch.optim.lr_scheduler.SequentialLR(
optimizer,
schedulers=[warmup_lr_scheduler, main_lr_scheduler],
milestones=[warmup_iters]
optimizer, schedulers=[warmup_lr_scheduler, main_lr_scheduler], milestones=[warmup_iters]
)
else:
lr_scheduler = main_lr_scheduler
if args.resume:
checkpoint = torch.load(args.resume, map_location='cpu')
model_without_ddp.load_state_dict(checkpoint['model'], strict=not args.test_only)
checkpoint = torch.load(args.resume, map_location="cpu")
model_without_ddp.load_state_dict(checkpoint["model"], strict=not args.test_only)
if not args.test_only:
optimizer.load_state_dict(checkpoint['optimizer'])
lr_scheduler.load_state_dict(checkpoint['lr_scheduler'])
args.start_epoch = checkpoint['epoch'] + 1
optimizer.load_state_dict(checkpoint["optimizer"])
lr_scheduler.load_state_dict(checkpoint["lr_scheduler"])
args.start_epoch = checkpoint["epoch"] + 1
if args.test_only:
confmat = evaluate(model, data_loader_test, device=device, num_classes=num_classes)
......@@ -179,53 +182,54 @@ def main(args):
confmat = evaluate(model, data_loader_test, device=device, num_classes=num_classes)
print(confmat)
checkpoint = {
'model': model_without_ddp.state_dict(),
'optimizer': optimizer.state_dict(),
'lr_scheduler': lr_scheduler.state_dict(),
'epoch': epoch,
'args': args
"model": model_without_ddp.state_dict(),
"optimizer": optimizer.state_dict(),
"lr_scheduler": lr_scheduler.state_dict(),
"epoch": epoch,
"args": args,
}
utils.save_on_master(
checkpoint,
os.path.join(args.output_dir, 'model_{}.pth'.format(epoch)))
utils.save_on_master(
checkpoint,
os.path.join(args.output_dir, 'checkpoint.pth'))
utils.save_on_master(checkpoint, os.path.join(args.output_dir, "model_{}.pth".format(epoch)))
utils.save_on_master(checkpoint, os.path.join(args.output_dir, "checkpoint.pth"))
total_time = time.time() - start_time
total_time_str = str(datetime.timedelta(seconds=int(total_time)))
print('Training time {}'.format(total_time_str))
print("Training time {}".format(total_time_str))
def get_args_parser(add_help=True):
import argparse
parser = argparse.ArgumentParser(description='PyTorch Segmentation Training', add_help=add_help)
parser.add_argument('--data-path', default='/datasets01/COCO/022719/', help='dataset path')
parser.add_argument('--dataset', default='coco', help='dataset name')
parser.add_argument('--model', default='fcn_resnet101', help='model')
parser.add_argument('--aux-loss', action='store_true', help='auxiliar loss')
parser.add_argument('--device', default='cuda', help='device')
parser.add_argument('-b', '--batch-size', default=8, type=int)
parser.add_argument('--epochs', default=30, type=int, metavar='N',
help='number of total epochs to run')
parser.add_argument('-j', '--workers', default=16, type=int, metavar='N',
help='number of data loading workers (default: 16)')
parser.add_argument('--lr', default=0.01, type=float, help='initial learning rate')
parser.add_argument('--momentum', default=0.9, type=float, metavar='M',
help='momentum')
parser.add_argument('--wd', '--weight-decay', default=1e-4, type=float,
metavar='W', help='weight decay (default: 1e-4)',
dest='weight_decay')
parser.add_argument('--lr-warmup-epochs', default=0, type=int, help='the number of epochs to warmup (default: 0)')
parser.add_argument('--lr-warmup-method', default="linear", type=str, help='the warmup method (default: linear)')
parser.add_argument('--lr-warmup-decay', default=0.01, type=float, help='the decay for lr')
parser.add_argument('--print-freq', default=10, type=int, help='print frequency')
parser.add_argument('--output-dir', default='.', help='path where to save')
parser.add_argument('--resume', default='', help='resume from checkpoint')
parser.add_argument('--start-epoch', default=0, type=int, metavar='N',
help='start epoch')
parser = argparse.ArgumentParser(description="PyTorch Segmentation Training", add_help=add_help)
parser.add_argument("--data-path", default="/datasets01/COCO/022719/", help="dataset path")
parser.add_argument("--dataset", default="coco", help="dataset name")
parser.add_argument("--model", default="fcn_resnet101", help="model")
parser.add_argument("--aux-loss", action="store_true", help="auxiliar loss")
parser.add_argument("--device", default="cuda", help="device")
parser.add_argument("-b", "--batch-size", default=8, type=int)
parser.add_argument("--epochs", default=30, type=int, metavar="N", help="number of total epochs to run")
parser.add_argument(
"-j", "--workers", default=16, type=int, metavar="N", help="number of data loading workers (default: 16)"
)
parser.add_argument("--lr", default=0.01, type=float, help="initial learning rate")
parser.add_argument("--momentum", default=0.9, type=float, metavar="M", help="momentum")
parser.add_argument(
"--wd",
"--weight-decay",
default=1e-4,
type=float,
metavar="W",
help="weight decay (default: 1e-4)",
dest="weight_decay",
)
parser.add_argument("--lr-warmup-epochs", default=0, type=int, help="the number of epochs to warmup (default: 0)")
parser.add_argument("--lr-warmup-method", default="linear", type=str, help="the warmup method (default: linear)")
parser.add_argument("--lr-warmup-decay", default=0.01, type=float, help="the decay for lr")
parser.add_argument("--print-freq", default=10, type=int, help="print frequency")
parser.add_argument("--output-dir", default=".", help="path where to save")
parser.add_argument("--resume", default="", help="resume from checkpoint")
parser.add_argument("--start-epoch", default=0, type=int, metavar="N", help="start epoch")
parser.add_argument(
"--test-only",
dest="test_only",
......@@ -239,9 +243,8 @@ def get_args_parser(add_help=True):
action="store_true",
)
# distributed training parameters
parser.add_argument('--world-size', default=1, type=int,
help='number of distributed processes')
parser.add_argument('--dist-url', default='env://', help='url used to set up distributed training')
parser.add_argument("--world-size", default=1, type=int, help="number of distributed processes")
parser.add_argument("--dist-url", default="env://", help="url used to set up distributed training")
return parser
......
from collections import defaultdict, deque
import datetime
import errno
import os
import time
from collections import defaultdict, deque
import torch
import torch.distributed as dist
import errno
import os
class SmoothedValue(object):
"""Track a series of values and provide access to smoothed values over a
......@@ -32,7 +32,7 @@ class SmoothedValue(object):
"""
if not is_dist_avail_and_initialized():
return
t = torch.tensor([self.count, self.total], dtype=torch.float64, device='cuda')
t = torch.tensor([self.count, self.total], dtype=torch.float64, device="cuda")
dist.barrier()
dist.all_reduce(t)
t = t.tolist()
......@@ -63,11 +63,8 @@ class SmoothedValue(object):
def __str__(self):
return self.fmt.format(
median=self.median,
avg=self.avg,
global_avg=self.global_avg,
max=self.max,
value=self.value)
median=self.median, avg=self.avg, global_avg=self.global_avg, max=self.max, value=self.value
)
class ConfusionMatrix(object):
......@@ -82,7 +79,7 @@ class ConfusionMatrix(object):
with torch.no_grad():
k = (a >= 0) & (a < n)
inds = n * a[k].to(torch.int64) + b[k]
self.mat += torch.bincount(inds, minlength=n**2).reshape(n, n)
self.mat += torch.bincount(inds, minlength=n ** 2).reshape(n, n)
def reset(self):
self.mat.zero_()
......@@ -104,15 +101,12 @@ class ConfusionMatrix(object):
def __str__(self):
acc_global, acc, iu = self.compute()
return (
'global correct: {:.1f}\n'
'average row correct: {}\n'
'IoU: {}\n'
'mean IoU: {:.1f}').format(
acc_global.item() * 100,
['{:.1f}'.format(i) for i in (acc * 100).tolist()],
['{:.1f}'.format(i) for i in (iu * 100).tolist()],
iu.mean().item() * 100)
return ("global correct: {:.1f}\n" "average row correct: {}\n" "IoU: {}\n" "mean IoU: {:.1f}").format(
acc_global.item() * 100,
["{:.1f}".format(i) for i in (acc * 100).tolist()],
["{:.1f}".format(i) for i in (iu * 100).tolist()],
iu.mean().item() * 100,
)
class MetricLogger(object):
......@@ -132,15 +126,12 @@ class MetricLogger(object):
return self.meters[attr]
if attr in self.__dict__:
return self.__dict__[attr]
raise AttributeError("'{}' object has no attribute '{}'".format(
type(self).__name__, attr))
raise AttributeError("'{}' object has no attribute '{}'".format(type(self).__name__, attr))
def __str__(self):
loss_str = []
for name, meter in self.meters.items():
loss_str.append(
"{}: {}".format(name, str(meter))
)
loss_str.append("{}: {}".format(name, str(meter)))
return self.delimiter.join(loss_str)
def synchronize_between_processes(self):
......@@ -153,31 +144,28 @@ class MetricLogger(object):
def log_every(self, iterable, print_freq, header=None):
i = 0
if not header:
header = ''
header = ""
start_time = time.time()
end = time.time()
iter_time = SmoothedValue(fmt='{avg:.4f}')
data_time = SmoothedValue(fmt='{avg:.4f}')
space_fmt = ':' + str(len(str(len(iterable)))) + 'd'
iter_time = SmoothedValue(fmt="{avg:.4f}")
data_time = SmoothedValue(fmt="{avg:.4f}")
space_fmt = ":" + str(len(str(len(iterable)))) + "d"
if torch.cuda.is_available():
log_msg = self.delimiter.join([
header,
'[{0' + space_fmt + '}/{1}]',
'eta: {eta}',
'{meters}',
'time: {time}',
'data: {data}',
'max mem: {memory:.0f}'
])
log_msg = self.delimiter.join(
[
header,
"[{0" + space_fmt + "}/{1}]",
"eta: {eta}",
"{meters}",
"time: {time}",
"data: {data}",
"max mem: {memory:.0f}",
]
)
else:
log_msg = self.delimiter.join([
header,
'[{0' + space_fmt + '}/{1}]',
'eta: {eta}',
'{meters}',
'time: {time}',
'data: {data}'
])
log_msg = self.delimiter.join(
[header, "[{0" + space_fmt + "}/{1}]", "eta: {eta}", "{meters}", "time: {time}", "data: {data}"]
)
MB = 1024.0 * 1024.0
for obj in iterable:
data_time.update(time.time() - end)
......@@ -187,21 +175,28 @@ class MetricLogger(object):
eta_seconds = iter_time.global_avg * (len(iterable) - i)
eta_string = str(datetime.timedelta(seconds=int(eta_seconds)))
if torch.cuda.is_available():
print(log_msg.format(
i, len(iterable), eta=eta_string,
meters=str(self),
time=str(iter_time), data=str(data_time),
memory=torch.cuda.max_memory_allocated() / MB))
print(
log_msg.format(
i,
len(iterable),
eta=eta_string,
meters=str(self),
time=str(iter_time),
data=str(data_time),
memory=torch.cuda.max_memory_allocated() / MB,
)
)
else:
print(log_msg.format(
i, len(iterable), eta=eta_string,
meters=str(self),
time=str(iter_time), data=str(data_time)))
print(
log_msg.format(
i, len(iterable), eta=eta_string, meters=str(self), time=str(iter_time), data=str(data_time)
)
)
i += 1
end = time.time()
total_time = time.time() - start_time
total_time_str = str(datetime.timedelta(seconds=int(total_time)))
print('{} Total time: {}'.format(header, total_time_str))
print("{} Total time: {}".format(header, total_time_str))
def cat_list(images, fill_value=0):
......@@ -209,7 +204,7 @@ def cat_list(images, fill_value=0):
batch_shape = (len(images),) + max_size
batched_imgs = images[0].new(*batch_shape).fill_(fill_value)
for img, pad_img in zip(images, batched_imgs):
pad_img[..., :img.shape[-2], :img.shape[-1]].copy_(img)
pad_img[..., : img.shape[-2], : img.shape[-1]].copy_(img)
return batched_imgs
......@@ -233,10 +228,11 @@ def setup_for_distributed(is_master):
This function disables printing when not in master process
"""
import builtins as __builtin__
builtin_print = __builtin__.print
def print(*args, **kwargs):
force = kwargs.pop('force', False)
force = kwargs.pop("force", False)
if is_master or force:
builtin_print(*args, **kwargs)
......@@ -273,26 +269,26 @@ def save_on_master(*args, **kwargs):
def init_distributed_mode(args):
if 'RANK' in os.environ and 'WORLD_SIZE' in os.environ:
if "RANK" in os.environ and "WORLD_SIZE" in os.environ:
args.rank = int(os.environ["RANK"])
args.world_size = int(os.environ['WORLD_SIZE'])
args.gpu = int(os.environ['LOCAL_RANK'])
elif 'SLURM_PROCID' in os.environ:
args.rank = int(os.environ['SLURM_PROCID'])
args.world_size = int(os.environ["WORLD_SIZE"])
args.gpu = int(os.environ["LOCAL_RANK"])
elif "SLURM_PROCID" in os.environ:
args.rank = int(os.environ["SLURM_PROCID"])
args.gpu = args.rank % torch.cuda.device_count()
elif hasattr(args, "rank"):
pass
else:
print('Not using distributed mode')
print("Not using distributed mode")
args.distributed = False
return
args.distributed = True
torch.cuda.set_device(args.gpu)
args.dist_backend = 'nccl'
print('| distributed init (rank {}): {}'.format(
args.rank, args.dist_url), flush=True)
torch.distributed.init_process_group(backend=args.dist_backend, init_method=args.dist_url,
world_size=args.world_size, rank=args.rank)
args.dist_backend = "nccl"
print("| distributed init (rank {}): {}".format(args.rank, args.dist_url), flush=True)
torch.distributed.init_process_group(
backend=args.dist_backend, init_method=args.dist_url, world_size=args.world_size, rank=args.rank
)
setup_for_distributed(args.rank == 0)
'''
"""
Pytorch adaptation of https://omoindrot.github.io/triplet-loss
https://github.com/omoindrot/tensorflow-triplet-loss
'''
"""
import torch
import torch.nn as nn
class TripletMarginLoss(nn.Module):
def __init__(self, margin=1.0, p=2., mining='batch_all'):
def __init__(self, margin=1.0, p=2.0, mining="batch_all"):
super(TripletMarginLoss, self).__init__()
self.margin = margin
self.p = p
self.mining = mining
if mining == 'batch_all':
if mining == "batch_all":
self.loss_fn = batch_all_triplet_loss
if mining == 'batch_hard':
if mining == "batch_hard":
self.loss_fn = batch_hard_triplet_loss
def forward(self, embeddings, labels):
......
import random
from collections import defaultdict
import torch
from torch.utils.data.sampler import Sampler
from collections import defaultdict
import random
def create_groups(groups, k):
......
import unittest
from collections import defaultdict
from torch.utils.data import DataLoader
from torchvision.datasets import FakeData
import torchvision.transforms as transforms
from sampler import PKSampler
from torch.utils.data import DataLoader
from torchvision.datasets import FakeData
class Tester(unittest.TestCase):
def test_pksampler(self):
p, k = 16, 4
......@@ -19,8 +17,7 @@ class Tester(unittest.TestCase):
self.assertRaises(AssertionError, PKSampler, targets, p, k)
# Ensure p, k constraints on batch
dataset = FakeData(size=1000, num_classes=100, image_size=(3, 1, 1),
transform=transforms.ToTensor())
dataset = FakeData(size=1000, num_classes=100, image_size=(3, 1, 1), transform=transforms.ToTensor())
targets = [target.item() for _, target in dataset]
sampler = PKSampler(targets, p, k)
loader = DataLoader(dataset, batch_size=p * k, sampler=sampler)
......@@ -38,5 +35,5 @@ class Tester(unittest.TestCase):
self.assertEqual(bins[b], k)
if __name__ == '__main__':
if __name__ == "__main__":
unittest.main()
import os
import torch
from torch.optim import Adam
from torch.utils.data import DataLoader
import torchvision.transforms as transforms
from torchvision.datasets import FashionMNIST
from loss import TripletMarginLoss
from sampler import PKSampler
from model import EmbeddingNet
from sampler import PKSampler
from torch.optim import Adam
from torch.utils.data import DataLoader
from torchvision.datasets import FashionMNIST
def train_epoch(model, optimizer, criterion, data_loader, device, epoch, print_freq):
......@@ -33,7 +31,7 @@ def train_epoch(model, optimizer, criterion, data_loader, device, epoch, print_f
i += 1
avg_loss = running_loss / print_freq
avg_trip = 100.0 * running_frac_pos_triplets / print_freq
print('[{:d}, {:d}] | loss: {:.4f} | % avg hard triplets: {:.2f}%'.format(epoch, i, avg_loss, avg_trip))
print("[{:d}, {:d}] | loss: {:.4f} | % avg hard triplets: {:.2f}%".format(epoch, i, avg_loss, avg_trip))
running_loss = 0
running_frac_pos_triplets = 0
......@@ -79,17 +77,17 @@ def evaluate(model, loader, device):
threshold, accuracy = find_best_threshold(dists, targets, device)
print('accuracy: {:.3f}%, threshold: {:.2f}'.format(accuracy, threshold))
print("accuracy: {:.3f}%, threshold: {:.2f}".format(accuracy, threshold))
def save(model, epoch, save_dir, file_name):
file_name = 'epoch_' + str(epoch) + '__' + file_name
file_name = "epoch_" + str(epoch) + "__" + file_name
save_path = os.path.join(save_dir, file_name)
torch.save(model.state_dict(), save_path)
def main(args):
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
p = args.labels_per_batch
k = args.samples_per_label
batch_size = p * k
......@@ -103,9 +101,9 @@ def main(args):
criterion = TripletMarginLoss(margin=args.margin)
optimizer = Adam(model.parameters(), lr=args.lr)
transform = transforms.Compose([transforms.Lambda(lambda image: image.convert('RGB')),
transforms.Resize((224, 224)),
transforms.ToTensor()])
transform = transforms.Compose(
[transforms.Lambda(lambda image: image.convert("RGB")), transforms.Resize((224, 224)), transforms.ToTensor()]
)
# Using FMNIST to demonstrate embedding learning using triplet loss. This dataset can
# be replaced with any classification dataset.
......@@ -118,48 +116,44 @@ def main(args):
# targets attribute with the same format.
targets = train_dataset.targets.tolist()
train_loader = DataLoader(train_dataset, batch_size=batch_size,
sampler=PKSampler(targets, p, k),
num_workers=args.workers)
test_loader = DataLoader(test_dataset, batch_size=args.eval_batch_size,
shuffle=False,
num_workers=args.workers)
train_loader = DataLoader(
train_dataset, batch_size=batch_size, sampler=PKSampler(targets, p, k), num_workers=args.workers
)
test_loader = DataLoader(test_dataset, batch_size=args.eval_batch_size, shuffle=False, num_workers=args.workers)
for epoch in range(1, args.epochs + 1):
print('Training...')
print("Training...")
train_epoch(model, optimizer, criterion, train_loader, device, epoch, args.print_freq)
print('Evaluating...')
print("Evaluating...")
evaluate(model, test_loader, device)
print('Saving...')
save(model, epoch, args.save_dir, 'ckpt.pth')
print("Saving...")
save(model, epoch, args.save_dir, "ckpt.pth")
def parse_args():
import argparse
parser = argparse.ArgumentParser(description='PyTorch Embedding Learning')
parser.add_argument('--dataset-dir', default='/tmp/fmnist/',
help='FashionMNIST dataset directory path')
parser.add_argument('-p', '--labels-per-batch', default=8, type=int,
help='Number of unique labels/classes per batch')
parser.add_argument('-k', '--samples-per-label', default=8, type=int,
help='Number of samples per label in a batch')
parser.add_argument('--eval-batch-size', default=512, type=int)
parser.add_argument('--epochs', default=10, type=int, metavar='N',
help='Number of training epochs to run')
parser.add_argument('-j', '--workers', default=4, type=int, metavar='N',
help='Number of data loading workers')
parser.add_argument('--lr', default=0.0001, type=float, help='Learning rate')
parser.add_argument('--margin', default=0.2, type=float, help='Triplet loss margin')
parser.add_argument('--print-freq', default=20, type=int, help='Print frequency')
parser.add_argument('--save-dir', default='.', help='Model save directory')
parser.add_argument('--resume', default='', help='Resume from checkpoint')
parser = argparse.ArgumentParser(description="PyTorch Embedding Learning")
parser.add_argument("--dataset-dir", default="/tmp/fmnist/", help="FashionMNIST dataset directory path")
parser.add_argument(
"-p", "--labels-per-batch", default=8, type=int, help="Number of unique labels/classes per batch"
)
parser.add_argument("-k", "--samples-per-label", default=8, type=int, help="Number of samples per label in a batch")
parser.add_argument("--eval-batch-size", default=512, type=int)
parser.add_argument("--epochs", default=10, type=int, metavar="N", help="Number of training epochs to run")
parser.add_argument("-j", "--workers", default=4, type=int, metavar="N", help="Number of data loading workers")
parser.add_argument("--lr", default=0.0001, type=float, help="Learning rate")
parser.add_argument("--margin", default=0.2, type=float, help="Triplet loss margin")
parser.add_argument("--print-freq", default=20, type=int, help="Print frequency")
parser.add_argument("--save-dir", default=".", help="Model save directory")
parser.add_argument("--resume", default="", help="Resume from checkpoint")
return parser.parse_args()
if __name__ == '__main__':
if __name__ == "__main__":
args = parse_args()
main(args)
import torch
from torchvision.transforms import transforms
from transforms import ConvertBHWCtoBCHW, ConvertBCHWtoCBHW
class VideoClassificationPresetTrain:
def __init__(self, resize_size, crop_size, mean=(0.43216, 0.394666, 0.37645), std=(0.22803, 0.22145, 0.216989),
hflip_prob=0.5):
def __init__(
self,
resize_size,
crop_size,
mean=(0.43216, 0.394666, 0.37645),
std=(0.22803, 0.22145, 0.216989),
hflip_prob=0.5,
):
trans = [
ConvertBHWCtoBCHW(),
transforms.ConvertImageDtype(torch.float32),
......@@ -14,11 +19,7 @@ class VideoClassificationPresetTrain:
]
if hflip_prob > 0:
trans.append(transforms.RandomHorizontalFlip(hflip_prob))
trans.extend([
transforms.Normalize(mean=mean, std=std),
transforms.RandomCrop(crop_size),
ConvertBCHWtoCBHW()
])
trans.extend([transforms.Normalize(mean=mean, std=std), transforms.RandomCrop(crop_size), ConvertBCHWtoCBHW()])
self.transforms = transforms.Compose(trans)
def __call__(self, x):
......@@ -27,14 +28,16 @@ class VideoClassificationPresetTrain:
class VideoClassificationPresetEval:
def __init__(self, resize_size, crop_size, mean=(0.43216, 0.394666, 0.37645), std=(0.22803, 0.22145, 0.216989)):
self.transforms = transforms.Compose([
ConvertBHWCtoBCHW(),
transforms.ConvertImageDtype(torch.float32),
transforms.Resize(resize_size),
transforms.Normalize(mean=mean, std=std),
transforms.CenterCrop(crop_size),
ConvertBCHWtoCBHW()
])
self.transforms = transforms.Compose(
[
ConvertBHWCtoBCHW(),
transforms.ConvertImageDtype(torch.float32),
transforms.Resize(resize_size),
transforms.Normalize(mean=mean, std=std),
transforms.CenterCrop(crop_size),
ConvertBCHWtoCBHW(),
]
)
def __call__(self, x):
return self.transforms(x)
import datetime
import os
import time
import presets
import torch
import torch.utils.data
from torch.utils.data.dataloader import default_collate
from torch import nn
import torchvision
import torchvision.datasets.video_utils
from torchvision.datasets.samplers import DistributedSampler, UniformClipSampler, RandomClipSampler
import presets
import utils
from torch import nn
from torch.utils.data.dataloader import default_collate
from torchvision.datasets.samplers import DistributedSampler, UniformClipSampler, RandomClipSampler
try:
from apex import amp
......@@ -21,10 +21,10 @@ except ImportError:
def train_one_epoch(model, criterion, optimizer, lr_scheduler, data_loader, device, epoch, print_freq, apex=False):
model.train()
metric_logger = utils.MetricLogger(delimiter=" ")
metric_logger.add_meter('lr', utils.SmoothedValue(window_size=1, fmt='{value}'))
metric_logger.add_meter('clips/s', utils.SmoothedValue(window_size=10, fmt='{value:.3f}'))
metric_logger.add_meter("lr", utils.SmoothedValue(window_size=1, fmt="{value}"))
metric_logger.add_meter("clips/s", utils.SmoothedValue(window_size=10, fmt="{value:.3f}"))
header = 'Epoch: [{}]'.format(epoch)
header = "Epoch: [{}]".format(epoch)
for video, target in metric_logger.log_every(data_loader, print_freq, header):
start_time = time.time()
video, target = video.to(device), target.to(device)
......@@ -42,16 +42,16 @@ def train_one_epoch(model, criterion, optimizer, lr_scheduler, data_loader, devi
acc1, acc5 = utils.accuracy(output, target, topk=(1, 5))
batch_size = video.shape[0]
metric_logger.update(loss=loss.item(), lr=optimizer.param_groups[0]["lr"])
metric_logger.meters['acc1'].update(acc1.item(), n=batch_size)
metric_logger.meters['acc5'].update(acc5.item(), n=batch_size)
metric_logger.meters['clips/s'].update(batch_size / (time.time() - start_time))
metric_logger.meters["acc1"].update(acc1.item(), n=batch_size)
metric_logger.meters["acc5"].update(acc5.item(), n=batch_size)
metric_logger.meters["clips/s"].update(batch_size / (time.time() - start_time))
lr_scheduler.step()
def evaluate(model, criterion, data_loader, device):
model.eval()
metric_logger = utils.MetricLogger(delimiter=" ")
header = 'Test:'
header = "Test:"
with torch.no_grad():
for video, target in metric_logger.log_every(data_loader, 100, header):
video = video.to(device, non_blocking=True)
......@@ -64,18 +64,22 @@ def evaluate(model, criterion, data_loader, device):
# could have been padded in distributed setup
batch_size = video.shape[0]
metric_logger.update(loss=loss.item())
metric_logger.meters['acc1'].update(acc1.item(), n=batch_size)
metric_logger.meters['acc5'].update(acc5.item(), n=batch_size)
metric_logger.meters["acc1"].update(acc1.item(), n=batch_size)
metric_logger.meters["acc5"].update(acc5.item(), n=batch_size)
# gather the stats from all processes
metric_logger.synchronize_between_processes()
print(' * Clip Acc@1 {top1.global_avg:.3f} Clip Acc@5 {top5.global_avg:.3f}'
.format(top1=metric_logger.acc1, top5=metric_logger.acc5))
print(
" * Clip Acc@1 {top1.global_avg:.3f} Clip Acc@5 {top5.global_avg:.3f}".format(
top1=metric_logger.acc1, top5=metric_logger.acc5
)
)
return metric_logger.acc1.global_avg
def _get_cache_path(filepath):
import hashlib
h = hashlib.sha1(filepath.encode()).hexdigest()
cache_path = os.path.join("~", ".torch", "vision", "datasets", "kinetics", h[:10] + ".pt")
cache_path = os.path.expanduser(cache_path)
......@@ -90,8 +94,10 @@ def collate_fn(batch):
def main(args):
if args.apex and amp is None:
raise RuntimeError("Failed to import apex. Please install apex from https://www.github.com/nvidia/apex "
"to enable mixed-precision training.")
raise RuntimeError(
"Failed to import apex. Please install apex from https://www.github.com/nvidia/apex "
"to enable mixed-precision training."
)
if args.output_dir:
utils.mkdir(args.output_dir)
......@@ -121,15 +127,17 @@ def main(args):
dataset.transform = transform_train
else:
if args.distributed:
print("It is recommended to pre-compute the dataset cache "
"on a single-gpu first, as it will be faster")
print("It is recommended to pre-compute the dataset cache " "on a single-gpu first, as it will be faster")
dataset = torchvision.datasets.Kinetics400(
traindir,
frames_per_clip=args.clip_len,
step_between_clips=1,
transform=transform_train,
frame_rate=15,
extensions=('avi', 'mp4', )
extensions=(
"avi",
"mp4",
),
)
if args.cache_dataset:
print("Saving dataset_train to {}".format(cache_path))
......@@ -149,15 +157,17 @@ def main(args):
dataset_test.transform = transform_test
else:
if args.distributed:
print("It is recommended to pre-compute the dataset cache "
"on a single-gpu first, as it will be faster")
print("It is recommended to pre-compute the dataset cache " "on a single-gpu first, as it will be faster")
dataset_test = torchvision.datasets.Kinetics400(
valdir,
frames_per_clip=args.clip_len,
step_between_clips=1,
transform=transform_test,
frame_rate=15,
extensions=('avi', 'mp4',)
extensions=(
"avi",
"mp4",
),
)
if args.cache_dataset:
print("Saving dataset_test to {}".format(cache_path))
......@@ -172,14 +182,22 @@ def main(args):
test_sampler = DistributedSampler(test_sampler)
data_loader = torch.utils.data.DataLoader(
dataset, batch_size=args.batch_size,
sampler=train_sampler, num_workers=args.workers,
pin_memory=True, collate_fn=collate_fn)
dataset,
batch_size=args.batch_size,
sampler=train_sampler,
num_workers=args.workers,
pin_memory=True,
collate_fn=collate_fn,
)
data_loader_test = torch.utils.data.DataLoader(
dataset_test, batch_size=args.batch_size,
sampler=test_sampler, num_workers=args.workers,
pin_memory=True, collate_fn=collate_fn)
dataset_test,
batch_size=args.batch_size,
sampler=test_sampler,
num_workers=args.workers,
pin_memory=True,
collate_fn=collate_fn,
)
print("Creating model")
model = torchvision.models.video.__dict__[args.model](pretrained=args.pretrained)
......@@ -190,13 +208,10 @@ def main(args):
criterion = nn.CrossEntropyLoss()
lr = args.lr * args.world_size
optimizer = torch.optim.SGD(
model.parameters(), lr=lr, momentum=args.momentum, weight_decay=args.weight_decay)
optimizer = torch.optim.SGD(model.parameters(), lr=lr, momentum=args.momentum, weight_decay=args.weight_decay)
if args.apex:
model, optimizer = amp.initialize(model, optimizer,
opt_level=args.apex_opt_level
)
model, optimizer = amp.initialize(model, optimizer, opt_level=args.apex_opt_level)
# convert scheduler to be per iteration, not per epoch, for warmup that lasts
# between different epochs
......@@ -207,20 +222,22 @@ def main(args):
if args.lr_warmup_epochs > 0:
warmup_iters = iters_per_epoch * args.lr_warmup_epochs
args.lr_warmup_method = args.lr_warmup_method.lower()
if args.lr_warmup_method == 'linear':
warmup_lr_scheduler = torch.optim.lr_scheduler.LinearLR(optimizer, start_factor=args.lr_warmup_decay,
total_iters=warmup_iters)
elif args.lr_warmup_method == 'constant':
warmup_lr_scheduler = torch.optim.lr_scheduler.ConstantLR(optimizer, factor=args.lr_warmup_decay,
total_iters=warmup_iters)
if args.lr_warmup_method == "linear":
warmup_lr_scheduler = torch.optim.lr_scheduler.LinearLR(
optimizer, start_factor=args.lr_warmup_decay, total_iters=warmup_iters
)
elif args.lr_warmup_method == "constant":
warmup_lr_scheduler = torch.optim.lr_scheduler.ConstantLR(
optimizer, factor=args.lr_warmup_decay, total_iters=warmup_iters
)
else:
raise RuntimeError("Invalid warmup lr method '{}'. Only linear and constant "
"are supported.".format(args.lr_warmup_method))
raise RuntimeError(
"Invalid warmup lr method '{}'. Only linear and constant "
"are supported.".format(args.lr_warmup_method)
)
lr_scheduler = torch.optim.lr_scheduler.SequentialLR(
optimizer,
schedulers=[warmup_lr_scheduler, main_lr_scheduler],
milestones=[warmup_iters]
optimizer, schedulers=[warmup_lr_scheduler, main_lr_scheduler], milestones=[warmup_iters]
)
else:
lr_scheduler = main_lr_scheduler
......@@ -231,11 +248,11 @@ def main(args):
model_without_ddp = model.module
if args.resume:
checkpoint = torch.load(args.resume, map_location='cpu')
model_without_ddp.load_state_dict(checkpoint['model'])
optimizer.load_state_dict(checkpoint['optimizer'])
lr_scheduler.load_state_dict(checkpoint['lr_scheduler'])
args.start_epoch = checkpoint['epoch'] + 1
checkpoint = torch.load(args.resume, map_location="cpu")
model_without_ddp.load_state_dict(checkpoint["model"])
optimizer.load_state_dict(checkpoint["optimizer"])
lr_scheduler.load_state_dict(checkpoint["lr_scheduler"])
args.start_epoch = checkpoint["epoch"] + 1
if args.test_only:
evaluate(model, criterion, data_loader_test, device=device)
......@@ -246,62 +263,65 @@ def main(args):
for epoch in range(args.start_epoch, args.epochs):
if args.distributed:
train_sampler.set_epoch(epoch)
train_one_epoch(model, criterion, optimizer, lr_scheduler, data_loader,
device, epoch, args.print_freq, args.apex)
train_one_epoch(
model, criterion, optimizer, lr_scheduler, data_loader, device, epoch, args.print_freq, args.apex
)
evaluate(model, criterion, data_loader_test, device=device)
if args.output_dir:
checkpoint = {
'model': model_without_ddp.state_dict(),
'optimizer': optimizer.state_dict(),
'lr_scheduler': lr_scheduler.state_dict(),
'epoch': epoch,
'args': args}
utils.save_on_master(
checkpoint,
os.path.join(args.output_dir, 'model_{}.pth'.format(epoch)))
utils.save_on_master(
checkpoint,
os.path.join(args.output_dir, 'checkpoint.pth'))
"model": model_without_ddp.state_dict(),
"optimizer": optimizer.state_dict(),
"lr_scheduler": lr_scheduler.state_dict(),
"epoch": epoch,
"args": args,
}
utils.save_on_master(checkpoint, os.path.join(args.output_dir, "model_{}.pth".format(epoch)))
utils.save_on_master(checkpoint, os.path.join(args.output_dir, "checkpoint.pth"))
total_time = time.time() - start_time
total_time_str = str(datetime.timedelta(seconds=int(total_time)))
print('Training time {}'.format(total_time_str))
print("Training time {}".format(total_time_str))
def parse_args():
import argparse
parser = argparse.ArgumentParser(description='PyTorch Video Classification Training')
parser.add_argument('--data-path', default='/datasets01_101/kinetics/070618/', help='dataset')
parser.add_argument('--train-dir', default='train_avi-480p', help='name of train dir')
parser.add_argument('--val-dir', default='val_avi-480p', help='name of val dir')
parser.add_argument('--model', default='r2plus1d_18', help='model')
parser.add_argument('--device', default='cuda', help='device')
parser.add_argument('--clip-len', default=16, type=int, metavar='N',
help='number of frames per clip')
parser.add_argument('--clips-per-video', default=5, type=int, metavar='N',
help='maximum number of clips per video to consider')
parser.add_argument('-b', '--batch-size', default=24, type=int)
parser.add_argument('--epochs', default=45, type=int, metavar='N',
help='number of total epochs to run')
parser.add_argument('-j', '--workers', default=10, type=int, metavar='N',
help='number of data loading workers (default: 10)')
parser.add_argument('--lr', default=0.01, type=float, help='initial learning rate')
parser.add_argument('--momentum', default=0.9, type=float, metavar='M',
help='momentum')
parser.add_argument('--wd', '--weight-decay', default=1e-4, type=float,
metavar='W', help='weight decay (default: 1e-4)',
dest='weight_decay')
parser.add_argument('--lr-milestones', nargs='+', default=[20, 30, 40], type=int, help='decrease lr on milestones')
parser.add_argument('--lr-gamma', default=0.1, type=float, help='decrease lr by a factor of lr-gamma')
parser.add_argument('--lr-warmup-epochs', default=10, type=int, help='the number of epochs to warmup (default: 10)')
parser.add_argument('--lr-warmup-method', default="linear", type=str, help='the warmup method (default: linear)')
parser.add_argument('--lr-warmup-decay', default=0.001, type=float, help='the decay for lr')
parser.add_argument('--print-freq', default=10, type=int, help='print frequency')
parser.add_argument('--output-dir', default='.', help='path where to save')
parser.add_argument('--resume', default='', help='resume from checkpoint')
parser.add_argument('--start-epoch', default=0, type=int, metavar='N',
help='start epoch')
parser = argparse.ArgumentParser(description="PyTorch Video Classification Training")
parser.add_argument("--data-path", default="/datasets01_101/kinetics/070618/", help="dataset")
parser.add_argument("--train-dir", default="train_avi-480p", help="name of train dir")
parser.add_argument("--val-dir", default="val_avi-480p", help="name of val dir")
parser.add_argument("--model", default="r2plus1d_18", help="model")
parser.add_argument("--device", default="cuda", help="device")
parser.add_argument("--clip-len", default=16, type=int, metavar="N", help="number of frames per clip")
parser.add_argument(
"--clips-per-video", default=5, type=int, metavar="N", help="maximum number of clips per video to consider"
)
parser.add_argument("-b", "--batch-size", default=24, type=int)
parser.add_argument("--epochs", default=45, type=int, metavar="N", help="number of total epochs to run")
parser.add_argument(
"-j", "--workers", default=10, type=int, metavar="N", help="number of data loading workers (default: 10)"
)
parser.add_argument("--lr", default=0.01, type=float, help="initial learning rate")
parser.add_argument("--momentum", default=0.9, type=float, metavar="M", help="momentum")
parser.add_argument(
"--wd",
"--weight-decay",
default=1e-4,
type=float,
metavar="W",
help="weight decay (default: 1e-4)",
dest="weight_decay",
)
parser.add_argument("--lr-milestones", nargs="+", default=[20, 30, 40], type=int, help="decrease lr on milestones")
parser.add_argument("--lr-gamma", default=0.1, type=float, help="decrease lr by a factor of lr-gamma")
parser.add_argument("--lr-warmup-epochs", default=10, type=int, help="the number of epochs to warmup (default: 10)")
parser.add_argument("--lr-warmup-method", default="linear", type=str, help="the warmup method (default: linear)")
parser.add_argument("--lr-warmup-decay", default=0.001, type=float, help="the decay for lr")
parser.add_argument("--print-freq", default=10, type=int, help="print frequency")
parser.add_argument("--output-dir", default=".", help="path where to save")
parser.add_argument("--resume", default="", help="resume from checkpoint")
parser.add_argument("--start-epoch", default=0, type=int, metavar="N", help="start epoch")
parser.add_argument(
"--cache-dataset",
dest="cache_dataset",
......@@ -328,18 +348,19 @@ def parse_args():
)
# Mixed precision training parameters
parser.add_argument('--apex', action='store_true',
help='Use apex for mixed precision training')
parser.add_argument('--apex-opt-level', default='O1', type=str,
help='For apex mixed precision training'
'O0 for FP32 training, O1 for mixed precision training.'
'For further detail, see https://github.com/NVIDIA/apex/tree/master/examples/imagenet'
)
parser.add_argument("--apex", action="store_true", help="Use apex for mixed precision training")
parser.add_argument(
"--apex-opt-level",
default="O1",
type=str,
help="For apex mixed precision training"
"O0 for FP32 training, O1 for mixed precision training."
"For further detail, see https://github.com/NVIDIA/apex/tree/master/examples/imagenet",
)
# distributed training parameters
parser.add_argument('--world-size', default=1, type=int,
help='number of distributed processes')
parser.add_argument('--dist-url', default='env://', help='url used to set up distributed training')
parser.add_argument("--world-size", default=1, type=int, help="number of distributed processes")
parser.add_argument("--dist-url", default="env://", help="url used to set up distributed training")
args = parser.parse_args()
......
......@@ -3,16 +3,14 @@ import torch.nn as nn
class ConvertBHWCtoBCHW(nn.Module):
"""Convert tensor from (B, H, W, C) to (B, C, H, W)
"""
"""Convert tensor from (B, H, W, C) to (B, C, H, W)"""
def forward(self, vid: torch.Tensor) -> torch.Tensor:
return vid.permute(0, 3, 1, 2)
class ConvertBCHWtoCBHW(nn.Module):
"""Convert tensor from (B, C, H, W) to (C, B, H, W)
"""
"""Convert tensor from (B, C, H, W) to (C, B, H, W)"""
def forward(self, vid: torch.Tensor) -> torch.Tensor:
return vid.permute(1, 0, 2, 3)
from collections import defaultdict, deque
import datetime
import errno
import os
import time
from collections import defaultdict, deque
import torch
import torch.distributed as dist
import errno
import os
class SmoothedValue(object):
"""Track a series of values and provide access to smoothed values over a
......@@ -32,7 +32,7 @@ class SmoothedValue(object):
"""
if not is_dist_avail_and_initialized():
return
t = torch.tensor([self.count, self.total], dtype=torch.float64, device='cuda')
t = torch.tensor([self.count, self.total], dtype=torch.float64, device="cuda")
dist.barrier()
dist.all_reduce(t)
t = t.tolist()
......@@ -63,11 +63,8 @@ class SmoothedValue(object):
def __str__(self):
return self.fmt.format(
median=self.median,
avg=self.avg,
global_avg=self.global_avg,
max=self.max,
value=self.value)
median=self.median, avg=self.avg, global_avg=self.global_avg, max=self.max, value=self.value
)
class MetricLogger(object):
......@@ -87,15 +84,12 @@ class MetricLogger(object):
return self.meters[attr]
if attr in self.__dict__:
return self.__dict__[attr]
raise AttributeError("'{}' object has no attribute '{}'".format(
type(self).__name__, attr))
raise AttributeError("'{}' object has no attribute '{}'".format(type(self).__name__, attr))
def __str__(self):
loss_str = []
for name, meter in self.meters.items():
loss_str.append(
"{}: {}".format(name, str(meter))
)
loss_str.append("{}: {}".format(name, str(meter)))
return self.delimiter.join(loss_str)
def synchronize_between_processes(self):
......@@ -108,31 +102,28 @@ class MetricLogger(object):
def log_every(self, iterable, print_freq, header=None):
i = 0
if not header:
header = ''
header = ""
start_time = time.time()
end = time.time()
iter_time = SmoothedValue(fmt='{avg:.4f}')
data_time = SmoothedValue(fmt='{avg:.4f}')
space_fmt = ':' + str(len(str(len(iterable)))) + 'd'
iter_time = SmoothedValue(fmt="{avg:.4f}")
data_time = SmoothedValue(fmt="{avg:.4f}")
space_fmt = ":" + str(len(str(len(iterable)))) + "d"
if torch.cuda.is_available():
log_msg = self.delimiter.join([
header,
'[{0' + space_fmt + '}/{1}]',
'eta: {eta}',
'{meters}',
'time: {time}',
'data: {data}',
'max mem: {memory:.0f}'
])
log_msg = self.delimiter.join(
[
header,
"[{0" + space_fmt + "}/{1}]",
"eta: {eta}",
"{meters}",
"time: {time}",
"data: {data}",
"max mem: {memory:.0f}",
]
)
else:
log_msg = self.delimiter.join([
header,
'[{0' + space_fmt + '}/{1}]',
'eta: {eta}',
'{meters}',
'time: {time}',
'data: {data}'
])
log_msg = self.delimiter.join(
[header, "[{0" + space_fmt + "}/{1}]", "eta: {eta}", "{meters}", "time: {time}", "data: {data}"]
)
MB = 1024.0 * 1024.0
for obj in iterable:
data_time.update(time.time() - end)
......@@ -142,21 +133,28 @@ class MetricLogger(object):
eta_seconds = iter_time.global_avg * (len(iterable) - i)
eta_string = str(datetime.timedelta(seconds=int(eta_seconds)))
if torch.cuda.is_available():
print(log_msg.format(
i, len(iterable), eta=eta_string,
meters=str(self),
time=str(iter_time), data=str(data_time),
memory=torch.cuda.max_memory_allocated() / MB))
print(
log_msg.format(
i,
len(iterable),
eta=eta_string,
meters=str(self),
time=str(iter_time),
data=str(data_time),
memory=torch.cuda.max_memory_allocated() / MB,
)
)
else:
print(log_msg.format(
i, len(iterable), eta=eta_string,
meters=str(self),
time=str(iter_time), data=str(data_time)))
print(
log_msg.format(
i, len(iterable), eta=eta_string, meters=str(self), time=str(iter_time), data=str(data_time)
)
)
i += 1
end = time.time()
total_time = time.time() - start_time
total_time_str = str(datetime.timedelta(seconds=int(total_time)))
print('{} Total time: {}'.format(header, total_time_str))
print("{} Total time: {}".format(header, total_time_str))
def accuracy(output, target, topk=(1,)):
......@@ -189,10 +187,11 @@ def setup_for_distributed(is_master):
This function disables printing when not in master process
"""
import builtins as __builtin__
builtin_print = __builtin__.print
def print(*args, **kwargs):
force = kwargs.pop('force', False)
force = kwargs.pop("force", False)
if is_master or force:
builtin_print(*args, **kwargs)
......@@ -229,26 +228,26 @@ def save_on_master(*args, **kwargs):
def init_distributed_mode(args):
if 'RANK' in os.environ and 'WORLD_SIZE' in os.environ:
if "RANK" in os.environ and "WORLD_SIZE" in os.environ:
args.rank = int(os.environ["RANK"])
args.world_size = int(os.environ['WORLD_SIZE'])
args.gpu = int(os.environ['LOCAL_RANK'])
elif 'SLURM_PROCID' in os.environ:
args.rank = int(os.environ['SLURM_PROCID'])
args.world_size = int(os.environ["WORLD_SIZE"])
args.gpu = int(os.environ["LOCAL_RANK"])
elif "SLURM_PROCID" in os.environ:
args.rank = int(os.environ["SLURM_PROCID"])
args.gpu = args.rank % torch.cuda.device_count()
elif hasattr(args, "rank"):
pass
else:
print('Not using distributed mode')
print("Not using distributed mode")
args.distributed = False
return
args.distributed = True
torch.cuda.set_device(args.gpu)
args.dist_backend = 'nccl'
print('| distributed init (rank {}): {}'.format(
args.rank, args.dist_url), flush=True)
torch.distributed.init_process_group(backend=args.dist_backend, init_method=args.dist_url,
world_size=args.world_size, rank=args.rank)
args.dist_backend = "nccl"
print("| distributed init (rank {}): {}".format(args.rank, args.dist_url), flush=True)
torch.distributed.init_process_group(
backend=args.dist_backend, init_method=args.dist_url, world_size=args.world_size, rank=args.rank
)
setup_for_distributed(args.rank == 0)
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment