Unverified Commit 5f0edb97 authored by Philip Meier's avatar Philip Meier Committed by GitHub
Browse files

Add ufmt (usort + black) as code formatter (#4384)



* add ufmt as code formatter

* cleanup

* quote ufmt requirement

* split imports into more groups

* regenerate circleci config

* fix CI

* clarify local testing utils section

* use ufmt pre-commit hook

* split relative imports into local category

* Revert "split relative imports into local category"

This reverts commit f2e224cde2008c56c9347c1f69746d39065cdd51.

* pin black and usort dependencies

* fix local test utils detection

* fix ufmt rev

* add reference utils to local category

* fix usort config

* remove custom categories sorting

* Run pre-commit without fixing flake8

* got a double import in merge
Co-authored-by: default avatarNicolas Hug <nicolashug@fb.com>
parent e45489b1
import bisect import bisect
from collections import defaultdict
import copy import copy
from itertools import repeat, chain
import math import math
import numpy as np from collections import defaultdict
from itertools import repeat, chain
import numpy as np
import torch import torch
import torch.utils.data import torch.utils.data
from torch.utils.data.sampler import BatchSampler, Sampler
from torch.utils.model_zoo import tqdm
import torchvision import torchvision
from PIL import Image from PIL import Image
from torch.utils.data.sampler import BatchSampler, Sampler
from torch.utils.model_zoo import tqdm
def _repeat_to_at_least(iterable, n): def _repeat_to_at_least(iterable, n):
...@@ -34,11 +33,11 @@ class GroupedBatchSampler(BatchSampler): ...@@ -34,11 +33,11 @@ class GroupedBatchSampler(BatchSampler):
0, i.e. they must be in the range [0, num_groups). 0, i.e. they must be in the range [0, num_groups).
batch_size (int): Size of mini-batch. batch_size (int): Size of mini-batch.
""" """
def __init__(self, sampler, group_ids, batch_size): def __init__(self, sampler, group_ids, batch_size):
if not isinstance(sampler, Sampler): if not isinstance(sampler, Sampler):
raise ValueError( raise ValueError(
"sampler should be an instance of " "sampler should be an instance of " "torch.utils.data.Sampler, but got sampler={}".format(sampler)
"torch.utils.data.Sampler, but got sampler={}".format(sampler)
) )
self.sampler = sampler self.sampler = sampler
self.group_ids = group_ids self.group_ids = group_ids
...@@ -68,8 +67,7 @@ class GroupedBatchSampler(BatchSampler): ...@@ -68,8 +67,7 @@ class GroupedBatchSampler(BatchSampler):
if num_remaining > 0: if num_remaining > 0:
# for the remaining batches, take first the buffers with largest number # for the remaining batches, take first the buffers with largest number
# of elements # of elements
for group_id, _ in sorted(buffer_per_group.items(), for group_id, _ in sorted(buffer_per_group.items(), key=lambda x: len(x[1]), reverse=True):
key=lambda x: len(x[1]), reverse=True):
remaining = self.batch_size - len(buffer_per_group[group_id]) remaining = self.batch_size - len(buffer_per_group[group_id])
samples_from_group_id = _repeat_to_at_least(samples_per_group[group_id], remaining) samples_from_group_id = _repeat_to_at_least(samples_per_group[group_id], remaining)
buffer_per_group[group_id].extend(samples_from_group_id[:remaining]) buffer_per_group[group_id].extend(samples_from_group_id[:remaining])
...@@ -85,10 +83,12 @@ class GroupedBatchSampler(BatchSampler): ...@@ -85,10 +83,12 @@ class GroupedBatchSampler(BatchSampler):
def _compute_aspect_ratios_slow(dataset, indices=None): def _compute_aspect_ratios_slow(dataset, indices=None):
print("Your dataset doesn't support the fast path for " print(
"Your dataset doesn't support the fast path for "
"computing the aspect ratios, so will iterate over " "computing the aspect ratios, so will iterate over "
"the full dataset and load every image instead. " "the full dataset and load every image instead. "
"This might take some time...") "This might take some time..."
)
if indices is None: if indices is None:
indices = range(len(dataset)) indices = range(len(dataset))
...@@ -104,9 +104,12 @@ def _compute_aspect_ratios_slow(dataset, indices=None): ...@@ -104,9 +104,12 @@ def _compute_aspect_ratios_slow(dataset, indices=None):
sampler = SubsetSampler(indices) sampler = SubsetSampler(indices)
data_loader = torch.utils.data.DataLoader( data_loader = torch.utils.data.DataLoader(
dataset, batch_size=1, sampler=sampler, dataset,
batch_size=1,
sampler=sampler,
num_workers=14, # you might want to increase it for faster processing num_workers=14, # you might want to increase it for faster processing
collate_fn=lambda x: x[0]) collate_fn=lambda x: x[0],
)
aspect_ratios = [] aspect_ratios = []
with tqdm(total=len(dataset)) as pbar: with tqdm(total=len(dataset)) as pbar:
for _i, (img, _) in enumerate(data_loader): for _i, (img, _) in enumerate(data_loader):
......
import torch import torch
import transforms as T import transforms as T
class DetectionPresetTrain: class DetectionPresetTrain:
def __init__(self, data_augmentation, hflip_prob=0.5, mean=(123., 117., 104.)): def __init__(self, data_augmentation, hflip_prob=0.5, mean=(123.0, 117.0, 104.0)):
if data_augmentation == 'hflip': if data_augmentation == "hflip":
self.transforms = T.Compose([ self.transforms = T.Compose(
[
T.RandomHorizontalFlip(p=hflip_prob), T.RandomHorizontalFlip(p=hflip_prob),
T.PILToTensor(), T.PILToTensor(),
T.ConvertImageDtype(torch.float), T.ConvertImageDtype(torch.float),
]) ]
elif data_augmentation == 'ssd': )
self.transforms = T.Compose([ elif data_augmentation == "ssd":
self.transforms = T.Compose(
[
T.RandomPhotometricDistort(), T.RandomPhotometricDistort(),
T.RandomZoomOut(fill=list(mean)), T.RandomZoomOut(fill=list(mean)),
T.RandomIoUCrop(), T.RandomIoUCrop(),
T.RandomHorizontalFlip(p=hflip_prob), T.RandomHorizontalFlip(p=hflip_prob),
T.PILToTensor(), T.PILToTensor(),
T.ConvertImageDtype(torch.float), T.ConvertImageDtype(torch.float),
]) ]
elif data_augmentation == 'ssdlite': )
self.transforms = T.Compose([ elif data_augmentation == "ssdlite":
self.transforms = T.Compose(
[
T.RandomIoUCrop(), T.RandomIoUCrop(),
T.RandomHorizontalFlip(p=hflip_prob), T.RandomHorizontalFlip(p=hflip_prob),
T.PILToTensor(), T.PILToTensor(),
T.ConvertImageDtype(torch.float), T.ConvertImageDtype(torch.float),
]) ]
)
else: else:
raise ValueError(f'Unknown data augmentation policy "{data_augmentation}"') raise ValueError(f'Unknown data augmentation policy "{data_augmentation}"')
......
...@@ -21,26 +21,20 @@ import datetime ...@@ -21,26 +21,20 @@ import datetime
import os import os
import time import time
import presets
import torch import torch
import torch.utils.data import torch.utils.data
import torchvision import torchvision
import torchvision.models.detection import torchvision.models.detection
import torchvision.models.detection.mask_rcnn import torchvision.models.detection.mask_rcnn
import utils
from coco_utils import get_coco, get_coco_kp from coco_utils import get_coco, get_coco_kp
from group_by_aspect_ratio import GroupedBatchSampler, create_aspect_ratio_groups
from engine import train_one_epoch, evaluate from engine import train_one_epoch, evaluate
from group_by_aspect_ratio import GroupedBatchSampler, create_aspect_ratio_groups
import presets
import utils
def get_dataset(name, image_set, transform, data_path): def get_dataset(name, image_set, transform, data_path):
paths = { paths = {"coco": (data_path, get_coco, 91), "coco_kp": (data_path, get_coco_kp, 2)}
"coco": (data_path, get_coco, 91),
"coco_kp": (data_path, get_coco_kp, 2)
}
p, ds_fn, num_classes = paths[name] p, ds_fn, num_classes = paths[name]
ds = ds_fn(p, image_set=image_set, transforms=transform) ds = ds_fn(p, image_set=image_set, transforms=transform)
...@@ -53,42 +47,60 @@ def get_transform(train, data_augmentation): ...@@ -53,42 +47,60 @@ def get_transform(train, data_augmentation):
def get_args_parser(add_help=True): def get_args_parser(add_help=True):
import argparse import argparse
parser = argparse.ArgumentParser(description='PyTorch Detection Training', add_help=add_help)
parser = argparse.ArgumentParser(description="PyTorch Detection Training", add_help=add_help)
parser.add_argument('--data-path', default='/datasets01/COCO/022719/', help='dataset')
parser.add_argument('--dataset', default='coco', help='dataset') parser.add_argument("--data-path", default="/datasets01/COCO/022719/", help="dataset")
parser.add_argument('--model', default='maskrcnn_resnet50_fpn', help='model') parser.add_argument("--dataset", default="coco", help="dataset")
parser.add_argument('--device', default='cuda', help='device') parser.add_argument("--model", default="maskrcnn_resnet50_fpn", help="model")
parser.add_argument('-b', '--batch-size', default=2, type=int, parser.add_argument("--device", default="cuda", help="device")
help='images per gpu, the total batch size is $NGPU x batch_size') parser.add_argument(
parser.add_argument('--epochs', default=26, type=int, metavar='N', "-b", "--batch-size", default=2, type=int, help="images per gpu, the total batch size is $NGPU x batch_size"
help='number of total epochs to run') )
parser.add_argument('-j', '--workers', default=4, type=int, metavar='N', parser.add_argument("--epochs", default=26, type=int, metavar="N", help="number of total epochs to run")
help='number of data loading workers (default: 4)') parser.add_argument(
parser.add_argument('--lr', default=0.02, type=float, "-j", "--workers", default=4, type=int, metavar="N", help="number of data loading workers (default: 4)"
help='initial learning rate, 0.02 is the default value for training ' )
'on 8 gpus and 2 images_per_gpu') parser.add_argument(
parser.add_argument('--momentum', default=0.9, type=float, metavar='M', "--lr",
help='momentum') default=0.02,
parser.add_argument('--wd', '--weight-decay', default=1e-4, type=float, type=float,
metavar='W', help='weight decay (default: 1e-4)', help="initial learning rate, 0.02 is the default value for training " "on 8 gpus and 2 images_per_gpu",
dest='weight_decay') )
parser.add_argument('--lr-scheduler', default="multisteplr", help='the lr scheduler (default: multisteplr)') parser.add_argument("--momentum", default=0.9, type=float, metavar="M", help="momentum")
parser.add_argument('--lr-step-size', default=8, type=int, parser.add_argument(
help='decrease lr every step-size epochs (multisteplr scheduler only)') "--wd",
parser.add_argument('--lr-steps', default=[16, 22], nargs='+', type=int, "--weight-decay",
help='decrease lr every step-size epochs (multisteplr scheduler only)') default=1e-4,
parser.add_argument('--lr-gamma', default=0.1, type=float, type=float,
help='decrease lr by a factor of lr-gamma (multisteplr scheduler only)') metavar="W",
parser.add_argument('--print-freq', default=20, type=int, help='print frequency') help="weight decay (default: 1e-4)",
parser.add_argument('--output-dir', default='.', help='path where to save') dest="weight_decay",
parser.add_argument('--resume', default='', help='resume from checkpoint') )
parser.add_argument('--start_epoch', default=0, type=int, help='start epoch') parser.add_argument("--lr-scheduler", default="multisteplr", help="the lr scheduler (default: multisteplr)")
parser.add_argument('--aspect-ratio-group-factor', default=3, type=int) parser.add_argument(
parser.add_argument('--rpn-score-thresh', default=None, type=float, help='rpn score threshold for faster-rcnn') "--lr-step-size", default=8, type=int, help="decrease lr every step-size epochs (multisteplr scheduler only)"
parser.add_argument('--trainable-backbone-layers', default=None, type=int, )
help='number of trainable layers of backbone') parser.add_argument(
parser.add_argument('--data-augmentation', default="hflip", help='data augmentation policy (default: hflip)') "--lr-steps",
default=[16, 22],
nargs="+",
type=int,
help="decrease lr every step-size epochs (multisteplr scheduler only)",
)
parser.add_argument(
"--lr-gamma", default=0.1, type=float, help="decrease lr by a factor of lr-gamma (multisteplr scheduler only)"
)
parser.add_argument("--print-freq", default=20, type=int, help="print frequency")
parser.add_argument("--output-dir", default=".", help="path where to save")
parser.add_argument("--resume", default="", help="resume from checkpoint")
parser.add_argument("--start_epoch", default=0, type=int, help="start epoch")
parser.add_argument("--aspect-ratio-group-factor", default=3, type=int)
parser.add_argument("--rpn-score-thresh", default=None, type=float, help="rpn score threshold for faster-rcnn")
parser.add_argument(
"--trainable-backbone-layers", default=None, type=int, help="number of trainable layers of backbone"
)
parser.add_argument("--data-augmentation", default="hflip", help="data augmentation policy (default: hflip)")
parser.add_argument( parser.add_argument(
"--sync-bn", "--sync-bn",
dest="sync_bn", dest="sync_bn",
...@@ -109,9 +121,8 @@ def get_args_parser(add_help=True): ...@@ -109,9 +121,8 @@ def get_args_parser(add_help=True):
) )
# distributed training parameters # distributed training parameters
parser.add_argument('--world-size', default=1, type=int, parser.add_argument("--world-size", default=1, type=int, help="number of distributed processes")
help='number of distributed processes') parser.add_argument("--dist-url", default="env://", help="url used to set up distributed training")
parser.add_argument('--dist-url', default='env://', help='url used to set up distributed training')
return parser return parser
...@@ -128,8 +139,9 @@ def main(args): ...@@ -128,8 +139,9 @@ def main(args):
# Data loading code # Data loading code
print("Loading data") print("Loading data")
dataset, num_classes = get_dataset(args.dataset, "train", get_transform(True, args.data_augmentation), dataset, num_classes = get_dataset(
args.data_path) args.dataset, "train", get_transform(True, args.data_augmentation), args.data_path
)
dataset_test, _ = get_dataset(args.dataset, "val", get_transform(False, args.data_augmentation), args.data_path) dataset_test, _ = get_dataset(args.dataset, "val", get_transform(False, args.data_augmentation), args.data_path)
print("Creating data loaders") print("Creating data loaders")
...@@ -144,27 +156,24 @@ def main(args): ...@@ -144,27 +156,24 @@ def main(args):
group_ids = create_aspect_ratio_groups(dataset, k=args.aspect_ratio_group_factor) group_ids = create_aspect_ratio_groups(dataset, k=args.aspect_ratio_group_factor)
train_batch_sampler = GroupedBatchSampler(train_sampler, group_ids, args.batch_size) train_batch_sampler = GroupedBatchSampler(train_sampler, group_ids, args.batch_size)
else: else:
train_batch_sampler = torch.utils.data.BatchSampler( train_batch_sampler = torch.utils.data.BatchSampler(train_sampler, args.batch_size, drop_last=True)
train_sampler, args.batch_size, drop_last=True)
data_loader = torch.utils.data.DataLoader( data_loader = torch.utils.data.DataLoader(
dataset, batch_sampler=train_batch_sampler, num_workers=args.workers, dataset, batch_sampler=train_batch_sampler, num_workers=args.workers, collate_fn=utils.collate_fn
collate_fn=utils.collate_fn) )
data_loader_test = torch.utils.data.DataLoader( data_loader_test = torch.utils.data.DataLoader(
dataset_test, batch_size=1, dataset_test, batch_size=1, sampler=test_sampler, num_workers=args.workers, collate_fn=utils.collate_fn
sampler=test_sampler, num_workers=args.workers, )
collate_fn=utils.collate_fn)
print("Creating model") print("Creating model")
kwargs = { kwargs = {"trainable_backbone_layers": args.trainable_backbone_layers}
"trainable_backbone_layers": args.trainable_backbone_layers
}
if "rcnn" in args.model: if "rcnn" in args.model:
if args.rpn_score_thresh is not None: if args.rpn_score_thresh is not None:
kwargs["rpn_score_thresh"] = args.rpn_score_thresh kwargs["rpn_score_thresh"] = args.rpn_score_thresh
model = torchvision.models.detection.__dict__[args.model](num_classes=num_classes, pretrained=args.pretrained, model = torchvision.models.detection.__dict__[args.model](
**kwargs) num_classes=num_classes, pretrained=args.pretrained, **kwargs
)
model.to(device) model.to(device)
if args.distributed and args.sync_bn: if args.distributed and args.sync_bn:
model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model) model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model)
...@@ -175,24 +184,25 @@ def main(args): ...@@ -175,24 +184,25 @@ def main(args):
model_without_ddp = model.module model_without_ddp = model.module
params = [p for p in model.parameters() if p.requires_grad] params = [p for p in model.parameters() if p.requires_grad]
optimizer = torch.optim.SGD( optimizer = torch.optim.SGD(params, lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay)
params, lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay)
args.lr_scheduler = args.lr_scheduler.lower() args.lr_scheduler = args.lr_scheduler.lower()
if args.lr_scheduler == 'multisteplr': if args.lr_scheduler == "multisteplr":
lr_scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=args.lr_steps, gamma=args.lr_gamma) lr_scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=args.lr_steps, gamma=args.lr_gamma)
elif args.lr_scheduler == 'cosineannealinglr': elif args.lr_scheduler == "cosineannealinglr":
lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=args.epochs) lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=args.epochs)
else: else:
raise RuntimeError("Invalid lr scheduler '{}'. Only MultiStepLR and CosineAnnealingLR " raise RuntimeError(
"are supported.".format(args.lr_scheduler)) "Invalid lr scheduler '{}'. Only MultiStepLR and CosineAnnealingLR "
"are supported.".format(args.lr_scheduler)
)
if args.resume: if args.resume:
checkpoint = torch.load(args.resume, map_location='cpu') checkpoint = torch.load(args.resume, map_location="cpu")
model_without_ddp.load_state_dict(checkpoint['model']) model_without_ddp.load_state_dict(checkpoint["model"])
optimizer.load_state_dict(checkpoint['optimizer']) optimizer.load_state_dict(checkpoint["optimizer"])
lr_scheduler.load_state_dict(checkpoint['lr_scheduler']) lr_scheduler.load_state_dict(checkpoint["lr_scheduler"])
args.start_epoch = checkpoint['epoch'] + 1 args.start_epoch = checkpoint["epoch"] + 1
if args.test_only: if args.test_only:
evaluate(model, data_loader_test, device=device) evaluate(model, data_loader_test, device=device)
...@@ -207,25 +217,21 @@ def main(args): ...@@ -207,25 +217,21 @@ def main(args):
lr_scheduler.step() lr_scheduler.step()
if args.output_dir: if args.output_dir:
checkpoint = { checkpoint = {
'model': model_without_ddp.state_dict(), "model": model_without_ddp.state_dict(),
'optimizer': optimizer.state_dict(), "optimizer": optimizer.state_dict(),
'lr_scheduler': lr_scheduler.state_dict(), "lr_scheduler": lr_scheduler.state_dict(),
'args': args, "args": args,
'epoch': epoch "epoch": epoch,
} }
utils.save_on_master( utils.save_on_master(checkpoint, os.path.join(args.output_dir, "model_{}.pth".format(epoch)))
checkpoint, utils.save_on_master(checkpoint, os.path.join(args.output_dir, "checkpoint.pth"))
os.path.join(args.output_dir, 'model_{}.pth'.format(epoch)))
utils.save_on_master(
checkpoint,
os.path.join(args.output_dir, 'checkpoint.pth'))
# evaluate after every epoch # evaluate after every epoch
evaluate(model, data_loader_test, device=device) evaluate(model, data_loader_test, device=device)
total_time = time.time() - start_time total_time = time.time() - start_time
total_time_str = str(datetime.timedelta(seconds=int(total_time))) total_time_str = str(datetime.timedelta(seconds=int(total_time)))
print('Training time {}'.format(total_time_str)) print("Training time {}".format(total_time_str))
if __name__ == "__main__": if __name__ == "__main__":
......
...@@ -28,8 +28,9 @@ class Compose(object): ...@@ -28,8 +28,9 @@ class Compose(object):
class RandomHorizontalFlip(T.RandomHorizontalFlip): class RandomHorizontalFlip(T.RandomHorizontalFlip):
def forward(self, image: Tensor, def forward(
target: Optional[Dict[str, Tensor]] = None) -> Tuple[Tensor, Optional[Dict[str, Tensor]]]: self, image: Tensor, target: Optional[Dict[str, Tensor]] = None
) -> Tuple[Tensor, Optional[Dict[str, Tensor]]]:
if torch.rand(1) < self.p: if torch.rand(1) < self.p:
image = F.hflip(image) image = F.hflip(image)
if target is not None: if target is not None:
...@@ -45,16 +46,18 @@ class RandomHorizontalFlip(T.RandomHorizontalFlip): ...@@ -45,16 +46,18 @@ class RandomHorizontalFlip(T.RandomHorizontalFlip):
class ToTensor(nn.Module): class ToTensor(nn.Module):
def forward(self, image: Tensor, def forward(
target: Optional[Dict[str, Tensor]] = None) -> Tuple[Tensor, Optional[Dict[str, Tensor]]]: self, image: Tensor, target: Optional[Dict[str, Tensor]] = None
) -> Tuple[Tensor, Optional[Dict[str, Tensor]]]:
image = F.pil_to_tensor(image) image = F.pil_to_tensor(image)
image = F.convert_image_dtype(image) image = F.convert_image_dtype(image)
return image, target return image, target
class PILToTensor(nn.Module): class PILToTensor(nn.Module):
def forward(self, image: Tensor, def forward(
target: Optional[Dict[str, Tensor]] = None) -> Tuple[Tensor, Optional[Dict[str, Tensor]]]: self, image: Tensor, target: Optional[Dict[str, Tensor]] = None
) -> Tuple[Tensor, Optional[Dict[str, Tensor]]]:
image = F.pil_to_tensor(image) image = F.pil_to_tensor(image)
return image, target return image, target
...@@ -64,15 +67,23 @@ class ConvertImageDtype(nn.Module): ...@@ -64,15 +67,23 @@ class ConvertImageDtype(nn.Module):
super().__init__() super().__init__()
self.dtype = dtype self.dtype = dtype
def forward(self, image: Tensor, def forward(
target: Optional[Dict[str, Tensor]] = None) -> Tuple[Tensor, Optional[Dict[str, Tensor]]]: self, image: Tensor, target: Optional[Dict[str, Tensor]] = None
) -> Tuple[Tensor, Optional[Dict[str, Tensor]]]:
image = F.convert_image_dtype(image, self.dtype) image = F.convert_image_dtype(image, self.dtype)
return image, target return image, target
class RandomIoUCrop(nn.Module): class RandomIoUCrop(nn.Module):
def __init__(self, min_scale: float = 0.3, max_scale: float = 1.0, min_aspect_ratio: float = 0.5, def __init__(
max_aspect_ratio: float = 2.0, sampler_options: Optional[List[float]] = None, trials: int = 40): self,
min_scale: float = 0.3,
max_scale: float = 1.0,
min_aspect_ratio: float = 0.5,
max_aspect_ratio: float = 2.0,
sampler_options: Optional[List[float]] = None,
trials: int = 40,
):
super().__init__() super().__init__()
# Configuration similar to https://github.com/weiliu89/caffe/blob/ssd/examples/ssd/ssd_coco.py#L89-L174 # Configuration similar to https://github.com/weiliu89/caffe/blob/ssd/examples/ssd/ssd_coco.py#L89-L174
self.min_scale = min_scale self.min_scale = min_scale
...@@ -84,14 +95,15 @@ class RandomIoUCrop(nn.Module): ...@@ -84,14 +95,15 @@ class RandomIoUCrop(nn.Module):
self.options = sampler_options self.options = sampler_options
self.trials = trials self.trials = trials
def forward(self, image: Tensor, def forward(
target: Optional[Dict[str, Tensor]] = None) -> Tuple[Tensor, Optional[Dict[str, Tensor]]]: self, image: Tensor, target: Optional[Dict[str, Tensor]] = None
) -> Tuple[Tensor, Optional[Dict[str, Tensor]]]:
if target is None: if target is None:
raise ValueError("The targets can't be None for this transform.") raise ValueError("The targets can't be None for this transform.")
if isinstance(image, torch.Tensor): if isinstance(image, torch.Tensor):
if image.ndimension() not in {2, 3}: if image.ndimension() not in {2, 3}:
raise ValueError('image should be 2/3 dimensional. Got {} dimensions.'.format(image.ndimension())) raise ValueError("image should be 2/3 dimensional. Got {} dimensions.".format(image.ndimension()))
elif image.ndimension() == 2: elif image.ndimension() == 2:
image = image.unsqueeze(0) image = image.unsqueeze(0)
...@@ -131,8 +143,9 @@ class RandomIoUCrop(nn.Module): ...@@ -131,8 +143,9 @@ class RandomIoUCrop(nn.Module):
# check at least 1 box with jaccard limitations # check at least 1 box with jaccard limitations
boxes = target["boxes"][is_within_crop_area] boxes = target["boxes"][is_within_crop_area]
ious = torchvision.ops.boxes.box_iou(boxes, torch.tensor([[left, top, right, bottom]], ious = torchvision.ops.boxes.box_iou(
dtype=boxes.dtype, device=boxes.device)) boxes, torch.tensor([[left, top, right, bottom]], dtype=boxes.dtype, device=boxes.device)
)
if ious.max() < min_jaccard_overlap: if ious.max() < min_jaccard_overlap:
continue continue
...@@ -149,13 +162,15 @@ class RandomIoUCrop(nn.Module): ...@@ -149,13 +162,15 @@ class RandomIoUCrop(nn.Module):
class RandomZoomOut(nn.Module): class RandomZoomOut(nn.Module):
def __init__(self, fill: Optional[List[float]] = None, side_range: Tuple[float, float] = (1., 4.), p: float = 0.5): def __init__(
self, fill: Optional[List[float]] = None, side_range: Tuple[float, float] = (1.0, 4.0), p: float = 0.5
):
super().__init__() super().__init__()
if fill is None: if fill is None:
fill = [0., 0., 0.] fill = [0.0, 0.0, 0.0]
self.fill = fill self.fill = fill
self.side_range = side_range self.side_range = side_range
if side_range[0] < 1. or side_range[0] > side_range[1]: if side_range[0] < 1.0 or side_range[0] > side_range[1]:
raise ValueError("Invalid canvas side range provided {}.".format(side_range)) raise ValueError("Invalid canvas side range provided {}.".format(side_range))
self.p = p self.p = p
...@@ -165,11 +180,12 @@ class RandomZoomOut(nn.Module): ...@@ -165,11 +180,12 @@ class RandomZoomOut(nn.Module):
# We fake the type to make it work on JIT # We fake the type to make it work on JIT
return tuple(int(x) for x in self.fill) if is_pil else 0 return tuple(int(x) for x in self.fill) if is_pil else 0
def forward(self, image: Tensor, def forward(
target: Optional[Dict[str, Tensor]] = None) -> Tuple[Tensor, Optional[Dict[str, Tensor]]]: self, image: Tensor, target: Optional[Dict[str, Tensor]] = None
) -> Tuple[Tensor, Optional[Dict[str, Tensor]]]:
if isinstance(image, torch.Tensor): if isinstance(image, torch.Tensor):
if image.ndimension() not in {2, 3}: if image.ndimension() not in {2, 3}:
raise ValueError('image should be 2/3 dimensional. Got {} dimensions.'.format(image.ndimension())) raise ValueError("image should be 2/3 dimensional. Got {} dimensions.".format(image.ndimension()))
elif image.ndimension() == 2: elif image.ndimension() == 2:
image = image.unsqueeze(0) image = image.unsqueeze(0)
...@@ -196,8 +212,9 @@ class RandomZoomOut(nn.Module): ...@@ -196,8 +212,9 @@ class RandomZoomOut(nn.Module):
image = F.pad(image, [left, top, right, bottom], fill=fill) image = F.pad(image, [left, top, right, bottom], fill=fill)
if isinstance(image, torch.Tensor): if isinstance(image, torch.Tensor):
v = torch.tensor(self.fill, device=image.device, dtype=image.dtype).view(-1, 1, 1) v = torch.tensor(self.fill, device=image.device, dtype=image.dtype).view(-1, 1, 1)
image[..., :top, :] = image[..., :, :left] = image[..., (top + orig_h):, :] = \ image[..., :top, :] = image[..., :, :left] = image[..., (top + orig_h) :, :] = image[
image[..., :, (left + orig_w):] = v ..., :, (left + orig_w) :
] = v
if target is not None: if target is not None:
target["boxes"][:, 0::2] += left target["boxes"][:, 0::2] += left
...@@ -207,8 +224,14 @@ class RandomZoomOut(nn.Module): ...@@ -207,8 +224,14 @@ class RandomZoomOut(nn.Module):
class RandomPhotometricDistort(nn.Module): class RandomPhotometricDistort(nn.Module):
def __init__(self, contrast: Tuple[float] = (0.5, 1.5), saturation: Tuple[float] = (0.5, 1.5), def __init__(
hue: Tuple[float] = (-0.05, 0.05), brightness: Tuple[float] = (0.875, 1.125), p: float = 0.5): self,
contrast: Tuple[float] = (0.5, 1.5),
saturation: Tuple[float] = (0.5, 1.5),
hue: Tuple[float] = (-0.05, 0.05),
brightness: Tuple[float] = (0.875, 1.125),
p: float = 0.5,
):
super().__init__() super().__init__()
self._brightness = T.ColorJitter(brightness=brightness) self._brightness = T.ColorJitter(brightness=brightness)
self._contrast = T.ColorJitter(contrast=contrast) self._contrast = T.ColorJitter(contrast=contrast)
...@@ -216,11 +239,12 @@ class RandomPhotometricDistort(nn.Module): ...@@ -216,11 +239,12 @@ class RandomPhotometricDistort(nn.Module):
self._saturation = T.ColorJitter(saturation=saturation) self._saturation = T.ColorJitter(saturation=saturation)
self.p = p self.p = p
def forward(self, image: Tensor, def forward(
target: Optional[Dict[str, Tensor]] = None) -> Tuple[Tensor, Optional[Dict[str, Tensor]]]: self, image: Tensor, target: Optional[Dict[str, Tensor]] = None
) -> Tuple[Tensor, Optional[Dict[str, Tensor]]]:
if isinstance(image, torch.Tensor): if isinstance(image, torch.Tensor):
if image.ndimension() not in {2, 3}: if image.ndimension() not in {2, 3}:
raise ValueError('image should be 2/3 dimensional. Got {} dimensions.'.format(image.ndimension())) raise ValueError("image should be 2/3 dimensional. Got {} dimensions.".format(image.ndimension()))
elif image.ndimension() == 2: elif image.ndimension() == 2:
image = image.unsqueeze(0) image = image.unsqueeze(0)
......
from collections import defaultdict, deque
import datetime import datetime
import errno import errno
import os import os
import time import time
from collections import defaultdict, deque
import torch import torch
import torch.distributed as dist import torch.distributed as dist
...@@ -32,7 +32,7 @@ class SmoothedValue(object): ...@@ -32,7 +32,7 @@ class SmoothedValue(object):
""" """
if not is_dist_avail_and_initialized(): if not is_dist_avail_and_initialized():
return return
t = torch.tensor([self.count, self.total], dtype=torch.float64, device='cuda') t = torch.tensor([self.count, self.total], dtype=torch.float64, device="cuda")
dist.barrier() dist.barrier()
dist.all_reduce(t) dist.all_reduce(t)
t = t.tolist() t = t.tolist()
...@@ -63,11 +63,8 @@ class SmoothedValue(object): ...@@ -63,11 +63,8 @@ class SmoothedValue(object):
def __str__(self): def __str__(self):
return self.fmt.format( return self.fmt.format(
median=self.median, median=self.median, avg=self.avg, global_avg=self.global_avg, max=self.max, value=self.value
avg=self.avg, )
global_avg=self.global_avg,
max=self.max,
value=self.value)
def all_gather(data): def all_gather(data):
...@@ -130,15 +127,12 @@ class MetricLogger(object): ...@@ -130,15 +127,12 @@ class MetricLogger(object):
return self.meters[attr] return self.meters[attr]
if attr in self.__dict__: if attr in self.__dict__:
return self.__dict__[attr] return self.__dict__[attr]
raise AttributeError("'{}' object has no attribute '{}'".format( raise AttributeError("'{}' object has no attribute '{}'".format(type(self).__name__, attr))
type(self).__name__, attr))
def __str__(self): def __str__(self):
loss_str = [] loss_str = []
for name, meter in self.meters.items(): for name, meter in self.meters.items():
loss_str.append( loss_str.append("{}: {}".format(name, str(meter)))
"{}: {}".format(name, str(meter))
)
return self.delimiter.join(loss_str) return self.delimiter.join(loss_str)
def synchronize_between_processes(self): def synchronize_between_processes(self):
...@@ -151,31 +145,28 @@ class MetricLogger(object): ...@@ -151,31 +145,28 @@ class MetricLogger(object):
def log_every(self, iterable, print_freq, header=None): def log_every(self, iterable, print_freq, header=None):
i = 0 i = 0
if not header: if not header:
header = '' header = ""
start_time = time.time() start_time = time.time()
end = time.time() end = time.time()
iter_time = SmoothedValue(fmt='{avg:.4f}') iter_time = SmoothedValue(fmt="{avg:.4f}")
data_time = SmoothedValue(fmt='{avg:.4f}') data_time = SmoothedValue(fmt="{avg:.4f}")
space_fmt = ':' + str(len(str(len(iterable)))) + 'd' space_fmt = ":" + str(len(str(len(iterable)))) + "d"
if torch.cuda.is_available(): if torch.cuda.is_available():
log_msg = self.delimiter.join([ log_msg = self.delimiter.join(
[
header, header,
'[{0' + space_fmt + '}/{1}]', "[{0" + space_fmt + "}/{1}]",
'eta: {eta}', "eta: {eta}",
'{meters}', "{meters}",
'time: {time}', "time: {time}",
'data: {data}', "data: {data}",
'max mem: {memory:.0f}' "max mem: {memory:.0f}",
]) ]
)
else: else:
log_msg = self.delimiter.join([ log_msg = self.delimiter.join(
header, [header, "[{0" + space_fmt + "}/{1}]", "eta: {eta}", "{meters}", "time: {time}", "data: {data}"]
'[{0' + space_fmt + '}/{1}]', )
'eta: {eta}',
'{meters}',
'time: {time}',
'data: {data}'
])
MB = 1024.0 * 1024.0 MB = 1024.0 * 1024.0
for obj in iterable: for obj in iterable:
data_time.update(time.time() - end) data_time.update(time.time() - end)
...@@ -185,22 +176,28 @@ class MetricLogger(object): ...@@ -185,22 +176,28 @@ class MetricLogger(object):
eta_seconds = iter_time.global_avg * (len(iterable) - i) eta_seconds = iter_time.global_avg * (len(iterable) - i)
eta_string = str(datetime.timedelta(seconds=int(eta_seconds))) eta_string = str(datetime.timedelta(seconds=int(eta_seconds)))
if torch.cuda.is_available(): if torch.cuda.is_available():
print(log_msg.format( print(
i, len(iterable), eta=eta_string, log_msg.format(
i,
len(iterable),
eta=eta_string,
meters=str(self), meters=str(self),
time=str(iter_time), data=str(data_time), time=str(iter_time),
memory=torch.cuda.max_memory_allocated() / MB)) data=str(data_time),
memory=torch.cuda.max_memory_allocated() / MB,
)
)
else: else:
print(log_msg.format( print(
i, len(iterable), eta=eta_string, log_msg.format(
meters=str(self), i, len(iterable), eta=eta_string, meters=str(self), time=str(iter_time), data=str(data_time)
time=str(iter_time), data=str(data_time))) )
)
i += 1 i += 1
end = time.time() end = time.time()
total_time = time.time() - start_time total_time = time.time() - start_time
total_time_str = str(datetime.timedelta(seconds=int(total_time))) total_time_str = str(datetime.timedelta(seconds=int(total_time)))
print('{} Total time: {} ({:.4f} s / it)'.format( print("{} Total time: {} ({:.4f} s / it)".format(header, total_time_str, total_time / len(iterable)))
header, total_time_str, total_time / len(iterable)))
def collate_fn(batch): def collate_fn(batch):
...@@ -220,10 +217,11 @@ def setup_for_distributed(is_master): ...@@ -220,10 +217,11 @@ def setup_for_distributed(is_master):
This function disables printing when not in master process This function disables printing when not in master process
""" """
import builtins as __builtin__ import builtins as __builtin__
builtin_print = __builtin__.print builtin_print = __builtin__.print
def print(*args, **kwargs): def print(*args, **kwargs):
force = kwargs.pop('force', False) force = kwargs.pop("force", False)
if is_master or force: if is_master or force:
builtin_print(*args, **kwargs) builtin_print(*args, **kwargs)
...@@ -260,25 +258,25 @@ def save_on_master(*args, **kwargs): ...@@ -260,25 +258,25 @@ def save_on_master(*args, **kwargs):
def init_distributed_mode(args): def init_distributed_mode(args):
if 'RANK' in os.environ and 'WORLD_SIZE' in os.environ: if "RANK" in os.environ and "WORLD_SIZE" in os.environ:
args.rank = int(os.environ["RANK"]) args.rank = int(os.environ["RANK"])
args.world_size = int(os.environ['WORLD_SIZE']) args.world_size = int(os.environ["WORLD_SIZE"])
args.gpu = int(os.environ['LOCAL_RANK']) args.gpu = int(os.environ["LOCAL_RANK"])
elif 'SLURM_PROCID' in os.environ: elif "SLURM_PROCID" in os.environ:
args.rank = int(os.environ['SLURM_PROCID']) args.rank = int(os.environ["SLURM_PROCID"])
args.gpu = args.rank % torch.cuda.device_count() args.gpu = args.rank % torch.cuda.device_count()
else: else:
print('Not using distributed mode') print("Not using distributed mode")
args.distributed = False args.distributed = False
return return
args.distributed = True args.distributed = True
torch.cuda.set_device(args.gpu) torch.cuda.set_device(args.gpu)
args.dist_backend = 'nccl' args.dist_backend = "nccl"
print('| distributed init (rank {}): {}'.format( print("| distributed init (rank {}): {}".format(args.rank, args.dist_url), flush=True)
args.rank, args.dist_url), flush=True) torch.distributed.init_process_group(
torch.distributed.init_process_group(backend=args.dist_backend, init_method=args.dist_url, backend=args.dist_backend, init_method=args.dist_url, world_size=args.world_size, rank=args.rank
world_size=args.world_size, rank=args.rank) )
torch.distributed.barrier() torch.distributed.barrier()
setup_for_distributed(args.rank == 0) setup_for_distributed(args.rank == 0)
import copy import copy
import os
import torch import torch
import torch.utils.data import torch.utils.data
import torchvision import torchvision
from PIL import Image from PIL import Image
import os
from pycocotools import mask as coco_mask from pycocotools import mask as coco_mask
from transforms import Compose from transforms import Compose
...@@ -90,14 +88,9 @@ def get_coco(root, image_set, transforms): ...@@ -90,14 +88,9 @@ def get_coco(root, image_set, transforms):
"val": ("val2017", os.path.join("annotations", "instances_val2017.json")), "val": ("val2017", os.path.join("annotations", "instances_val2017.json")),
# "train": ("val2017", os.path.join("annotations", "instances_val2017.json")) # "train": ("val2017", os.path.join("annotations", "instances_val2017.json"))
} }
CAT_LIST = [0, 5, 2, 16, 9, 44, 6, 3, 17, 62, 21, 67, 18, 19, 4, CAT_LIST = [0, 5, 2, 16, 9, 44, 6, 3, 17, 62, 21, 67, 18, 19, 4, 1, 64, 20, 63, 7, 72]
1, 64, 20, 63, 7, 72]
transforms = Compose([FilterAndRemapCocoCategories(CAT_LIST, remap=True), ConvertCocoPolysToMask(), transforms])
transforms = Compose([
FilterAndRemapCocoCategories(CAT_LIST, remap=True),
ConvertCocoPolysToMask(),
transforms
])
img_folder, ann_file = PATHS[image_set] img_folder, ann_file = PATHS[image_set]
img_folder = os.path.join(root, img_folder) img_folder = os.path.join(root, img_folder)
......
import torch import torch
import transforms as T import transforms as T
...@@ -11,12 +10,14 @@ class SegmentationPresetTrain: ...@@ -11,12 +10,14 @@ class SegmentationPresetTrain:
trans = [T.RandomResize(min_size, max_size)] trans = [T.RandomResize(min_size, max_size)]
if hflip_prob > 0: if hflip_prob > 0:
trans.append(T.RandomHorizontalFlip(hflip_prob)) trans.append(T.RandomHorizontalFlip(hflip_prob))
trans.extend([ trans.extend(
[
T.RandomCrop(crop_size), T.RandomCrop(crop_size),
T.PILToTensor(), T.PILToTensor(),
T.ConvertImageDtype(torch.float), T.ConvertImageDtype(torch.float),
T.Normalize(mean=mean, std=std), T.Normalize(mean=mean, std=std),
]) ]
)
self.transforms = T.Compose(trans) self.transforms = T.Compose(trans)
def __call__(self, img, target): def __call__(self, img, target):
...@@ -25,12 +26,14 @@ class SegmentationPresetTrain: ...@@ -25,12 +26,14 @@ class SegmentationPresetTrain:
class SegmentationPresetEval: class SegmentationPresetEval:
def __init__(self, base_size, mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)): def __init__(self, base_size, mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)):
self.transforms = T.Compose([ self.transforms = T.Compose(
[
T.RandomResize(base_size, base_size), T.RandomResize(base_size, base_size),
T.PILToTensor(), T.PILToTensor(),
T.ConvertImageDtype(torch.float), T.ConvertImageDtype(torch.float),
T.Normalize(mean=mean, std=std), T.Normalize(mean=mean, std=std),
]) ]
)
def __call__(self, img, target): def __call__(self, img, target):
return self.transforms(img, target) return self.transforms(img, target)
...@@ -2,23 +2,23 @@ import datetime ...@@ -2,23 +2,23 @@ import datetime
import os import os
import time import time
import presets
import torch import torch
import torch.utils.data import torch.utils.data
from torch import nn
import torchvision import torchvision
from coco_utils import get_coco
import presets
import utils import utils
from coco_utils import get_coco
from torch import nn
def get_dataset(dir_path, name, image_set, transform): def get_dataset(dir_path, name, image_set, transform):
def sbd(*args, **kwargs): def sbd(*args, **kwargs):
return torchvision.datasets.SBDataset(*args, mode='segmentation', **kwargs) return torchvision.datasets.SBDataset(*args, mode="segmentation", **kwargs)
paths = { paths = {
"voc": (dir_path, torchvision.datasets.VOCSegmentation, 21), "voc": (dir_path, torchvision.datasets.VOCSegmentation, 21),
"voc_aug": (dir_path, sbd, 21), "voc_aug": (dir_path, sbd, 21),
"coco": (dir_path, get_coco, 21) "coco": (dir_path, get_coco, 21),
} }
p, ds_fn, num_classes = paths[name] p, ds_fn, num_classes = paths[name]
...@@ -39,21 +39,21 @@ def criterion(inputs, target): ...@@ -39,21 +39,21 @@ def criterion(inputs, target):
losses[name] = nn.functional.cross_entropy(x, target, ignore_index=255) losses[name] = nn.functional.cross_entropy(x, target, ignore_index=255)
if len(losses) == 1: if len(losses) == 1:
return losses['out'] return losses["out"]
return losses['out'] + 0.5 * losses['aux'] return losses["out"] + 0.5 * losses["aux"]
def evaluate(model, data_loader, device, num_classes): def evaluate(model, data_loader, device, num_classes):
model.eval() model.eval()
confmat = utils.ConfusionMatrix(num_classes) confmat = utils.ConfusionMatrix(num_classes)
metric_logger = utils.MetricLogger(delimiter=" ") metric_logger = utils.MetricLogger(delimiter=" ")
header = 'Test:' header = "Test:"
with torch.no_grad(): with torch.no_grad():
for image, target in metric_logger.log_every(data_loader, 100, header): for image, target in metric_logger.log_every(data_loader, 100, header):
image, target = image.to(device), target.to(device) image, target = image.to(device), target.to(device)
output = model(image) output = model(image)
output = output['out'] output = output["out"]
confmat.update(target.flatten(), output.argmax(1).flatten()) confmat.update(target.flatten(), output.argmax(1).flatten())
...@@ -65,8 +65,8 @@ def evaluate(model, data_loader, device, num_classes): ...@@ -65,8 +65,8 @@ def evaluate(model, data_loader, device, num_classes):
def train_one_epoch(model, criterion, optimizer, data_loader, lr_scheduler, device, epoch, print_freq): def train_one_epoch(model, criterion, optimizer, data_loader, lr_scheduler, device, epoch, print_freq):
model.train() model.train()
metric_logger = utils.MetricLogger(delimiter=" ") metric_logger = utils.MetricLogger(delimiter=" ")
metric_logger.add_meter('lr', utils.SmoothedValue(window_size=1, fmt='{value}')) metric_logger.add_meter("lr", utils.SmoothedValue(window_size=1, fmt="{value}"))
header = 'Epoch: [{}]'.format(epoch) header = "Epoch: [{}]".format(epoch)
for image, target in metric_logger.log_every(data_loader, print_freq, header): for image, target in metric_logger.log_every(data_loader, print_freq, header):
image, target = image.to(device), target.to(device) image, target = image.to(device), target.to(device)
output = model(image) output = model(image)
...@@ -101,18 +101,21 @@ def main(args): ...@@ -101,18 +101,21 @@ def main(args):
test_sampler = torch.utils.data.SequentialSampler(dataset_test) test_sampler = torch.utils.data.SequentialSampler(dataset_test)
data_loader = torch.utils.data.DataLoader( data_loader = torch.utils.data.DataLoader(
dataset, batch_size=args.batch_size, dataset,
sampler=train_sampler, num_workers=args.workers, batch_size=args.batch_size,
collate_fn=utils.collate_fn, drop_last=True) sampler=train_sampler,
num_workers=args.workers,
collate_fn=utils.collate_fn,
drop_last=True,
)
data_loader_test = torch.utils.data.DataLoader( data_loader_test = torch.utils.data.DataLoader(
dataset_test, batch_size=1, dataset_test, batch_size=1, sampler=test_sampler, num_workers=args.workers, collate_fn=utils.collate_fn
sampler=test_sampler, num_workers=args.workers, )
collate_fn=utils.collate_fn)
model = torchvision.models.segmentation.__dict__[args.model](num_classes=num_classes, model = torchvision.models.segmentation.__dict__[args.model](
aux_loss=args.aux_loss, num_classes=num_classes, aux_loss=args.aux_loss, pretrained=args.pretrained
pretrained=args.pretrained) )
model.to(device) model.to(device)
if args.distributed: if args.distributed:
model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model) model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model)
...@@ -129,42 +132,42 @@ def main(args): ...@@ -129,42 +132,42 @@ def main(args):
if args.aux_loss: if args.aux_loss:
params = [p for p in model_without_ddp.aux_classifier.parameters() if p.requires_grad] params = [p for p in model_without_ddp.aux_classifier.parameters() if p.requires_grad]
params_to_optimize.append({"params": params, "lr": args.lr * 10}) params_to_optimize.append({"params": params, "lr": args.lr * 10})
optimizer = torch.optim.SGD( optimizer = torch.optim.SGD(params_to_optimize, lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay)
params_to_optimize,
lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay)
iters_per_epoch = len(data_loader) iters_per_epoch = len(data_loader)
main_lr_scheduler = torch.optim.lr_scheduler.LambdaLR( main_lr_scheduler = torch.optim.lr_scheduler.LambdaLR(
optimizer, optimizer, lambda x: (1 - x / (iters_per_epoch * (args.epochs - args.lr_warmup_epochs))) ** 0.9
lambda x: (1 - x / (iters_per_epoch * (args.epochs - args.lr_warmup_epochs))) ** 0.9) )
if args.lr_warmup_epochs > 0: if args.lr_warmup_epochs > 0:
warmup_iters = iters_per_epoch * args.lr_warmup_epochs warmup_iters = iters_per_epoch * args.lr_warmup_epochs
args.lr_warmup_method = args.lr_warmup_method.lower() args.lr_warmup_method = args.lr_warmup_method.lower()
if args.lr_warmup_method == 'linear': if args.lr_warmup_method == "linear":
warmup_lr_scheduler = torch.optim.lr_scheduler.LinearLR(optimizer, start_factor=args.lr_warmup_decay, warmup_lr_scheduler = torch.optim.lr_scheduler.LinearLR(
total_iters=warmup_iters) optimizer, start_factor=args.lr_warmup_decay, total_iters=warmup_iters
elif args.lr_warmup_method == 'constant': )
warmup_lr_scheduler = torch.optim.lr_scheduler.ConstantLR(optimizer, factor=args.lr_warmup_decay, elif args.lr_warmup_method == "constant":
total_iters=warmup_iters) warmup_lr_scheduler = torch.optim.lr_scheduler.ConstantLR(
optimizer, factor=args.lr_warmup_decay, total_iters=warmup_iters
)
else: else:
raise RuntimeError("Invalid warmup lr method '{}'. Only linear and constant " raise RuntimeError(
"are supported.".format(args.lr_warmup_method)) "Invalid warmup lr method '{}'. Only linear and constant "
"are supported.".format(args.lr_warmup_method)
)
lr_scheduler = torch.optim.lr_scheduler.SequentialLR( lr_scheduler = torch.optim.lr_scheduler.SequentialLR(
optimizer, optimizer, schedulers=[warmup_lr_scheduler, main_lr_scheduler], milestones=[warmup_iters]
schedulers=[warmup_lr_scheduler, main_lr_scheduler],
milestones=[warmup_iters]
) )
else: else:
lr_scheduler = main_lr_scheduler lr_scheduler = main_lr_scheduler
if args.resume: if args.resume:
checkpoint = torch.load(args.resume, map_location='cpu') checkpoint = torch.load(args.resume, map_location="cpu")
model_without_ddp.load_state_dict(checkpoint['model'], strict=not args.test_only) model_without_ddp.load_state_dict(checkpoint["model"], strict=not args.test_only)
if not args.test_only: if not args.test_only:
optimizer.load_state_dict(checkpoint['optimizer']) optimizer.load_state_dict(checkpoint["optimizer"])
lr_scheduler.load_state_dict(checkpoint['lr_scheduler']) lr_scheduler.load_state_dict(checkpoint["lr_scheduler"])
args.start_epoch = checkpoint['epoch'] + 1 args.start_epoch = checkpoint["epoch"] + 1
if args.test_only: if args.test_only:
confmat = evaluate(model, data_loader_test, device=device, num_classes=num_classes) confmat = evaluate(model, data_loader_test, device=device, num_classes=num_classes)
...@@ -179,53 +182,54 @@ def main(args): ...@@ -179,53 +182,54 @@ def main(args):
confmat = evaluate(model, data_loader_test, device=device, num_classes=num_classes) confmat = evaluate(model, data_loader_test, device=device, num_classes=num_classes)
print(confmat) print(confmat)
checkpoint = { checkpoint = {
'model': model_without_ddp.state_dict(), "model": model_without_ddp.state_dict(),
'optimizer': optimizer.state_dict(), "optimizer": optimizer.state_dict(),
'lr_scheduler': lr_scheduler.state_dict(), "lr_scheduler": lr_scheduler.state_dict(),
'epoch': epoch, "epoch": epoch,
'args': args "args": args,
} }
utils.save_on_master( utils.save_on_master(checkpoint, os.path.join(args.output_dir, "model_{}.pth".format(epoch)))
checkpoint, utils.save_on_master(checkpoint, os.path.join(args.output_dir, "checkpoint.pth"))
os.path.join(args.output_dir, 'model_{}.pth'.format(epoch)))
utils.save_on_master(
checkpoint,
os.path.join(args.output_dir, 'checkpoint.pth'))
total_time = time.time() - start_time total_time = time.time() - start_time
total_time_str = str(datetime.timedelta(seconds=int(total_time))) total_time_str = str(datetime.timedelta(seconds=int(total_time)))
print('Training time {}'.format(total_time_str)) print("Training time {}".format(total_time_str))
def get_args_parser(add_help=True): def get_args_parser(add_help=True):
import argparse import argparse
parser = argparse.ArgumentParser(description='PyTorch Segmentation Training', add_help=add_help)
parser = argparse.ArgumentParser(description="PyTorch Segmentation Training", add_help=add_help)
parser.add_argument('--data-path', default='/datasets01/COCO/022719/', help='dataset path')
parser.add_argument('--dataset', default='coco', help='dataset name') parser.add_argument("--data-path", default="/datasets01/COCO/022719/", help="dataset path")
parser.add_argument('--model', default='fcn_resnet101', help='model') parser.add_argument("--dataset", default="coco", help="dataset name")
parser.add_argument('--aux-loss', action='store_true', help='auxiliar loss') parser.add_argument("--model", default="fcn_resnet101", help="model")
parser.add_argument('--device', default='cuda', help='device') parser.add_argument("--aux-loss", action="store_true", help="auxiliar loss")
parser.add_argument('-b', '--batch-size', default=8, type=int) parser.add_argument("--device", default="cuda", help="device")
parser.add_argument('--epochs', default=30, type=int, metavar='N', parser.add_argument("-b", "--batch-size", default=8, type=int)
help='number of total epochs to run') parser.add_argument("--epochs", default=30, type=int, metavar="N", help="number of total epochs to run")
parser.add_argument('-j', '--workers', default=16, type=int, metavar='N', parser.add_argument(
help='number of data loading workers (default: 16)') "-j", "--workers", default=16, type=int, metavar="N", help="number of data loading workers (default: 16)"
parser.add_argument('--lr', default=0.01, type=float, help='initial learning rate') )
parser.add_argument('--momentum', default=0.9, type=float, metavar='M', parser.add_argument("--lr", default=0.01, type=float, help="initial learning rate")
help='momentum') parser.add_argument("--momentum", default=0.9, type=float, metavar="M", help="momentum")
parser.add_argument('--wd', '--weight-decay', default=1e-4, type=float, parser.add_argument(
metavar='W', help='weight decay (default: 1e-4)', "--wd",
dest='weight_decay') "--weight-decay",
parser.add_argument('--lr-warmup-epochs', default=0, type=int, help='the number of epochs to warmup (default: 0)') default=1e-4,
parser.add_argument('--lr-warmup-method', default="linear", type=str, help='the warmup method (default: linear)') type=float,
parser.add_argument('--lr-warmup-decay', default=0.01, type=float, help='the decay for lr') metavar="W",
parser.add_argument('--print-freq', default=10, type=int, help='print frequency') help="weight decay (default: 1e-4)",
parser.add_argument('--output-dir', default='.', help='path where to save') dest="weight_decay",
parser.add_argument('--resume', default='', help='resume from checkpoint') )
parser.add_argument('--start-epoch', default=0, type=int, metavar='N', parser.add_argument("--lr-warmup-epochs", default=0, type=int, help="the number of epochs to warmup (default: 0)")
help='start epoch') parser.add_argument("--lr-warmup-method", default="linear", type=str, help="the warmup method (default: linear)")
parser.add_argument("--lr-warmup-decay", default=0.01, type=float, help="the decay for lr")
parser.add_argument("--print-freq", default=10, type=int, help="print frequency")
parser.add_argument("--output-dir", default=".", help="path where to save")
parser.add_argument("--resume", default="", help="resume from checkpoint")
parser.add_argument("--start-epoch", default=0, type=int, metavar="N", help="start epoch")
parser.add_argument( parser.add_argument(
"--test-only", "--test-only",
dest="test_only", dest="test_only",
...@@ -239,9 +243,8 @@ def get_args_parser(add_help=True): ...@@ -239,9 +243,8 @@ def get_args_parser(add_help=True):
action="store_true", action="store_true",
) )
# distributed training parameters # distributed training parameters
parser.add_argument('--world-size', default=1, type=int, parser.add_argument("--world-size", default=1, type=int, help="number of distributed processes")
help='number of distributed processes') parser.add_argument("--dist-url", default="env://", help="url used to set up distributed training")
parser.add_argument('--dist-url', default='env://', help='url used to set up distributed training')
return parser return parser
......
from collections import defaultdict, deque
import datetime import datetime
import errno
import os
import time import time
from collections import defaultdict, deque
import torch import torch
import torch.distributed as dist import torch.distributed as dist
import errno
import os
class SmoothedValue(object): class SmoothedValue(object):
"""Track a series of values and provide access to smoothed values over a """Track a series of values and provide access to smoothed values over a
...@@ -32,7 +32,7 @@ class SmoothedValue(object): ...@@ -32,7 +32,7 @@ class SmoothedValue(object):
""" """
if not is_dist_avail_and_initialized(): if not is_dist_avail_and_initialized():
return return
t = torch.tensor([self.count, self.total], dtype=torch.float64, device='cuda') t = torch.tensor([self.count, self.total], dtype=torch.float64, device="cuda")
dist.barrier() dist.barrier()
dist.all_reduce(t) dist.all_reduce(t)
t = t.tolist() t = t.tolist()
...@@ -63,11 +63,8 @@ class SmoothedValue(object): ...@@ -63,11 +63,8 @@ class SmoothedValue(object):
def __str__(self): def __str__(self):
return self.fmt.format( return self.fmt.format(
median=self.median, median=self.median, avg=self.avg, global_avg=self.global_avg, max=self.max, value=self.value
avg=self.avg, )
global_avg=self.global_avg,
max=self.max,
value=self.value)
class ConfusionMatrix(object): class ConfusionMatrix(object):
...@@ -82,7 +79,7 @@ class ConfusionMatrix(object): ...@@ -82,7 +79,7 @@ class ConfusionMatrix(object):
with torch.no_grad(): with torch.no_grad():
k = (a >= 0) & (a < n) k = (a >= 0) & (a < n)
inds = n * a[k].to(torch.int64) + b[k] inds = n * a[k].to(torch.int64) + b[k]
self.mat += torch.bincount(inds, minlength=n**2).reshape(n, n) self.mat += torch.bincount(inds, minlength=n ** 2).reshape(n, n)
def reset(self): def reset(self):
self.mat.zero_() self.mat.zero_()
...@@ -104,15 +101,12 @@ class ConfusionMatrix(object): ...@@ -104,15 +101,12 @@ class ConfusionMatrix(object):
def __str__(self): def __str__(self):
acc_global, acc, iu = self.compute() acc_global, acc, iu = self.compute()
return ( return ("global correct: {:.1f}\n" "average row correct: {}\n" "IoU: {}\n" "mean IoU: {:.1f}").format(
'global correct: {:.1f}\n'
'average row correct: {}\n'
'IoU: {}\n'
'mean IoU: {:.1f}').format(
acc_global.item() * 100, acc_global.item() * 100,
['{:.1f}'.format(i) for i in (acc * 100).tolist()], ["{:.1f}".format(i) for i in (acc * 100).tolist()],
['{:.1f}'.format(i) for i in (iu * 100).tolist()], ["{:.1f}".format(i) for i in (iu * 100).tolist()],
iu.mean().item() * 100) iu.mean().item() * 100,
)
class MetricLogger(object): class MetricLogger(object):
...@@ -132,15 +126,12 @@ class MetricLogger(object): ...@@ -132,15 +126,12 @@ class MetricLogger(object):
return self.meters[attr] return self.meters[attr]
if attr in self.__dict__: if attr in self.__dict__:
return self.__dict__[attr] return self.__dict__[attr]
raise AttributeError("'{}' object has no attribute '{}'".format( raise AttributeError("'{}' object has no attribute '{}'".format(type(self).__name__, attr))
type(self).__name__, attr))
def __str__(self): def __str__(self):
loss_str = [] loss_str = []
for name, meter in self.meters.items(): for name, meter in self.meters.items():
loss_str.append( loss_str.append("{}: {}".format(name, str(meter)))
"{}: {}".format(name, str(meter))
)
return self.delimiter.join(loss_str) return self.delimiter.join(loss_str)
def synchronize_between_processes(self): def synchronize_between_processes(self):
...@@ -153,31 +144,28 @@ class MetricLogger(object): ...@@ -153,31 +144,28 @@ class MetricLogger(object):
def log_every(self, iterable, print_freq, header=None): def log_every(self, iterable, print_freq, header=None):
i = 0 i = 0
if not header: if not header:
header = '' header = ""
start_time = time.time() start_time = time.time()
end = time.time() end = time.time()
iter_time = SmoothedValue(fmt='{avg:.4f}') iter_time = SmoothedValue(fmt="{avg:.4f}")
data_time = SmoothedValue(fmt='{avg:.4f}') data_time = SmoothedValue(fmt="{avg:.4f}")
space_fmt = ':' + str(len(str(len(iterable)))) + 'd' space_fmt = ":" + str(len(str(len(iterable)))) + "d"
if torch.cuda.is_available(): if torch.cuda.is_available():
log_msg = self.delimiter.join([ log_msg = self.delimiter.join(
[
header, header,
'[{0' + space_fmt + '}/{1}]', "[{0" + space_fmt + "}/{1}]",
'eta: {eta}', "eta: {eta}",
'{meters}', "{meters}",
'time: {time}', "time: {time}",
'data: {data}', "data: {data}",
'max mem: {memory:.0f}' "max mem: {memory:.0f}",
]) ]
)
else: else:
log_msg = self.delimiter.join([ log_msg = self.delimiter.join(
header, [header, "[{0" + space_fmt + "}/{1}]", "eta: {eta}", "{meters}", "time: {time}", "data: {data}"]
'[{0' + space_fmt + '}/{1}]', )
'eta: {eta}',
'{meters}',
'time: {time}',
'data: {data}'
])
MB = 1024.0 * 1024.0 MB = 1024.0 * 1024.0
for obj in iterable: for obj in iterable:
data_time.update(time.time() - end) data_time.update(time.time() - end)
...@@ -187,21 +175,28 @@ class MetricLogger(object): ...@@ -187,21 +175,28 @@ class MetricLogger(object):
eta_seconds = iter_time.global_avg * (len(iterable) - i) eta_seconds = iter_time.global_avg * (len(iterable) - i)
eta_string = str(datetime.timedelta(seconds=int(eta_seconds))) eta_string = str(datetime.timedelta(seconds=int(eta_seconds)))
if torch.cuda.is_available(): if torch.cuda.is_available():
print(log_msg.format( print(
i, len(iterable), eta=eta_string, log_msg.format(
i,
len(iterable),
eta=eta_string,
meters=str(self), meters=str(self),
time=str(iter_time), data=str(data_time), time=str(iter_time),
memory=torch.cuda.max_memory_allocated() / MB)) data=str(data_time),
memory=torch.cuda.max_memory_allocated() / MB,
)
)
else: else:
print(log_msg.format( print(
i, len(iterable), eta=eta_string, log_msg.format(
meters=str(self), i, len(iterable), eta=eta_string, meters=str(self), time=str(iter_time), data=str(data_time)
time=str(iter_time), data=str(data_time))) )
)
i += 1 i += 1
end = time.time() end = time.time()
total_time = time.time() - start_time total_time = time.time() - start_time
total_time_str = str(datetime.timedelta(seconds=int(total_time))) total_time_str = str(datetime.timedelta(seconds=int(total_time)))
print('{} Total time: {}'.format(header, total_time_str)) print("{} Total time: {}".format(header, total_time_str))
def cat_list(images, fill_value=0): def cat_list(images, fill_value=0):
...@@ -209,7 +204,7 @@ def cat_list(images, fill_value=0): ...@@ -209,7 +204,7 @@ def cat_list(images, fill_value=0):
batch_shape = (len(images),) + max_size batch_shape = (len(images),) + max_size
batched_imgs = images[0].new(*batch_shape).fill_(fill_value) batched_imgs = images[0].new(*batch_shape).fill_(fill_value)
for img, pad_img in zip(images, batched_imgs): for img, pad_img in zip(images, batched_imgs):
pad_img[..., :img.shape[-2], :img.shape[-1]].copy_(img) pad_img[..., : img.shape[-2], : img.shape[-1]].copy_(img)
return batched_imgs return batched_imgs
...@@ -233,10 +228,11 @@ def setup_for_distributed(is_master): ...@@ -233,10 +228,11 @@ def setup_for_distributed(is_master):
This function disables printing when not in master process This function disables printing when not in master process
""" """
import builtins as __builtin__ import builtins as __builtin__
builtin_print = __builtin__.print builtin_print = __builtin__.print
def print(*args, **kwargs): def print(*args, **kwargs):
force = kwargs.pop('force', False) force = kwargs.pop("force", False)
if is_master or force: if is_master or force:
builtin_print(*args, **kwargs) builtin_print(*args, **kwargs)
...@@ -273,26 +269,26 @@ def save_on_master(*args, **kwargs): ...@@ -273,26 +269,26 @@ def save_on_master(*args, **kwargs):
def init_distributed_mode(args): def init_distributed_mode(args):
if 'RANK' in os.environ and 'WORLD_SIZE' in os.environ: if "RANK" in os.environ and "WORLD_SIZE" in os.environ:
args.rank = int(os.environ["RANK"]) args.rank = int(os.environ["RANK"])
args.world_size = int(os.environ['WORLD_SIZE']) args.world_size = int(os.environ["WORLD_SIZE"])
args.gpu = int(os.environ['LOCAL_RANK']) args.gpu = int(os.environ["LOCAL_RANK"])
elif 'SLURM_PROCID' in os.environ: elif "SLURM_PROCID" in os.environ:
args.rank = int(os.environ['SLURM_PROCID']) args.rank = int(os.environ["SLURM_PROCID"])
args.gpu = args.rank % torch.cuda.device_count() args.gpu = args.rank % torch.cuda.device_count()
elif hasattr(args, "rank"): elif hasattr(args, "rank"):
pass pass
else: else:
print('Not using distributed mode') print("Not using distributed mode")
args.distributed = False args.distributed = False
return return
args.distributed = True args.distributed = True
torch.cuda.set_device(args.gpu) torch.cuda.set_device(args.gpu)
args.dist_backend = 'nccl' args.dist_backend = "nccl"
print('| distributed init (rank {}): {}'.format( print("| distributed init (rank {}): {}".format(args.rank, args.dist_url), flush=True)
args.rank, args.dist_url), flush=True) torch.distributed.init_process_group(
torch.distributed.init_process_group(backend=args.dist_backend, init_method=args.dist_url, backend=args.dist_backend, init_method=args.dist_url, world_size=args.world_size, rank=args.rank
world_size=args.world_size, rank=args.rank) )
setup_for_distributed(args.rank == 0) setup_for_distributed(args.rank == 0)
''' """
Pytorch adaptation of https://omoindrot.github.io/triplet-loss Pytorch adaptation of https://omoindrot.github.io/triplet-loss
https://github.com/omoindrot/tensorflow-triplet-loss https://github.com/omoindrot/tensorflow-triplet-loss
''' """
import torch import torch
import torch.nn as nn import torch.nn as nn
class TripletMarginLoss(nn.Module): class TripletMarginLoss(nn.Module):
def __init__(self, margin=1.0, p=2., mining='batch_all'): def __init__(self, margin=1.0, p=2.0, mining="batch_all"):
super(TripletMarginLoss, self).__init__() super(TripletMarginLoss, self).__init__()
self.margin = margin self.margin = margin
self.p = p self.p = p
self.mining = mining self.mining = mining
if mining == 'batch_all': if mining == "batch_all":
self.loss_fn = batch_all_triplet_loss self.loss_fn = batch_all_triplet_loss
if mining == 'batch_hard': if mining == "batch_hard":
self.loss_fn = batch_hard_triplet_loss self.loss_fn = batch_hard_triplet_loss
def forward(self, embeddings, labels): def forward(self, embeddings, labels):
......
import random
from collections import defaultdict
import torch import torch
from torch.utils.data.sampler import Sampler from torch.utils.data.sampler import Sampler
from collections import defaultdict
import random
def create_groups(groups, k): def create_groups(groups, k):
......
import unittest import unittest
from collections import defaultdict from collections import defaultdict
from torch.utils.data import DataLoader
from torchvision.datasets import FakeData
import torchvision.transforms as transforms import torchvision.transforms as transforms
from sampler import PKSampler from sampler import PKSampler
from torch.utils.data import DataLoader
from torchvision.datasets import FakeData
class Tester(unittest.TestCase): class Tester(unittest.TestCase):
def test_pksampler(self): def test_pksampler(self):
p, k = 16, 4 p, k = 16, 4
...@@ -19,8 +17,7 @@ class Tester(unittest.TestCase): ...@@ -19,8 +17,7 @@ class Tester(unittest.TestCase):
self.assertRaises(AssertionError, PKSampler, targets, p, k) self.assertRaises(AssertionError, PKSampler, targets, p, k)
# Ensure p, k constraints on batch # Ensure p, k constraints on batch
dataset = FakeData(size=1000, num_classes=100, image_size=(3, 1, 1), dataset = FakeData(size=1000, num_classes=100, image_size=(3, 1, 1), transform=transforms.ToTensor())
transform=transforms.ToTensor())
targets = [target.item() for _, target in dataset] targets = [target.item() for _, target in dataset]
sampler = PKSampler(targets, p, k) sampler = PKSampler(targets, p, k)
loader = DataLoader(dataset, batch_size=p * k, sampler=sampler) loader = DataLoader(dataset, batch_size=p * k, sampler=sampler)
...@@ -38,5 +35,5 @@ class Tester(unittest.TestCase): ...@@ -38,5 +35,5 @@ class Tester(unittest.TestCase):
self.assertEqual(bins[b], k) self.assertEqual(bins[b], k)
if __name__ == '__main__': if __name__ == "__main__":
unittest.main() unittest.main()
import os import os
import torch import torch
from torch.optim import Adam
from torch.utils.data import DataLoader
import torchvision.transforms as transforms import torchvision.transforms as transforms
from torchvision.datasets import FashionMNIST
from loss import TripletMarginLoss from loss import TripletMarginLoss
from sampler import PKSampler
from model import EmbeddingNet from model import EmbeddingNet
from sampler import PKSampler
from torch.optim import Adam
from torch.utils.data import DataLoader
from torchvision.datasets import FashionMNIST
def train_epoch(model, optimizer, criterion, data_loader, device, epoch, print_freq): def train_epoch(model, optimizer, criterion, data_loader, device, epoch, print_freq):
...@@ -33,7 +31,7 @@ def train_epoch(model, optimizer, criterion, data_loader, device, epoch, print_f ...@@ -33,7 +31,7 @@ def train_epoch(model, optimizer, criterion, data_loader, device, epoch, print_f
i += 1 i += 1
avg_loss = running_loss / print_freq avg_loss = running_loss / print_freq
avg_trip = 100.0 * running_frac_pos_triplets / print_freq avg_trip = 100.0 * running_frac_pos_triplets / print_freq
print('[{:d}, {:d}] | loss: {:.4f} | % avg hard triplets: {:.2f}%'.format(epoch, i, avg_loss, avg_trip)) print("[{:d}, {:d}] | loss: {:.4f} | % avg hard triplets: {:.2f}%".format(epoch, i, avg_loss, avg_trip))
running_loss = 0 running_loss = 0
running_frac_pos_triplets = 0 running_frac_pos_triplets = 0
...@@ -79,17 +77,17 @@ def evaluate(model, loader, device): ...@@ -79,17 +77,17 @@ def evaluate(model, loader, device):
threshold, accuracy = find_best_threshold(dists, targets, device) threshold, accuracy = find_best_threshold(dists, targets, device)
print('accuracy: {:.3f}%, threshold: {:.2f}'.format(accuracy, threshold)) print("accuracy: {:.3f}%, threshold: {:.2f}".format(accuracy, threshold))
def save(model, epoch, save_dir, file_name): def save(model, epoch, save_dir, file_name):
file_name = 'epoch_' + str(epoch) + '__' + file_name file_name = "epoch_" + str(epoch) + "__" + file_name
save_path = os.path.join(save_dir, file_name) save_path = os.path.join(save_dir, file_name)
torch.save(model.state_dict(), save_path) torch.save(model.state_dict(), save_path)
def main(args): def main(args):
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
p = args.labels_per_batch p = args.labels_per_batch
k = args.samples_per_label k = args.samples_per_label
batch_size = p * k batch_size = p * k
...@@ -103,9 +101,9 @@ def main(args): ...@@ -103,9 +101,9 @@ def main(args):
criterion = TripletMarginLoss(margin=args.margin) criterion = TripletMarginLoss(margin=args.margin)
optimizer = Adam(model.parameters(), lr=args.lr) optimizer = Adam(model.parameters(), lr=args.lr)
transform = transforms.Compose([transforms.Lambda(lambda image: image.convert('RGB')), transform = transforms.Compose(
transforms.Resize((224, 224)), [transforms.Lambda(lambda image: image.convert("RGB")), transforms.Resize((224, 224)), transforms.ToTensor()]
transforms.ToTensor()]) )
# Using FMNIST to demonstrate embedding learning using triplet loss. This dataset can # Using FMNIST to demonstrate embedding learning using triplet loss. This dataset can
# be replaced with any classification dataset. # be replaced with any classification dataset.
...@@ -118,48 +116,44 @@ def main(args): ...@@ -118,48 +116,44 @@ def main(args):
# targets attribute with the same format. # targets attribute with the same format.
targets = train_dataset.targets.tolist() targets = train_dataset.targets.tolist()
train_loader = DataLoader(train_dataset, batch_size=batch_size, train_loader = DataLoader(
sampler=PKSampler(targets, p, k), train_dataset, batch_size=batch_size, sampler=PKSampler(targets, p, k), num_workers=args.workers
num_workers=args.workers) )
test_loader = DataLoader(test_dataset, batch_size=args.eval_batch_size, test_loader = DataLoader(test_dataset, batch_size=args.eval_batch_size, shuffle=False, num_workers=args.workers)
shuffle=False,
num_workers=args.workers)
for epoch in range(1, args.epochs + 1): for epoch in range(1, args.epochs + 1):
print('Training...') print("Training...")
train_epoch(model, optimizer, criterion, train_loader, device, epoch, args.print_freq) train_epoch(model, optimizer, criterion, train_loader, device, epoch, args.print_freq)
print('Evaluating...') print("Evaluating...")
evaluate(model, test_loader, device) evaluate(model, test_loader, device)
print('Saving...') print("Saving...")
save(model, epoch, args.save_dir, 'ckpt.pth') save(model, epoch, args.save_dir, "ckpt.pth")
def parse_args(): def parse_args():
import argparse import argparse
parser = argparse.ArgumentParser(description='PyTorch Embedding Learning')
parser = argparse.ArgumentParser(description="PyTorch Embedding Learning")
parser.add_argument('--dataset-dir', default='/tmp/fmnist/',
help='FashionMNIST dataset directory path') parser.add_argument("--dataset-dir", default="/tmp/fmnist/", help="FashionMNIST dataset directory path")
parser.add_argument('-p', '--labels-per-batch', default=8, type=int, parser.add_argument(
help='Number of unique labels/classes per batch') "-p", "--labels-per-batch", default=8, type=int, help="Number of unique labels/classes per batch"
parser.add_argument('-k', '--samples-per-label', default=8, type=int, )
help='Number of samples per label in a batch') parser.add_argument("-k", "--samples-per-label", default=8, type=int, help="Number of samples per label in a batch")
parser.add_argument('--eval-batch-size', default=512, type=int) parser.add_argument("--eval-batch-size", default=512, type=int)
parser.add_argument('--epochs', default=10, type=int, metavar='N', parser.add_argument("--epochs", default=10, type=int, metavar="N", help="Number of training epochs to run")
help='Number of training epochs to run') parser.add_argument("-j", "--workers", default=4, type=int, metavar="N", help="Number of data loading workers")
parser.add_argument('-j', '--workers', default=4, type=int, metavar='N', parser.add_argument("--lr", default=0.0001, type=float, help="Learning rate")
help='Number of data loading workers') parser.add_argument("--margin", default=0.2, type=float, help="Triplet loss margin")
parser.add_argument('--lr', default=0.0001, type=float, help='Learning rate') parser.add_argument("--print-freq", default=20, type=int, help="Print frequency")
parser.add_argument('--margin', default=0.2, type=float, help='Triplet loss margin') parser.add_argument("--save-dir", default=".", help="Model save directory")
parser.add_argument('--print-freq', default=20, type=int, help='Print frequency') parser.add_argument("--resume", default="", help="Resume from checkpoint")
parser.add_argument('--save-dir', default='.', help='Model save directory')
parser.add_argument('--resume', default='', help='Resume from checkpoint')
return parser.parse_args() return parser.parse_args()
if __name__ == '__main__': if __name__ == "__main__":
args = parse_args() args = parse_args()
main(args) main(args)
import torch import torch
from torchvision.transforms import transforms from torchvision.transforms import transforms
from transforms import ConvertBHWCtoBCHW, ConvertBCHWtoCBHW from transforms import ConvertBHWCtoBCHW, ConvertBCHWtoCBHW
class VideoClassificationPresetTrain: class VideoClassificationPresetTrain:
def __init__(self, resize_size, crop_size, mean=(0.43216, 0.394666, 0.37645), std=(0.22803, 0.22145, 0.216989), def __init__(
hflip_prob=0.5): self,
resize_size,
crop_size,
mean=(0.43216, 0.394666, 0.37645),
std=(0.22803, 0.22145, 0.216989),
hflip_prob=0.5,
):
trans = [ trans = [
ConvertBHWCtoBCHW(), ConvertBHWCtoBCHW(),
transforms.ConvertImageDtype(torch.float32), transforms.ConvertImageDtype(torch.float32),
...@@ -14,11 +19,7 @@ class VideoClassificationPresetTrain: ...@@ -14,11 +19,7 @@ class VideoClassificationPresetTrain:
] ]
if hflip_prob > 0: if hflip_prob > 0:
trans.append(transforms.RandomHorizontalFlip(hflip_prob)) trans.append(transforms.RandomHorizontalFlip(hflip_prob))
trans.extend([ trans.extend([transforms.Normalize(mean=mean, std=std), transforms.RandomCrop(crop_size), ConvertBCHWtoCBHW()])
transforms.Normalize(mean=mean, std=std),
transforms.RandomCrop(crop_size),
ConvertBCHWtoCBHW()
])
self.transforms = transforms.Compose(trans) self.transforms = transforms.Compose(trans)
def __call__(self, x): def __call__(self, x):
...@@ -27,14 +28,16 @@ class VideoClassificationPresetTrain: ...@@ -27,14 +28,16 @@ class VideoClassificationPresetTrain:
class VideoClassificationPresetEval: class VideoClassificationPresetEval:
def __init__(self, resize_size, crop_size, mean=(0.43216, 0.394666, 0.37645), std=(0.22803, 0.22145, 0.216989)): def __init__(self, resize_size, crop_size, mean=(0.43216, 0.394666, 0.37645), std=(0.22803, 0.22145, 0.216989)):
self.transforms = transforms.Compose([ self.transforms = transforms.Compose(
[
ConvertBHWCtoBCHW(), ConvertBHWCtoBCHW(),
transforms.ConvertImageDtype(torch.float32), transforms.ConvertImageDtype(torch.float32),
transforms.Resize(resize_size), transforms.Resize(resize_size),
transforms.Normalize(mean=mean, std=std), transforms.Normalize(mean=mean, std=std),
transforms.CenterCrop(crop_size), transforms.CenterCrop(crop_size),
ConvertBCHWtoCBHW() ConvertBCHWtoCBHW(),
]) ]
)
def __call__(self, x): def __call__(self, x):
return self.transforms(x) return self.transforms(x)
import datetime import datetime
import os import os
import time import time
import presets
import torch import torch
import torch.utils.data import torch.utils.data
from torch.utils.data.dataloader import default_collate
from torch import nn
import torchvision import torchvision
import torchvision.datasets.video_utils import torchvision.datasets.video_utils
from torchvision.datasets.samplers import DistributedSampler, UniformClipSampler, RandomClipSampler
import presets
import utils import utils
from torch import nn
from torch.utils.data.dataloader import default_collate
from torchvision.datasets.samplers import DistributedSampler, UniformClipSampler, RandomClipSampler
try: try:
from apex import amp from apex import amp
...@@ -21,10 +21,10 @@ except ImportError: ...@@ -21,10 +21,10 @@ except ImportError:
def train_one_epoch(model, criterion, optimizer, lr_scheduler, data_loader, device, epoch, print_freq, apex=False): def train_one_epoch(model, criterion, optimizer, lr_scheduler, data_loader, device, epoch, print_freq, apex=False):
model.train() model.train()
metric_logger = utils.MetricLogger(delimiter=" ") metric_logger = utils.MetricLogger(delimiter=" ")
metric_logger.add_meter('lr', utils.SmoothedValue(window_size=1, fmt='{value}')) metric_logger.add_meter("lr", utils.SmoothedValue(window_size=1, fmt="{value}"))
metric_logger.add_meter('clips/s', utils.SmoothedValue(window_size=10, fmt='{value:.3f}')) metric_logger.add_meter("clips/s", utils.SmoothedValue(window_size=10, fmt="{value:.3f}"))
header = 'Epoch: [{}]'.format(epoch) header = "Epoch: [{}]".format(epoch)
for video, target in metric_logger.log_every(data_loader, print_freq, header): for video, target in metric_logger.log_every(data_loader, print_freq, header):
start_time = time.time() start_time = time.time()
video, target = video.to(device), target.to(device) video, target = video.to(device), target.to(device)
...@@ -42,16 +42,16 @@ def train_one_epoch(model, criterion, optimizer, lr_scheduler, data_loader, devi ...@@ -42,16 +42,16 @@ def train_one_epoch(model, criterion, optimizer, lr_scheduler, data_loader, devi
acc1, acc5 = utils.accuracy(output, target, topk=(1, 5)) acc1, acc5 = utils.accuracy(output, target, topk=(1, 5))
batch_size = video.shape[0] batch_size = video.shape[0]
metric_logger.update(loss=loss.item(), lr=optimizer.param_groups[0]["lr"]) metric_logger.update(loss=loss.item(), lr=optimizer.param_groups[0]["lr"])
metric_logger.meters['acc1'].update(acc1.item(), n=batch_size) metric_logger.meters["acc1"].update(acc1.item(), n=batch_size)
metric_logger.meters['acc5'].update(acc5.item(), n=batch_size) metric_logger.meters["acc5"].update(acc5.item(), n=batch_size)
metric_logger.meters['clips/s'].update(batch_size / (time.time() - start_time)) metric_logger.meters["clips/s"].update(batch_size / (time.time() - start_time))
lr_scheduler.step() lr_scheduler.step()
def evaluate(model, criterion, data_loader, device): def evaluate(model, criterion, data_loader, device):
model.eval() model.eval()
metric_logger = utils.MetricLogger(delimiter=" ") metric_logger = utils.MetricLogger(delimiter=" ")
header = 'Test:' header = "Test:"
with torch.no_grad(): with torch.no_grad():
for video, target in metric_logger.log_every(data_loader, 100, header): for video, target in metric_logger.log_every(data_loader, 100, header):
video = video.to(device, non_blocking=True) video = video.to(device, non_blocking=True)
...@@ -64,18 +64,22 @@ def evaluate(model, criterion, data_loader, device): ...@@ -64,18 +64,22 @@ def evaluate(model, criterion, data_loader, device):
# could have been padded in distributed setup # could have been padded in distributed setup
batch_size = video.shape[0] batch_size = video.shape[0]
metric_logger.update(loss=loss.item()) metric_logger.update(loss=loss.item())
metric_logger.meters['acc1'].update(acc1.item(), n=batch_size) metric_logger.meters["acc1"].update(acc1.item(), n=batch_size)
metric_logger.meters['acc5'].update(acc5.item(), n=batch_size) metric_logger.meters["acc5"].update(acc5.item(), n=batch_size)
# gather the stats from all processes # gather the stats from all processes
metric_logger.synchronize_between_processes() metric_logger.synchronize_between_processes()
print(' * Clip Acc@1 {top1.global_avg:.3f} Clip Acc@5 {top5.global_avg:.3f}' print(
.format(top1=metric_logger.acc1, top5=metric_logger.acc5)) " * Clip Acc@1 {top1.global_avg:.3f} Clip Acc@5 {top5.global_avg:.3f}".format(
top1=metric_logger.acc1, top5=metric_logger.acc5
)
)
return metric_logger.acc1.global_avg return metric_logger.acc1.global_avg
def _get_cache_path(filepath): def _get_cache_path(filepath):
import hashlib import hashlib
h = hashlib.sha1(filepath.encode()).hexdigest() h = hashlib.sha1(filepath.encode()).hexdigest()
cache_path = os.path.join("~", ".torch", "vision", "datasets", "kinetics", h[:10] + ".pt") cache_path = os.path.join("~", ".torch", "vision", "datasets", "kinetics", h[:10] + ".pt")
cache_path = os.path.expanduser(cache_path) cache_path = os.path.expanduser(cache_path)
...@@ -90,8 +94,10 @@ def collate_fn(batch): ...@@ -90,8 +94,10 @@ def collate_fn(batch):
def main(args): def main(args):
if args.apex and amp is None: if args.apex and amp is None:
raise RuntimeError("Failed to import apex. Please install apex from https://www.github.com/nvidia/apex " raise RuntimeError(
"to enable mixed-precision training.") "Failed to import apex. Please install apex from https://www.github.com/nvidia/apex "
"to enable mixed-precision training."
)
if args.output_dir: if args.output_dir:
utils.mkdir(args.output_dir) utils.mkdir(args.output_dir)
...@@ -121,15 +127,17 @@ def main(args): ...@@ -121,15 +127,17 @@ def main(args):
dataset.transform = transform_train dataset.transform = transform_train
else: else:
if args.distributed: if args.distributed:
print("It is recommended to pre-compute the dataset cache " print("It is recommended to pre-compute the dataset cache " "on a single-gpu first, as it will be faster")
"on a single-gpu first, as it will be faster")
dataset = torchvision.datasets.Kinetics400( dataset = torchvision.datasets.Kinetics400(
traindir, traindir,
frames_per_clip=args.clip_len, frames_per_clip=args.clip_len,
step_between_clips=1, step_between_clips=1,
transform=transform_train, transform=transform_train,
frame_rate=15, frame_rate=15,
extensions=('avi', 'mp4', ) extensions=(
"avi",
"mp4",
),
) )
if args.cache_dataset: if args.cache_dataset:
print("Saving dataset_train to {}".format(cache_path)) print("Saving dataset_train to {}".format(cache_path))
...@@ -149,15 +157,17 @@ def main(args): ...@@ -149,15 +157,17 @@ def main(args):
dataset_test.transform = transform_test dataset_test.transform = transform_test
else: else:
if args.distributed: if args.distributed:
print("It is recommended to pre-compute the dataset cache " print("It is recommended to pre-compute the dataset cache " "on a single-gpu first, as it will be faster")
"on a single-gpu first, as it will be faster")
dataset_test = torchvision.datasets.Kinetics400( dataset_test = torchvision.datasets.Kinetics400(
valdir, valdir,
frames_per_clip=args.clip_len, frames_per_clip=args.clip_len,
step_between_clips=1, step_between_clips=1,
transform=transform_test, transform=transform_test,
frame_rate=15, frame_rate=15,
extensions=('avi', 'mp4',) extensions=(
"avi",
"mp4",
),
) )
if args.cache_dataset: if args.cache_dataset:
print("Saving dataset_test to {}".format(cache_path)) print("Saving dataset_test to {}".format(cache_path))
...@@ -172,14 +182,22 @@ def main(args): ...@@ -172,14 +182,22 @@ def main(args):
test_sampler = DistributedSampler(test_sampler) test_sampler = DistributedSampler(test_sampler)
data_loader = torch.utils.data.DataLoader( data_loader = torch.utils.data.DataLoader(
dataset, batch_size=args.batch_size, dataset,
sampler=train_sampler, num_workers=args.workers, batch_size=args.batch_size,
pin_memory=True, collate_fn=collate_fn) sampler=train_sampler,
num_workers=args.workers,
pin_memory=True,
collate_fn=collate_fn,
)
data_loader_test = torch.utils.data.DataLoader( data_loader_test = torch.utils.data.DataLoader(
dataset_test, batch_size=args.batch_size, dataset_test,
sampler=test_sampler, num_workers=args.workers, batch_size=args.batch_size,
pin_memory=True, collate_fn=collate_fn) sampler=test_sampler,
num_workers=args.workers,
pin_memory=True,
collate_fn=collate_fn,
)
print("Creating model") print("Creating model")
model = torchvision.models.video.__dict__[args.model](pretrained=args.pretrained) model = torchvision.models.video.__dict__[args.model](pretrained=args.pretrained)
...@@ -190,13 +208,10 @@ def main(args): ...@@ -190,13 +208,10 @@ def main(args):
criterion = nn.CrossEntropyLoss() criterion = nn.CrossEntropyLoss()
lr = args.lr * args.world_size lr = args.lr * args.world_size
optimizer = torch.optim.SGD( optimizer = torch.optim.SGD(model.parameters(), lr=lr, momentum=args.momentum, weight_decay=args.weight_decay)
model.parameters(), lr=lr, momentum=args.momentum, weight_decay=args.weight_decay)
if args.apex: if args.apex:
model, optimizer = amp.initialize(model, optimizer, model, optimizer = amp.initialize(model, optimizer, opt_level=args.apex_opt_level)
opt_level=args.apex_opt_level
)
# convert scheduler to be per iteration, not per epoch, for warmup that lasts # convert scheduler to be per iteration, not per epoch, for warmup that lasts
# between different epochs # between different epochs
...@@ -207,20 +222,22 @@ def main(args): ...@@ -207,20 +222,22 @@ def main(args):
if args.lr_warmup_epochs > 0: if args.lr_warmup_epochs > 0:
warmup_iters = iters_per_epoch * args.lr_warmup_epochs warmup_iters = iters_per_epoch * args.lr_warmup_epochs
args.lr_warmup_method = args.lr_warmup_method.lower() args.lr_warmup_method = args.lr_warmup_method.lower()
if args.lr_warmup_method == 'linear': if args.lr_warmup_method == "linear":
warmup_lr_scheduler = torch.optim.lr_scheduler.LinearLR(optimizer, start_factor=args.lr_warmup_decay, warmup_lr_scheduler = torch.optim.lr_scheduler.LinearLR(
total_iters=warmup_iters) optimizer, start_factor=args.lr_warmup_decay, total_iters=warmup_iters
elif args.lr_warmup_method == 'constant': )
warmup_lr_scheduler = torch.optim.lr_scheduler.ConstantLR(optimizer, factor=args.lr_warmup_decay, elif args.lr_warmup_method == "constant":
total_iters=warmup_iters) warmup_lr_scheduler = torch.optim.lr_scheduler.ConstantLR(
optimizer, factor=args.lr_warmup_decay, total_iters=warmup_iters
)
else: else:
raise RuntimeError("Invalid warmup lr method '{}'. Only linear and constant " raise RuntimeError(
"are supported.".format(args.lr_warmup_method)) "Invalid warmup lr method '{}'. Only linear and constant "
"are supported.".format(args.lr_warmup_method)
)
lr_scheduler = torch.optim.lr_scheduler.SequentialLR( lr_scheduler = torch.optim.lr_scheduler.SequentialLR(
optimizer, optimizer, schedulers=[warmup_lr_scheduler, main_lr_scheduler], milestones=[warmup_iters]
schedulers=[warmup_lr_scheduler, main_lr_scheduler],
milestones=[warmup_iters]
) )
else: else:
lr_scheduler = main_lr_scheduler lr_scheduler = main_lr_scheduler
...@@ -231,11 +248,11 @@ def main(args): ...@@ -231,11 +248,11 @@ def main(args):
model_without_ddp = model.module model_without_ddp = model.module
if args.resume: if args.resume:
checkpoint = torch.load(args.resume, map_location='cpu') checkpoint = torch.load(args.resume, map_location="cpu")
model_without_ddp.load_state_dict(checkpoint['model']) model_without_ddp.load_state_dict(checkpoint["model"])
optimizer.load_state_dict(checkpoint['optimizer']) optimizer.load_state_dict(checkpoint["optimizer"])
lr_scheduler.load_state_dict(checkpoint['lr_scheduler']) lr_scheduler.load_state_dict(checkpoint["lr_scheduler"])
args.start_epoch = checkpoint['epoch'] + 1 args.start_epoch = checkpoint["epoch"] + 1
if args.test_only: if args.test_only:
evaluate(model, criterion, data_loader_test, device=device) evaluate(model, criterion, data_loader_test, device=device)
...@@ -246,62 +263,65 @@ def main(args): ...@@ -246,62 +263,65 @@ def main(args):
for epoch in range(args.start_epoch, args.epochs): for epoch in range(args.start_epoch, args.epochs):
if args.distributed: if args.distributed:
train_sampler.set_epoch(epoch) train_sampler.set_epoch(epoch)
train_one_epoch(model, criterion, optimizer, lr_scheduler, data_loader, train_one_epoch(
device, epoch, args.print_freq, args.apex) model, criterion, optimizer, lr_scheduler, data_loader, device, epoch, args.print_freq, args.apex
)
evaluate(model, criterion, data_loader_test, device=device) evaluate(model, criterion, data_loader_test, device=device)
if args.output_dir: if args.output_dir:
checkpoint = { checkpoint = {
'model': model_without_ddp.state_dict(), "model": model_without_ddp.state_dict(),
'optimizer': optimizer.state_dict(), "optimizer": optimizer.state_dict(),
'lr_scheduler': lr_scheduler.state_dict(), "lr_scheduler": lr_scheduler.state_dict(),
'epoch': epoch, "epoch": epoch,
'args': args} "args": args,
utils.save_on_master( }
checkpoint, utils.save_on_master(checkpoint, os.path.join(args.output_dir, "model_{}.pth".format(epoch)))
os.path.join(args.output_dir, 'model_{}.pth'.format(epoch))) utils.save_on_master(checkpoint, os.path.join(args.output_dir, "checkpoint.pth"))
utils.save_on_master(
checkpoint,
os.path.join(args.output_dir, 'checkpoint.pth'))
total_time = time.time() - start_time total_time = time.time() - start_time
total_time_str = str(datetime.timedelta(seconds=int(total_time))) total_time_str = str(datetime.timedelta(seconds=int(total_time)))
print('Training time {}'.format(total_time_str)) print("Training time {}".format(total_time_str))
def parse_args(): def parse_args():
import argparse import argparse
parser = argparse.ArgumentParser(description='PyTorch Video Classification Training')
parser = argparse.ArgumentParser(description="PyTorch Video Classification Training")
parser.add_argument('--data-path', default='/datasets01_101/kinetics/070618/', help='dataset')
parser.add_argument('--train-dir', default='train_avi-480p', help='name of train dir') parser.add_argument("--data-path", default="/datasets01_101/kinetics/070618/", help="dataset")
parser.add_argument('--val-dir', default='val_avi-480p', help='name of val dir') parser.add_argument("--train-dir", default="train_avi-480p", help="name of train dir")
parser.add_argument('--model', default='r2plus1d_18', help='model') parser.add_argument("--val-dir", default="val_avi-480p", help="name of val dir")
parser.add_argument('--device', default='cuda', help='device') parser.add_argument("--model", default="r2plus1d_18", help="model")
parser.add_argument('--clip-len', default=16, type=int, metavar='N', parser.add_argument("--device", default="cuda", help="device")
help='number of frames per clip') parser.add_argument("--clip-len", default=16, type=int, metavar="N", help="number of frames per clip")
parser.add_argument('--clips-per-video', default=5, type=int, metavar='N', parser.add_argument(
help='maximum number of clips per video to consider') "--clips-per-video", default=5, type=int, metavar="N", help="maximum number of clips per video to consider"
parser.add_argument('-b', '--batch-size', default=24, type=int) )
parser.add_argument('--epochs', default=45, type=int, metavar='N', parser.add_argument("-b", "--batch-size", default=24, type=int)
help='number of total epochs to run') parser.add_argument("--epochs", default=45, type=int, metavar="N", help="number of total epochs to run")
parser.add_argument('-j', '--workers', default=10, type=int, metavar='N', parser.add_argument(
help='number of data loading workers (default: 10)') "-j", "--workers", default=10, type=int, metavar="N", help="number of data loading workers (default: 10)"
parser.add_argument('--lr', default=0.01, type=float, help='initial learning rate') )
parser.add_argument('--momentum', default=0.9, type=float, metavar='M', parser.add_argument("--lr", default=0.01, type=float, help="initial learning rate")
help='momentum') parser.add_argument("--momentum", default=0.9, type=float, metavar="M", help="momentum")
parser.add_argument('--wd', '--weight-decay', default=1e-4, type=float, parser.add_argument(
metavar='W', help='weight decay (default: 1e-4)', "--wd",
dest='weight_decay') "--weight-decay",
parser.add_argument('--lr-milestones', nargs='+', default=[20, 30, 40], type=int, help='decrease lr on milestones') default=1e-4,
parser.add_argument('--lr-gamma', default=0.1, type=float, help='decrease lr by a factor of lr-gamma') type=float,
parser.add_argument('--lr-warmup-epochs', default=10, type=int, help='the number of epochs to warmup (default: 10)') metavar="W",
parser.add_argument('--lr-warmup-method', default="linear", type=str, help='the warmup method (default: linear)') help="weight decay (default: 1e-4)",
parser.add_argument('--lr-warmup-decay', default=0.001, type=float, help='the decay for lr') dest="weight_decay",
parser.add_argument('--print-freq', default=10, type=int, help='print frequency') )
parser.add_argument('--output-dir', default='.', help='path where to save') parser.add_argument("--lr-milestones", nargs="+", default=[20, 30, 40], type=int, help="decrease lr on milestones")
parser.add_argument('--resume', default='', help='resume from checkpoint') parser.add_argument("--lr-gamma", default=0.1, type=float, help="decrease lr by a factor of lr-gamma")
parser.add_argument('--start-epoch', default=0, type=int, metavar='N', parser.add_argument("--lr-warmup-epochs", default=10, type=int, help="the number of epochs to warmup (default: 10)")
help='start epoch') parser.add_argument("--lr-warmup-method", default="linear", type=str, help="the warmup method (default: linear)")
parser.add_argument("--lr-warmup-decay", default=0.001, type=float, help="the decay for lr")
parser.add_argument("--print-freq", default=10, type=int, help="print frequency")
parser.add_argument("--output-dir", default=".", help="path where to save")
parser.add_argument("--resume", default="", help="resume from checkpoint")
parser.add_argument("--start-epoch", default=0, type=int, metavar="N", help="start epoch")
parser.add_argument( parser.add_argument(
"--cache-dataset", "--cache-dataset",
dest="cache_dataset", dest="cache_dataset",
...@@ -328,18 +348,19 @@ def parse_args(): ...@@ -328,18 +348,19 @@ def parse_args():
) )
# Mixed precision training parameters # Mixed precision training parameters
parser.add_argument('--apex', action='store_true', parser.add_argument("--apex", action="store_true", help="Use apex for mixed precision training")
help='Use apex for mixed precision training') parser.add_argument(
parser.add_argument('--apex-opt-level', default='O1', type=str, "--apex-opt-level",
help='For apex mixed precision training' default="O1",
'O0 for FP32 training, O1 for mixed precision training.' type=str,
'For further detail, see https://github.com/NVIDIA/apex/tree/master/examples/imagenet' help="For apex mixed precision training"
"O0 for FP32 training, O1 for mixed precision training."
"For further detail, see https://github.com/NVIDIA/apex/tree/master/examples/imagenet",
) )
# distributed training parameters # distributed training parameters
parser.add_argument('--world-size', default=1, type=int, parser.add_argument("--world-size", default=1, type=int, help="number of distributed processes")
help='number of distributed processes') parser.add_argument("--dist-url", default="env://", help="url used to set up distributed training")
parser.add_argument('--dist-url', default='env://', help='url used to set up distributed training')
args = parser.parse_args() args = parser.parse_args()
......
...@@ -3,16 +3,14 @@ import torch.nn as nn ...@@ -3,16 +3,14 @@ import torch.nn as nn
class ConvertBHWCtoBCHW(nn.Module): class ConvertBHWCtoBCHW(nn.Module):
"""Convert tensor from (B, H, W, C) to (B, C, H, W) """Convert tensor from (B, H, W, C) to (B, C, H, W)"""
"""
def forward(self, vid: torch.Tensor) -> torch.Tensor: def forward(self, vid: torch.Tensor) -> torch.Tensor:
return vid.permute(0, 3, 1, 2) return vid.permute(0, 3, 1, 2)
class ConvertBCHWtoCBHW(nn.Module): class ConvertBCHWtoCBHW(nn.Module):
"""Convert tensor from (B, C, H, W) to (C, B, H, W) """Convert tensor from (B, C, H, W) to (C, B, H, W)"""
"""
def forward(self, vid: torch.Tensor) -> torch.Tensor: def forward(self, vid: torch.Tensor) -> torch.Tensor:
return vid.permute(1, 0, 2, 3) return vid.permute(1, 0, 2, 3)
from collections import defaultdict, deque
import datetime import datetime
import errno
import os
import time import time
from collections import defaultdict, deque
import torch import torch
import torch.distributed as dist import torch.distributed as dist
import errno
import os
class SmoothedValue(object): class SmoothedValue(object):
"""Track a series of values and provide access to smoothed values over a """Track a series of values and provide access to smoothed values over a
...@@ -32,7 +32,7 @@ class SmoothedValue(object): ...@@ -32,7 +32,7 @@ class SmoothedValue(object):
""" """
if not is_dist_avail_and_initialized(): if not is_dist_avail_and_initialized():
return return
t = torch.tensor([self.count, self.total], dtype=torch.float64, device='cuda') t = torch.tensor([self.count, self.total], dtype=torch.float64, device="cuda")
dist.barrier() dist.barrier()
dist.all_reduce(t) dist.all_reduce(t)
t = t.tolist() t = t.tolist()
...@@ -63,11 +63,8 @@ class SmoothedValue(object): ...@@ -63,11 +63,8 @@ class SmoothedValue(object):
def __str__(self): def __str__(self):
return self.fmt.format( return self.fmt.format(
median=self.median, median=self.median, avg=self.avg, global_avg=self.global_avg, max=self.max, value=self.value
avg=self.avg, )
global_avg=self.global_avg,
max=self.max,
value=self.value)
class MetricLogger(object): class MetricLogger(object):
...@@ -87,15 +84,12 @@ class MetricLogger(object): ...@@ -87,15 +84,12 @@ class MetricLogger(object):
return self.meters[attr] return self.meters[attr]
if attr in self.__dict__: if attr in self.__dict__:
return self.__dict__[attr] return self.__dict__[attr]
raise AttributeError("'{}' object has no attribute '{}'".format( raise AttributeError("'{}' object has no attribute '{}'".format(type(self).__name__, attr))
type(self).__name__, attr))
def __str__(self): def __str__(self):
loss_str = [] loss_str = []
for name, meter in self.meters.items(): for name, meter in self.meters.items():
loss_str.append( loss_str.append("{}: {}".format(name, str(meter)))
"{}: {}".format(name, str(meter))
)
return self.delimiter.join(loss_str) return self.delimiter.join(loss_str)
def synchronize_between_processes(self): def synchronize_between_processes(self):
...@@ -108,31 +102,28 @@ class MetricLogger(object): ...@@ -108,31 +102,28 @@ class MetricLogger(object):
def log_every(self, iterable, print_freq, header=None): def log_every(self, iterable, print_freq, header=None):
i = 0 i = 0
if not header: if not header:
header = '' header = ""
start_time = time.time() start_time = time.time()
end = time.time() end = time.time()
iter_time = SmoothedValue(fmt='{avg:.4f}') iter_time = SmoothedValue(fmt="{avg:.4f}")
data_time = SmoothedValue(fmt='{avg:.4f}') data_time = SmoothedValue(fmt="{avg:.4f}")
space_fmt = ':' + str(len(str(len(iterable)))) + 'd' space_fmt = ":" + str(len(str(len(iterable)))) + "d"
if torch.cuda.is_available(): if torch.cuda.is_available():
log_msg = self.delimiter.join([ log_msg = self.delimiter.join(
[
header, header,
'[{0' + space_fmt + '}/{1}]', "[{0" + space_fmt + "}/{1}]",
'eta: {eta}', "eta: {eta}",
'{meters}', "{meters}",
'time: {time}', "time: {time}",
'data: {data}', "data: {data}",
'max mem: {memory:.0f}' "max mem: {memory:.0f}",
]) ]
)
else: else:
log_msg = self.delimiter.join([ log_msg = self.delimiter.join(
header, [header, "[{0" + space_fmt + "}/{1}]", "eta: {eta}", "{meters}", "time: {time}", "data: {data}"]
'[{0' + space_fmt + '}/{1}]', )
'eta: {eta}',
'{meters}',
'time: {time}',
'data: {data}'
])
MB = 1024.0 * 1024.0 MB = 1024.0 * 1024.0
for obj in iterable: for obj in iterable:
data_time.update(time.time() - end) data_time.update(time.time() - end)
...@@ -142,21 +133,28 @@ class MetricLogger(object): ...@@ -142,21 +133,28 @@ class MetricLogger(object):
eta_seconds = iter_time.global_avg * (len(iterable) - i) eta_seconds = iter_time.global_avg * (len(iterable) - i)
eta_string = str(datetime.timedelta(seconds=int(eta_seconds))) eta_string = str(datetime.timedelta(seconds=int(eta_seconds)))
if torch.cuda.is_available(): if torch.cuda.is_available():
print(log_msg.format( print(
i, len(iterable), eta=eta_string, log_msg.format(
i,
len(iterable),
eta=eta_string,
meters=str(self), meters=str(self),
time=str(iter_time), data=str(data_time), time=str(iter_time),
memory=torch.cuda.max_memory_allocated() / MB)) data=str(data_time),
memory=torch.cuda.max_memory_allocated() / MB,
)
)
else: else:
print(log_msg.format( print(
i, len(iterable), eta=eta_string, log_msg.format(
meters=str(self), i, len(iterable), eta=eta_string, meters=str(self), time=str(iter_time), data=str(data_time)
time=str(iter_time), data=str(data_time))) )
)
i += 1 i += 1
end = time.time() end = time.time()
total_time = time.time() - start_time total_time = time.time() - start_time
total_time_str = str(datetime.timedelta(seconds=int(total_time))) total_time_str = str(datetime.timedelta(seconds=int(total_time)))
print('{} Total time: {}'.format(header, total_time_str)) print("{} Total time: {}".format(header, total_time_str))
def accuracy(output, target, topk=(1,)): def accuracy(output, target, topk=(1,)):
...@@ -189,10 +187,11 @@ def setup_for_distributed(is_master): ...@@ -189,10 +187,11 @@ def setup_for_distributed(is_master):
This function disables printing when not in master process This function disables printing when not in master process
""" """
import builtins as __builtin__ import builtins as __builtin__
builtin_print = __builtin__.print builtin_print = __builtin__.print
def print(*args, **kwargs): def print(*args, **kwargs):
force = kwargs.pop('force', False) force = kwargs.pop("force", False)
if is_master or force: if is_master or force:
builtin_print(*args, **kwargs) builtin_print(*args, **kwargs)
...@@ -229,26 +228,26 @@ def save_on_master(*args, **kwargs): ...@@ -229,26 +228,26 @@ def save_on_master(*args, **kwargs):
def init_distributed_mode(args): def init_distributed_mode(args):
if 'RANK' in os.environ and 'WORLD_SIZE' in os.environ: if "RANK" in os.environ and "WORLD_SIZE" in os.environ:
args.rank = int(os.environ["RANK"]) args.rank = int(os.environ["RANK"])
args.world_size = int(os.environ['WORLD_SIZE']) args.world_size = int(os.environ["WORLD_SIZE"])
args.gpu = int(os.environ['LOCAL_RANK']) args.gpu = int(os.environ["LOCAL_RANK"])
elif 'SLURM_PROCID' in os.environ: elif "SLURM_PROCID" in os.environ:
args.rank = int(os.environ['SLURM_PROCID']) args.rank = int(os.environ["SLURM_PROCID"])
args.gpu = args.rank % torch.cuda.device_count() args.gpu = args.rank % torch.cuda.device_count()
elif hasattr(args, "rank"): elif hasattr(args, "rank"):
pass pass
else: else:
print('Not using distributed mode') print("Not using distributed mode")
args.distributed = False args.distributed = False
return return
args.distributed = True args.distributed = True
torch.cuda.set_device(args.gpu) torch.cuda.set_device(args.gpu)
args.dist_backend = 'nccl' args.dist_backend = "nccl"
print('| distributed init (rank {}): {}'.format( print("| distributed init (rank {}): {}".format(args.rank, args.dist_url), flush=True)
args.rank, args.dist_url), flush=True) torch.distributed.init_process_group(
torch.distributed.init_process_group(backend=args.dist_backend, init_method=args.dist_url, backend=args.dist_backend, init_method=args.dist_url, world_size=args.world_size, rank=args.rank
world_size=args.world_size, rank=args.rank) )
setup_for_distributed(args.rank == 0) setup_for_distributed(args.rank == 0)
...@@ -9,7 +9,13 @@ max-line-length = 120 ...@@ -9,7 +9,13 @@ max-line-length = 120
[flake8] [flake8]
max-line-length = 120 max-line-length = 120
ignore = F401,E402,F403,W503,W504,F821 ignore = E203, E402, W503, W504, F821
per-file-ignores =
__init__.py: F401, F403, F405
./hubconf.py: F401
torchvision/models/mobilenet.py: F401, F403
torchvision/models/quantization/mobilenet.py: F401, F403
test/smoke_test.py: F401
exclude = venv exclude = venv
[pydocstyle] [pydocstyle]
......
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment