Commit 64b02fb6 authored by liangjing's avatar liangjing
Browse files

version 1

parents
Pipeline #176 failed with stages
in 0 seconds
#!/bin/bash
export HSA_FORCE_FINE_GRAIN_PCIE=1
#run 8 dcu
mpirun -np 8 --allow-run-as-root --bind-to none -x LD_LIBRARY_PATH -x PATH `pwd`/dcu.sh localhost
#!/bin/bash
export HSA_FORCE_FINE_GRAIN_PCIE=1
#run 8 dcu
mpirun -np 8 --allow-run-as-root --bind-to none -x LD_LIBRARY_PATH -x PATH `pwd`/dcu_fp32.sh localhost
#!/usr/bin/env python3
import argparse
import torch
import torch.onnx
from torchvision.ops import misc as misc_nn_ops
from torch.autograd import Variable
from model.resnet import resnet50, resnext50_32x4d
def parse_args(add_help=True):
parser = argparse.ArgumentParser(description='Convert RetinaNet backbone to onnx format', add_help=add_help)
parser.add_argument('--backbone', default='resnext50_32x4d', choices=['resnet50', 'resnext50_32x4d'],
help='The model backbone')
parser.add_argument('--output', default=None, help='output onnx file')
parser.add_argument('--image-size', default=None, nargs=2, type=int,
help='Image size for training. If not set then will be dynamic')
parser.add_argument('--batch-size', default=None, type=int,
help='input batch size. if not set then will be dynamic')
parser.add_argument('--device', default='cuda', help='device')
args = parser.parse_args()
args.output = args.output or (args.backbone+'.onnx')
return args
def main(args):
batch_size = args.batch_size or 1
image_size = args.image_size or [800, 800]
print("Loading model")
model = None
if args.backbone=="resnet50":
model = resnet50(pretrained=True,
norm_layer=misc_nn_ops.FrozenBatchNorm2d)
elif args.backbone=="resnext50_32x4d":
model = resnext50_32x4d(pretrained=True,
norm_layer=misc_nn_ops.FrozenBatchNorm2d)
device = torch.device(args.device)
model.to(device)
print("Creating input tensor")
rand = torch.randn(batch_size, 3, image_size[0], image_size[1],
device=device,
requires_grad=False,
dtype=torch.float)
inputs = torch.autograd.Variable(rand)
dynamic_axes = {}
# Input dynamic axes
if (args.batch_size is None) or (args.image_size is None):
dynamic_axes['images'] = {}
if args.batch_size is None:
dynamic_axes['images'][0]: 'batch_size'
if args.image_size is None:
dynamic_axes['images'][2] = 'width'
dynamic_axes['images'][3] = 'height'
print("Exporting the model")
torch.onnx.export(model,
inputs,
args.output,
export_params=True,
opset_version=13,
input_names=['images'],
dynamic_axes=dynamic_axes)
if __name__ == "__main__":
args = parse_args()
main(args)
#!/usr/bin/env bash
DOWNLOAD_LINK='https://download.pytorch.org/models/resnext50_32x4d-7cdf4587.pth'
SHA512='15c9f0bc1c8d64750712f86ffaded3b0bc6a87e77a395dcda3013d8af65b7ebf3ca1c24dd3aae60c0d83e510b4d27731f0526b6f9392c0a85ffc18e5fecd8a13'
FILENAME='resnext50_32x4d-7cdf4587.pth'
wget -c $DOWNLOAD_LINK
echo "${SHA512} ./${FILENAME}" | sha512sum -c
#!/bin/bash
: "${DOWNLOAD_PATH:=/datasets/downloads/coco2017}"
: "${OUTPUT_PATH:=/datasets/coco2017}"
while [ "$1" != "" ]; do
case $1 in
-d | --download-path ) shift
DOWNLOAD_PATH=$1
;;
-o | --output-path ) shift
OUTPUT_PATH=$1
;;
esac
shift
done
mkdir -p $DOWNLOAD_PATH
cd $DOWNLOAD_PATH
wget -c http://images.cocodataset.org/zips/train2017.zip
wget -c http://images.cocodataset.org/zips/val2017.zip
wget -c http://images.cocodataset.org/annotations/annotations_trainval2017.zip
echo "cced6f7f71b7629ddf16f17bbcfab6b2 ./train2017.zip" | md5sum -c
echo "442b8da7639aecaf257c1dceb8ba8c80 ./val2017.zip" | md5sum -c
echo "f4bbac642086de4f52a3fdda2de5fa2c ./annotations_trainval2017.zip" | md5sum -c
mkdir -p $OUTPUT_PATH
unzip train2017.zip -d $OUTPUT_PATH
unzip val2017.zip -d $OUTPUT_PATH
unzip annotations_trainval2017.zip -d $OUTPUT_PATH
#!/bin/bash
: "${DATASET_PATH:=/datasets/open-images-v6}"
while [ "$1" != "" ]; do
case $1 in
-d | --dataset-path ) shift
DATASET_PATH=$1
;;
esac
shift
done
python fiftyone_openimages.py \
--dataset-dir=${DATASET_PATH}
#!/bin/bash
: "${DATASET_PATH:=/datasets/open-images-v6-mlperf}"
while [ "$1" != "" ]; do
case $1 in
-d | --dataset-path ) shift
DATASET_PATH=$1
;;
esac
shift
done
MLPERF_CLASSES=('Airplane' 'Antelope' 'Apple' 'Backpack' 'Balloon' 'Banana'
'Barrel' 'Baseball bat' 'Baseball glove' 'Bee' 'Beer' 'Bench' 'Bicycle'
'Bicycle helmet' 'Bicycle wheel' 'Billboard' 'Book' 'Bookcase' 'Boot'
'Bottle' 'Bowl' 'Bowling equipment' 'Box' 'Boy' 'Brassiere' 'Bread'
'Broccoli' 'Bronze sculpture' 'Bull' 'Bus' 'Bust' 'Butterfly' 'Cabinetry'
'Cake' 'Camel' 'Camera' 'Candle' 'Candy' 'Cannon' 'Canoe' 'Carrot' 'Cart'
'Castle' 'Cat' 'Cattle' 'Cello' 'Chair' 'Cheese' 'Chest of drawers' 'Chicken'
'Christmas tree' 'Coat' 'Cocktail' 'Coffee' 'Coffee cup' 'Coffee table' 'Coin'
'Common sunflower' 'Computer keyboard' 'Computer monitor' 'Convenience store'
'Cookie' 'Countertop' 'Cowboy hat' 'Crab' 'Crocodile' 'Cucumber' 'Cupboard'
'Curtain' 'Deer' 'Desk' 'Dinosaur' 'Dog' 'Doll' 'Dolphin' 'Door' 'Dragonfly'
'Drawer' 'Dress' 'Drum' 'Duck' 'Eagle' 'Earrings' 'Egg (Food)' 'Elephant'
'Falcon' 'Fedora' 'Flag' 'Flowerpot' 'Football' 'Football helmet' 'Fork'
'Fountain' 'French fries' 'French horn' 'Frog' 'Giraffe' 'Girl' 'Glasses'
'Goat' 'Goggles' 'Goldfish' 'Gondola' 'Goose' 'Grape' 'Grapefruit' 'Guitar'
'Hamburger' 'Handbag' 'Harbor seal' 'Headphones' 'Helicopter' 'High heels'
'Hiking equipment' 'Horse' 'House' 'Houseplant' 'Human arm' 'Human beard'
'Human body' 'Human ear' 'Human eye' 'Human face' 'Human foot' 'Human hair'
'Human hand' 'Human head' 'Human leg' 'Human mouth' 'Human nose' 'Ice cream'
'Jacket' 'Jeans' 'Jellyfish' 'Juice' 'Kitchen & dining room table' 'Kite'
'Lamp' 'Lantern' 'Laptop' 'Lavender (Plant)' 'Lemon' 'Light bulb' 'Lighthouse'
'Lily' 'Lion' 'Lipstick' 'Lizard' 'Man' 'Maple' 'Microphone' 'Mirror'
'Mixing bowl' 'Mobile phone' 'Monkey' 'Motorcycle' 'Muffin' 'Mug' 'Mule'
'Mushroom' 'Musical keyboard' 'Necklace' 'Nightstand' 'Office building'
'Orange' 'Owl' 'Oyster' 'Paddle' 'Palm tree' 'Parachute' 'Parrot' 'Pen'
'Penguin' 'Personal flotation device' 'Piano' 'Picture frame' 'Pig' 'Pillow'
'Pizza' 'Plate' 'Platter' 'Porch' 'Poster' 'Pumpkin' 'Rabbit' 'Rifle'
'Roller skates' 'Rose' 'Salad' 'Sandal' 'Saucer' 'Saxophone' 'Scarf' 'Sea lion'
'Sea turtle' 'Sheep' 'Shelf' 'Shirt' 'Shorts' 'Shrimp' 'Sink' 'Skateboard'
'Ski' 'Skull' 'Skyscraper' 'Snake' 'Sock' 'Sofa bed' 'Sparrow' 'Spider' 'Spoon'
'Sports uniform' 'Squirrel' 'Stairs' 'Stool' 'Strawberry' 'Street light'
'Studio couch' 'Suit' 'Sun hat' 'Sunglasses' 'Surfboard' 'Sushi' 'Swan'
'Swimming pool' 'Swimwear' 'Tank' 'Tap' 'Taxi' 'Tea' 'Teddy bear' 'Television'
'Tent' 'Tie' 'Tiger' 'Tin can' 'Tire' 'Toilet' 'Tomato' 'Tortoise' 'Tower'
'Traffic light' 'Train' 'Tripod' 'Truck' 'Trumpet' 'Umbrella' 'Van' 'Vase'
'Vehicle registration plate' 'Violin' 'Wall clock' 'Waste container' 'Watch'
'Whale' 'Wheel' 'Wheelchair' 'Whiteboard' 'Window' 'Wine' 'Wine glass' 'Woman'
'Zebra' 'Zucchini')
python fiftyone_openimages.py \
--dataset-dir=${DATASET_PATH} \
--output-labels="openimages-mlperf.json" \
--classes "${MLPERF_CLASSES[@]}"
#!/usr/bin/env python
import os
import json
import argparse
import fiftyone as fo
import fiftyone.zoo as foz
parser = argparse.ArgumentParser(description='Download OpenImages using FiftyOne', add_help=True)
parser.add_argument('--dataset-dir', default='/open-images-v6', help='dataset download location')
parser.add_argument('--splits', default=['train', 'validation'], choices=['train', 'validation', 'test'],
nargs='+', type=str,
help='Splits to download, possible values are train, validation and test')
parser.add_argument('--classes', default=None, nargs='+', type=str,
help='Classes to download. default to all classes')
parser.add_argument('--output-labels', default='labels.json', type=str,
help='Classes to download. default to all classes')
args = parser.parse_args()
print("Downloading open-images dataset ...")
dataset = foz.load_zoo_dataset(
name="open-images-v6",
classes=args.classes,
splits=args.splits,
label_types="detections",
dataset_name="open-images",
dataset_dir=args.dataset_dir
)
print("Converting dataset to coco format ...")
for split in args.splits:
output_fname = os.path.join(args.dataset_dir, split, "labels", args.output_labels)
split_view = dataset.match_tags(split)
split_view.export(
labels_path=output_fname,
dataset_type=fo.types.COCODetectionDataset,
label_field="detections",
classes=args.classes)
# Add iscrowd label to openimages annotations
with open(output_fname) as fp:
labels = json.load(fp)
for annotation in labels['annotations']:
annotation['iscrowd'] = int(annotation['IsGroupOf'])
with open(output_fname, "w") as fp:
json.dump(labels, fp)
#!/usr/bin/env python3
import argparse
import torch
import torch.onnx
import torchvision
from torch.autograd import Variable
from model.retinanet import retinanet_from_backbone
def parse_args(add_help=True):
parser = argparse.ArgumentParser(description='Convert PyTorch detection file to onnx format', add_help=add_help)
parser.add_argument('--input', required=True, help='input pth file')
parser.add_argument('--output', default=None, help='output onnx file')
parser.add_argument('--backbone', default='resnext50_32x4d',
choices=['resnet50', 'resnext50_32x4d', 'resnet101', 'resnext101_32x8d'],
help='The model backbone')
parser.add_argument('--num-classes', default=264, type=int,
help='Number of detection classes')
parser.add_argument('--trainable-backbone-layers', default=3, type=int,
help='number of trainable layers of backbone')
parser.add_argument('--image-size', default=None, nargs=2, type=int,
help='Image size for training. If not set then will be dynamic')
parser.add_argument('--batch-size', default=None, type=int,
help='input batch size. if not set then will be dynamic')
parser.add_argument('--data-layout', default="channels_first", choices=['channels_first', 'channels_last'],
help="Model data layout")
parser.add_argument('--device', default='cuda', help='device')
args = parser.parse_args()
args.output = args.output or ('retinanet_'+args.backbone+'.onnx')
return args
def main(args):
batch_size = args.batch_size or 1
image_size = args.image_size or [800, 800]
print("Creating model")
model = retinanet_from_backbone(backbone=args.backbone,
num_classes=args.num_classes,
image_size=image_size,
data_layout=args.data_layout,
pretrained=False,
trainable_backbone_layers=args.trainable_backbone_layers)
device = torch.device(args.device)
model.to(device)
print("Loading model")
checkpoint = torch.load(args.input)
model.load_state_dict(checkpoint['model'])
print("Creating input tensor")
rand = torch.randn(batch_size, 3, image_size[0], image_size[1],
device=device,
requires_grad=False,
dtype=torch.float)
inputs = torch.autograd.Variable(rand)
# Output dynamic axes
dynamic_axes = {
'boxes': {0 : 'num_detections'},
'scores': {0 : 'num_detections'},
'labels': {0 : 'num_detections'},
}
# Input dynamic axes
if (args.batch_size is None) or (args.image_size is None):
dynamic_axes['images'] = {}
if args.batch_size is None:
dynamic_axes['images'][0]: 'batch_size'
if args.image_size is None:
dynamic_axes['images'][2] = 'width'
dynamic_axes['images'][3] = 'height'
print("Exporting the model")
model.eval()
torch.onnx.export(model,
inputs,
args.output,
export_params=True,
opset_version=13,
do_constant_folding=False,
input_names=['images'],
output_names=['boxes', 'scores', 'labels'],
dynamic_axes=dynamic_axes)
if __name__ == "__main__":
args = parse_args()
main(args)
#!/usr/bin/env python3
from argparse import ArgumentParser
import pickle
import numpy as np
import torch
parser = ArgumentParser(description="Convert a pytorch (.pth) file to a pickled dictionary of numpy arrays. "
"The dictionary will have the following format: \n"
"{pytorch param name: numpy array}")
parser.add_argument('input_file', type=str, help='input pytorch .pth file')
parser.add_argument('output_file', type=str, help='output pickle file')
parser.add_argument('-v', '--verbose', action='store_true',
help='print parameters names and statistics')
args = parser.parse_args()
dict_out = {}
pth_input = torch.load(open(args.input_file, 'rb'))
for key, value in pth_input.items():
dict_out[key] = value.data.numpy()
if args.verbose:
print("name, dtype, mean, std, min, max")
for key, value in dict_out.items():
t_mean = np.mean(value)
t_std = np.std(value)
t_min = np.min(value)
t_max = np.max(value)
print(f"{key}, {value.dtype}, {value.shape}, {t_mean:0.3}, {t_std:0.3}, {t_min:0.3}, {t_max:0.3}")
pickle.dump(dict_out, open(args.output_file, 'wb'))
# Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import torch
import utils
from torch.utils.data import Dataset
from engine import preprocessing, loss_preprocessing, compute_matched_idxs
def init_cache(model_ptr, dataset, device, args, cache_sz):
cache_images_, cache_targets_ = [], []
for j in range(int(cache_sz / args.batch_size)):
images, targets = [], []
for i in range(args.batch_size):
images.append(dataset[j * int(cache_sz / args.batch_size) + i][0])
targets.append(dataset[j * int(cache_sz / args.batch_size) + i][1])
images = list(image.to(device, non_blocking=True) for image in images)
targets = {k: [dic[k].to(device, non_blocking=True) for dic in targets] for k in targets[0]}
images, targets = preprocessing(images, targets, model_ptr, args.data_layout)
with torch.cuda.amp.autocast(enabled=args.amp):
targets['matched_idxs'] = compute_matched_idxs(targets['boxes'], model_ptr)
for i in range(args.batch_size):
cache_images_.append(images[i].cpu())
cache_targets_.append({'matched_idxs': targets['matched_idxs'][i].cuda(),
'labels': targets['labels'][i].cuda(),
'boxes': targets['boxes'][i].cuda()})
return cache_images_, cache_targets_
def get_cached_dataset(model, dataset, device, args, cache_sz=16, virtual_cache_sz_factor=32768):
cache_images_, cache_targets_ = init_cache(model, dataset, device, args, cache_sz)
cached_dataset = CachedDataset(cache_sz, virtual_cache_sz_factor, cache_images_, cache_targets_)
if args.distributed:
cached_train_sampler = torch.utils.data.distributed.DistributedSampler(cached_dataset)
else:
cached_train_sampler = torch.utils.data.RandomSampler(cached_dataset)
cached_train_batch_sampler = torch.utils.data.BatchSampler(cached_train_sampler, args.batch_size, drop_last=True)
cached_data_loader = torch.utils.data.DataLoader(cached_dataset, batch_sampler=cached_train_batch_sampler,
num_workers=0, pin_memory=False, collate_fn=utils.collate_fn)
return cached_data_loader
class CachedDataset(Dataset):
def __init__(self, cache_sz, virtual_cache_sz_factor, cache_images, cache_targets):
self.cache_sz = cache_sz
self.virtual_cache_sz_factor = virtual_cache_sz_factor
self.virtual_dataset_sz = self.cache_sz * self.virtual_cache_sz_factor
self.cache_images = cache_images
self.cache_targets = cache_targets
def __len__(self):
return self.virtual_dataset_sz
def __getitem__(self, idx):
return self.cache_images[idx % self.cache_sz], self.cache_targets[idx % self.cache_sz]
# Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import time
import random
import argparse
import datetime
import math
import numpy as np
import torch
import torch.utils.data
import apex_C
from model.frozen_bn import FrozenBatchNorm2d
from mlperf_logger import mllogger
from mlperf_logging.mllog.constants import (SSD, STATUS, SUCCESS, ABORTED, INIT_START, INIT_STOP, RUN_START, RUN_STOP,
SEED, GLOBAL_BATCH_SIZE, TRAIN_SAMPLES, EVAL_SAMPLES, EPOCH_COUNT,
FIRST_EPOCH_NUM, OPT_NAME, ADAM, OPT_BASE_LR, OPT_WEIGHT_DECAY,
OPT_LR_WARMUP_EPOCHS, OPT_LR_WARMUP_FACTOR, GRADIENT_ACCUMULATION_STEPS)
import utils
import presets
from coco.coco_utils import get_coco, get_openimages
from engine import train_one_epoch, evaluate
from model.retinanet import retinanet_from_backbone, cudnn_fusion_warmup
import model_capture
import apex
from syn_dataset import get_cached_dataset
from mlperf_common.scaleoutbridge import init_bridge, ScaleoutBridgeBase as SBridge
from mlperf_common.frameworks.pyt import PyTProfilerHandler, PyTCommunicationHandler
from torch.distributed.algorithms.ddp_comm_hooks.default_hooks import fp16_compress_hook
from async_executor import async_executor
try:
from dali import DaliDataIterator
except ImportError as err:
print("Could not import DaliDataIterator, it's fine if you do not use --dali")
def get_dataset_fn(dataset, dataset_path):
dataset_fn = None
num_classes = None
train_data_path = None
train_annotations_file = None
val_data_path = None
val_annotations_file = None
if dataset == 'coco':
dataset_fn = get_coco
num_classes = 91
train_sz = 117266
val_sz = 5000
train_data_path = os.path.join(dataset_path, 'train2017')
train_annotations_file = os.path.join(dataset_path, 'annotations', 'instances_train2017.json')
val_data_path = os.path.join(dataset_path, 'val2017')
val_annotations_file = os.path.join(dataset_path, 'annotations', 'instances_val2017.json')
elif dataset == 'openimages':
# Full openimages dataset
dataset_fn = get_openimages
num_classes = 601
train_sz = 1743042
val_sz = 41620
train_data_path = os.path.join(dataset_path, 'train', 'data')
train_annotations_file = os.path.join(dataset_path, 'train', 'labels', 'openimages.json')
val_data_path = os.path.join(dataset_path, 'validation', 'data')
val_annotations_file = os.path.join(dataset_path, 'validation', 'labels', 'openimages.json')
elif dataset == 'openimages-mlperf':
# L0 classes with more than 1000 samples
dataset_fn = get_openimages
num_classes = 264
train_sz = 1170301
val_sz = 24781
train_data_path = os.path.join(dataset_path, 'train', 'data')
train_annotations_file = os.path.join(dataset_path, 'train', 'labels', 'openimages-mlperf.json')
val_data_path = os.path.join(dataset_path, 'validation', 'data')
val_annotations_file = os.path.join(dataset_path, 'validation', 'labels', 'openimages-mlperf.json')
else:
assert False, "Unknown dataset = {dataset}"
return (dataset_fn, num_classes, train_sz, val_sz,
train_data_path, train_annotations_file, val_data_path, val_annotations_file)
def get_transform(train, data_augmentation):
return presets.DetectionPresetTrain(data_augmentation) if train else presets.DetectionPresetEval()
def cast_frozen_bn_half(module: torch.nn.Module):
for name, child_module in module.named_children():
if isinstance(child_module, FrozenBatchNorm2d):
child_module.half()
elif len(list(child_module.children())) > 0:
cast_frozen_bn_half(child_module)
def parse_args(add_help=True):
parser = argparse.ArgumentParser(description='PyTorch Detection Training', add_help=add_help)
# Model
parser.add_argument('--backbone', default='resnext50_32x4d',
choices=['resnet50', 'resnext50_32x4d', 'resnet101', 'resnext101_32x8d'],
help='The model backbone')
parser.add_argument('--trainable-backbone-layers', default=3, type=int,
help='number of trainable layers of backbone')
parser.add_argument('--sync-bn', dest='sync_bn', action="store_true", help='Use sync batch norm')
parser.add_argument('--data-layout', default="channels_last", choices=['channels_first', 'channels_last'],
help="Model data layout")
parser.add_argument("--amp", dest='amp', action="store_true",
help="Whether to enable Automatic Mixed Precision (AMP). "
"When false, uses TF32 on A100 and FP32 on V100 GPUS.")
parser.add_argument("--no-amp", dest='amp', action="store_false",
help="Whether to enable Automatic Mixed Precision (AMP). "
"When false, uses TF32 on A100 and FP32 on V100 GPUS.")
parser.set_defaults(amp=True)
# Async validation
parser.add_argument("--async-coco", action="store_true",
help="Enable asynchronous coco scoring")
parser.add_argument("--async-coco-check-freq", default=20, type=int,
help="Enable asynchronous coco scoring")
parser.add_argument("--num-eval-ranks", default=None, type=int,
help="Number of validation ranks. default to use")
# Dataset
parser.add_argument('--dataset', default='openimages-mlperf',
choices=['coco', 'openimages', 'openimages-mlperf'],
help='dataset')
parser.add_argument('--dataset-path', default='/datasets/open-images-v6',
help='dataset root path')
parser.add_argument('--num-classes', default=None, type=int,
help='Number of classes in the dataset. By default will be infered from --dataset')
parser.add_argument('--train-data-path', default=None, type=str,
help='Training images folder. By default will be inferred from --dataset')
parser.add_argument('--train-annotations-file', default=None, type=str,
help='Training annotations file. By default will be inferred from --dataset')
parser.add_argument('--val-data-path', default=None, type=str,
help='Validation images folder. By default will be inferred from --dataset')
parser.add_argument('--val-annotations-file', default=None, type=str,
help='Validation annotations file. By default will be inferred from --dataset')
parser.add_argument('--image-size', default=[800, 800], nargs=2, type=int,
help='Image size for training')
parser.add_argument('--data-augmentation', default="hflip", help='data augmentation policy')
# Train parameters
parser.add_argument('--epochs', default=26, type=int, metavar='N',
help='number of total epochs to run')
parser.add_argument('--start-epoch', default=0, type=int, help='start epoch')
parser.add_argument('--output-dir', default=None, help='path where to save checkpoints.')
parser.add_argument('--target-map', default=0.34, type=float, help='Stop training when target mAP is reached')
parser.add_argument('--resume', default='', help='resume from checkpoint')
parser.add_argument("--pretrained", dest="pretrained", action="store_true",
help="Use pre-trained models from the modelzoo")
# Hyperparameters
parser.add_argument('-b', '--batch-size', default=2, type=int,
help='images per gpu, the total batch size is $NGPU x batch_size')
parser.add_argument('-e', '--eval-batch-size', default=None, type=int,
help='evaluation images per gpu, the total batch size is $NGPU x batch_size')
parser.add_argument('--lr', default=0.02, type=float,
help='initial learning rate, 0.02 is the default value for training '
'on 8 gpus and 2 images_per_gpu')
parser.add_argument('--warmup-epochs', default=1, type=int,
help='how long the learning rate will be warmed up in fraction of epochs')
parser.add_argument('--warmup-factor', default=1e-3, type=float,
help='factor for controlling warmup curve')
# Other
parser.add_argument("--local_rank",
type=int,
default=-1,
help="local_rank for distributed training on gpus")
parser.add_argument('-j', '--workers', default=4, type=int, metavar='N',
help='number of data loading workers (default: 4)')
parser.add_argument('--print-freq', default=20, type=int, help='print frequency')
parser.add_argument('--eval-print-freq', default=None, type=int, help='eval print frequency')
parser.add_argument("--test-only", dest="test_only", action="store_true", help="Only test the model")
parser.add_argument('--seed', '-s', type=int, default=random.SystemRandom().randint(0, 2**32 - 1),
help='manually set random seed')
parser.add_argument('--device', default='cuda', help='device')
parser.add_argument('--cocoeval', default='nvidia',
choices=['python', 'nvidia'],
help='Choose the cocoeval implementation (nvidia is a much faster c++ implementation)')
parser.add_argument('--coco-threads', default=8, type=int,
help='Number of threads to use with --coco=nvidia')
# distributed training parameters
parser.add_argument('--world-size', default=1, type=int,
help='number of distributed processes')
parser.add_argument('--dist-url', default='tcp://localhost:45679', help='url used to set up distributed training')
# optimizations
parser.add_argument('--frozen-bn-opt', action="store_true", help='calculate frozen BN scale and bias only once')
parser.add_argument('--frozen-bn-fp16', dest="frozen_bn_fp16",
action="store_true", help="cast frozen BN layers to fp16 (use with --amp)")
parser.add_argument('--jit', action="store_true", help="enable fusing opportunities")
parser.add_argument('--cuda-graphs', action="store_true", help='enable CUDA graphs')
parser.add_argument('--cuda-graphs-eval', action="store_true", help='enable CUDA graphs in evaluation')
parser.add_argument('--cls-head-pad', action="store_true",
help='pad classification head (used for CUDA graphs or just parallelization)')
parser.add_argument('--reg-head-pad', action="store_true",
help='pad regression head (used for CUDA graphs or just parallelization)')
parser.add_argument('--cuda-graphs-syn', action="store_true", help='using synthetic data for model capture')
parser.add_argument('--model-warmup-epochs', default=16, type=int,
help='warmup model for JIT and cuDNN using synthetic data')
# DALI
parser.add_argument('--dali', action="store_true",
help='use DALI instead of native PyTorch dataloader during training')
parser.add_argument('--dali-matched-idxs', action="store_true", help='compute matched_idxs within DALI')
parser.add_argument('--dali-eval', action="store_true",
help='use DALI instead of native PyTorch dataloader during evaluation')
parser.add_argument('--dali-eval-cache', action="store_true",
help='Cache test dataset during evaluation')
parser.add_argument('--dali-prefetch-queue-depth', type=int, default=2, help='set DALI prefetch queue depth')
parser.add_argument('--dali-cpu-decode', action="store_true",
help='use CPU-based DALI decoder instead of the mixed one')
# apex optimizations
parser.add_argument('--apex-adam', action="store_true", help="use APEX implementation of Adam")
parser.add_argument('--apex-focal-loss', action="store_true", help="use APEX implementation of focal loss")
parser.add_argument('--apex-head-fusion', action="store_true", help='using APEX conv-bias-relu fusion')
# communication optimizations
parser.add_argument('--disable-ddp-broadcast-buffers', dest='broadcast_buffers', action='store_false',
help='disable DDP broadcast buffers (BNs are frozen)')
parser.add_argument('--fp16-allreduce', action="store_true", help='using fp16 allreduce compression')
parser.add_argument('--ddp-bucket-sz', default=25, type=int, help='DDP bucket size in MB')
parser.add_argument('--ddp-first-bucket-sz', default=None, type=int, help='DDP first bucket size in MB')
# additional params
parser.add_argument('--max-boxes', dest='max_boxes', type=int, default=1000,
help='pad the number of bboxes to max_boxes, used to make functions parallel')
parser.add_argument('--cudnn-bench', dest='cudnn_bench', action='store_true',
help='set torch.backends.cudnn.benchmark')
parser.add_argument('--not-graphed-prologues', action='store_true', help='')
parser.add_argument('--skip-metric-loss', action='store_true', help='')
parser.add_argument('--syn-dataset', dest='syn_dataset', action='store_true',
help='it is actually a semi-synthetic dataset, since the original dataset is required')
parser.add_argument('--sync-after-graph-replay', action='store_true',
help='this is a workaround for the scenario in which DALI is blocked due to optimizer sync '
'driver lock')
args = parser.parse_args()
args.eval_batch_size = args.eval_batch_size or args.batch_size
args.eval_print_freq = args.eval_print_freq or args.print_freq
return args
def main(args):
# CUDA graphs will only work if regression head tensors are padded
assert((args.cuda_graphs and args.reg_head_pad and args.cls_head_pad) or
(args.cuda_graphs and args.not_graphed_prologues) or
(not args.cuda_graphs))
# Do not use DALI when using synthetic data
assert((args.dali and not args.syn_dataset) or not args.dali)
# At the moment, to use JIT FrozenBN fusions, one must use the FrozenBN optimization flag
assert((args.jit and args.frozen_bn_opt) or not args.jit)
# Enable JIT
if args.jit:
assert args.backbone == 'resnext50_32x4d',"JIT was only tested with ResNeXt50-32x4d."
torch._C._jit_set_nvfuser_enabled(True)
torch._C._jit_set_texpr_fuser_enabled(False)
torch._C._jit_set_profiling_executor(True)
torch._C._jit_set_profiling_mode(True)
torch._C._jit_override_can_fuse_on_cpu(False)
torch._C._jit_override_can_fuse_on_gpu(False)
torch._C._jit_set_bailout_depth(20)
# Init distributed mode
train_group, eval_group = utils.init_distributed_mode(args)
# Start MLPerf benchmark
mllogger.mlperf_submission_log(benchmark=SSD)
mllogger.start(key=INIT_START, sync=True)
if args.output_dir:
utils.mkdir(args.output_dir)
torch.backends.cudnn.benchmark = args.cudnn_bench
device = torch.device(args.device)
# set rank seeds according to MLPerf rules
if args.distributed:
args.seed = utils.broadcast(args.seed, src=1, group=None)
args.seed = (args.seed + utils.get_rank()) % 2**32
torch.manual_seed(args.seed)
np.random.seed(seed=args.seed)
mllogger.event(key=SEED, value=args.seed, unique=False)
# Print args
mllogger.event(key='local_batch_size', value=args.batch_size)
mllogger.event(key=GLOBAL_BATCH_SIZE, value=args.batch_size*args.num_train_ranks)
mllogger.event(key=EPOCH_COUNT, value=args.epochs)
mllogger.event(key=FIRST_EPOCH_NUM, value=args.start_epoch)
print(args)
# Data loading code
print("Getting dataset information")
dataset_fn, num_classes, train_sz, val_sz, \
train_data_path, train_annotations_file, val_data_path, val_annotations_file = \
get_dataset_fn(dataset=args.dataset, dataset_path=args.dataset_path)
args.num_classes = args.num_classes or num_classes
args.train_sz = train_sz
args.val_sz = val_sz
args.train_data_path = args.train_data_path or train_data_path
args.train_annotations_file = args.train_annotations_file or train_annotations_file
args.val_data_path = args.val_data_path or val_data_path
args.val_annotations_file = args.val_annotations_file or val_annotations_file
print("Creating model")
model = retinanet_from_backbone(backbone=args.backbone,
num_classes=num_classes,
image_size=args.image_size,
data_layout=args.data_layout,
pretrained=args.pretrained,
trainable_backbone_layers=args.trainable_backbone_layers,
jit=args.jit,
head_fusion=args.apex_head_fusion,
frozen_bn_opt=args.frozen_bn_opt)
model.to(device)
if args.data_layout == 'channels_last':
model = model.to(memory_format=torch.channels_last)
# cast FrozenBatchNorm2d parameters to FP16
if args.amp and args.frozen_bn_fp16 and args.frozen_bn_opt:
cast_frozen_bn_half(module=model)
if args.distributed and args.sync_bn:
model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model)
model_without_ddp = model
if args.distributed:
s = torch.cuda.Stream()
with torch.cuda.stream(s):
if args.ddp_first_bucket_sz is not None:
torch.distributed._DEFAULT_FIRST_BUCKET_BYTES = args.ddp_first_bucket_sz * 1024 * 1024
process_group = train_group if args.rank in args.train_ranks else eval_group
model = torch.nn.parallel.DistributedDataParallel(model,
process_group=process_group,
device_ids=[args.gpu],
broadcast_buffers=args.broadcast_buffers,
bucket_cap_mb=args.ddp_bucket_sz)
model_without_ddp = model.module
if args.fp16_allreduce:
model.register_comm_hook(state=None, hook=fp16_compress_hook)
params = [p for p in model.parameters() if p.requires_grad]
if not args.apex_adam:
optimizer = torch.optim.Adam(params, lr=args.lr)
else:
optimizer = apex.optimizers.FusedAdam(params, lr=args.lr)
mllogger.event(key=OPT_NAME, value=ADAM)
mllogger.event(key=OPT_BASE_LR, value=args.lr)
mllogger.event(key=OPT_WEIGHT_DECAY, value=0)
mllogger.event(key=OPT_LR_WARMUP_EPOCHS, value=args.warmup_epochs)
mllogger.event(key=OPT_LR_WARMUP_FACTOR, value=args.warmup_factor)
mllogger.event(key=GRADIENT_ACCUMULATION_STEPS, value=1)
if args.resume:
checkpoint = torch.load(args.resume, map_location='cpu')
model_without_ddp.load_state_dict(checkpoint['model'])
optimizer.load_state_dict(checkpoint['optimizer'])
args.start_epoch = checkpoint['epoch'] + 1
# using default feature sizes to create anchors (happens just once)
model_without_ddp.update_anchors(torch.Size([args.batch_size, 3, args.image_size[0], args.image_size[1]]), device,
dtype=(torch.float16 if args.amp else torch.float32))
# no need for eval warmup here, since warmup is also part of graph capture
if args.model_warmup_epochs > 0 and not args.cuda_graphs_eval:
print('Model eval warmup')
assert(args.dataset == 'openimages-mlperf' and args.image_size == [800, 800])
start_time = time.time()
bs_list = [args.eval_batch_size]
eval_sz = args.val_sz
eval_last_iter_bs = int(math.ceil(float(eval_sz) % (args.num_eval_ranks * args.eval_batch_size) / args.num_eval_ranks))
if eval_last_iter_bs > 0:
if eval_last_iter_bs != args.eval_batch_size:
bs_list.append(eval_last_iter_bs)
if eval_last_iter_bs > 1:
bs_list.append(eval_last_iter_bs - 1)
# TODO: since during training the model is usually graphed, we skip this warmup at the moment
# train_sz = args.train_sz
# train_last_iter_bs = int(math.ceil(float(train_sz) % (world_size * args.batch_size) / world_size))
# if train_last_iter_bs > 0 and train_last_iter_bs != eval_last_iter_bs:
# bs_list.append(train_last_iter_bs)
for bs in bs_list:
model_capture.model_eval_warmup(model, bs, args.model_warmup_epochs, args)
#cudnn_fusion_warmup(bs_list)
total_time = time.time() - start_time
print('Time: {} sec'.format(total_time))
# GradScaler for AMP
scaler = torch.cuda.amp.GradScaler(enabled=args.amp)
data_loader = None
data_loader_test = None
# The dali based data_loader doesn't touch data at init time (lazy_init=True). So we place before after RUN_START
if args.dali and (args.rank in args.train_ranks):
print("Creating Dali dataloader")
data_loader = DaliDataIterator(data_path=args.train_data_path,
anno_path=args.train_annotations_file,
batch_size=args.batch_size,
num_shards=args.num_train_ranks,
shard_id=args.train_rank,
is_training=True,
image_size=args.image_size,
num_threads=args.workers,
prefetch_queue_depth=args.dali_prefetch_queue_depth,
compute_matched_idxs=args.dali_matched_idxs,
anchors=model_without_ddp.anchors,
cpu_decode=args.dali_cpu_decode,
lazy_init=True,
cache=False,
seed=args.seed)
# Preparing CUDA graph using the synthetic data
graphed_model, static_input, static_loss, static_prologues_out = None, None, None, None
if args.cuda_graphs and args.cuda_graphs_syn:
graphed_model, static_input, static_loss, static_prologues_out = \
model_capture.whole_model_capture(model, optimizer, scaler, None, args)
graphed_model_eval, static_input_eval, static_model_output_eval = None, None, None
if args.cuda_graphs_eval and args.cuda_graphs_syn:
graphed_model_eval, static_input_eval, static_model_output_eval = \
model_capture.whole_model_capture_eval(model, None, args)
mllogger.end(key=INIT_STOP, sync=True)
start_time = time.time()
sbridge = init_bridge(PyTProfilerHandler(), PyTCommunicationHandler(), mllogger)
mllogger.start(key=RUN_START, sync=True)
sbridge.start_prof(SBridge.LOAD_TIME)
# The pytorch based data_loader touches data at init time. So we place it after RUN_START
if not args.dali and (args.rank in args.train_ranks) and (not args.test_only):
#if not args.dali and (args.rank in args.train_ranks):
print("Creating PyTorch dataloader")
dataset = dataset_fn(dataset_path=args.train_data_path,
annotations_file=args.train_annotations_file,
transforms=get_transform(True, args.data_augmentation),
training=True)
if args.syn_dataset:
data_loader = get_cached_dataset(model_without_ddp, dataset, device, args)
else:
if args.distributed:
train_sampler = torch.utils.data.distributed.DistributedSampler(dataset=dataset,
num_replicas=args.num_train_ranks,
rank=args.train_rank)
else:
train_sampler = torch.utils.data.RandomSampler(dataset)
train_batch_sampler = torch.utils.data.BatchSampler(sampler=train_sampler,
batch_size=args.batch_size,
drop_last=True)
data_loader = torch.utils.data.DataLoader(
dataset, batch_sampler=train_batch_sampler, num_workers=args.workers,
pin_memory=True, collate_fn=utils.collate_fn)
if args.rank in args.train_ranks and (not args.test_only):
mllogger.event(key=TRAIN_SAMPLES, value=len(data_loader))
if args.rank in args.eval_ranks:
if args.dali_eval:
data_loader_test = DaliDataIterator(data_path=args.val_data_path,
anno_path=args.val_annotations_file,
batch_size=args.eval_batch_size,
num_shards=args.num_eval_ranks,
shard_id=args.eval_rank,
is_training=False,
image_size=args.image_size,
num_threads=args.workers,
prefetch_queue_depth=args.dali_prefetch_queue_depth,
compute_matched_idxs=False,
anchors=model_without_ddp.anchors,
cpu_decode=args.dali_cpu_decode,
lazy_init=True,
cache=args.dali_eval_cache,
seed=args.seed)
else:
dataset_test = dataset_fn(dataset_path=args.val_data_path,
annotations_file=args.val_annotations_file,
transforms=get_transform(False, args.data_augmentation),
training=False)
if args.distributed:
test_sampler = torch.utils.data.distributed.DistributedSampler(dataset=dataset_test,
num_replicas=args.num_eval_ranks,
rank=args.eval_rank,
shuffle=False)
else:
test_sampler = torch.utils.data.SequentialSampler(dataset_test)
data_loader_test = torch.utils.data.DataLoader(
dataset_test, batch_size=args.eval_batch_size or args.batch_size,
sampler=test_sampler, num_workers=args.workers,
pin_memory=True, collate_fn=utils.collate_fn)
sbridge.stop_prof(SBridge.LOAD_TIME)
if args.rank in args.eval_ranks and (not args.test_only):
mllogger.event(key=EVAL_SAMPLES, value=len(data_loader_test))
# Preparing CUDA graph using the dataset
if args.cuda_graphs and not args.cuda_graphs_syn:
graphed_model, static_input, static_loss, static_prologues_out = \
model_capture.whole_model_capture(model, optimizer, scaler, data_loader, args)
if args.cuda_graphs_eval and not args.cuda_graphs_syn:
graphed_model_eval, static_input_eval, static_model_output_eval = \
model_capture.whole_model_capture_eval(model, data_loader_test, args)
print("Running ...")
status = ABORTED
accuracy = None
if args.test_only and (args.rank in args.eval_ranks):
accuracy = evaluate(model=model,
data_loader=data_loader_test,
device=device,
epoch=None,
eval_group=eval_group,
args=args,
graphed_model=graphed_model_eval, static_input=static_input_eval,
static_output=static_model_output_eval,
sbridge=sbridge)
print(f'Model mAP = {accuracy}')
if args.target_map and accuracy and accuracy >= args.target_map:
status = SUCCESS
else:
for epoch in range(args.start_epoch, args.epochs):
############################################################################################################
# Train
############################################################################################################
if args.rank in args.train_ranks:
if args.distributed and not args.dali and not args.syn_dataset:
train_sampler.set_epoch(epoch)
metric_logger, accuracy = train_one_epoch(model=model,
optimizer=optimizer,
scaler=scaler,
data_loader=data_loader,
device=device,
epoch=epoch,
train_group=train_group,
args=args,
graphed_model=graphed_model,
static_input=static_input,
static_loss=static_loss,
static_prologues_out=static_prologues_out,
sbridge=sbridge)
if args.output_dir:
checkpoint = {
'model': model_without_ddp.state_dict(),
'optimizer': optimizer.state_dict(),
'epoch': epoch+1,
'args': args,
}
utils.save_on_master(
checkpoint,
os.path.join(args.output_dir, 'model_{}.pth'.format(epoch+1)))
utils.save_on_master(
checkpoint,
os.path.join(args.output_dir, 'checkpoint.pth'))
if args.target_map and accuracy and accuracy >= args.target_map:
status = SUCCESS
break
############################################################################################################
############################################################################################################
# Sync train and val ranks (only if they are on different sets of nodes)
############################################################################################################
if args.eval_ranks != args.train_ranks:
# TODO(ahmadki): do we need to sync params without grads ?
params = [param for param in model.parameters()]
# params = [param for param in model.parameters() if param.requires_grad]
flat_params = apex_C.flatten(params)
# sync train and val
utils.barrier(group=None)
# broadcast train->val (actually train(0)->all)
torch.distributed.broadcast(flat_params, 0)
############################################################################################################
############################################################################################################
# Validation
############################################################################################################
if args.rank in args.eval_ranks:
accuracy = evaluate(model=model,
data_loader=data_loader_test,
device=device,
epoch=epoch+1,
eval_group=eval_group,
args=args,
graphed_model=graphed_model_eval, static_input=static_input_eval,
static_output=static_model_output_eval,
sbridge=sbridge)
if args.target_map and accuracy and accuracy >= args.target_map:
status = SUCCESS
break
############################################################################################################
# Wait for async coco jobs if necessary
if args.async_coco:
while status != SUCCESS and len(async_executor.tags()):
# FIXME(ahmadki): --num-eval-ranks
if args.eval_rank == 0:
results = async_executor.pop_if_done()
# in case of multiple results are returned, get the highest mAP
if results and len(results) > 0:
accuracy = max([result['bbox'][0] for result in results.values() if result], default=-1)
if args.distributed:
accuracy = utils.broadcast(accuracy, 0, group=None)
if args.target_map and accuracy and accuracy >= args.target_map:
status = SUCCESS
mllogger.end(key=RUN_STOP, metadata={"status": status}, sync=True)
total_time = time.time() - start_time
total_time_str = str(datetime.timedelta(seconds=int(total_time)))
print('Training time {}'.format(total_time_str))
mllogger.event(key=STATUS, value=status, unique=False)
if __name__ == "__main__":
args = parse_args()
main(args)
# Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import torch
import torchvision
from torch import nn, Tensor
from torchvision.transforms import functional as F
from torchvision.transforms import transforms as T
from typing import List, Tuple, Dict, Optional
def _flip_coco_person_keypoints(kps, width):
flip_inds = [0, 2, 1, 4, 3, 6, 5, 8, 7, 10, 9, 12, 11, 14, 13, 16, 15]
flipped_data = kps[:, flip_inds]
flipped_data[..., 0] = width - flipped_data[..., 0]
# Maintain COCO convention that if visibility == 0, then x, y = 0
inds = flipped_data[..., 2] == 0
flipped_data[inds] = 0
return flipped_data
################################################################################
# TODO(ahmadki): remove this block, and replace get_image_size with F.get_image_size
# once https://github.com/pytorch/vision/pull/4321 is public
from PIL import Image, ImageOps, ImageEnhance
Image.MAX_IMAGE_PIXELS = None
from typing import Any
try:
import accimage
except ImportError:
accimage = None
@torch.jit.unused
def _is_pil_image(img: Any) -> bool:
if accimage is not None:
return isinstance(img, (Image.Image, accimage.Image))
else:
return isinstance(img, Image.Image)
def get_image_size_tensor(img: Tensor) -> List[int]:
# Returns (w, h) of tensor image
_assert_image_tensor(img)
return [img.shape[-1], img.shape[-2]]
@torch.jit.unused
def get_image_size_pil(img: Any) -> List[int]:
if _is_pil_image(img):
return list(img.size)
raise TypeError("Unexpected type {}".format(type(img)))
def get_image_size(img: Tensor) -> List[int]:
"""Returns the size of an image as [width, height].
Args:
img (PIL Image or Tensor): The image to be checked.
Returns:
List[int]: The image size.
"""
if isinstance(img, torch.Tensor):
return get_image_size_tensor(img)
return get_image_size_pil(img)
def get_image_num_channels_tensor(img: Tensor) -> int:
_assert_image_tensor(img)
if img.ndim == 2:
return 1
elif img.ndim > 2:
return img.shape[-3]
raise TypeError(f"Input ndim should be 2 or more. Got {img.ndim}")
@torch.jit.unused
def get_image_num_channels_pil(img: Any) -> int:
if _is_pil_image(img):
return len(img.getbands())
raise TypeError("Unexpected type {}".format(type(img)))
def get_image_num_channels(img: Tensor) -> int:
if isinstance(img, torch.Tensor):
return get_image_num_channels_tensor(img)
return get_image_num_channels_pil(img)
################################################################################
class Compose(object):
def __init__(self, transforms):
self.transforms = transforms
def __call__(self, image, target):
for t in self.transforms:
image, target = t(image, target)
return image, target
class RandomHorizontalFlip(T.RandomHorizontalFlip):
def forward(self, image: Tensor,
target: Optional[Dict[str, Tensor]] = None) -> Tuple[Tensor, Optional[Dict[str, Tensor]]]:
if torch.rand(1) < self.p:
image = F.hflip(image)
if target is not None:
width, _ = get_image_size(image)
target["boxes"][:, [0, 2]] = width - target["boxes"][:, [2, 0]]
if "masks" in target:
target["masks"] = target["masks"].flip(-1)
if "keypoints" in target:
keypoints = target["keypoints"]
keypoints = _flip_coco_person_keypoints(keypoints, width)
target["keypoints"] = keypoints
return image, target
class ToTensor(nn.Module):
def forward(self, image: Tensor,
target: Optional[Dict[str, Tensor]] = None) -> Tuple[Tensor, Optional[Dict[str, Tensor]]]:
image = F.to_tensor(image)
return image, target
class RandomIoUCrop(nn.Module):
def __init__(self, min_scale: float = 0.3, max_scale: float = 1.0, min_aspect_ratio: float = 0.5,
max_aspect_ratio: float = 2.0, sampler_options: Optional[List[float]] = None, trials: int = 40):
super().__init__()
# Configuration similar to https://github.com/weiliu89/caffe/blob/ssd/examples/ssd/ssd_coco.py#L89-L174
self.min_scale = min_scale
self.max_scale = max_scale
self.min_aspect_ratio = min_aspect_ratio
self.max_aspect_ratio = max_aspect_ratio
if sampler_options is None:
sampler_options = [0.0, 0.1, 0.3, 0.5, 0.7, 0.9, 1.0]
self.options = sampler_options
self.trials = trials
def forward(self, image: Tensor,
target: Optional[Dict[str, Tensor]] = None) -> Tuple[Tensor, Optional[Dict[str, Tensor]]]:
if target is None:
raise ValueError("The targets can't be None for this transform.")
if isinstance(image, torch.Tensor):
if image.ndimension() not in {2, 3}:
raise ValueError('image should be 2/3 dimensional. Got {} dimensions.'.format(image.ndimension()))
elif image.ndimension() == 2:
image = image.unsqueeze(0)
orig_w, orig_h = get_image_size(image)
while True:
# sample an option
idx = int(torch.randint(low=0, high=len(self.options), size=(1,)))
min_jaccard_overlap = self.options[idx]
if min_jaccard_overlap >= 1.0: # a value larger than 1 encodes the leave as-is option
return image, target
for _ in range(self.trials):
# check the aspect ratio limitations
r = self.min_scale + (self.max_scale - self.min_scale) * torch.rand(2)
new_w = int(orig_w * r[0])
new_h = int(orig_h * r[1])
aspect_ratio = new_w / new_h
if not (self.min_aspect_ratio <= aspect_ratio <= self.max_aspect_ratio):
continue
# check for 0 area crops
r = torch.rand(2)
left = int((orig_w - new_w) * r[0])
top = int((orig_h - new_h) * r[1])
right = left + new_w
bottom = top + new_h
if left == right or top == bottom:
continue
# check for any valid boxes with centers within the crop area
cx = 0.5 * (target["boxes"][:, 0] + target["boxes"][:, 2])
cy = 0.5 * (target["boxes"][:, 1] + target["boxes"][:, 3])
is_within_crop_area = (left < cx) & (cx < right) & (top < cy) & (cy < bottom)
if not is_within_crop_area.any():
continue
# check at least 1 box with jaccard limitations
boxes = target["boxes"][is_within_crop_area]
ious = torchvision.ops.boxes.box_iou(boxes, torch.tensor([[left, top, right, bottom]],
dtype=boxes.dtype, device=boxes.device))
if ious.max() < min_jaccard_overlap:
continue
# keep only valid boxes and perform cropping
target["boxes"] = boxes
target["labels"] = target["labels"][is_within_crop_area]
target["boxes"][:, 0::2] -= left
target["boxes"][:, 1::2] -= top
target["boxes"][:, 0::2].clamp_(min=0, max=new_w)
target["boxes"][:, 1::2].clamp_(min=0, max=new_h)
image = F.crop(image, top, left, new_h, new_w)
return image, target
class RandomZoomOut(nn.Module):
def __init__(self, fill: Optional[List[float]] = None, side_range: Tuple[float, float] = (1., 4.), p: float = 0.5):
super().__init__()
if fill is None:
fill = [0., 0., 0.]
self.fill = fill
self.side_range = side_range
if side_range[0] < 1. or side_range[0] > side_range[1]:
raise ValueError("Invalid canvas side range provided {}.".format(side_range))
self.p = p
@torch.jit.unused
def _get_fill_value(self, is_pil):
# type: (bool) -> int
# We fake the type to make it work on JIT
return tuple(int(x) for x in self.fill) if is_pil else 0
def forward(self, image: Tensor,
target: Optional[Dict[str, Tensor]] = None) -> Tuple[Tensor, Optional[Dict[str, Tensor]]]:
if isinstance(image, torch.Tensor):
if image.ndimension() not in {2, 3}:
raise ValueError('image should be 2/3 dimensional. Got {} dimensions.'.format(image.ndimension()))
elif image.ndimension() == 2:
image = image.unsqueeze(0)
if torch.rand(1) < self.p:
return image, target
orig_w, orig_h = get_image_size(image)
r = self.side_range[0] + torch.rand(1) * (self.side_range[1] - self.side_range[0])
canvas_width = int(orig_w * r)
canvas_height = int(orig_h * r)
r = torch.rand(2)
left = int((canvas_width - orig_w) * r[0])
top = int((canvas_height - orig_h) * r[1])
right = canvas_width - (left + orig_w)
bottom = canvas_height - (top + orig_h)
if torch.jit.is_scripting():
fill = 0
else:
fill = self._get_fill_value(_is_pil_image(image))
image = F.pad(image, [left, top, right, bottom], fill=fill)
if isinstance(image, torch.Tensor):
v = torch.tensor(self.fill, device=image.device, dtype=image.dtype).view(-1, 1, 1)
image[..., :top, :] = image[..., :, :left] = image[..., (top + orig_h):, :] = \
image[..., :, (left + orig_w):] = v
if target is not None:
target["boxes"][:, 0::2] += left
target["boxes"][:, 1::2] += top
return image, target
class RandomPhotometricDistort(nn.Module):
def __init__(self, contrast: Tuple[float] = (0.5, 1.5), saturation: Tuple[float] = (0.5, 1.5),
hue: Tuple[float] = (-0.05, 0.05), brightness: Tuple[float] = (0.875, 1.125), p: float = 0.5):
super().__init__()
self._brightness = T.ColorJitter(brightness=brightness)
self._contrast = T.ColorJitter(contrast=contrast)
self._hue = T.ColorJitter(hue=hue)
self._saturation = T.ColorJitter(saturation=saturation)
self.p = p
def forward(self, image: Tensor,
target: Optional[Dict[str, Tensor]] = None) -> Tuple[Tensor, Optional[Dict[str, Tensor]]]:
if isinstance(image, torch.Tensor):
if image.ndimension() not in {2, 3}:
raise ValueError('image should be 2/3 dimensional. Got {} dimensions.'.format(image.ndimension()))
elif image.ndimension() == 2:
image = image.unsqueeze(0)
r = torch.rand(7)
if r[0] < self.p:
image = self._brightness(image)
contrast_before = r[1] < 0.5
if contrast_before:
if r[2] < self.p:
image = self._contrast(image)
if r[3] < self.p:
image = self._saturation(image)
if r[4] < self.p:
image = self._hue(image)
if not contrast_before:
if r[5] < self.p:
image = self._contrast(image)
if r[6] < self.p:
channels = get_image_num_channels(image)
permutation = torch.randperm(channels)
is_pil = _is_pil_image(image)
if is_pil:
image = F.to_tensor(image)
image = image[..., permutation, :, :]
if is_pil:
image = F.to_pil_image(image)
return image, target
source config_DGXA100_001x08x032.sh
export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
#export HSA_FORCE_FINE_GRAIN_PCIE=1
#export MIOPEN_FIND_MODE=5
#export NCCL_NET_GDR_LEVEL=5
#export NCCL_P2P_LEVEL=5
torchrun --nproc_per_node=8 train.py --lr 0.000085 --batch-size 18 --eval-batch-size 32 --epochs 1 --print-freq 20 --dataset-path /public/home/liangjj/2023/training_results_v2.1-main/NVIDIA/benchmarks/ssd/implementations/pytorch-22.09/public-scripts/datasets/open-images-v6 --warmup-epochs 0 --frozen-bn-opt --frozen-bn-fp16 --apex-adam --disable-ddp-broadcast-buffers --fp16-allreduce --skip-metric-loss --async-coco
#torchrun --standalone --nproc_per_node=8 --no_python ./dcu_run.sh
# Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from collections import defaultdict, deque
import datetime
import errno
import os
import time
import torch
import torch.distributed as dist
class ScratchPad:
target_n = None
target_labels_padded = None
target_boxes_padded = None
target_matched_idxs = None
gt_classes_target = None
batch_size_vector = None
class SmoothedValue(object):
"""Track a series of values and provide access to smoothed values over a
window or the global series average.
"""
def __init__(self, window_size=20, fmt=None):
if fmt is None:
fmt = "{median:.4f} ({global_avg:.4f})"
self.deque = deque(maxlen=window_size)
self.total = 0.0
self.count = 0
self.fmt = fmt
def update(self, value, n=1):
self.deque.append(value)
self.count += n
self.total += value * n
def synchronize_between_processes(self, group=None):
"""
Warning: does not synchronize the deque!
"""
if not is_dist_avail_and_initialized():
return
t = torch.tensor([self.count, self.total], dtype=torch.float64, device='cuda')
dist.barrier(group=group)
dist.all_reduce(t, group=group)
t = t.tolist()
self.count = int(t[0])
self.total = t[1]
@property
def median(self):
d = torch.tensor(list(self.deque))
return d.median().item()
@property
def avg(self):
d = torch.tensor(list(self.deque), dtype=torch.float32)
return d.mean().item()
@property
def global_avg(self):
return self.total / self.count
@property
def max(self):
return max(self.deque)
@property
def value(self):
return self.deque[-1]
def __str__(self):
return self.fmt.format(
median=self.median,
avg=self.avg,
global_avg=self.global_avg,
max=self.max,
value=self.value)
def all_gather(data, group):
"""
Run all_gather on arbitrary picklable data (not necessarily tensors)
Args:
data: any picklable object
Returns:
list[data]: list of data gathered from each rank
"""
world_size = group.size() if group else get_world_size()
if world_size == 1:
return [data]
data_list = [None] * world_size
dist.all_gather_object(object_list=data_list, obj=data, group=group)
return data_list
def broadcast(data, src, group):
"""
Run broadcast on arbitrary picklable data (not necessarily tensors)
Args:
data: any picklable object
src: Source rank from which to broadcast data
Returns:
list[data]: list of data gathered from each rank
"""
world_size = group.size() if group else get_world_size()
if world_size == 1:
return data
data_list = data if isinstance(data, list) else [data]
dist.broadcast_object_list(object_list=data_list, src=src, group=group)
return data_list if isinstance(data, list) else data_list[0]
def reduce_dict(input_dict, group, average=True):
"""
Args:
input_dict (dict): all the values will be reduced
average (bool): whether to do average or sum
Reduce the values in the dictionary from all processes so that all processes
have the averaged results. Returns a dict with the same fields as
input_dict, after reduction.
"""
world_size = group.size() if group else get_world_size()
if world_size < 2:
return input_dict
with torch.no_grad():
names = []
values = []
# sort the keys so that they are consistent across processes
for k in sorted(input_dict.keys()):
names.append(k)
values.append(input_dict[k])
values = torch.stack(values, dim=0)
dist.all_reduce(tensor=values, group=group)
if average:
values /= world_size
reduced_dict = {k: v for k, v in zip(names, values)}
return reduced_dict
class SimpleTimer(object):
def __init__(self, prefix=""):
self.prefix = prefix
def __enter__(self):
self.start = time.time()
return self
def __exit__(self, *args):
self.end = time.time()
run_time = self.end - self.start
print(f"{self.prefix}{run_time}")
class MetricLogger(object):
def __init__(self, delimiter="\t"):
self.meters = defaultdict(SmoothedValue)
self.delimiter = delimiter
self.summary = defaultdict(lambda: None)
self.current_iter = 0
def update(self, **kwargs):
for k, v in kwargs.items():
if isinstance(v, torch.Tensor):
v = v.item()
assert isinstance(v, (float, int))
self.meters[k].update(v)
def __getattr__(self, attr):
if attr in self.meters:
return self.meters[attr]
if attr in self.__dict__:
return self.__dict__[attr]
raise AttributeError("'{}' object has no attribute '{}'".format(
type(self).__name__, attr))
def __str__(self):
loss_str = []
for name, meter in self.meters.items():
loss_str.append(
"{}: {}".format(name, str(meter))
)
return self.delimiter.join(loss_str)
def synchronize_between_processes(self, group=None):
for meter in self.meters.values():
meter.synchronize_between_processes(group=group)
def add_meter(self, name, meter):
self.meters[name] = meter
def log_every(self, iterable, print_freq, header=None):
self.current_iter = 0
self.summary['samples'] = 0
if not header:
header = ''
start_time = time.time()
self.summary['start_time'] = start_time
end = time.time()
iter_time = SmoothedValue(fmt='{avg:.4f}')
data_time = SmoothedValue(fmt='{avg:.4f}')
space_fmt = ':' + str(len(str(len(iterable)))) + 'd'
if torch.cuda.is_available():
log_msg = self.delimiter.join([
header,
'[{0' + space_fmt + '}/{1}]',
'eta: {eta}',
'{meters}',
'time: {time}',
'data: {data}',
'max mem: {memory:.0f}'
])
else:
log_msg = self.delimiter.join([
header,
'[{0' + space_fmt + '}/{1}]',
'eta: {eta}',
'{meters}',
'time: {time}',
'data: {data}'
])
MB = 1024.0 * 1024.0
for obj in iterable:
data_time.update(time.time() - end)
yield obj
iter_time.update(time.time() - end)
if self.current_iter % print_freq == 0 or self.current_iter == len(iterable) - 1:
eta_seconds = iter_time.global_avg * (len(iterable) - self.current_iter)
eta_string = str(datetime.timedelta(seconds=int(eta_seconds)))
if torch.cuda.is_available():
print(log_msg.format(
self.current_iter, len(iterable), eta=eta_string,
meters=str(self),
time=str(iter_time), data=str(data_time),
memory=torch.cuda.max_memory_allocated() / MB))
else:
print(log_msg.format(
self.current_iter, len(iterable), eta=eta_string,
meters=str(self),
time=str(iter_time), data=str(data_time)))
self.current_iter += 1
end = time.time()
self.summary['samples'] += len(obj[0])
self.summary['end_time'] = end
end_time = time.time()
total_time = end_time - start_time
total_time_str = str(datetime.timedelta(seconds=int(total_time)))
print('{} Total time: {} ({:.4f} s / it)'.format(
header, total_time_str, total_time / len(iterable)))
def collate_fn(batch):
return tuple(zip(*batch))
def warmup_lr_scheduler(optimizer, start_iter, warmup_iters, warmup_factor):
def f(x):
x = x + start_iter
if x >= warmup_iters:
return 1
alpha = float(x) / warmup_iters
return warmup_factor * (1 - alpha) + alpha
return torch.optim.lr_scheduler.LambdaLR(optimizer, f)
def mkdir(path):
try:
os.makedirs(path)
except OSError as e:
if e.errno != errno.EEXIST:
raise
def setup_for_distributed(is_master):
"""
This function disables printing when not in master process
"""
import builtins as __builtin__
builtin_print = __builtin__.print
def print(*args, **kwargs):
force = kwargs.pop('force', False)
if is_master or force:
builtin_print(*args, **kwargs)
__builtin__.print = print
def is_dist_avail_and_initialized():
if not dist.is_available():
return False
if not dist.is_initialized():
return False
return True
def get_world_size():
if not is_dist_avail_and_initialized():
return 1
return dist.get_world_size()
def get_rank():
if not is_dist_avail_and_initialized():
return 0
return dist.get_rank()
def is_main_process():
return get_rank() == 0
def save_on_master(*args, **kwargs):
if is_main_process():
torch.save(*args, **kwargs)
def barrier(group):
if not is_dist_avail_and_initialized():
return
torch.distributed.barrier(group)
def init_distributed_mode(args):
if args.world_size > 1 :
args.rank = args.local_rank
args.world_size = args.world_size
args.gpu = args.rank
elif 'SLURM_PROCID' in os.environ:
args.rank = int(os.environ['SLURM_PROCID'])
args.gpu = args.rank % torch.cuda.device_count()
else:
print('Not using distributed mode')
args.distributed = False
args.num_train_ranks = 1
args.num_eval_ranks = 1
args.rank = 0
args.ranks = 1
args.train_ranks = [0]
args.eval_ranks = [0]
args.train_rank = 0
args.eval_rank = 0
return None, None
args.distributed = True
torch.cuda.set_device(args.gpu)
args.dist_backend = 'nccl'
print(f'| distributed init (rank {args.rank}): {args.dist_url}')
if args.cuda_graphs or args.cuda_graphs_eval:
os.environ["NCCL_ASYNC_ERROR_HANDLING"] = "0"
torch.distributed.init_process_group(backend=args.dist_backend, init_method=args.dist_url,
world_size=args.world_size, rank=args.rank)
torch.distributed.barrier()
args.ranks = list(range(args.world_size))
if args.num_eval_ranks is None:
args.num_train_ranks = args.world_size
args.num_eval_ranks = args.world_size
args.train_ranks = args.ranks
args.eval_ranks = args.ranks
args.train_rank = args.rank
args.eval_rank = args.rank
else:
args.num_train_ranks = args.world_size - args.num_eval_ranks
args.train_ranks = args.ranks[:args.num_train_ranks]
args.eval_ranks = args.ranks[args.num_train_ranks:]
args.train_rank = args.rank
args.eval_rank = args.rank - args.num_train_ranks
assert 1<=args.num_train_ranks<=args.world_size, "Number of training ranks must be between 1 and world size"
assert 1<=args.num_eval_ranks<=args.world_size, "Number of validation ranks must be between 1 and world size"
# create training and validation comm groups
train_group = torch.distributed.new_group(ranks=args.train_ranks)
eval_group = torch.distributed.new_group(ranks=args.eval_ranks)
setup_for_distributed(args.train_rank==0 or args.eval_rank==0)
# init new comms
tmp_tensor = torch.ones([1], device='cuda')
torch.distributed.all_reduce(tmp_tensor, group=train_group)
torch.distributed.all_reduce(tmp_tensor, group=eval_group)
return train_group, eval_group
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment