Commit 64b02fb6 authored by liangjing's avatar liangjing
Browse files

version 1

parents
Pipeline #176 failed with stages
in 0 seconds
# Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import math
import sys
import time
import torch
import torch.distributed as dist
from mlperf_logger import mllogger
from mlperf_logging.mllog.constants import (EPOCH_START, EPOCH_STOP, EVAL_START, EVAL_STOP, EVAL_ACCURACY)
import utils
from mlperf_common.scaleoutbridge import ScaleoutBridgeBase as SBridge
from async_executor import async_executor
def preprocessing(images, targets, model_ptr, data_layout):
# TODO: can be parallelized? should we use DALI? there must be a better way
target_per_image = []
for i in range(len(images)):
# create List[Dict] mapping for targets, used only for preprocessing.
# only 'boxes', and perhaps 'keypoints', are used for preprocessing.
dict_ = {'boxes': targets['boxes'][i]}
assert ('keypoints' not in targets)
target_per_image.append(dict_)
images, targets_ = model_ptr.transform(images, target_per_image)
# List[Dict] -> Dict[List]
for i in range(len(targets_)):
targets['boxes'][i] = targets_[i]['boxes']
images = images.tensors
if data_layout == 'channels_last':
images = images.to(memory_format=torch.channels_last)
return images, targets
def init_scratchpad(images, targets, batch_size, num_classes, amp, fused_focal_loss,
max_boxes, cls_head_padded, reg_head_padded, cuda_graphs):
device = targets['labels'][0].device
# TODO: should we skip validation in deployment?
# model_ptr.validate_input(images, targets)
# one-time init
if utils.ScratchPad.target_labels_padded is None and cls_head_padded:
utils.ScratchPad.target_labels_padded = torch.zeros([batch_size, max_boxes + 1],
device=device, dtype=torch.int64)
utils.ScratchPad.target_labels_padded[:, -1] = num_classes if not fused_focal_loss else -1
if utils.ScratchPad.target_boxes_padded is None and reg_head_padded:
utils.ScratchPad.target_boxes_padded = torch.zeros([batch_size, max_boxes, 4], device=device)
if utils.ScratchPad.target_n is None and (cls_head_padded or reg_head_padded):
utils.ScratchPad.target_n = torch.zeros([batch_size, 1], device=device, dtype=torch.int64)
if utils.ScratchPad.target_matched_idxs is None and cuda_graphs:
utils.ScratchPad.target_matched_idxs = torch.zeros_like(targets['matched_idxs'], device=device)
# these allocations are used to avoid allocations per iteration
if utils.ScratchPad.gt_classes_target is None:
if not fused_focal_loss:
utils.ScratchPad.gt_classes_target = \
torch.zeros(torch.Size([batch_size, 120087, num_classes + (1 if cls_head_padded else 0)]),
dtype=torch.float32 if not amp else torch.float16).to(device)
else:
utils.ScratchPad.gt_classes_target = \
torch.zeros(torch.Size([batch_size, 120087]), device=device, dtype=torch.int64)
if utils.ScratchPad.batch_size_vector is None:
utils.ScratchPad.batch_size_vector = torch.arange(len(targets['boxes'])).unsqueeze(1).cuda()
# data init
if cls_head_padded:
utils.ScratchPad.target_labels_padded[:, :-1].fill_(0)
if reg_head_padded:
utils.ScratchPad.target_boxes_padded.fill_(0)
if cuda_graphs:
utils.ScratchPad.target_matched_idxs.copy_(targets['matched_idxs'])
for i in range(images.size(0)):
# debug
# assert targets['labels'][i].size(0) < max_boxes
labels_n = targets['labels'][i].size(0)
if cls_head_padded:
utils.ScratchPad.target_labels_padded[i][:labels_n] = targets['labels'][i][:labels_n]
# debug: if args.apex_focal_loss than the -1 pos remains num_classes and not overridden
# assert ((not fused_focal_loss and (utils.ScratchPad.target_labels_padded[:, -1] == num_classes).all())
# or fused_focal_loss)
if reg_head_padded:
utils.ScratchPad.target_boxes_padded[i][:labels_n] = targets['boxes'][i][:labels_n]
if cls_head_padded or reg_head_padded:
utils.ScratchPad.target_n[i] = labels_n
utils.ScratchPad.gt_classes_target.fill_(0 if not fused_focal_loss else -1)
def compute_matched_idxs(targets_boxes, model_ptr):
matched_idxs = model_ptr.get_matched_idxs(targets_boxes)
return matched_idxs
def loss_preprocessing(targets_boxes, targets_labels, matched_idxs, model_ptr, fused_focal_loss, max_boxes,
cls_head_padded, reg_head_padded):
# classification loss prologues
if cls_head_padded:
gt_classes_target, num_foreground, valid_idxs = \
model_ptr.head.classification_head.compute_loss_prologue_padded(targets_labels,
matched_idxs,
one_hot=(not fused_focal_loss),
max_boxes=max_boxes)
else:
gt_classes_target, num_foreground, valid_idxs = \
model_ptr.head.classification_head.compute_loss_prologue(targets_labels, matched_idxs,
one_hot=(not fused_focal_loss))
# regression loss prologues
if reg_head_padded:
target_regression, _, foreground_idxs_mask = \
model_ptr.head.regression_head.compute_loss_prologue_padded(targets_boxes, matched_idxs, model_ptr.anchors)
else:
target_regression, _, foreground_idxs_mask = \
model_ptr.head.regression_head.compute_loss_prologue(targets_boxes, matched_idxs, model_ptr.anchors)
return gt_classes_target, target_regression, num_foreground, valid_idxs, foreground_idxs_mask
def compute_loss(model_ptr, cls_logits, bbox_regression, valid_idxs, gt_classes_target, num_foreground,
target_regression, foreground_idxs_mask, fused_focal_loss, reg_head_padded):
cls_loss = model_ptr.head.classification_head.compute_loss_core(cls_logits, gt_classes_target,
valid_idxs, num_foreground,
fused_focal_loss=fused_focal_loss)
if reg_head_padded:
reg_loss = model_ptr.head.regression_head.compute_loss_core_padded(bbox_regression, target_regression,
foreground_idxs_mask, num_foreground)
else:
reg_loss = model_ptr.head.regression_head.compute_loss_core(bbox_regression, target_regression,
foreground_idxs_mask, num_foreground)
return cls_loss, reg_loss
def train_one_epoch(model, optimizer, scaler, data_loader, device, epoch, train_group, args,
graphed_model=None, static_input=None, static_loss=None, static_prologues_out=None,
sbridge=SBridge()):
mllogger.start(key=EPOCH_START, value=epoch, metadata={"epoch_num": epoch}, sync=True, sync_group=train_group)
sbridge.start_epoch_prof()
model.train()
metric_logger = utils.MetricLogger(delimiter=" ")
if not args.skip_metric_loss:
metric_logger.add_meter('lr', utils.SmoothedValue(window_size=1, fmt='{value:.6f}'))
header = 'Epoch: [{}]'.format(epoch)
# direct pointer to the model
model_ptr = model.module if args.distributed else model
lr_scheduler = None
if epoch < args.warmup_epochs:
# Convert epochs to iterations
# we want to control warmup at the epoch level, but update lr every iteration
start_iter = epoch*len(data_loader)
warmup_iters = args.warmup_epochs*len(data_loader)
lr_scheduler = utils.warmup_lr_scheduler(optimizer, start_iter, warmup_iters, args.warmup_factor)
accuracy = None
for images, targets in metric_logger.log_every(data_loader, args.print_freq, header):
sbridge.start_prof(SBridge.ITER_TIME)
if args.syn_dataset:
images = list(image.to(device, non_blocking=True) for image in images)
images = torch.stack(images)
targets = {k: [dic[k].to(device, non_blocking=True) for dic in targets] for k in targets[0]}
targets['matched_idxs'] = torch.stack(targets['matched_idxs'])
else:
# DALI iterator provides data as needed
if not args.dali:
images = list(image.to(device, non_blocking=True) for image in images)
# arrange "targets" as a Dict[List], instead of a List[Dict], so later it will be easier to use targets
# data in parallel (e.g., to get the entire batch "boxes", one can just use targets['boxes']).
# TODO: there might be some unused fields in the targets tensor, so perhaps can avoid some transfers
targets = {k: [dic[k].to(device, non_blocking=True) for dic in targets] for k in targets[0]}
# preprocessing
images, targets = preprocessing(images, targets, model_ptr, args.data_layout)
# DALI can compute matched_idxs and put it in targets, but if it doesn't do so, do it here
if 'matched_idxs' not in targets:
with torch.cuda.amp.autocast(enabled=args.amp):
targets['matched_idxs'] = compute_matched_idxs(targets['boxes'], model_ptr)
if not args.cuda_graphs:
optimizer.zero_grad()
# init necessary data in the scratchpad
with torch.cuda.amp.autocast(enabled=args.amp):
init_scratchpad(images, targets, args.batch_size, args.num_classes, args.amp,
args.apex_focal_loss, args.max_boxes, args.cls_head_pad, args.reg_head_pad,
args.cuda_graphs)
if lr_scheduler is not None:
lr_scheduler.step()
if args.cuda_graphs:
if args.not_graphed_prologues:
with torch.cuda.amp.autocast(enabled=args.amp):
# loss prologue: preprocess everything that does not require model forward and backward
# use the padded scratchpad buffers if reg_head_pad/cls_head_pad are toggled
targets_boxes = targets['boxes'] if not args.reg_head_pad else utils.ScratchPad.target_boxes_padded
targets_labels = targets['labels'] if not args.cls_head_pad else utils.ScratchPad.target_labels_padded
gt_classes_target, target_regression, num_foreground, valid_idxs, foreground_idxs_mask = \
loss_preprocessing(targets_boxes, targets_labels, targets['matched_idxs'], model_ptr,
args.apex_focal_loss, args.max_boxes, args.cls_head_pad, args.reg_head_pad)
static_input.copy_(images)
# All necessary data is copied to the graph buffers in init_scratchpad
# The graph is programmed to use pointers to the scratchpad (besides images)
if args.not_graphed_prologues:
static_prologues_out[0].copy_(gt_classes_target)
static_prologues_out[1].copy_(target_regression)
static_prologues_out[2].copy_(num_foreground)
static_prologues_out[3].copy_(valid_idxs)
static_prologues_out[4].copy_(foreground_idxs_mask)
# graphed model comprises loss_preprocessing->forward->compute_loss->backward
graphed_model.replay()
if not args.skip_metric_loss:
dist.all_reduce(tensor=static_loss, group=train_group)
losses_reduced = static_loss / utils.get_world_size()
if args.sync_after_graph_replay:
torch.cuda.synchronize()
sbridge.start_prof(SBridge.OPT_TIME)
scaler.step(optimizer)
scaler.update()
sbridge.stop_prof(SBridge.OPT_TIME)
else:
with torch.cuda.amp.autocast(enabled=args.amp):
# loss prologue: preprocess everything that does not require model forward and backward
# use the padded scratchpad buffers if reg_head_pad/cls_head_pad are toggled
targets_boxes = utils.ScratchPad.target_boxes_padded if args.reg_head_pad else targets['boxes']
targets_labels = utils.ScratchPad.target_labels_padded if args.cls_head_pad else targets['labels']
gt_classes_target, target_regression, num_foreground, valid_idxs, foreground_idxs_mask = \
loss_preprocessing(targets_boxes, targets_labels, targets['matched_idxs'], model_ptr,
args.apex_focal_loss, args.max_boxes, args.cls_head_pad, args.reg_head_pad)
# forward
sbridge.start_prof(SBridge.FWD_TIME)
model_output = model(images)
# features = model_output[0:5]
# head_outputs = {'cls_logits': model_output[5], 'bbox_regression': model_output[6]}
# loss (given the prologue computations)
cls_loss, reg_loss = compute_loss(model_ptr, model_output[5], model_output[6], valid_idxs,
gt_classes_target, num_foreground, target_regression,
foreground_idxs_mask, args.apex_focal_loss, args.reg_head_pad)
loss_dict = {'classification': cls_loss, 'bbox_regression': reg_loss}
losses = sum(loss for loss in loss_dict.values())
# --- old loss (for debug)
# loss_dict_ = model_ptr.compute_loss(targets, head_outputs)
# assert(torch.allclose(loss_dict['classification'], loss_dict_['classification']))
# assert(torch.allclose(loss_dict['bbox_regression'], loss_dict_['bbox_regression']))
# reduce losses over all GPUs for logging purposes
# TODO: remove
loss_dict_reduced = utils.reduce_dict(loss_dict, group=train_group)
losses_reduced = sum(loss for loss in loss_dict_reduced.values())
loss_value = losses_reduced.item()
sbridge.stop_prof(SBridge.FWD_TIME)
if not math.isfinite(loss_value):
print("Loss is {}, stopping training".format(loss_value))
print(loss_dict_reduced)
sys.exit(1)
# backward
sbridge.start_prof(SBridge.BWD_TIME)
scaler.scale(losses).backward()
sbridge.stop_start_prof(SBridge.BWD_TIME, SBridge.OPT_TIME)
scaler.step(optimizer)
scaler.update()
sbridge.stop_prof(SBridge.OPT_TIME)
if not args.skip_metric_loss:
if not args.cuda_graphs:
metric_logger.update(loss=losses_reduced, **loss_dict_reduced)
else:
metric_logger.update(loss=losses_reduced)
metric_logger.update(lr=optimizer.param_groups[0]["lr"])
# Check async coco result
if args.async_coco and not (metric_logger.current_iter % args.async_coco_check_freq):
# FIXME(ahmadki): --num-eval-ranks
if args.eval_rank == 0:
results = async_executor.pop_if_done()
# in case of multiple results are returned, get the highest mAP
if results and len(results) > 0:
accuracy = max([result['bbox'][0] for result in results.values() if result], default=-1)
if args.distributed:
accuracy = utils.broadcast(accuracy, 0, group=train_group)
if args.target_map and accuracy and accuracy >= args.target_map:
break
sbridge.stop_prof(SBridge.ITER_TIME)
sbridge.stop_epoch_prof()
mllogger.end(key=EPOCH_STOP, value=epoch, metadata={"epoch_num": epoch}, sync=True, sync_group=train_group)
summary = metric_logger.summary
if summary['samples'] > 0:
throughput = summary['samples'] / (summary['end_time'] - summary['start_time'])
mllogger.event(key='tracked_stats', value={'throughput': throughput}, metadata={'step': (epoch + 1)})
return metric_logger, accuracy
@torch.no_grad()
def evaluate(model, data_loader, device, epoch, eval_group, args,
graphed_model=None, static_input=None, static_output=None, sbridge=SBridge()):
sbridge.start_eval_prof()
mllogger.start(key=EVAL_START, value=epoch, metadata={"epoch_num": epoch},
sync=True, sync_group=eval_group)
n_threads = torch.get_num_threads()
# FIXME remove this and make paste_masks_in_image run on the GPU
torch.set_num_threads(1)
cpu_device = torch.device("cpu")
model.eval()
metric_logger = utils.MetricLogger(delimiter=" ")
header = 'Test:'
iou_types = ["bbox"]
if args.cocoeval == 'python':
from coco.python_coco import PythonCocoEvaluator
coco_evaluator = PythonCocoEvaluator(annotations_file=args.val_annotations_file,
iou_types=iou_types, group=eval_group)
elif args.cocoeval == 'nvidia':
from coco.nvidia_coco import NVCocoEvaluator
coco_evaluator = NVCocoEvaluator(annotations_file=args.val_annotations_file,
iou_types=iou_types, num_threads=args.coco_threads, group=eval_group)
else:
assert False, f"Unknown coco evaluator implementation: {args.coco}"
model_ptr = model.module if args.distributed else model
for images, targets in metric_logger.log_every(data_loader, args.eval_print_freq, header):
if args.dali_eval and images.shape[0] == 0:
continue
if not args.dali_eval:
images = list(img.to(device, non_blocking=True) for img in images)
if torch.cuda.is_available():
torch.cuda.synchronize()
# preprocessing
for i, (image, target) in enumerate(zip(images, targets)):
# add the original image size to targets
targets[i]['original_image_size'] = image.shape[-2:]
images, targets = model_ptr.transform(images, targets)
images = images.tensors
if args.data_layout == 'channels_last':
images = images.to(memory_format=torch.channels_last)
model_time = time.time()
if not args.cuda_graphs_eval:
with torch.cuda.amp.autocast(enabled=args.amp):
model_output = model(images)
else:
if images.size(0) != args.eval_batch_size:
static_input.fill_(0)
static_input[:images.size(0)].copy_(images)
graphed_model.replay()
model_output = static_output
if images.size(0) != args.eval_batch_size:
model_output = [x[:images.size(0)] for x in model_output]
with torch.cuda.amp.autocast(enabled=args.amp):
features = model_output[0:5]
head_outputs = {'cls_logits': model_output[5], 'bbox_regression': model_output[6]}
outputs = model_ptr.eval_postprocess(images, features, targets, head_outputs, targets_dict=args.dali_eval)
outputs = [{k: v.to(cpu_device) for k, v in t.items()} for t in outputs]
model_time = time.time() - model_time
if args.dali_eval:
res = {target.item(): output for target, output in zip(targets['image_id'], outputs)}
else:
res = {target["image_id"].item(): output for target, output in zip(targets, outputs)}
evaluator_time = time.time()
coco_evaluator.update(res)
evaluator_time = time.time() - evaluator_time
metric_logger.update(model_time=model_time, evaluator_time=evaluator_time)
# gather the stats from all processes
metric_logger.synchronize_between_processes(group=eval_group)
print("Averaged stats:", metric_logger)
coco_evaluator.synchronize_between_processes()
def log_callback(future):
stats = future.result()
accuracy = stats['bbox'][0]
mllogger.event(key=EVAL_ACCURACY, value=accuracy, metadata={"epoch_num": epoch})
mllogger.end(key=EVAL_STOP, value=epoch, metadata={"epoch_num": epoch})
accuracy = None
if (not args.distributed) or args.eval_rank == 0:
if args.async_coco:
async_executor.submit(tag=str(epoch),
fn=coco_evaluator.get_stats_from_evaluator,
evaluator=coco_evaluator)
async_executor.add_done_callback(tag=str(epoch), fn=log_callback)
else:
stats = coco_evaluator.get_stats_from_evaluator(coco_evaluator)
accuracy = stats['bbox'][0]
mllogger.event(key=EVAL_ACCURACY, value=accuracy, metadata={"epoch_num": epoch})
mllogger.end(key=EVAL_STOP, value=epoch, metadata={"epoch_num": epoch})
if (not args.async_coco) and args.distributed:
accuracy = utils.broadcast(accuracy, 0, group=eval_group)
torch.set_num_threads(n_threads)
sbridge.stop_eval_prof()
return accuracy
#!/usr/bin/awk -f
BEGIN {
OFS="\t"
if (ARGV[1] == "--header") {
print_header()
ARGV[1]="" # tell awk that '--header' isn't a file name
}
}
function print_header() {
print("file", "gpus", "batch", "total", "converge", "init", "epoch1", "epoch_avg", "[epoch", "eval_time", "mAP]*")
}
function get_mll_string_val(line, key) {
myregex= "\"" key "\": \"([^\"]*)\""
match(line, myregex, result_array)
return result_array[1]
}
function get_mll_int_val(line, key) {
myregex= "\"" key "\": ([0-9]*)"
match(line, myregex, result_array)
return result_array[1]
}
function get_mll_float_val(line, key) {
myregex= "\"" key "\": ([0-9.e+-]*)"
match(line, myregex, result_array)
return result_array[1]
}
function get_mll_time(line) {
return get_mll_int_val(line, "time_ms")/1000
}
function get_mll_epoch_num(line) {
return get_mll_int_val(line, "epoch_num")
}
BEGINFILE {
stop_status = "notdone"
last_eval_epoch = -1
run_start_time = 0
run_stop_time = 0
avg_epoch_time = -1
init_time = -1
training_time = 0
last_eval_time = 0
delete eval_time
delete eval_acc
delete time_at_epoch
ranks = 0
global_batch = -1
}
# make sure all the relevant lines have the fields in the positions we expect
# them (sometimes the parallel output causes multiple (or none) "0: " at the
# beginning instead of the single one expected, just make it none)
/:::MLL/ {
sub(/^.*:::MLL/, ":::MLL")
}
/:::MLL.*"key": "init_start"/ {
ranks = ranks+1
}
/:::MLL.*"key": "global_batch_size"/ {
global_batch = get_mll_int_val($0, "value")
}
/:::MLL.*"key": "epoch_start"/ {
epoch_num = get_mll_epoch_num($0)
epoch_start_time = get_mll_time($0)
time_at_epoch[epoch_num] = epoch_start_time
if (epoch_num == 1) {
init_time=epoch_start_time-run_start_time
}
}
/:::MLL.*"key": "epoch_stop"/ {
epoch_num = get_mll_epoch_num($0)
current_time = get_mll_time($0)
training_time = training_time + (current_time-time_at_epoch[epoch_num])
if (epoch_num > 1 && epoch_num <= 39) {
avg_epoch_time = (current_time - time_at_epoch[2])/(epoch_num-1)
}
}
/:::MLL.*"key": "eval_start"/ {
epoch_num = get_mll_epoch_num($0)
current_time = get_mll_time($0)
eval_time[epoch_num] = current_time
last_eval_time = current_time
}
/:::MLL.*"key": "eval_accuracy"/ {
eval_acc[get_mll_epoch_num($0)] = get_mll_float_val($0, "value")
}
/:::MLL.*"key": "eval_stop"/ {
epoch_num = get_mll_epoch_num($0)
eval_time[epoch_num] = get_mll_time($0) - eval_time[epoch_num]
last_eval_epoch = epoch_num
}
/:::MLL.*"key": "run_start"/ {
run_start_time=get_mll_time($0)
}
function printall(fname, total_time, last_eval_epoch, init_time, avg_epoch_time, eval_time, eval_acc) {
if (ranks > 0) {
local_batch=global_batch/ranks
} else {
local_batch = -1
}
printf("%s\t%d\t%d\t%.2f\t%s\t%.4f\t%.4f\t%.4f\t%.2f\t%.2f", fname, ranks, local_batch, total_time, last_eval_epoch, init_time, time_at_epoch[2]-time_at_epoch[1], avg_epoch_time, training_time, last_eval_time)
for (i in eval_time) {
printf("\t%d\t%.4f\t%.4f", i, eval_time[i], eval_acc[i])
}
printf("\n")
}
/:::MLL.*"key": "run_stop"/ {
stop_status = get_mll_string_val($0, "status")
run_stop_time = get_mll_time($0)
if (last_eval_time > 0) {
last_eval_time = run_stop_time-last_eval_time
}
if (stop_status == "success") {
stop_status = last_eval_epoch
}
}
ENDFILE {
printall(FILENAME, run_stop_time-run_start_time, stop_status, init_time, avg_epoch_time, eval_time, eval_acc)
}
# Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from mlperf_common.logging import MLLoggerWrapper
from mlperf_common.frameworks.pyt import PyTCommunicationHandler
mllogger = MLLoggerWrapper(PyTCommunicationHandler(), value=None)
import warnings
from typing import Callable, List, Optional, Sequence, Tuple, Union
import torch
from torch import Tensor
from ..utils import _log_api_usage_once, _make_ntuple
interpolate = torch.nn.functional.interpolate
[docs]class FrozenBatchNorm2d(torch.nn.Module):
"""
BatchNorm2d where the batch statistics and the affine parameters are fixed
Args:
num_features (int): Number of features ``C`` from an expected input of size ``(N, C, H, W)``
eps (float): a value added to the denominator for numerical stability. Default: 1e-5
"""
def __init__(
self,
num_features: int,
eps: float = 1e-5,
):
super().__init__()
_log_api_usage_once(self)
self.eps = eps
self.register_buffer("weight", torch.ones(num_features))
self.register_buffer("bias", torch.zeros(num_features))
self.register_buffer("running_mean", torch.zeros(num_features))
self.register_buffer("running_var", torch.ones(num_features))
def _load_from_state_dict(
self,
state_dict: dict,
prefix: str,
local_metadata: dict,
strict: bool,
missing_keys: List[str],
unexpected_keys: List[str],
error_msgs: List[str],
):
num_batches_tracked_key = prefix + "num_batches_tracked"
if num_batches_tracked_key in state_dict:
del state_dict[num_batches_tracked_key]
super()._load_from_state_dict(
state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs
)
[docs] def forward(self, x: Tensor) -> Tensor:
# move reshapes to the beginning
# to make it fuser-friendly
w = self.weight.reshape(1, -1, 1, 1)
b = self.bias.reshape(1, -1, 1, 1)
rv = self.running_var.reshape(1, -1, 1, 1)
rm = self.running_mean.reshape(1, -1, 1, 1)
scale = w * (rv + self.eps).rsqrt()
bias = b - rm * scale
return x * scale + bias
def __repr__(self) -> str:
return f"{self.__class__.__name__}({self.weight.shape[0]}, eps={self.eps})"
class ConvNormActivation(torch.nn.Sequential):
def __init__(
self,
in_channels: int,
out_channels: int,
kernel_size: Union[int, Tuple[int, ...]] = 3,
stride: Union[int, Tuple[int, ...]] = 1,
padding: Optional[Union[int, Tuple[int, ...], str]] = None,
groups: int = 1,
norm_layer: Optional[Callable[..., torch.nn.Module]] = torch.nn.BatchNorm2d,
activation_layer: Optional[Callable[..., torch.nn.Module]] = torch.nn.ReLU,
dilation: Union[int, Tuple[int, ...]] = 1,
inplace: Optional[bool] = True,
bias: Optional[bool] = None,
conv_layer: Callable[..., torch.nn.Module] = torch.nn.Conv2d,
) -> None:
if padding is None:
if isinstance(kernel_size, int) and isinstance(dilation, int):
padding = (kernel_size - 1) // 2 * dilation
else:
_conv_dim = len(kernel_size) if isinstance(kernel_size, Sequence) else len(dilation)
kernel_size = _make_ntuple(kernel_size, _conv_dim)
dilation = _make_ntuple(dilation, _conv_dim)
padding = tuple((kernel_size[i] - 1) // 2 * dilation[i] for i in range(_conv_dim))
if bias is None:
bias = norm_layer is None
layers = [
conv_layer(
in_channels,
out_channels,
kernel_size,
stride,
padding,
dilation=dilation,
groups=groups,
bias=bias,
)
]
if norm_layer is not None:
layers.append(norm_layer(out_channels))
if activation_layer is not None:
params = {} if inplace is None else {"inplace": inplace}
layers.append(activation_layer(**params))
super().__init__(*layers)
_log_api_usage_once(self)
self.out_channels = out_channels
if self.__class__ == ConvNormActivation:
warnings.warn(
"Don't use ConvNormActivation directly, please use Conv2dNormActivation and Conv3dNormActivation instead."
)
[docs]class Conv2dNormActivation(ConvNormActivation):
"""
Configurable block used for Convolution2d-Normalization-Activation blocks.
Args:
in_channels (int): Number of channels in the input image
out_channels (int): Number of channels produced by the Convolution-Normalization-Activation block
kernel_size: (int, optional): Size of the convolving kernel. Default: 3
stride (int, optional): Stride of the convolution. Default: 1
padding (int, tuple or str, optional): Padding added to all four sides of the input. Default: None, in which case it will calculated as ``padding = (kernel_size - 1) // 2 * dilation``
groups (int, optional): Number of blocked connections from input channels to output channels. Default: 1
norm_layer (Callable[..., torch.nn.Module], optional): Norm layer that will be stacked on top of the convolution layer. If ``None`` this layer wont be used. Default: ``torch.nn.BatchNorm2d``
activation_layer (Callable[..., torch.nn.Module], optional): Activation function which will be stacked on top of the normalization layer (if not None), otherwise on top of the conv layer. If ``None`` this layer wont be used. Default: ``torch.nn.ReLU``
dilation (int): Spacing between kernel elements. Default: 1
inplace (bool): Parameter for the activation layer, which can optionally do the operation in-place. Default ``True``
bias (bool, optional): Whether to use bias in the convolution layer. By default, biases are included if ``norm_layer is None``.
"""
def __init__(
self,
in_channels: int,
out_channels: int,
kernel_size: Union[int, Tuple[int, int]] = 3,
stride: Union[int, Tuple[int, int]] = 1,
padding: Optional[Union[int, Tuple[int, int], str]] = None,
groups: int = 1,
norm_layer: Optional[Callable[..., torch.nn.Module]] = torch.nn.BatchNorm2d,
activation_layer: Optional[Callable[..., torch.nn.Module]] = torch.nn.ReLU,
dilation: Union[int, Tuple[int, int]] = 1,
inplace: Optional[bool] = True,
bias: Optional[bool] = None,
) -> None:
super().__init__(
in_channels,
out_channels,
kernel_size,
stride,
padding,
groups,
norm_layer,
activation_layer,
dilation,
inplace,
bias,
torch.nn.Conv2d,
)
[docs]class Conv3dNormActivation(ConvNormActivation):
"""
Configurable block used for Convolution3d-Normalization-Activation blocks.
Args:
in_channels (int): Number of channels in the input video.
out_channels (int): Number of channels produced by the Convolution-Normalization-Activation block
kernel_size: (int, optional): Size of the convolving kernel. Default: 3
stride (int, optional): Stride of the convolution. Default: 1
padding (int, tuple or str, optional): Padding added to all four sides of the input. Default: None, in which case it will calculated as ``padding = (kernel_size - 1) // 2 * dilation``
groups (int, optional): Number of blocked connections from input channels to output channels. Default: 1
norm_layer (Callable[..., torch.nn.Module], optional): Norm layer that will be stacked on top of the convolution layer. If ``None`` this layer wont be used. Default: ``torch.nn.BatchNorm3d``
activation_layer (Callable[..., torch.nn.Module], optional): Activation function which will be stacked on top of the normalization layer (if not None), otherwise on top of the conv layer. If ``None`` this layer wont be used. Default: ``torch.nn.ReLU``
dilation (int): Spacing between kernel elements. Default: 1
inplace (bool): Parameter for the activation layer, which can optionally do the operation in-place. Default ``True``
bias (bool, optional): Whether to use bias in the convolution layer. By default, biases are included if ``norm_layer is None``.
"""
def __init__(
self,
in_channels: int,
out_channels: int,
kernel_size: Union[int, Tuple[int, int, int]] = 3,
stride: Union[int, Tuple[int, int, int]] = 1,
padding: Optional[Union[int, Tuple[int, int, int], str]] = None,
groups: int = 1,
norm_layer: Optional[Callable[..., torch.nn.Module]] = torch.nn.BatchNorm3d,
activation_layer: Optional[Callable[..., torch.nn.Module]] = torch.nn.ReLU,
dilation: Union[int, Tuple[int, int, int]] = 1,
inplace: Optional[bool] = True,
bias: Optional[bool] = None,
) -> None:
super().__init__(
in_channels,
out_channels,
kernel_size,
stride,
padding,
groups,
norm_layer,
activation_layer,
dilation,
inplace,
bias,
torch.nn.Conv3d,
)
[docs]class SqueezeExcitation(torch.nn.Module):
"""
This block implements the Squeeze-and-Excitation block from https://arxiv.org/abs/1709.01507 (see Fig. 1).
Parameters ``activation``, and ``scale_activation`` correspond to ``delta`` and ``sigma`` in eq. 3.
Args:
input_channels (int): Number of channels in the input image
squeeze_channels (int): Number of squeeze channels
activation (Callable[..., torch.nn.Module], optional): ``delta`` activation. Default: ``torch.nn.ReLU``
scale_activation (Callable[..., torch.nn.Module]): ``sigma`` activation. Default: ``torch.nn.Sigmoid``
"""
def __init__(
self,
input_channels: int,
squeeze_channels: int,
activation: Callable[..., torch.nn.Module] = torch.nn.ReLU,
scale_activation: Callable[..., torch.nn.Module] = torch.nn.Sigmoid,
) -> None:
super().__init__()
_log_api_usage_once(self)
self.avgpool = torch.nn.AdaptiveAvgPool2d(1)
self.fc1 = torch.nn.Conv2d(input_channels, squeeze_channels, 1)
self.fc2 = torch.nn.Conv2d(squeeze_channels, input_channels, 1)
self.activation = activation()
self.scale_activation = scale_activation()
def _scale(self, input: Tensor) -> Tensor:
scale = self.avgpool(input)
scale = self.fc1(scale)
scale = self.activation(scale)
scale = self.fc2(scale)
return self.scale_activation(scale)
[docs] def forward(self, input: Tensor) -> Tensor:
scale = self._scale(input)
return scale * input
[docs]class MLP(torch.nn.Sequential):
"""This block implements the multi-layer perceptron (MLP) module.
Args:
in_channels (int): Number of channels of the input
hidden_channels (List[int]): List of the hidden channel dimensions
norm_layer (Callable[..., torch.nn.Module], optional): Norm layer that will be stacked on top of the linear layer. If ``None`` this layer wont be used. Default: ``None``
activation_layer (Callable[..., torch.nn.Module], optional): Activation function which will be stacked on top of the normalization layer (if not None), otherwise on top of the linear layer. If ``None`` this layer wont be used. Default: ``torch.nn.ReLU``
inplace (bool): Parameter for the activation layer, which can optionally do the operation in-place. Default ``True``
bias (bool): Whether to use bias in the linear layer. Default ``True``
dropout (float): The probability for the dropout layer. Default: 0.0
"""
def __init__(
self,
in_channels: int,
hidden_channels: List[int],
norm_layer: Optional[Callable[..., torch.nn.Module]] = None,
activation_layer: Optional[Callable[..., torch.nn.Module]] = torch.nn.ReLU,
inplace: Optional[bool] = True,
bias: bool = True,
dropout: float = 0.0,
):
# The addition of `norm_layer` is inspired from the implementation of TorchMultimodal:
# https://github.com/facebookresearch/multimodal/blob/5dec8a/torchmultimodal/modules/layers/mlp.py
params = {} if inplace is None else {"inplace": inplace}
layers = []
in_dim = in_channels
for hidden_dim in hidden_channels[:-1]:
layers.append(torch.nn.Linear(in_dim, hidden_dim, bias=bias))
if norm_layer is not None:
layers.append(norm_layer(hidden_dim))
layers.append(activation_layer(**params))
layers.append(torch.nn.Dropout(dropout, **params))
in_dim = hidden_dim
layers.append(torch.nn.Linear(in_dim, hidden_channels[-1], bias=bias))
layers.append(torch.nn.Dropout(dropout, **params))
super().__init__(*layers)
_log_api_usage_once(self)
[docs]class Permute(torch.nn.Module):
"""This module returns a view of the tensor input with its dimensions permuted.
Args:
dims (List[int]): The desired ordering of dimensions
"""
def __init__(self, dims: List[int]):
super().__init__()
self.dims = dims
[docs] def forward(self, x: Tensor) -> Tensor:
return torch.permute(x, self.dims)
# Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import math
import torch
from torch import nn, Tensor
from typing import List, Optional
from model.image_list import ImageList
class AnchorGenerator(nn.Module):
"""
Module that generates anchors for a set of feature maps and
image sizes.
The module support computing anchors at multiple sizes and aspect ratios
per feature map. This module assumes aspect ratio = height / width for
each anchor.
sizes and aspect_ratios should have the same number of elements, and it should
correspond to the number of feature maps.
sizes[i] and aspect_ratios[i] can have an arbitrary number of elements,
and AnchorGenerator will output a set of sizes[i] * aspect_ratios[i] anchors
per spatial location for feature map i.
Args:
sizes (Tuple[Tuple[int]]):
aspect_ratios (Tuple[Tuple[float]]):
"""
__annotations__ = {
"cell_anchors": List[torch.Tensor],
}
def __init__(
self,
sizes=((128, 256, 512),),
aspect_ratios=((0.5, 1.0, 2.0),),
):
super(AnchorGenerator, self).__init__()
if not isinstance(sizes[0], (list, tuple)):
# TODO change this
sizes = tuple((s,) for s in sizes)
if not isinstance(aspect_ratios[0], (list, tuple)):
aspect_ratios = (aspect_ratios,) * len(sizes)
assert len(sizes) == len(aspect_ratios)
self.sizes = sizes
self.aspect_ratios = aspect_ratios
self.cell_anchors = [self.generate_anchors(size, aspect_ratio)
for size, aspect_ratio in zip(sizes, aspect_ratios)]
# TODO: https://github.com/pytorch/pytorch/issues/26792
# For every (aspect_ratios, scales) combination, output a zero-centered anchor with those values.
# (scales, aspect_ratios) are usually an element of zip(self.scales, self.aspect_ratios)
# This method assumes aspect ratio = height / width for an anchor.
def generate_anchors(self, scales: List[int], aspect_ratios: List[float], dtype: torch.dtype = torch.float32,
device: torch.device = torch.device("cpu")):
scales = torch.as_tensor(scales, dtype=dtype, device=device)
aspect_ratios = torch.as_tensor(aspect_ratios, dtype=dtype, device=device)
h_ratios = torch.sqrt(aspect_ratios)
w_ratios = 1 / h_ratios
ws = (w_ratios[:, None] * scales[None, :]).view(-1)
hs = (h_ratios[:, None] * scales[None, :]).view(-1)
base_anchors = torch.stack([-ws, -hs, ws, hs], dim=1) / 2
return base_anchors.round()
def set_cell_anchors(self, dtype: torch.dtype, device: torch.device):
self.cell_anchors = [cell_anchor.to(dtype=dtype, device=device)
for cell_anchor in self.cell_anchors]
def num_anchors_per_location(self):
return [len(s) * len(a) for s, a in zip(self.sizes, self.aspect_ratios)]
# For every combination of (a, (g, s), i) in (self.cell_anchors, zip(grid_sizes, strides), 0:2),
# output g[i] anchors that are s[i] distance apart in direction i, with the same dimensions as a.
def grid_anchors(self, grid_sizes: List[List[int]], strides: List[List[Tensor]]) -> List[Tensor]:
anchors = []
cell_anchors = self.cell_anchors
assert cell_anchors is not None
if not (len(grid_sizes) == len(strides) == len(cell_anchors)):
raise ValueError("Anchors should be Tuple[Tuple[int]] because each feature "
"map could potentially have different sizes and aspect ratios. "
"There needs to be a match between the number of "
"feature maps passed and the number of sizes / aspect ratios specified.")
for size, stride, base_anchors in zip(
grid_sizes, strides, cell_anchors
):
grid_height, grid_width = size
stride_height, stride_width = stride
device = base_anchors.device
# For output anchor, compute [x_center, y_center, x_center, y_center]
shifts_x = torch.arange(
0, grid_width, dtype=torch.float32, device=device
) * stride_width
shifts_y = torch.arange(
0, grid_height, dtype=torch.float32, device=device
) * stride_height
shift_y, shift_x = torch.meshgrid(shifts_y, shifts_x)
shift_x = shift_x.reshape(-1)
shift_y = shift_y.reshape(-1)
shifts = torch.stack((shift_x, shift_y, shift_x, shift_y), dim=1)
# For every (base anchor, output anchor) pair,
# offset each zero-centered base anchor by the center of the output anchor.
anchors.append(
(shifts.view(-1, 1, 4) + base_anchors.view(1, -1, 4)).reshape(-1, 4)
)
return anchors
def forward(self, image_list: Tensor, feature_maps: List[Tensor]) -> List[Tensor]:
grid_sizes = [feature_map.shape[-2:] for feature_map in feature_maps]
image_size = image_list.shape[-2:]
dtype, device = feature_maps[0].dtype, feature_maps[0].device
strides = [[torch.tensor(image_size[0] // g[0], dtype=torch.int64, device=device),
torch.tensor(image_size[1] // g[1], dtype=torch.int64, device=device)] for g in grid_sizes]
self.set_cell_anchors(dtype, device)
anchors_over_all_feature_maps = self.grid_anchors(grid_sizes, strides)
anchors: List[List[torch.Tensor]] = []
for _ in range(image_list.size(0)):
anchors_in_image = [anchors_per_feature_map for anchors_per_feature_map in anchors_over_all_feature_maps]
anchors.append(anchors_in_image)
anchors = [torch.cat(anchors_per_image) for anchors_per_image in anchors]
return anchors
def forward_opt(self, image_shape: torch.Size = None, grid_sizes: List[torch.Size] = None,
device: torch.device = None, dtype=torch.float16):
assert(device is not None)
image_size = torch.Size([800, 800]) if image_shape is None else image_shape[-2:]
grid_sizes = [torch.Size([100, 100]), torch.Size([50, 50]),
torch.Size([25, 25]), torch.Size([13, 13]),
torch.Size([7, 7])] if grid_sizes is None else grid_sizes
strides = [[torch.tensor(image_size[0] // g[0], dtype=torch.int64, device=device),
torch.tensor(image_size[1] // g[1], dtype=torch.int64, device=device)] for g in grid_sizes]
self.set_cell_anchors(dtype, device)
anchors_over_all_feature_maps = self.grid_anchors(grid_sizes, strides)
anchors: List[List[torch.Tensor]] = []
for _ in range(image_shape[0]):
anchors_in_image = [anchors_per_feature_map for anchors_per_feature_map in anchors_over_all_feature_maps]
anchors.append(anchors_in_image)
anchors = [torch.cat(anchors_per_image) for anchors_per_image in anchors]
return anchors
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment