Commit 64b02fb6 authored by liangjing's avatar liangjing
Browse files

version 1

parents
Pipeline #176 failed with stages
in 0 seconds
# Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import warnings
from torch import nn
from torchvision.ops import misc as misc_nn_ops
import model.resnet
from model.utils import IntermediateLayerGetter
from model.feature_pyramid_network import FeaturePyramidNetwork, LastLevelMaxPool
class BackboneWithFPN(nn.Module):
"""
Adds a FPN on top of a model.
Internally, it uses torchvision.models._utils.IntermediateLayerGetter to
extract a submodel that returns the feature maps specified in return_layers.
The same limitations of IntermediateLayerGetter apply here.
Args:
backbone (nn.Module)
return_layers (Dict[name, new_name]): a dict containing the names
of the modules for which the activations will be returned as
the key of the dict, and the value of the dict is the name
of the returned activation (which the user can specify).
in_channels_list (List[int]): number of channels for each feature map
that is returned, in the order they are present in the OrderedDict
out_channels (int): number of channels in the FPN.
Attributes:
out_channels (int): the number of channels in the FPN
"""
def __init__(self, backbone, return_layers, in_channels_list, out_channels, extra_blocks=None, module_name=""):
super(BackboneWithFPN, self).__init__()
if extra_blocks is None:
extra_blocks = LastLevelMaxPool()
self.body = IntermediateLayerGetter(backbone, return_layers=return_layers)
self.fpn = FeaturePyramidNetwork(
in_channels_list=in_channels_list,
out_channels=out_channels,
extra_blocks=extra_blocks,
module_name="module.backbone.fpn",
)
self.out_channels = out_channels
def forward(self, x):
x = self.body(x)
x = self.fpn(x)
return x
def resnet_fpn_backbone(
backbone_name,
pretrained,
norm_layer=misc_nn_ops.FrozenBatchNorm2d,
trainable_layers=3,
returned_layers=None,
extra_blocks=None,
**kwargs
):
"""
Constructs a specified ResNet backbone with FPN on top. Freezes the specified number of layers in the backbone.
Examples::
>>> from torchvision.models.detection.backbone_utils import resnet_fpn_backbone
>>> backbone = resnet_fpn_backbone('resnet50', pretrained=True, trainable_layers=3)
>>> # get some dummy image
>>> x = torch.rand(1,3,64,64)
>>> # compute the output
>>> output = backbone(x)
>>> print([(k, v.shape) for k, v in output.items()])
>>> # returns
>>> [('0', torch.Size([1, 256, 16, 16])),
>>> ('1', torch.Size([1, 256, 8, 8])),
>>> ('2', torch.Size([1, 256, 4, 4])),
>>> ('3', torch.Size([1, 256, 2, 2])),
>>> ('pool', torch.Size([1, 256, 1, 1]))]
Args:
backbone_name (string): resnet architecture. Possible values are 'resnet50',
'resnet101', 'resnext50_32x4d', 'resnext101_32x8d'
pretrained (bool): If True, returns a model with backbone pre-trained on Imagenet
norm_layer (torchvision.ops): it is recommended to use the default value. For details visit:
(https://github.com/facebookresearch/maskrcnn-benchmark/issues/267)
trainable_layers (int): number of trainable (not frozen) resnet layers starting from final block.
Valid values are between 0 and 5, with 5 meaning all backbone layers are trainable.
returned_layers (list of int): The layers of the network to return. Each entry must be in ``[1, 4]``.
By default all layers are returned.
extra_blocks (ExtraFPNBlock or None): if provided, extra operations will
be performed. It is expected to take the fpn features, the original
features and the names of the original features as input, and returns
a new list of feature maps and their corresponding names. By
default a ``LastLevelMaxPool`` is used.
"""
backbone = model.resnet.__dict__[backbone_name](pretrained=pretrained, norm_layer=norm_layer, **kwargs)
# select layers that wont be frozen
assert 0 <= trainable_layers <= 5
layers_to_train = ["layer4", "layer3", "layer2", "layer1", "conv1"][:trainable_layers]
if trainable_layers == 5:
layers_to_train.append("bn1")
for name, parameter in backbone.named_parameters():
if all([not name.startswith(layer) for layer in layers_to_train]):
parameter.requires_grad_(False)
if extra_blocks is None:
extra_blocks = LastLevelMaxPool()
if returned_layers is None:
returned_layers = [1, 2, 3, 4]
assert min(returned_layers) > 0 and max(returned_layers) < 5
return_layers = {f"layer{k}": str(v) for v, k in enumerate(returned_layers)}
in_channels_stage2 = backbone.inplanes // 8
in_channels_list = [in_channels_stage2 * 2 ** (i - 1) for i in returned_layers]
out_channels = 256
return BackboneWithFPN(backbone, return_layers, in_channels_list, out_channels, extra_blocks=extra_blocks)
def _validate_trainable_layers(pretrained, trainable_backbone_layers, max_value, default_value):
# dont freeze any layers if pretrained model or backbone is not used
if not pretrained:
if trainable_backbone_layers is not None:
warnings.warn(
"Changing trainable_backbone_layers has not effect if "
"neither pretrained nor pretrained_backbone have been set to True, "
"falling back to trainable_backbone_layers={} so that all layers are trainable".format(max_value))
trainable_backbone_layers = max_value
# by default freeze first blocks
if trainable_backbone_layers is None:
trainable_backbone_layers = default_value
assert 0 <= trainable_backbone_layers <= max_value
return trainable_backbone_layers
# Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import torch
from torch import Tensor
from typing import Tuple
import torchvision
from torchvision.extension import _assert_has_ops
def nms(boxes: Tensor, scores: Tensor, iou_threshold: float) -> Tensor:
"""
Performs non-maximum suppression (NMS) on the boxes according
to their intersection-over-union (IoU).
NMS iteratively removes lower scoring boxes which have an
IoU greater than iou_threshold with another (higher scoring)
box.
If multiple boxes have the exact same score and satisfy the IoU
criterion with respect to a reference box, the selected box is
not guaranteed to be the same between CPU and GPU. This is similar
to the behavior of argsort in PyTorch when repeated values are present.
Args:
boxes (Tensor[N, 4])): boxes to perform NMS on. They
are expected to be in ``(x1, y1, x2, y2)`` format with ``0 <= x1 < x2`` and
``0 <= y1 < y2``.
scores (Tensor[N]): scores for each one of the boxes
iou_threshold (float): discards all overlapping boxes with IoU > iou_threshold
Returns:
Tensor: int64 tensor with the indices of the elements that have been kept
by NMS, sorted in decreasing order of scores
"""
_assert_has_ops()
return torch.ops.torchvision.nms(boxes, scores, iou_threshold)
def batched_nms(
boxes: Tensor,
scores: Tensor,
idxs: Tensor,
iou_threshold: float,
) -> Tensor:
"""
Performs non-maximum suppression in a batched fashion.
Each index value correspond to a category, and NMS
will not be applied between elements of different categories.
Args:
boxes (Tensor[N, 4]): boxes where NMS will be performed. They
are expected to be in ``(x1, y1, x2, y2)`` format with ``0 <= x1 < x2`` and
``0 <= y1 < y2``.
scores (Tensor[N]): scores for each one of the boxes
idxs (Tensor[N]): indices of the categories for each one of the boxes.
iou_threshold (float): discards all overlapping boxes with IoU > iou_threshold
Returns:
Tensor: int64 tensor with the indices of the elements that have been kept by NMS, sorted
in decreasing order of scores
"""
# Benchmarks that drove the following thresholds are at
# https://github.com/pytorch/vision/issues/1311#issuecomment-781329339
# Ideally for GPU we'd use a higher threshold
if boxes.numel() > 4_000 and not torchvision._is_tracing():
return _batched_nms_vanilla(boxes, scores, idxs, iou_threshold)
else:
return _batched_nms_coordinate_trick(boxes, scores, idxs, iou_threshold)
@torch.jit._script_if_tracing
def _batched_nms_coordinate_trick(
boxes: Tensor,
scores: Tensor,
idxs: Tensor,
iou_threshold: float,
) -> Tensor:
# strategy: in order to perform NMS independently per class,
# we add an offset to all the boxes. The offset is dependent
# only on the class idx, and is large enough so that boxes
# from different classes do not overlap
if boxes.numel() == 0:
return torch.empty((0,), dtype=torch.int64, device=boxes.device)
max_coordinate = boxes.max()
offsets = idxs.to(boxes) * (max_coordinate + torch.tensor(1).to(boxes))
boxes_for_nms = boxes + offsets[:, None]
keep = nms(boxes_for_nms, scores, iou_threshold)
return keep
@torch.jit._script_if_tracing
def _batched_nms_vanilla(
boxes: Tensor,
scores: Tensor,
idxs: Tensor,
iou_threshold: float,
) -> Tensor:
# Based on Detectron2 implementation, just manually call nms() on each class independently
keep_mask = torch.zeros_like(scores, dtype=torch.bool)
for class_id in torch.unique(idxs):
curr_indices = torch.where(idxs == class_id)[0]
curr_keep_indices = nms(boxes[curr_indices], scores[curr_indices], iou_threshold)
keep_mask[curr_indices[curr_keep_indices]] = True
keep_indices = torch.where(keep_mask)[0]
return keep_indices[scores[keep_indices].sort(descending=True)[1]]
def clip_boxes_to_image(boxes: Tensor, size: Tuple[int, int]) -> Tensor:
"""
Clip boxes so that they lie inside an image of size `size`.
Args:
boxes (Tensor[N, 4]): boxes in ``(x1, y1, x2, y2)`` format
with ``0 <= x1 < x2`` and ``0 <= y1 < y2``.
size (Tuple[height, width]): size of the image
Returns:
Tensor[N, 4]: clipped boxes
"""
dim = boxes.dim()
boxes_x = boxes[..., 0::2]
boxes_y = boxes[..., 1::2]
height, width = size
if torchvision._is_tracing():
boxes_x = torch.max(boxes_x, torch.tensor(0, dtype=boxes.dtype, device=boxes.device))
boxes_x = torch.min(boxes_x, torch.tensor(width, dtype=boxes.dtype, device=boxes.device))
boxes_y = torch.max(boxes_y, torch.tensor(0, dtype=boxes.dtype, device=boxes.device))
boxes_y = torch.min(boxes_y, torch.tensor(height, dtype=boxes.dtype, device=boxes.device))
else:
boxes_x = boxes_x.clamp(min=0, max=width)
boxes_y = boxes_y.clamp(min=0, max=height)
clipped_boxes = torch.stack((boxes_x, boxes_y), dim=dim)
return clipped_boxes.reshape(boxes.shape)
def _upcast(t: Tensor) -> Tensor:
# Protects from numerical overflows in multiplications by upcasting to the equivalent higher type
if t.is_floating_point():
return t if t.dtype in (torch.float32, torch.float64) else t.float()
else:
return t if t.dtype in (torch.int32, torch.int64) else t.int()
def box_area(boxes: Tensor) -> Tensor:
"""
Computes the area of a set of bounding boxes, which are specified by their
(x1, y1, x2, y2) coordinates.
Args:
boxes (Tensor[N, 4]): boxes for which the area will be computed. They
are expected to be in (x1, y1, x2, y2) format with
``0 <= x1 < x2`` and ``0 <= y1 < y2``.
Returns:
Tensor[N]: the area for each box
"""
boxes = _upcast(boxes)
return (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1])
# implementation from https://github.com/kuangliu/torchcv/blob/master/torchcv/utils/box.py
# with slight modifications
def _box_inter_union(boxes1: Tensor, boxes2: Tensor) -> Tuple[Tensor, Tensor]:
area1 = box_area(boxes1)
area2 = box_area(boxes2)
lt = torch.max(boxes1[:, None, :2], boxes2[:, :2]) # [N,M,2]
rb = torch.min(boxes1[:, None, 2:], boxes2[:, 2:]) # [N,M,2]
wh = _upcast(rb - lt).clamp(min=0) # [N,M,2]
inter = wh[:, :, 0] * wh[:, :, 1] # [N,M]
union = area1[:, None] + area2 - inter
return inter, union
def box_iou(boxes1: Tensor, boxes2: Tensor) -> Tensor:
"""
Return intersection-over-union (Jaccard index) between two sets of boxes.
Both sets of boxes are expected to be in ``(x1, y1, x2, y2)`` format with
``0 <= x1 < x2`` and ``0 <= y1 < y2``.
Args:
boxes1 (Tensor[N, 4]): first set of boxes
boxes2 (Tensor[M, 4]): second set of boxes
Returns:
Tensor[N, M]: the NxM matrix containing the pairwise IoU values for every element in boxes1 and boxes2
"""
inter, union = _box_inter_union(boxes1, boxes2)
iou = inter / union
return iou
# Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from collections import OrderedDict
import torch.nn.functional as F
from torch import nn, Tensor
from typing import Tuple, List, Dict, Optional
from mlperf_logger import mllogger
from mlperf_logging.mllog.constants import WEIGHTS_INITIALIZATION
class ExtraFPNBlock(nn.Module):
"""
Base class for the extra block in the FPN.
Args:
results (List[Tensor]): the result of the FPN
x (List[Tensor]): the original feature maps
names (List[str]): the names for each one of the
original feature maps
Returns:
results (List[Tensor]): the extended set of results
of the FPN
names (List[str]): the extended set of names for the results
"""
def forward(
self,
results: List[Tensor],
x: List[Tensor],
names: List[str],
) -> Tuple[List[Tensor], List[str]]:
pass
class FeaturePyramidNetwork(nn.Module):
"""
Module that adds a FPN from on top of a set of feature maps. This is based on
`"Feature Pyramid Network for Object Detection" <https://arxiv.org/abs/1612.03144>`_.
The feature maps are currently supposed to be in increasing depth
order.
The input to the model is expected to be an OrderedDict[Tensor], containing
the feature maps on top of which the FPN will be added.
Args:
in_channels_list (list[int]): number of channels for each feature map that
is passed to the module
out_channels (int): number of channels of the FPN representation
extra_blocks (ExtraFPNBlock or None): if provided, extra operations will
be performed. It is expected to take the fpn features, the original
features and the names of the original features as input, and returns
a new list of feature maps and their corresponding names
Examples::
>>> m = torchvision.ops.FeaturePyramidNetwork([10, 20, 30], 5)
>>> # get some dummy data
>>> x = OrderedDict()
>>> x['feat0'] = torch.rand(1, 10, 64, 64)
>>> x['feat2'] = torch.rand(1, 20, 16, 16)
>>> x['feat3'] = torch.rand(1, 30, 8, 8)
>>> # compute the FPN on top of x
>>> output = m(x)
>>> print([(k, v.shape) for k, v in output.items()])
>>> # returns
>>> [('feat0', torch.Size([1, 5, 64, 64])),
>>> ('feat2', torch.Size([1, 5, 16, 16])),
>>> ('feat3', torch.Size([1, 5, 8, 8]))]
"""
def __init__(
self,
in_channels_list: List[int],
out_channels: int,
extra_blocks: Optional[ExtraFPNBlock] = None,
module_name: Optional[str] = "",
):
super(FeaturePyramidNetwork, self).__init__()
self.inner_blocks = nn.ModuleList()
self.layer_blocks = nn.ModuleList()
for in_channels in in_channels_list:
if in_channels == 0:
raise ValueError("in_channels=0 is currently not supported")
inner_block_module = nn.Conv2d(in_channels, out_channels, 1)
layer_block_module = nn.Conv2d(out_channels, out_channels, 3, padding=1)
self.inner_blocks.append(inner_block_module)
self.layer_blocks.append(layer_block_module)
# initialize parameters now to avoid modifying the initialization of top_blocks
for name, m in self.named_modules(prefix=module_name):
if isinstance(m, nn.Conv2d):
mllogger.event(key=WEIGHTS_INITIALIZATION, metadata={"tensor": f"{name}.weight"})
nn.init.kaiming_uniform_(m.weight, a=1)
mllogger.event(key=WEIGHTS_INITIALIZATION, metadata={"tensor": f"{name}.bias"})
nn.init.constant_(m.bias, 0)
if extra_blocks is not None:
assert isinstance(extra_blocks, ExtraFPNBlock)
self.extra_blocks = extra_blocks
def get_result_from_inner_blocks(self, x: Tensor, idx: int) -> Tensor:
"""
This is equivalent to self.inner_blocks[idx](x),
but torchscript doesn't support this yet
"""
num_blocks = len(self.inner_blocks)
if idx < 0:
idx += num_blocks
i = 0
out = x
for module in self.inner_blocks:
if i == idx:
out = module(x)
i += 1
return out
def get_result_from_layer_blocks(self, x: Tensor, idx: int) -> Tensor:
"""
This is equivalent to self.layer_blocks[idx](x),
but torchscript doesn't support this yet
"""
num_blocks = len(self.layer_blocks)
if idx < 0:
idx += num_blocks
i = 0
out = x
for module in self.layer_blocks:
if i == idx:
out = module(x)
i += 1
return out
def forward(self, x: Dict[str, Tensor]) -> Dict[str, Tensor]:
"""
Computes the FPN for a set of feature maps.
Args:
x (OrderedDict[Tensor]): feature maps for each feature level.
Returns:
results (OrderedDict[Tensor]): feature maps after FPN layers.
They are ordered from highest resolution first.
"""
# unpack OrderedDict into two lists for easier handling
names = list(x.keys())
x = list(x.values())
last_inner = self.get_result_from_inner_blocks(x[-1], -1)
results = []
results.append(self.get_result_from_layer_blocks(last_inner, -1))
for idx in range(len(x) - 2, -1, -1):
inner_lateral = self.get_result_from_inner_blocks(x[idx], idx)
feat_shape = inner_lateral.shape[-2:]
inner_top_down = F.interpolate(last_inner, size=feat_shape, mode="nearest")
last_inner = inner_lateral + inner_top_down
results.insert(0, self.get_result_from_layer_blocks(last_inner, idx))
if self.extra_blocks is not None:
results, names = self.extra_blocks(results, x, names)
# make it back an OrderedDict
out = OrderedDict([(k, v) for k, v in zip(names, results)])
return out
class LastLevelMaxPool(ExtraFPNBlock):
"""
Applies a max_pool2d on top of the last feature map
"""
def forward(
self,
x: List[Tensor],
y: List[Tensor],
names: List[str],
) -> Tuple[List[Tensor], List[str]]:
names.append("pool")
x.append(F.max_pool2d(x[-1], 1, 2, 0))
return x, names
class LastLevelP6P7(ExtraFPNBlock):
"""
This module is used in RetinaNet to generate extra layers, P6 and P7.
"""
def __init__(self, in_channels: int, out_channels: int, module_name: Optional[str]=""):
super(LastLevelP6P7, self).__init__()
self.p6 = nn.Conv2d(in_channels, out_channels, 3, 2, 1)
self.p7 = nn.Conv2d(out_channels, out_channels, 3, 2, 1)
for name, module in self.named_modules(prefix=module_name):
if module in [self.p6, self.p7]:
mllogger.event(key=WEIGHTS_INITIALIZATION, metadata={"tensor": f"{name}.weight"})
nn.init.kaiming_uniform_(module.weight, a=1)
mllogger.event(key=WEIGHTS_INITIALIZATION, metadata={"tensor": f"{name}.bias"})
nn.init.constant_(module.bias, 0)
self.use_P5 = in_channels == out_channels
def forward(
self,
p: List[Tensor],
c: List[Tensor],
names: List[str],
) -> Tuple[List[Tensor], List[str]]:
p5, c5 = p[-1], c[-1]
x = p5 if self.use_P5 else c5
p6 = self.p6(x)
p7 = self.p7(F.relu(p6))
p.extend([p6, p7])
names.extend(["p6", "p7"])
return p, names
# Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import torch
import torch.nn.functional as F
try:
from apex.contrib.focal_loss.focal_loss import FocalLoss
focal_loss_opt = FocalLoss.apply
except ImportError as err:
print("Could not import APEX fused focal loss, it's fine if you do not use --apex-focal-loss")
def sigmoid_focal_loss(
inputs: torch.Tensor,
targets: torch.Tensor,
alpha: float = 0.25,
gamma: float = 2,
reduction: str = "none",
):
"""
Original implementation from https://github.com/facebookresearch/fvcore/blob/master/fvcore/nn/focal_loss.py .
Loss used in RetinaNet for dense detection: https://arxiv.org/abs/1708.02002.
Args:
inputs: A float tensor of arbitrary shape.
The predictions for each example.
targets: A float tensor with the same shape as inputs. Stores the binary
classification label for each element in inputs
(0 for the negative class and 1 for the positive class).
alpha: (optional) Weighting factor in range (0,1) to balance
positive vs negative examples or -1 for ignore. Default = 0.25
gamma: Exponent of the modulating factor (1 - p_t) to
balance easy vs hard examples.
reduction: 'none' | 'mean' | 'sum'
'none': No reduction will be applied to the output.
'mean': The output will be averaged.
'sum': The output will be summed.
Returns:
Loss tensor with the reduction option applied.
"""
p = torch.sigmoid(inputs)
ce_loss = F.binary_cross_entropy_with_logits(
inputs, targets, reduction="none"
)
p_t = p * targets + (1 - p) * (1 - targets)
loss = ce_loss * ((1 - p_t) ** gamma)
if alpha >= 0:
alpha_t = alpha * targets + (1 - alpha) * (1 - targets)
loss = alpha_t * loss
if reduction == "mean":
loss = loss.mean()
elif reduction == "sum":
loss = loss.sum()
return loss
# The following focal loss implementation is similar to the previous one, besides an additional mask operation.
# The mask operation is handy when using CUDA graphs, since it will enable fixed tensor dimension (otherwise,
# for each image a different sized tensor would be used).
def sigmoid_focal_loss_masked(
inputs: torch.Tensor,
targets: torch.Tensor,
mask: torch.Tensor,
alpha: float = 0.25,
gamma: float = 2,
reduction: str = "none",
):
assert(reduction == "sum")
p = torch.sigmoid(inputs)
ce_loss = F.binary_cross_entropy_with_logits(
inputs, targets, reduction="none"
)
p_t = p * targets + (1 - p) * (1 - targets)
loss = ce_loss * ((1 - p_t) ** gamma)
if alpha >= 0:
alpha_t = alpha * targets + (1 - alpha) * (1 - targets)
loss = alpha_t * loss
loss = loss * mask
loss = loss.sum(dim=[1, 2])
return loss
def sigmoid_focal_loss_masked_fused(
inputs: torch.Tensor,
targets: torch.Tensor,
mask: torch.Tensor,
alpha: float = 0.25,
gamma: float = 2,
label_smoothing: float = 0.0,
reduction: str = "none",
one_ptr: torch.Tensor = None
):
assert(reduction == "sum")
num_classes = inputs.size(2)
inputs_ = inputs.reshape([inputs.size(0), 1, 13343, 9, num_classes])
# -2 indicates the kernel to ignore that value
targets_ = torch.where(mask, targets, -2)
targets_ = targets_.reshape([inputs.size(0), 1, 13343, 9])
# TODO: implement within the kernel and not with a loop
loss = []
inputs_list = torch.chunk(inputs_, inputs_.size(0))
targets_list = torch.chunk(targets_, targets_.size(0))
for b in range(inputs_.size(0)):
loss.append(focal_loss_opt(inputs_list[b], targets_list[b], one_ptr, num_classes, alpha, gamma, label_smoothing))
return torch.stack(loss)
# Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import warnings
import torch
from torch import Tensor
from typing import Callable, List, Optional
class FrozenBatchNorm2d(torch.nn.Module):
"""
BatchNorm2d where the batch statistics and the affine parameters
are fixed
"""
def __init__(
self,
num_features: int,
eps: float = 1e-5,
n: Optional[int] = None,
):
# n=None for backward-compatibility
if n is not None:
warnings.warn("`n` argument is deprecated and has been renamed `num_features`",
DeprecationWarning)
num_features = n
super(FrozenBatchNorm2d, self).__init__()
self.eps = eps
self.register_buffer("weight", torch.ones(num_features))
self.register_buffer("bias", torch.zeros(num_features))
self.register_buffer("running_mean", torch.zeros(num_features))
self.register_buffer("running_var", torch.ones(num_features))
def _load_from_state_dict(
self,
state_dict: dict,
prefix: str,
local_metadata: dict,
strict: bool,
missing_keys: List[str],
unexpected_keys: List[str],
error_msgs: List[str],
):
num_batches_tracked_key = prefix + 'num_batches_tracked'
if num_batches_tracked_key in state_dict:
del state_dict[num_batches_tracked_key]
super(FrozenBatchNorm2d, self)._load_from_state_dict(
state_dict, prefix, local_metadata, strict,
missing_keys, unexpected_keys, error_msgs)
# one-time preprocessing
self.weight = self.weight.reshape(1, -1, 1, 1)
self.bias = self.bias.reshape(1, -1, 1, 1)
self.running_var = self.running_var.reshape(1, -1, 1, 1)
self.running_mean = self.running_mean.reshape(1, -1, 1, 1)
# registering these variables as buffers
self.register_buffer("scale", self.weight * (self.running_var + self.eps).rsqrt())
self.register_buffer("bias_term", self.bias - self.running_mean * self.scale)
def forward(self, x: Tensor) -> Tensor:
return x * self.scale + self.bias_term
def __repr__(self) -> str:
return f"{self.__class__.__name__}({self.weight.shape[0]}, eps={self.eps})"
# Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import torch
from torch import Tensor
from typing import List, Tuple
class ImageList(object):
"""
Structure that holds a list of images (of possibly
varying sizes) as a single tensor.
This works by padding the images to the same size,
and storing in a field the original sizes of each image
"""
def __init__(self, tensors: Tensor, image_sizes: List[Tuple[int, int]]):
"""
Args:
tensors (tensor)
image_sizes (list[tuple[int, int]])
"""
self.tensors = tensors
self.image_sizes = image_sizes
def to(self, device: torch.device) -> 'ImageList':
cast_tensor = self.tensors.to(device)
return ImageList(cast_tensor, self.image_sizes)
# Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import warnings
import torch
import torch.nn.functional as F
from torch import Tensor, HalfTensor, BoolTensor
from typing import Callable, List, Optional, Tuple
from model.frozen_bn import FrozenBatchNorm2d
# For debugging backprop put the following in the function and uncomment
# import pydevd
# pydevd.settrace(suspend=False, trace_only_current_thread=True)
class bn_relu_wrapper(FrozenBatchNorm2d):
def __init__(self, num_features, eps=1e-5, n=None):
super(bn_relu_wrapper, self).__init__(num_features, eps, n)
def forward(self, x):
return bn_relu_jit.apply(x, self.scale, self.bias_term)
class bn_relu_jit(torch.autograd.Function):
@staticmethod
@torch.cuda.amp.custom_fwd(cast_inputs=torch.half)
def forward(ctx, input, scale, bias):
bn_relu_out, relu_mask = fwd_bn_relu_jit(input, scale, bias)
ctx.save_for_backward(scale, relu_mask)
return bn_relu_out
@staticmethod
@torch.cuda.amp.custom_bwd
def backward(ctx, grad_output):
scale, relu_mask = ctx.saved_tensors
grad_input = bwd_bn_relu_jit(grad_output, scale, relu_mask)
return grad_input, None, None
@torch.jit.script
def fwd_bn_relu_jit(input: HalfTensor, scale: HalfTensor, bias: HalfTensor) -> Tuple[HalfTensor, BoolTensor]:
bn = input * scale + bias
bn_relu = torch.nn.functional.relu(bn)
relu_mask = bn > 0
return bn_relu, relu_mask
@torch.jit.script
def bwd_bn_relu_jit(grad_output: HalfTensor, scale: HalfTensor, relu_mask: BoolTensor) -> HalfTensor:
grad_input = grad_output * scale
grad_input = grad_input * relu_mask
return grad_input
class bn_add_relu_jit(torch.autograd.Function):
@staticmethod
@torch.cuda.amp.custom_fwd(cast_inputs=torch.half)
def forward(ctx, input1, scale1, bias1, input2):
bn_relu_out, relu_mask = fwd_bn_add_relu_jit(input1, scale1, bias1, input2)
ctx.save_for_backward(scale1, relu_mask)
return bn_relu_out
@staticmethod
@torch.cuda.amp.custom_bwd
def backward(ctx, grad_output):
scale, relu_mask = ctx.saved_tensors
grad_input1, grad_input2 = bwd_bn_add_relu_jit(grad_output, scale, relu_mask)
return grad_input1, None, None, grad_input2
@torch.jit.script
def fwd_bn_add_relu_jit(input1: HalfTensor, scale1: HalfTensor, bias1: HalfTensor,
input2: HalfTensor) -> Tuple[HalfTensor, BoolTensor]:
bn = input1 * scale1 + bias1
bn_add = bn + input2
bn_add_relu = torch.nn.functional.relu(bn_add)
relu_mask = bn_add > 0
return bn_add_relu, relu_mask
@torch.jit.script
def bwd_bn_add_relu_jit(grad_output: HalfTensor, scale: HalfTensor,
relu_mask: BoolTensor) -> Tuple[HalfTensor, HalfTensor]:
grad_input2 = grad_output * relu_mask
grad_input1 = grad_input2 * scale
return grad_input1, grad_input2
class bn_bn_add_relu_jit(torch.autograd.Function):
@staticmethod
@torch.cuda.amp.custom_fwd(cast_inputs=torch.half)
def forward(ctx, input1, scale1, bias1, input2, scale2, bias2):
bn_relu_out, relu_mask = fwd_bn_bn_add_relu_jit(input1, scale1, bias1,
input2, scale2, bias2)
ctx.save_for_backward(scale1, scale2, relu_mask)
return bn_relu_out
@staticmethod
@torch.cuda.amp.custom_bwd
def backward(ctx, grad_output):
scale1, scale2, relu_mask = ctx.saved_tensors
grad_input1, grad_input2 = bwd_bn_bn_add_relu_jit(grad_output, scale1, scale2, relu_mask)
return grad_input1, None, None, grad_input2, None, None
@torch.jit.script
def fwd_bn_bn_add_relu_jit(input1: HalfTensor, scale1: HalfTensor, bias1: HalfTensor,
input2: HalfTensor, scale2: HalfTensor, bias2: HalfTensor) -> Tuple[HalfTensor, BoolTensor]:
bn1 = input1 * scale1 + bias1
bn2 = input2 * scale2 + bias2
bn_add = bn1 + bn2
bn_add_relu = torch.nn.functional.relu(bn_add)
relu_mask = bn_add > 0
return bn_add_relu, relu_mask
@torch.jit.script
def bwd_bn_bn_add_relu_jit(grad_output: HalfTensor, scale1: HalfTensor, scale2: HalfTensor,
relu_mask: BoolTensor) -> Tuple[HalfTensor, HalfTensor]:
grad_output_masked = grad_output * relu_mask
grad_input1 = grad_output_masked * scale1
grad_input2 = grad_output_masked * scale2
return grad_input1, grad_input2
# Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import torch
import time
import numpy as np
import pickle
IS_PROFILE = False
IS_WALL_TIME = False
class stats_wrapper:
def __init__(self):
self.records = {}
self.warmup_t = 0
def print_all(self):
print('>>> START STATS PRINT <<<')
for k, v in self.records.items():
samples = np.asarray(v['samples'])
mean = np.mean(samples)
standard_deviation = np.std(samples)
distance_from_mean = abs(samples - mean)
max_deviations = 2
not_outlier = distance_from_mean < max_deviations * standard_deviation
samples_ = samples[not_outlier]
avg = samples_.mean()
var = samples_.var()
print('{}, {}, {}, {}, {}'.format(k, avg * 1000, var * 1000, samples.max() * 1000, samples.min() * 1000))
print('>>> END STATS PRINT <<<')
pickle.dump(self.records, open(b"records.pkl", "wb"))
def create(self, k):
if k not in self.records:
self.records[k] = {'n': self.warmup_t * (-1), 'samples': []}
def add(self, k, v):
self.records[k]['samples'].append(v)
self.records[k]['n'] += 1
class measure_t:
def __init__(self, name, enable=True):
self.name = name
self.t0, self.t1 = 0, 0
self.enable = enable
self.is_running = False
if enable:
stats.create(self.name)
def __enter__(self):
if not self.enable:
return
self.start()
def __exit__(self, type, value, traceback):
if not self.enable:
return
self.stop()
def start(self):
if not self.enable:
return
if IS_PROFILE:
torch.cuda.nvtx.range_push(self.name)
if IS_WALL_TIME:
torch.cuda.synchronize()
self.t0 = time.time()
self.is_running = True
def stop(self):
if not self.enable:
return
if self.is_running:
if IS_PROFILE:
torch.cuda.nvtx.range_pop()
if IS_WALL_TIME:
torch.cuda.synchronize()
self.t1 = time.time()
delta = self.t1 - self.t0
stats.add(self.name, delta)
self.is_running = False
stats = stats_wrapper()
# Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import torch
from torch import Tensor
import torch.nn as nn
from torch.hub import load_state_dict_from_url
from typing import Type, Any, Callable, Union, List, Optional
from .jit_fn import bn_relu_jit, bn_add_relu_jit, bn_bn_add_relu_jit, bn_relu_wrapper
from mlperf_logger import mllogger
from mlperf_logging.mllog.constants import WEIGHTS_INITIALIZATION
__all__ = ['resnet50', 'resnet101',
'resnext50_32x4d', 'resnext101_32x8d']
model_urls = {
'resnet50': 'https://download.pytorch.org/models/resnet50-0676ba61.pth',
'resnext50_32x4d': 'https://download.pytorch.org/models/resnext50_32x4d-7cdf4587.pth',
'resnet101': 'https://download.pytorch.org/models/resnet101-63fe2227.pth',
'resnext101_32x8d': 'https://download.pytorch.org/models/resnext101_32x8d-8ba56ff5.pth',
}
def conv3x3(in_planes: int, out_planes: int, stride: int = 1, groups: int = 1, dilation: int = 1) -> nn.Conv2d:
"""3x3 convolution with padding"""
return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
padding=dilation, groups=groups, bias=False, dilation=dilation)
def conv1x1(in_planes: int, out_planes: int, stride: int = 1) -> nn.Conv2d:
"""1x1 convolution"""
return nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=stride, bias=False)
class BasicBlock(nn.Module):
expansion: int = 1
def __init__(
self,
inplanes: int,
planes: int,
stride: int = 1,
downsample: Optional[nn.Module] = None,
groups: int = 1,
base_width: int = 64,
dilation: int = 1,
norm_layer: Optional[Callable[..., nn.Module]] = None
) -> None:
super(BasicBlock, self).__init__()
if norm_layer is None:
norm_layer = nn.BatchNorm2d
if groups != 1 or base_width != 64:
raise ValueError('BasicBlock only supports groups=1 and base_width=64')
if dilation > 1:
raise NotImplementedError("Dilation > 1 not supported in BasicBlock")
# Both self.conv1 and self.downsample layers downsample the input when stride != 1
self.conv1 = conv3x3(inplanes, planes, stride)
self.bn1 = norm_layer(planes)
self.relu = nn.ReLU(inplace=True)
self.conv2 = conv3x3(planes, planes)
self.bn2 = norm_layer(planes)
self.downsample = downsample
self.stride = stride
def forward(self, x: Tensor) -> Tensor:
identity = x
out = self.conv1(x)
out = self.bn1(out)
out = self.relu(out)
out = self.conv2(out)
out = self.bn2(out)
if self.downsample is not None:
identity = self.downsample(x)
out += identity
out = self.relu(out)
return out
class Bottleneck(nn.Module):
# Bottleneck in torchvision places the stride for downsampling at 3x3 convolution(self.conv2)
# while original implementation places the stride at the first 1x1 convolution(self.conv1)
# according to "Deep residual learning for image recognition"https://arxiv.org/abs/1512.03385.
# This variant is also known as ResNet V1.5 and improves accuracy according to
# https://ngc.nvidia.com/catalog/model-scripts/nvidia:resnet_50_v1_5_for_pytorch.
expansion: int = 4
def __init__(
self,
inplanes: int,
planes: int,
stride: int = 1,
downsample: Optional[nn.Module] = None,
groups: int = 1,
base_width: int = 64,
dilation: int = 1,
norm_layer: Optional[Callable[..., nn.Module]] = None
) -> None:
super(Bottleneck, self).__init__()
if norm_layer is None:
norm_layer = nn.BatchNorm2d
width = int(planes * (base_width / 64.)) * groups
# Both self.conv2 and self.downsample layers downsample the input when stride != 1
self.conv1 = conv1x1(inplanes, width)
self.bn1 = norm_layer(width)
self.conv2 = conv3x3(width, width, stride, groups, dilation)
self.bn2 = norm_layer(width)
self.conv3 = conv1x1(width, planes * self.expansion)
self.bn3 = norm_layer(planes * self.expansion)
self.relu = nn.ReLU(inplace=True)
self.downsample = downsample
self.stride = stride
def forward(self, x: Tensor) -> Tensor:
identity = x
out = self.conv1(x)
out = self.bn1(out)
out = self.relu(out)
out = self.conv2(out)
out = self.bn2(out)
out = self.relu(out)
out = self.conv3(out)
out = self.bn3(out)
if self.downsample is not None:
identity = self.downsample(x)
out += identity
out = self.relu(out)
return out
class BottleneckJIT(nn.Module):
# Bottleneck in torchvision places the stride for downsampling at 3x3 convolution(self.conv2)
# while original implementation places the stride at the first 1x1 convolution(self.conv1)
# according to "Deep residual learning for image recognition"https://arxiv.org/abs/1512.03385.
# This variant is also known as ResNet V1.5 and improves accuracy according to
# https://ngc.nvidia.com/catalog/model-scripts/nvidia:resnet_50_v1_5_for_pytorch.
expansion: int = 4
def __init__(
self,
inplanes: int,
planes: int,
stride: int = 1,
downsample: Optional[nn.Module] = None,
groups: int = 1,
base_width: int = 64,
dilation: int = 1,
norm_layer: Optional[Callable[..., nn.Module]] = None
) -> None:
super(BottleneckJIT, self).__init__()
if norm_layer is None:
norm_layer = nn.BatchNorm2d
width = int(planes * (base_width / 64.)) * groups
# Both self.conv2 and self.downsample layers downsample the input when stride != 1
self.conv1 = conv1x1(inplanes, width)
self.bn1 = norm_layer(width)
self.conv2 = conv3x3(width, width, stride, groups, dilation)
self.bn2 = norm_layer(width)
self.conv3 = conv1x1(width, planes * self.expansion)
self.bn3 = norm_layer(planes * self.expansion)
self.relu = nn.ReLU(inplace=True)
self.downsample = downsample
self.stride = stride
def forward(self, x: Tensor) -> Tensor:
identity = x
out = self.conv1(x)
out = bn_relu_jit.apply(out, self.bn1.scale, self.bn1.bias_term)
out = self.conv2(out)
out = bn_relu_jit.apply(out, self.bn2.scale, self.bn2.bias_term)
out = self.conv3(out)
if self.downsample is not None:
identity = self.downsample[0](x)
out = bn_bn_add_relu_jit.apply(out, self.bn3.scale, self.bn3.bias_term,
identity, self.downsample[1].scale, self.downsample[1].bias_term)
else:
out = bn_add_relu_jit.apply(out, self.bn3.scale, self.bn3.bias_term, identity)
return out
class ResNet(nn.Module):
def __init__(
self,
block: Type[Union[BasicBlock, Bottleneck]],
layers: List[int],
num_classes: int = 1000,
zero_init_residual: bool = False,
groups: int = 1,
width_per_group: int = 64,
replace_stride_with_dilation: Optional[List[bool]] = None,
norm_layer: Optional[Callable[..., nn.Module]] = None,
module_name: Optional[str] = "",
**kwargs: Any
) -> None:
super(ResNet, self).__init__()
if norm_layer is None:
norm_layer = nn.BatchNorm2d
self._norm_layer = norm_layer
self.jit = kwargs['jit']
self.inplanes = 64
self.dilation = 1
if replace_stride_with_dilation is None:
# each element in the tuple indicates if we should replace
# the 2x2 stride with a dilated convolution instead
replace_stride_with_dilation = [False, False, False]
if len(replace_stride_with_dilation) != 3:
raise ValueError("replace_stride_with_dilation should be None "
"or a 3-element tuple, got {}".format(replace_stride_with_dilation))
self.groups = groups
self.base_width = width_per_group
self.conv1 = nn.Conv2d(3, self.inplanes, kernel_size=7, stride=2, padding=3,
bias=False)
# have to workaround with bn_relu_wrapper, since during execution the forward function is not called
if self.jit:
self.bn1 = bn_relu_wrapper(self.inplanes)
else:
self.bn1 = norm_layer(self.inplanes)
self.relu = nn.ReLU(inplace=True)
self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
self.layer1 = self._make_layer(block, 64, layers[0])
self.layer2 = self._make_layer(block, 128, layers[1], stride=2,
dilate=replace_stride_with_dilation[0])
self.layer3 = self._make_layer(block, 256, layers[2], stride=2,
dilate=replace_stride_with_dilation[1])
self.layer4 = self._make_layer(block, 512, layers[3], stride=2,
dilate=replace_stride_with_dilation[2])
self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
self.fc = nn.Linear(512 * block.expansion, num_classes)
for name, m in self.named_modules(prefix=module_name):
if isinstance(m, nn.Conv2d):
mllogger.event(key=WEIGHTS_INITIALIZATION, metadata={"tensor": f"{name}.weight"})
nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
elif isinstance(m, (nn.BatchNorm2d, nn.GroupNorm)):
mllogger.event(key=WEIGHTS_INITIALIZATION, metadata={"tensor": f"{name}.weight"})
nn.init.constant_(m.weight, 1)
mllogger.event(key=WEIGHTS_INITIALIZATION, metadata={"tensor": f"{name}.bias"})
nn.init.constant_(m.bias, 0)
# Zero-initialize the last BN in each residual branch,
# so that the residual branch starts with zeros, and each residual block behaves like an identity.
# This improves the model by 0.2~0.3% according to https://arxiv.org/abs/1706.02677
if zero_init_residual:
for name, m in self.named_modules(prefix=module_name):
if isinstance(m, Bottleneck):
mllogger.event(key=WEIGHTS_INITIALIZATION, metadata={"tensor": f"{name}.weight"})
nn.init.constant_(m.bn3.weight, 0) # type: ignore[arg-type]
elif isinstance(m, BasicBlock):
mllogger.event(key=WEIGHTS_INITIALIZATION, metadata={"tensor": f"{name}.weight"})
nn.init.constant_(m.bn2.weight, 0) # type: ignore[arg-type]
def _make_layer(self, block: Type[Union[BasicBlock, Bottleneck]], planes: int, blocks: int,
stride: int = 1, dilate: bool = False) -> nn.Sequential:
norm_layer = self._norm_layer
downsample = None
previous_dilation = self.dilation
if dilate:
self.dilation *= stride
stride = 1
if stride != 1 or self.inplanes != planes * block.expansion:
downsample = nn.Sequential(
conv1x1(self.inplanes, planes * block.expansion, stride),
norm_layer(planes * block.expansion),
)
layers = [block(self.inplanes, planes, stride, downsample, self.groups,
self.base_width, previous_dilation, norm_layer)]
self.inplanes = planes * block.expansion
for _ in range(1, blocks):
layers.append(block(self.inplanes, planes, groups=self.groups,
base_width=self.base_width, dilation=self.dilation,
norm_layer=norm_layer))
return nn.Sequential(*layers)
def _forward_impl(self, x: Tensor) -> Tensor:
# See note [TorchScript super()]
x = self.conv1(x)
x = self.bn1(x)
x = self.relu(x) if not self.jit else x
x = self.maxpool(x)
x = self.layer1(x)
x = self.layer2(x)
x = self.layer3(x)
x = self.layer4(x)
x = self.avgpool(x)
x = torch.flatten(x, 1)
x = self.fc(x)
return x
def forward(self, x: Tensor) -> Tensor:
return self._forward_impl(x)
def _resnet(
arch: str,
block: Type[Union[BasicBlock, Bottleneck]],
layers: List[int],
pretrained: bool,
progress: bool,
**kwargs: Any
) -> ResNet:
model = ResNet(block, layers, module_name="module.backbone.body", **kwargs)
if pretrained:
state_dict = load_state_dict_from_url(model_urls[arch],
progress=progress)
model.load_state_dict(state_dict)
return model
def resnet50(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> ResNet:
r"""ResNet-50 model from
`"Deep Residual Learning for Image Recognition" <https://arxiv.org/pdf/1512.03385.pdf>`_.
Args:
pretrained (bool): If True, returns a model pre-trained on ImageNet
progress (bool): If True, displays a progress bar of the download to stderr
"""
return _resnet('resnet50', Bottleneck, [3, 4, 6, 3], pretrained, progress,
**kwargs)
def resnet101(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> ResNet:
r"""ResNet-101 model from
`"Deep Residual Learning for Image Recognition" <https://arxiv.org/pdf/1512.03385.pdf>`_.
Args:
pretrained (bool): If True, returns a model pre-trained on ImageNet
progress (bool): If True, displays a progress bar of the download to stderr
"""
return _resnet('resnet101', Bottleneck, [3, 4, 23, 3], pretrained, progress,
**kwargs)
def resnext50_32x4d(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> ResNet:
r"""ResNeXt-50 32x4d model from
`"Aggregated Residual Transformation for Deep Neural Networks" <https://arxiv.org/pdf/1611.05431.pdf>`_.
Args:
pretrained (bool): If True, returns a model pre-trained on ImageNet
progress (bool): If True, displays a progress bar of the download to stderr
"""
resnet_block = BottleneckJIT if kwargs['jit'] else Bottleneck
kwargs['groups'] = 32
kwargs['width_per_group'] = 4
return _resnet('resnext50_32x4d', resnet_block, [3, 4, 6, 3],
pretrained, progress, **kwargs)
def resnext101_32x8d(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> ResNet:
r"""ResNeXt-101 32x8d model from
`"Aggregated Residual Transformation for Deep Neural Networks" <https://arxiv.org/pdf/1611.05431.pdf>`_.
Args:
pretrained (bool): If True, returns a model pre-trained on ImageNet
progress (bool): If True, displays a progress bar of the download to stderr
"""
kwargs['groups'] = 32
kwargs['width_per_group'] = 8
return _resnet('resnext101_32x8d', Bottleneck, [3, 4, 23, 3],
pretrained, progress, **kwargs)
# Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import math
from collections import OrderedDict
import warnings
import torch
from torch import nn, Tensor
from torch.hub import load_state_dict_from_url
from typing import Dict, List, Tuple, Optional
from model.anchor_utils import AnchorGenerator
from model.transform import GeneralizedRCNNTransform
from model.backbone_utils import resnet_fpn_backbone, _validate_trainable_layers
from model.feature_pyramid_network import LastLevelP6P7
from model.focal_loss import sigmoid_focal_loss, sigmoid_focal_loss_masked, sigmoid_focal_loss_masked_fused
from model.boxes import box_iou, clip_boxes_to_image, batched_nms
from model.utils import Matcher, MatcherBatch, overwrite_eps, BoxCoder
from .frozen_bn import FrozenBatchNorm2d
from torchvision.ops import misc as misc_nn_ops
from mlperf_logger import mllogger
from mlperf_logging.mllog.constants import WEIGHTS_INITIALIZATION
import utils
try:
from apex.contrib.conv_bias_relu import ConvBiasReLU, ConvBias
except ImportError as err:
print("Could not import APEX fused Conv-Bias-ReLU, it's fine if you do not use --apex-head")
__all__ = [
"retinanet_from_backbone",
"retinanet_resnet50_fpn",
"retinanet_resnet101_fpn",
"retinanet_resnext50_32x4d_fpn",
"retinanet_resnext101_32x8d_fpn",
]
class GradClone_(torch.autograd.Function):
@staticmethod
@torch.cuda.amp.custom_fwd
def forward(ctx, x):
return x
@staticmethod
@torch.cuda.amp.custom_bwd
def backward(ctx, grad_output):
return grad_output.clone()
GradClone = GradClone_.apply
def _sum(x: List[Tensor]) -> Tensor:
res = x[0]
for i in x[1:]:
res = res + i
return res
def cudnn_fusion_warmup(bs_list):
hw_dim_list = [100, 50, 25, 13, 7]
for bs in bs_list:
for hw in hw_dim_list:
ConvBiasReLU(torch.rand([bs, 256, hw, hw], dtype=torch.half).to(memory_format=torch.channels_last).cuda(),
torch.rand([256, 256, 3, 3], dtype=torch.half).to(memory_format=torch.channels_last).cuda(),
torch.rand([1, 256, 1, 1], dtype=torch.half).to(memory_format=torch.channels_last).cuda(), 1, 1)
ConvBias(torch.rand([bs, 256, hw, hw], dtype=torch.half).to(memory_format=torch.channels_last).cuda(),
torch.rand([2376, 256, 3, 3], dtype=torch.half).to(memory_format=torch.channels_last).cuda(),
torch.rand([1, 2376, 1, 1], dtype=torch.half).to(memory_format=torch.channels_last).cuda(), 1, 1)
ConvBias(torch.rand([bs, 256, hw, hw], dtype=torch.half).to(memory_format=torch.channels_last).cuda(),
torch.rand([36, 256, 3, 3], dtype=torch.half).to(memory_format=torch.channels_last).cuda(),
torch.rand([1, 36, 1, 1], dtype=torch.half).to(memory_format=torch.channels_last).cuda(), 1, 1)
class RetinaNetHead(nn.Module):
"""
A regression and classification head for use in RetinaNet.
Args:
in_channels (int): number of channels of the input feature
num_anchors (int): number of anchors to be predicted
num_classes (int): number of classes to be predicted
"""
def __init__(self, in_channels, num_anchors, num_classes, fusion=False):
super().__init__()
self.classification_head = RetinaNetClassificationHead(in_channels, num_anchors, num_classes, fusion=fusion,
module_name="module.head.classification_head")
self.regression_head = RetinaNetRegressionHead(in_channels, num_anchors, fusion=fusion,
module_name="module.head.regression_head")
def compute_loss(self, targets, head_outputs, anchors, matched_idxs):
# type: (List[Dict[str, Tensor]], Dict[str, Tensor], List[Tensor], List[Tensor]) -> Dict[str, Tensor]
return {
'classification': self.classification_head.compute_loss(targets, head_outputs, matched_idxs),
'bbox_regression': self.regression_head.compute_loss(targets, head_outputs, anchors, matched_idxs),
}
def forward(self, x):
return [self.classification_head(x), self.regression_head(x)]
class RetinaNetClassificationHead(nn.Module):
"""
A classification head for use in RetinaNet.
Args:
in_channels (int): number of channels of the input feature
num_anchors (int): number of anchors to be predicted
num_classes (int): number of classes to be predicted
"""
def __init__(self, in_channels, num_anchors, num_classes, prior_probability=0.01, fusion=False, module_name=""):
super().__init__()
conv = []
for _ in range(4):
conv.append(nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=1, padding=1))
conv.append(nn.ReLU())
self.conv = nn.Sequential(*conv)
for name, layer in self.conv.named_children():
if isinstance(layer, nn.Conv2d):
mllogger.event(key=WEIGHTS_INITIALIZATION, metadata={"tensor": f"{module_name}.conv.{name}.weight"})
torch.nn.init.normal_(layer.weight, std=0.01)
mllogger.event(key=WEIGHTS_INITIALIZATION, metadata={"tensor": f"{module_name}.conv.{name}.bias"})
torch.nn.init.constant_(layer.bias, 0)
self.cls_logits = nn.Conv2d(in_channels, num_anchors * num_classes, kernel_size=3, stride=1, padding=1)
mllogger.event(key=WEIGHTS_INITIALIZATION, metadata={"tensor": f"{module_name}.cls_logits.weight"})
torch.nn.init.normal_(self.cls_logits.weight, std=0.01)
mllogger.event(key=WEIGHTS_INITIALIZATION, metadata={"tensor": f"{module_name}.cls_logits.bias"})
torch.nn.init.constant_(self.cls_logits.bias, -math.log((1 - prior_probability) / prior_probability))
self.num_classes = num_classes
self.num_anchors = num_anchors
# This is to fix using det_utils.Matcher.BETWEEN_THRESHOLDS in TorchScript.
# TorchScript doesn't support class attributes.
# https://github.com/pytorch/vision/pull/1697#issuecomment-630255584
self.BETWEEN_THRESHOLDS = Matcher.BETWEEN_THRESHOLDS
self.register_buffer("one", torch.Tensor([1.]))
self.fusion = fusion
# --- original implementation ---
def compute_loss(self, targets, head_outputs, matched_idxs):
# type: (List[Dict[str, Tensor]], Dict[str, Tensor], List[Tensor]) -> Tensor
losses = []
cls_logits = head_outputs['cls_logits']
for labels_per_image, cls_logits_per_image, matched_idxs_per_image in zip(targets['labels'], cls_logits, matched_idxs):
# determine only the foreground
foreground_idxs_per_image = matched_idxs_per_image >= 0
num_foreground = foreground_idxs_per_image.sum()
# create the target classification
gt_classes_target = torch.zeros_like(cls_logits_per_image)
gt_classes_target[
foreground_idxs_per_image,
labels_per_image[matched_idxs_per_image[foreground_idxs_per_image]]
] = 1.0
# find indices for which anchors should be ignored
valid_idxs_per_image = matched_idxs_per_image != self.BETWEEN_THRESHOLDS
# compute the classification loss
losses.append(sigmoid_focal_loss(
cls_logits_per_image[valid_idxs_per_image],
gt_classes_target[valid_idxs_per_image],
reduction='sum',
) / max(1, num_foreground))
# doesn't matter which targets['?'] is taken, this represent the batch size
return _sum(losses) / len(targets['boxes'])
def compute_loss_prologue(self, target_labels, matched_idxs, one_hot):
# determine only the foreground
foreground_idxs_ = matched_idxs >= 0
num_foreground_ = foreground_idxs_.sum(dim=1)
# find indices for which anchors should be ignored
valid_idxs_ = matched_idxs != self.BETWEEN_THRESHOLDS
# TODO: unable to parallelize, try again
for i, (labels_per_image, matched_idxs_per_image, foreground_idxs_per_image) in \
enumerate(zip(target_labels, matched_idxs, foreground_idxs_)):
# create the target classification
if one_hot:
utils.ScratchPad.gt_classes_target[i][
foreground_idxs_per_image,
labels_per_image[matched_idxs_per_image[foreground_idxs_per_image]]
] = 1.0
else:
utils.ScratchPad.gt_classes_target[i][foreground_idxs_per_image] = \
labels_per_image[matched_idxs_per_image[foreground_idxs_per_image]]
return utils.ScratchPad.gt_classes_target, num_foreground_, valid_idxs_
def compute_loss_prologue_padded(self, target_labels, matched_idxs, one_hot, max_boxes):
# buffers are initialized in init_scratchpad
# utils.ScratchPad.gt_classes_target.fill_(0 if one_hot else -1)
# determine only the foreground
foreground_idxs_ = matched_idxs >= 0
num_foreground_ = foreground_idxs_.sum(dim=1)
# find indices for which anchors should be ignored
valid_idxs_ = matched_idxs != self.BETWEEN_THRESHOLDS
if one_hot:
idxs = torch.gather(target_labels, 1, torch.where(foreground_idxs_, matched_idxs, max_boxes))
utils.ScratchPad.gt_classes_target.scatter_(2, idxs[:, :, None], 1)
gt_classes_target = utils.ScratchPad.gt_classes_target[:, :, :-1]
else:
utils.ScratchPad.gt_classes_target = \
torch.gather(target_labels, 1, torch.where(foreground_idxs_, matched_idxs, max_boxes))
gt_classes_target = utils.ScratchPad.gt_classes_target
return gt_classes_target, num_foreground_, valid_idxs_
def compute_loss_core(self, cls_logits, gt_classes_target, valid_idxs, num_foreground, fused_focal_loss=False):
# notice that in the original implementation, the focal loss input dimension may differ
if not fused_focal_loss:
losses = sigmoid_focal_loss_masked(cls_logits, gt_classes_target, valid_idxs[:, :, None], reduction='sum')
else:
losses = sigmoid_focal_loss_masked_fused(cls_logits, gt_classes_target, valid_idxs, reduction='sum',
one_ptr=self.one)
losses = losses / num_foreground
return _sum(losses) / num_foreground.size(0)
def forward(self, x):
# type: (List[Tensor]) -> Tensor
all_cls_logits = []
# since weights are shared, we can cast weights and biases only one time per iteration
if self.fusion:
conv1_w = self.conv[0].weight.half()
conv2_w = self.conv[2].weight.half()
conv3_w = self.conv[4].weight.half()
conv4_w = self.conv[6].weight.half()
conv5_w = self.cls_logits.weight.half()
conv1_b = self.conv[0].bias.reshape(1, -1, 1, 1).half()
conv2_b = self.conv[2].bias.reshape(1, -1, 1, 1).half()
conv3_b = self.conv[4].bias.reshape(1, -1, 1, 1).half()
conv4_b = self.conv[6].bias.reshape(1, -1, 1, 1).half()
conv5_b = self.cls_logits.bias.reshape(1, -1, 1, 1).half()
for features in x:
if not self.fusion:
cls_logits = self.conv(features)
cls_logits = self.cls_logits(cls_logits)
else:
cls_logits = ConvBiasReLU(features, conv1_w, conv1_b, 1, 1)
cls_logits = ConvBiasReLU(cls_logits, conv2_w, conv2_b, 1, 1)
cls_logits = ConvBiasReLU(cls_logits, conv3_w, conv3_b, 1, 1)
cls_logits = ConvBiasReLU(cls_logits, conv4_w, conv4_b, 1, 1)
cls_logits = ConvBias(cls_logits, conv5_w, conv5_b, 1, 1)
# cloning grad in backprop to make it contiguous for fusion code
cls_logits = GradClone(cls_logits)
# Permute classification output from (N, A * K, H, W) to (N, HWA, K).
N, _, H, W = cls_logits.shape
cls_logits = cls_logits.view(N, -1, self.num_classes, H, W)
cls_logits = cls_logits.permute(0, 3, 4, 1, 2)
cls_logits = cls_logits.reshape(N, -1, self.num_classes) # Size=(N, HWA, 4)
all_cls_logits.append(cls_logits)
return torch.cat(all_cls_logits, dim=1)
class RetinaNetRegressionHead(nn.Module):
"""
A regression head for use in RetinaNet.
Args:
in_channels (int): number of channels of the input feature
num_anchors (int): number of anchors to be predicted
"""
__annotations__ = {
'box_coder': BoxCoder,
}
def __init__(self, in_channels, num_anchors, fusion=False, module_name=""):
super().__init__()
conv = []
for _ in range(4):
conv.append(nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=1, padding=1))
conv.append(nn.ReLU())
self.conv = nn.Sequential(*conv)
self.bbox_reg = nn.Conv2d(in_channels, num_anchors * 4, kernel_size=3, stride=1, padding=1)
mllogger.event(key=WEIGHTS_INITIALIZATION, metadata={"tensor": f"{module_name}.bbox_reg.weight"})
torch.nn.init.normal_(self.bbox_reg.weight, std=0.01)
mllogger.event(key=WEIGHTS_INITIALIZATION, metadata={"tensor": f"{module_name}.bbox_reg.bias"})
torch.nn.init.zeros_(self.bbox_reg.bias)
for name, layer in self.conv.named_children():
if isinstance(layer, nn.Conv2d):
mllogger.event(key=WEIGHTS_INITIALIZATION, metadata={"tensor": f"{module_name}.conv.{name}.weight"})
torch.nn.init.normal_(layer.weight, std=0.01)
mllogger.event(key=WEIGHTS_INITIALIZATION, metadata={"tensor": f"{module_name}.conv.{name}.bias"})
torch.nn.init.zeros_(layer.bias)
self.box_coder = BoxCoder(weights=(1.0, 1.0, 1.0, 1.0))
self.fusion = fusion
# --- original implementation ---
def compute_loss(self, targets, head_outputs, anchors, matched_idxs):
# type: (List[Dict[str, Tensor]], Dict[str, Tensor], List[Tensor], List[Tensor]) -> Tensor
losses = []
bbox_regression = head_outputs['bbox_regression']
for boxes_per_image, bbox_regression_per_image, anchors_per_image, matched_idxs_per_image in \
zip(targets['boxes'], bbox_regression, anchors, matched_idxs):
# determine only the foreground indices, ignore the rest
foreground_idxs_per_image = torch.where(matched_idxs_per_image >= 0)[0]
num_foreground = foreground_idxs_per_image.numel()
# select only the foreground boxes
matched_gt_boxes_per_image = boxes_per_image[matched_idxs_per_image[foreground_idxs_per_image]]
bbox_regression_per_image = bbox_regression_per_image[foreground_idxs_per_image, :]
anchors_per_image = anchors_per_image[foreground_idxs_per_image, :]
# compute the regression targets
target_regression = self.box_coder.encode_single(matched_gt_boxes_per_image, anchors_per_image)
# compute the loss
losses.append(torch.nn.functional.l1_loss(
bbox_regression_per_image,
target_regression,
reduction='sum'
) / max(1, num_foreground))
# doesn't matter which targets['?'] is taken, this represent the batch size
return _sum(losses) / len(targets['boxes'])
def compute_loss_prologue(self, target_boxes, matched_idxs, anchors):
foreground_idxs_mask, num_foreground_, target_regression_ = [], [], []
for boxes_per_image, anchors_per_image, matched_idxs_per_image in zip(target_boxes, anchors, matched_idxs):
foreground_idxs_per_image = torch.where(matched_idxs_per_image >= 0)[0]
num_foreground = foreground_idxs_per_image.numel()
foreground_idxs_mask.append(foreground_idxs_per_image)
num_foreground_.append(num_foreground)
# select only the foreground boxes
matched_gt_boxes_per_image = boxes_per_image[matched_idxs_per_image[foreground_idxs_per_image]]
anchors_per_image = anchors_per_image[foreground_idxs_per_image, :]
# compute the regression targets
target_regression = self.box_coder.encode_single(matched_gt_boxes_per_image, anchors_per_image)
target_regression_.append(target_regression)
return target_regression_, num_foreground_, foreground_idxs_mask
def compute_loss_core(self, bbox_regression, target_regression, foreground_idxs, num_foreground):
losses = []
for bbox_regression_i, target_regression_i, foreground_idxs_i, num_foreground_i in \
zip(bbox_regression, target_regression, foreground_idxs, num_foreground):
bbox_regression_i_ = bbox_regression_i[foreground_idxs_i, :]
losses.append(torch.nn.functional.l1_loss(bbox_regression_i_, target_regression_i, reduction='sum')
/ max(1, num_foreground_i))
return _sum(losses) / num_foreground.size(0)
def compute_loss_prologue_padded(self, target_boxes, matched_idxs, anchors):
# notice the number of boxes is padded in this implementation
# make sure we do not trim bboxes
# assert (matched_idxs.max() < max_boxes)
foreground_idxs_mask = matched_idxs >= 0
num_foreground_ = foreground_idxs_mask.sum(dim=1)
# clamping to avoid -2, -1
matched_idxs_clamped = torch.clamp(matched_idxs, min=0)
# check that the premade vector size is relevant to the current batch size
# not sure what will happen if it is not
assert(utils.ScratchPad.batch_size_vector.size(0) == len(target_boxes))
matched_gt_boxes_ = target_boxes[utils.ScratchPad.batch_size_vector, matched_idxs_clamped]
target_regression_ = self.box_coder.encode_batch(matched_gt_boxes_,
torch.stack(anchors)) * foreground_idxs_mask[:, :, None]
return target_regression_, num_foreground_, foreground_idxs_mask
def compute_loss_core_padded(self, bbox_regression, target_regression, foreground_idxs, num_foreground):
bbox_regression_masked = bbox_regression * foreground_idxs[:, :, None]
losses = torch.norm(bbox_regression_masked - target_regression, 1, dim=[1, 2]) / \
torch.max(torch.ones_like(num_foreground), num_foreground)
# The denominator is just the batch size
return _sum(losses) / num_foreground.size(0)
def forward(self, x):
# type: (List[Tensor]) -> Tensor
all_bbox_regression = []
# since weights are shared, we can cast weights and biases only one time per iteration
if self.fusion:
conv1_w = self.conv[0].weight.half()
conv2_w = self.conv[2].weight.half()
conv3_w = self.conv[4].weight.half()
conv4_w = self.conv[6].weight.half()
conv5_w = self.bbox_reg.weight.half()
conv1_b = self.conv[0].bias.reshape(1, -1, 1, 1).half()
conv2_b = self.conv[2].bias.reshape(1, -1, 1, 1).half()
conv3_b = self.conv[4].bias.reshape(1, -1, 1, 1).half()
conv4_b = self.conv[6].bias.reshape(1, -1, 1, 1).half()
conv5_b = self.bbox_reg.bias.reshape(1, -1, 1, 1).half()
for features in x:
if not self.fusion:
bbox_regression = self.conv(features)
bbox_regression = self.bbox_reg(bbox_regression)
else:
bbox_regression = ConvBiasReLU(features, conv1_w, conv1_b, 1, 1)
bbox_regression = ConvBiasReLU(bbox_regression, conv2_w, conv2_b, 1, 1)
bbox_regression = ConvBiasReLU(bbox_regression, conv3_w, conv3_b, 1, 1)
bbox_regression = ConvBiasReLU(bbox_regression, conv4_w, conv4_b, 1, 1)
bbox_regression = ConvBias(bbox_regression, conv5_w, conv5_b, 1, 1)
# cloning grad in backprop to make it contiguous for fusion code
bbox_regression = GradClone(bbox_regression)
# Permute bbox regression output from (N, 4 * A, H, W) to (N, HWA, 4).
N, _, H, W = bbox_regression.shape
bbox_regression = bbox_regression.view(N, -1, 4, H, W)
bbox_regression = bbox_regression.permute(0, 3, 4, 1, 2)
bbox_regression = bbox_regression.reshape(N, -1, 4) # Size=(N, HWA, 4)
all_bbox_regression.append(bbox_regression)
return torch.cat(all_bbox_regression, dim=1)
class RetinaNet(nn.Module):
"""
Implements RetinaNet.
The input to the model is expected to be a list of tensors, each of shape [C, H, W], one for each
image, and should be in 0-1 range. Different images can have different sizes.
The behavior of the model changes depending if it is in training or evaluation mode.
During training, the model expects both the input tensors, as well as a targets (list of dictionary),
containing:
- boxes (``FloatTensor[N, 4]``): the ground-truth boxes in ``[x1, y1, x2, y2]`` format, with
``0 <= x1 < x2 <= W`` and ``0 <= y1 < y2 <= H``.
- labels (Int64Tensor[N]): the class label for each ground-truth box
The model returns a Dict[Tensor] during training, containing the classification and regression
losses.
During inference, the model requires only the input tensors, and returns the post-processed
predictions as a List[Dict[Tensor]], one for each input image. The fields of the Dict are as
follows:
- boxes (``FloatTensor[N, 4]``): the predicted boxes in ``[x1, y1, x2, y2]`` format, with
``0 <= x1 < x2 <= W`` and ``0 <= y1 < y2 <= H``.
- labels (Int64Tensor[N]): the predicted labels for each image
- scores (Tensor[N]): the scores for each prediction
Args:
backbone (nn.Module): the network used to compute the features for the model.
It should contain an out_channels attribute, which indicates the number of output
channels that each feature map has (and it should be the same for all feature maps).
The backbone should return a single Tensor or an OrderedDict[Tensor].
num_classes (int): number of output classes of the model (including the background).
image_mean (Tuple[float, float, float]): mean values used for input normalization.
They are generally the mean values of the dataset on which the backbone has been trained
on
image_std (Tuple[float, float, float]): std values used for input normalization.
They are generally the std values of the dataset on which the backbone has been trained on
anchor_generator (AnchorGenerator): module that generates the anchors for a set of feature
maps.
head (nn.Module): Module run on top of the feature pyramid.
Defaults to a module containing a classification and regression module.
score_thresh (float): Score threshold used for postprocessing the detections.
nms_thresh (float): NMS threshold used for postprocessing the detections.
detections_per_img (int): Number of best detections to keep after NMS.
fg_iou_thresh (float): minimum IoU between the anchor and the GT box so that they can be
considered as positive during training.
bg_iou_thresh (float): maximum IoU between the anchor and the GT box so that they can be
considered as negative during training.
topk_candidates (int): Number of best detections to keep before NMS.
Example:
>>> import torch
>>> import torchvision
>>> from torchvision.models.detection import RetinaNet
>>> from torchvision.models.detection.anchor_utils import AnchorGenerator
>>> # load a pre-trained model for classification and return
>>> # only the features
>>> backbone = torchvision.models.mobilenet_v2(pretrained=True).features
>>> # RetinaNet needs to know the number of
>>> # output channels in a backbone. For mobilenet_v2, it's 1280
>>> # so we need to add it here
>>> backbone.out_channels = 1280
>>>
>>> # let's make the network generate 5 x 3 anchors per spatial
>>> # location, with 5 different sizes and 3 different aspect
>>> # ratios. We have a Tuple[Tuple[int]] because each feature
>>> # map could potentially have different sizes and
>>> # aspect ratios
>>> anchor_generator = AnchorGenerator(
>>> sizes=((32, 64, 128, 256, 512),),
>>> aspect_ratios=((0.5, 1.0, 2.0),)
>>> )
>>>
>>> # put the pieces together inside a RetinaNet model
>>> model = RetinaNet(backbone,
>>> num_classes=2,
>>> anchor_generator=anchor_generator)
>>> model.eval()
>>> x = [torch.rand(3, 300, 400), torch.rand(3, 500, 400)]
>>> predictions = model(x)
"""
__annotations__ = {
'box_coder': BoxCoder,
'proposal_matcher': Matcher,
}
def __init__(self, backbone, num_classes, data_layout='channels_first', head_fusion=False,
# transform parameters
image_size=None, image_mean=None, image_std=None,
# Anchor parameters
anchor_generator=None, head=None,
# Detection parameters
proposal_matcher=None,
score_thresh=0.05,
nms_thresh=0.5,
detections_per_img=300,
fg_iou_thresh=0.5, bg_iou_thresh=0.4,
topk_candidates=1000):
super().__init__()
if not hasattr(backbone, "out_channels"):
raise ValueError(
"backbone should contain an attribute out_channels "
"specifying the number of output channels (assumed to be the "
"same for all the levels)")
self.backbone = backbone
self.data_layout = data_layout
assert isinstance(anchor_generator, (AnchorGenerator, type(None)))
if anchor_generator is None:
anchor_sizes = tuple((x, int(x * 2 ** (1.0 / 3)), int(x * 2 ** (2.0 / 3))) for x in [32, 64, 128, 256, 512])
aspect_ratios = ((0.5, 1.0, 2.0),) * len(anchor_sizes)
anchor_generator = AnchorGenerator(
anchor_sizes, aspect_ratios
)
self.anchor_generator = anchor_generator
self.anchors = None
if head is None:
head = RetinaNetHead(backbone.out_channels, anchor_generator.num_anchors_per_location()[0], num_classes,
fusion=head_fusion)
self.head = head
if proposal_matcher is None:
proposal_matcher = Matcher(
fg_iou_thresh,
bg_iou_thresh,
allow_low_quality_matches=True,
)
else:
warnings.warn('proposal_matcher_batch is statically assigned to MatcherBatch')
self.proposal_matcher = proposal_matcher
self.proposal_matcher_batch = MatcherBatch(fg_iou_thresh, bg_iou_thresh, allow_low_quality_matches=True)
self.score_thresh = score_thresh
self.nms_thresh = nms_thresh
self.detections_per_img = detections_per_img
self.topk_candidates = topk_candidates
self.box_coder = BoxCoder(weights=(1.0, 1.0, 1.0, 1.0))
self.anchors = None
if image_size is None:
image_size = [800, 800]
if image_std is None:
image_std = [0.229, 0.224, 0.225]
if image_mean is None:
image_mean = [0.485, 0.456, 0.406]
self.transform = GeneralizedRCNNTransform(image_size=image_size,
image_mean=image_mean, image_std=image_std)
# used only on torchscript mode
self._has_warned = False
@torch.jit.unused
def eager_outputs(self, losses, detections):
# type: (Dict[str, Tensor], List[Dict[str, Tensor]]) -> Tuple[Dict[str, Tensor], List[Dict[str, Tensor]]]
if self.training:
return losses
return detections
# --- original code ---
def get_matched_idxs(self, target_boxes):
matched_idxs = []
for anchors_per_image, boxes_per_image in zip(self.anchors, target_boxes):
if boxes_per_image.numel() == 0:
matched_idxs.append(torch.full((anchors_per_image.size(0),), -1, dtype=torch.int64,
device=anchors_per_image.device))
continue
match_quality_matrix = box_iou(boxes_per_image, anchors_per_image)
matched_idxs.append(self.proposal_matcher(match_quality_matrix))
return torch.stack(matched_idxs)
# --- parallel implementation ---
# this implementation is not in use, since (1) it is done as part of DALI pipe; and (2) because of the
# significant padding to target_boxes, box_iou has significant computational overheads
def get_matched_idxs_padded(self, target_boxes, batch_sz, max_boxes):
target_boxes_ = target_boxes.reshape(-1, 4)
match_quality_matrix = box_iou(target_boxes_, self.anchors[0])
match_quality_matrix = match_quality_matrix.reshape([batch_sz, max_boxes, -1])
matched_idxs = self.proposal_matcher_batch(match_quality_matrix)
return matched_idxs
# --- original code ---
def compute_loss(self, targets, head_outputs):
# type: (List[Dict[str, Tensor]], Dict[str, Tensor]) -> Dict[str, Tensor]
matched_idxs = []
for anchors_per_image, boxes_per_image in zip(self.anchors, targets['boxes']):
# Uncomment to support trim of targets according to MAX_BOXES, so can be used a reference
# boxes_per_image = boxes_per_image[0:MAX_BOXES, :]
if boxes_per_image.numel() == 0:
matched_idxs.append(torch.full((anchors_per_image.size(0),), -1, dtype=torch.int64,
device=anchors_per_image.device))
continue
match_quality_matrix = box_iou(boxes_per_image, anchors_per_image)
matched_idxs.append(self.proposal_matcher(match_quality_matrix))
return self.head.compute_loss(targets, head_outputs, self.anchors, matched_idxs)
def update_anchors(self, images, device, features=None, dtype=torch.float16, force=False):
# TODO: should perhaps create once in the relevant constructor
if self.anchors is None or force is True:
if features is None:
# forward_opt uses the default grid size (100, 50, 25, 13, 7)
# images is the image tensor shape
self.anchors = self.anchor_generator.forward_opt(image_shape=images, device=device, dtype=dtype)
else:
# using the old method if the features are passed
self.anchors = self.anchor_generator.forward(images, features)
def eval_postprocess_detections(self, head_outputs, anchors, image_shapes):
# type: (Dict[str, List[Tensor]], List[List[Tensor]], List[Tuple[int, int]]) -> List[Dict[str, Tensor]]
class_logits = head_outputs['cls_logits']
box_regression = head_outputs['bbox_regression']
num_images = len(image_shapes)
detections: List[Dict[str, Tensor]] = []
for index in range(num_images):
box_regression_per_image = [br[index] for br in box_regression]
logits_per_image = [cl[index] for cl in class_logits]
# anchors[i] = anchors[j] for every i!=j
anchors_per_image, image_shape = anchors[0], image_shapes[index]
image_boxes = []
image_scores = []
image_labels = []
for box_regression_per_level, logits_per_level, anchors_per_level in \
zip(box_regression_per_image, logits_per_image, anchors_per_image):
num_classes = logits_per_level.shape[-1]
# remove low scoring boxes
scores_per_level = torch.sigmoid(logits_per_level).flatten()
keep_idxs = scores_per_level > self.score_thresh
scores_per_level = scores_per_level[keep_idxs]
topk_idxs = torch.where(keep_idxs)[0]
# keep only topk scoring predictions
num_topk = min(self.topk_candidates, topk_idxs.size(0))
scores_per_level, idxs = scores_per_level.topk(num_topk)
topk_idxs = topk_idxs[idxs]
anchor_idxs = torch.div(topk_idxs, num_classes, rounding_mode='floor')
labels_per_level = topk_idxs % num_classes
boxes_per_level = self.box_coder.decode_single(box_regression_per_level[anchor_idxs],
anchors_per_level[anchor_idxs])
boxes_per_level = clip_boxes_to_image(boxes_per_level, image_shape)
image_boxes.append(boxes_per_level)
image_scores.append(scores_per_level)
image_labels.append(labels_per_level)
image_boxes = torch.cat(image_boxes, dim=0)
image_scores = torch.cat(image_scores, dim=0)
image_labels = torch.cat(image_labels, dim=0)
# non-maximum suppression
keep = batched_nms(image_boxes, image_scores, image_labels, self.nms_thresh)
keep = keep[:self.detections_per_img]
detections.append({
'boxes': image_boxes[keep],
'scores': image_scores[keep],
'labels': image_labels[keep],
})
return detections
def eval_postprocess(self, images, features, targets, head_outputs, targets_dict=False):
# recover level sizes
num_anchors_per_level = [x.size(2) * x.size(3) for x in features]
HW = 0
for v in num_anchors_per_level:
HW += v
HWA = head_outputs['cls_logits'].size(1)
A = HWA // HW
num_anchors_per_level = [hw * A for hw in num_anchors_per_level]
# split outputs per level
split_head_outputs: Dict[str, List[Tensor]] = {}
for k in head_outputs:
split_head_outputs[k] = list(head_outputs[k].split(num_anchors_per_level, dim=1))
split_anchors = [list(a.split(num_anchors_per_level)) for a in self.anchors]
# get the original image sizes
original_image_sizes = []
if targets_dict:
original_image_sizes = targets['original_image_size']
else:
for target in targets:
original_image_sizes.append(target['original_image_size'])
# compute the detections
detections = self.eval_postprocess_detections(split_head_outputs, split_anchors,
[(image.size(1), image.size(2)) for image in images])
detections = self.transform.postprocess(detections,
[(image.size(1), image.size(2)) for image in images],
original_image_sizes)
return detections
def validate_input(self, images, targets):
if self.training and targets is None:
raise ValueError("In training mode, targets should be passed")
if self.training:
assert targets is not None
for boxes in targets["boxes"]:
if isinstance(boxes, torch.Tensor):
if len(boxes.shape) != 2 or boxes.shape[-1] != 4:
raise ValueError("Expected target boxes to be a tensor"
"of shape [N, 4], got {:}.".format(
boxes.shape))
else:
raise ValueError("Expected target boxes to be of type "
"Tensor, got {:}.".format(type(boxes)))
# check for degenerate boxes
if targets is not None:
for target_idx, boxes in enumerate(targets["boxes"]):
degenerate_boxes = boxes[:, 2:] <= boxes[:, :2]
if degenerate_boxes.any():
# print the first degenerate box
bb_idx = torch.where(degenerate_boxes.any(dim=1))[0][0]
degen_bb: List[float] = boxes[bb_idx].tolist()
raise ValueError("All bounding boxes should have positive height and width."
" Found invalid box {} for target at index {}."
.format(degen_bb, target_idx))
def forward(self, images: Tensor) -> Tuple[Tensor]:
"""
Args:
images (Tensor): images to be processed
Returns:
result (Tuple[Tensor]): the output from the model; [0]: pyramid 100x100, [1] 50x50, [2] 25x25,
[3] 13x13, [4] 7x7, [5] cls head, [6] bbox head
"""
# get the features from the backbone
features = self.backbone(images)
if isinstance(features, torch.Tensor):
features = OrderedDict([('0', features)])
features = list(features.values())
# compute the retinanet heads outputs using the features
head_outputs = self.head(features)
features.extend(head_outputs)
out = tuple(features)
return out
model_urls = {
'retinanet_resnet50_fpn_coco':
'https://download.pytorch.org/models/retinanet_resnet50_fpn_coco-eeacb38b.pth',
}
def retinanet_resnet50_fpn(num_classes, image_size, data_layout='channels_first',
pretrained=False, progress=True, pretrained_backbone=True,
trainable_backbone_layers=None):
"""
Constructs a RetinaNet model with a ResNet-50-FPN backbone.
Reference: `"Focal Loss for Dense Object Detection" <https://arxiv.org/abs/1708.02002>`_.
The input to the model is expected to be a list of tensors, each of shape ``[C, H, W]``, one for each
image, and should be in ``0-1`` range. Different images can have different sizes.
The behavior of the model changes depending if it is in training or evaluation mode.
During training, the model expects both the input tensors, as well as a targets (list of dictionary),
containing:
- boxes (``FloatTensor[N, 4]``): the ground-truth boxes in ``[x1, y1, x2, y2]`` format, with
``0 <= x1 < x2 <= W`` and ``0 <= y1 < y2 <= H``.
- labels (``Int64Tensor[N]``): the class label for each ground-truth box
The model returns a ``Dict[Tensor]`` during training, containing the classification and regression
losses.
During inference, the model requires only the input tensors, and returns the post-processed
predictions as a ``List[Dict[Tensor]]``, one for each input image. The fields of the ``Dict`` are as
follows, where ``N`` is the number of detections:
- boxes (``FloatTensor[N, 4]``): the predicted boxes in ``[x1, y1, x2, y2]`` format, with
``0 <= x1 < x2 <= W`` and ``0 <= y1 < y2 <= H``.
- labels (``Int64Tensor[N]``): the predicted labels for each detection
- scores (``Tensor[N]``): the scores of each detection
For more details on the output, you may refer to :ref:`instance_seg_output`.
Example::
>>> model = torchvision.models.detection.retinanet_resnet50_fpn(pretrained=True)
>>> model.eval()
>>> x = [torch.rand(3, 300, 400), torch.rand(3, 500, 400)]
>>> predictions = model(x)
Args:
num_classes (int): number of output classes of the model (including the background)
image_size (list(int, int)): Image size
data_layout (str): model data layout (channels_first or channels_last)
pretrained (bool): If True, returns a model pre-trained on COCO train2017
progress (bool): If True, displays a progress bar of the download to stderr
pretrained_backbone (bool): If True, returns a model with backbone pre-trained on Imagenet
trainable_backbone_layers (int): number of trainable (not frozen) resnet layers starting from final block.
Valid values are between 0 and 5, with 5 meaning all backbone layers are trainable.
"""
trainable_backbone_layers = _validate_trainable_layers(
pretrained or pretrained_backbone, trainable_backbone_layers, 5, 3)
if pretrained:
# no need to download the backbone if pretrained is set
pretrained_backbone = False
# skip P2 because it generates too many anchors (according to their paper)
backbone = resnet_fpn_backbone('resnet50', pretrained_backbone, returned_layers=[2, 3, 4],
extra_blocks=LastLevelP6P7(256, 256, module_name="module.backbone.fpn.extra_blocks"),
trainable_layers=trainable_backbone_layers)
model = RetinaNet(backbone=backbone, num_classes=num_classes, data_layout=data_layout, image_size=image_size)
if pretrained:
state_dict = load_state_dict_from_url(model_urls['retinanet_resnet50_fpn_coco'],
progress=progress)
model.load_state_dict(state_dict)
overwrite_eps(model, 0.0)
return model
def retinanet_resnext50_32x4d_fpn(num_classes, image_size, data_layout='channels_first',
pretrained=False, progress=True, pretrained_backbone=True,
trainable_backbone_layers=None, jit=False, head_fusion=False, frozen_bn_opt=False):
"""
Constructs a RetinaNet model with a resnext50_32x4d-FPN backbone.
Reference: `"Focal Loss for Dense Object Detection" <https://arxiv.org/abs/1708.02002>`_.
The input to the model is expected to be a list of tensors, each of shape ``[C, H, W]``, one for each
image, and should be in ``0-1`` range. Different images can have different sizes.
The behavior of the model changes depending if it is in training or evaluation mode.
During training, the model expects both the input tensors, as well as a targets (list of dictionary),
containing:
- boxes (``FloatTensor[N, 4]``): the ground-truth boxes in ``[x1, y1, x2, y2]`` format, with
``0 <= x1 < x2 <= W`` and ``0 <= y1 < y2 <= H``.
- labels (``Int64Tensor[N]``): the class label for each ground-truth box
The model returns a ``Dict[Tensor]`` during training, containing the classification and regression
losses.
During inference, the model requires only the input tensors, and returns the post-processed
predictions as a ``List[Dict[Tensor]]``, one for each input image. The fields of the ``Dict`` are as
follows, where ``N`` is the number of detections:
- boxes (``FloatTensor[N, 4]``): the predicted boxes in ``[x1, y1, x2, y2]`` format, with
``0 <= x1 < x2 <= W`` and ``0 <= y1 < y2 <= H``.
- labels (``Int64Tensor[N]``): the predicted labels for each detection
- scores (``Tensor[N]``): the scores of each detection
For more details on the output, you may refer to :ref:`instance_seg_output`.
Example::
>>> model = torchvision.models.detection.retinanet_resnext50_32x4d_fpn(pretrained=True)
>>> model.eval()
>>> x = [torch.rand(3, 300, 400), torch.rand(3, 500, 400)]
>>> predictions = model(x)
Args:
num_classes (int): number of output classes of the model (including the background)
image_size (list(int, int)): Image size
data_layout (str): model data layout (channels_first or channels_last)
pretrained (bool): If True, returns a model pre-trained on COCO train2017
progress (bool): If True, displays a progress bar of the download to stderr
pretrained_backbone (bool): If True, returns a model with backbone pre-trained on Imagenet
trainable_backbone_layers (int): number of trainable (not frozen) resnet layers starting from final block.
Valid values are between 0 and 5, with 5 meaning all backbone layers are trainable.
"""
trainable_backbone_layers = _validate_trainable_layers(
pretrained or pretrained_backbone, trainable_backbone_layers, 5, 3)
if pretrained:
# no need to download the backbone if pretrained is set
pretrained_backbone = False
# skip P2 because it generates too many anchors (according to their paper)
backbone = resnet_fpn_backbone('resnext50_32x4d', pretrained_backbone, returned_layers=[2, 3, 4],
extra_blocks=LastLevelP6P7(256, 256, module_name="module.backbone.fpn.extra_blocks"),
trainable_layers=trainable_backbone_layers,
norm_layer=FrozenBatchNorm2d if frozen_bn_opt else misc_nn_ops.FrozenBatchNorm2d,
jit=jit)
model = RetinaNet(backbone=backbone, num_classes=num_classes, data_layout=data_layout, image_size=image_size,
head_fusion=head_fusion)
if pretrained:
raise ValueError("Torchvision doesn't have a pretrained retinanet_resnext50_32x4d_fpn model")
return model
def retinanet_resnet101_fpn(num_classes, image_size, data_layout='channels_first',
pretrained=False, progress=True, pretrained_backbone=True,
trainable_backbone_layers=None):
"""
Constructs a RetinaNet model with a ResNet-101-FPN backbone.
Reference: `"Focal Loss for Dense Object Detection" <https://arxiv.org/abs/1708.02002>`_.
The input to the model is expected to be a list of tensors, each of shape ``[C, H, W]``, one for each
image, and should be in ``0-1`` range. Different images can have different sizes.
The behavior of the model changes depending if it is in training or evaluation mode.
During training, the model expects both the input tensors, as well as a targets (list of dictionary),
containing:
- boxes (``FloatTensor[N, 4]``): the ground-truth boxes in ``[x1, y1, x2, y2]`` format, with
``0 <= x1 < x2 <= W`` and ``0 <= y1 < y2 <= H``.
- labels (``Int64Tensor[N]``): the class label for each ground-truth box
The model returns a ``Dict[Tensor]`` during training, containing the classification and regression
losses.
During inference, the model requires only the input tensors, and returns the post-processed
predictions as a ``List[Dict[Tensor]]``, one for each input image. The fields of the ``Dict`` are as
follows, where ``N`` is the number of detections:
- boxes (``FloatTensor[N, 4]``): the predicted boxes in ``[x1, y1, x2, y2]`` format, with
``0 <= x1 < x2 <= W`` and ``0 <= y1 < y2 <= H``.
- labels (``Int64Tensor[N]``): the predicted labels for each detection
- scores (``Tensor[N]``): the scores of each detection
For more details on the output, you may refer to :ref:`instance_seg_output`.
Example::
>>> model = torchvision.models.detection.retinanet_resnet101_fpn(pretrained=True)
>>> model.eval()
>>> x = [torch.rand(3, 300, 400), torch.rand(3, 500, 400)]
>>> predictions = model(x)
Args:
num_classes (int): number of output classes of the model (including the background)
image_size (list(int, int)): Image size
data_layout (str): model data layout (channels_first or channels_last)
pretrained (bool): If True, returns a model pre-trained on COCO train2017
progress (bool): If True, displays a progress bar of the download to stderr
pretrained_backbone (bool): If True, returns a model with backbone pre-trained on Imagenet
trainable_backbone_layers (int): number of trainable (not frozen) resnet layers starting from final block.
Valid values are between 0 and 5, with 5 meaning all backbone layers are trainable.
"""
trainable_backbone_layers = _validate_trainable_layers(
pretrained or pretrained_backbone, trainable_backbone_layers, 5, 3)
if pretrained:
# no need to download the backbone if pretrained is set
pretrained_backbone = False
# skip P2 because it generates too many anchors (according to their paper)
backbone = resnet_fpn_backbone('resnet101', pretrained_backbone, returned_layers=[2, 3, 4],
extra_blocks=LastLevelP6P7(256, 256, module_name="module.backbone.fpn.extra_blocks"),
trainable_layers=trainable_backbone_layers)
model = RetinaNet(backbone=backbone, num_classes=num_classes, data_layout=data_layout, image_size=image_size)
if pretrained:
raise ValueError("Torchvision doesn't have a pretrained retinanet_resnet101_fpn model")
return model
def retinanet_resnext101_32x8d_fpn(num_classes, image_size, data_layout='channels_first',
pretrained=False, progress=True, pretrained_backbone=True,
trainable_backbone_layers=None):
"""
Constructs a RetinaNet model with a resnext101_32x8d-FPN backbone.
Reference: `"Focal Loss for Dense Object Detection" <https://arxiv.org/abs/1708.02002>`_.
The input to the model is expected to be a list of tensors, each of shape ``[C, H, W]``, one for each
image, and should be in ``0-1`` range. Different images can have different sizes.
The behavior of the model changes depending if it is in training or evaluation mode.
During training, the model expects both the input tensors, as well as a targets (list of dictionary),
containing:
- boxes (``FloatTensor[N, 4]``): the ground-truth boxes in ``[x1, y1, x2, y2]`` format, with
``0 <= x1 < x2 <= W`` and ``0 <= y1 < y2 <= H``.
- labels (``Int64Tensor[N]``): the class label for each ground-truth box
The model returns a ``Dict[Tensor]`` during training, containing the classification and regression
losses.
During inference, the model requires only the input tensors, and returns the post-processed
predictions as a ``List[Dict[Tensor]]``, one for each input image. The fields of the ``Dict`` are as
follows, where ``N`` is the number of detections:
- boxes (``FloatTensor[N, 4]``): the predicted boxes in ``[x1, y1, x2, y2]`` format, with
``0 <= x1 < x2 <= W`` and ``0 <= y1 < y2 <= H``.
- labels (``Int64Tensor[N]``): the predicted labels for each detection
- scores (``Tensor[N]``): the scores of each detection
For more details on the output, you may refer to :ref:`instance_seg_output`.
Example::
>>> model = torchvision.models.detection.retinanet_resnext101_32x8d_fpn(pretrained=True)
>>> model.eval()
>>> x = [torch.rand(3, 300, 400), torch.rand(3, 500, 400)]
>>> predictions = model(x)
Args:
num_classes (int): number of output classes of the model (including the background)
image_size (list(int, int)): Image size
data_layout (str): model data layout (channels_first or channels_last)
pretrained (bool): If True, returns a model pre-trained on COCO train2017
progress (bool): If True, displays a progress bar of the download to stderr
pretrained_backbone (bool): If True, returns a model with backbone pre-trained on Imagenet
trainable_backbone_layers (int): number of trainable (not frozen) resnet layers starting from final block.
Valid values are between 0 and 5, with 5 meaning all backbone layers are trainable.
"""
trainable_backbone_layers = _validate_trainable_layers(
pretrained or pretrained_backbone, trainable_backbone_layers, 5, 3)
if pretrained:
# no need to download the backbone if pretrained is set
pretrained_backbone = False
# skip P2 because it generates too many anchors (according to their paper)
backbone = resnet_fpn_backbone('resnext101_32x8d', pretrained_backbone, returned_layers=[2, 3, 4],
extra_blocks=LastLevelP6P7(256, 256, module_name="module.backbone.fpn.extra_blocks"),
trainable_layers=trainable_backbone_layers)
model = RetinaNet(backbone=backbone, num_classes=num_classes, data_layout=data_layout, image_size=image_size)
if pretrained:
raise ValueError("Torchvision doesn't have a pretrained retinanet_resnext101_32x8d_fpn model")
return model
def retinanet_from_backbone(backbone,
num_classes=91, data_layout='channels_first', image_size=None,
pretrained=False, progress=True, pretrained_backbone=True,
trainable_backbone_layers=None, jit=False, head_fusion=False, frozen_bn_opt=False):
if image_size is None:
image_size = [800, 800]
if backbone == "resnet50":
return retinanet_resnet50_fpn(num_classes=num_classes, data_layout=data_layout, image_size=image_size,
pretrained=pretrained, progress=progress,
pretrained_backbone=pretrained_backbone,
trainable_backbone_layers=trainable_backbone_layers)
elif backbone == "resnext50_32x4d":
return retinanet_resnext50_32x4d_fpn(num_classes=num_classes, data_layout=data_layout, image_size=image_size,
pretrained=pretrained, progress=progress,
pretrained_backbone=pretrained_backbone,
trainable_backbone_layers=trainable_backbone_layers, jit=jit,
head_fusion=head_fusion, frozen_bn_opt=frozen_bn_opt)
elif backbone == "resnet101":
return retinanet_resnet101_fpn(num_classes=num_classes, data_layout=data_layout, image_size=image_size,
pretrained=pretrained, progress=progress,
pretrained_backbone=pretrained_backbone,
trainable_backbone_layers=trainable_backbone_layers)
elif backbone == "resnext101_32x8d":
return retinanet_resnext101_32x8d_fpn(num_classes=num_classes, data_layout=data_layout, image_size=image_size,
pretrained=pretrained, progress=progress,
pretrained_backbone=pretrained_backbone,
trainable_backbone_layers=trainable_backbone_layers)
else:
raise ValueError(f"Unknown backbone {backbone}")
# Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import torch
import torchvision
import torch.nn.functional as F
from torch import nn, Tensor
from torchvision.ops import boxes as box_ops
from torchvision.ops import roi_align
from typing import Optional, List, Dict, Tuple
from model.utils import BoxCoder, Matcher
def expand_boxes(boxes, scale):
# type: (Tensor, float) -> Tensor
w_half = (boxes[:, 2] - boxes[:, 0]) * .5
h_half = (boxes[:, 3] - boxes[:, 1]) * .5
x_c = (boxes[:, 2] + boxes[:, 0]) * .5
y_c = (boxes[:, 3] + boxes[:, 1]) * .5
w_half *= scale
h_half *= scale
boxes_exp = torch.zeros_like(boxes)
boxes_exp[:, 0] = x_c - w_half
boxes_exp[:, 2] = x_c + w_half
boxes_exp[:, 1] = y_c - h_half
boxes_exp[:, 3] = y_c + h_half
return boxes_exp
def expand_masks(mask, padding):
# type: (Tensor, int) -> Tuple[Tensor, float]
M = mask.shape[-1]
scale = float(M + 2 * padding) / M
padded_mask = F.pad(mask, (padding,) * 4)
return padded_mask, scale
def paste_mask_in_image(mask, box, im_h, im_w):
# type: (Tensor, Tensor, int, int) -> Tensor
TO_REMOVE = 1
w = int(box[2] - box[0] + TO_REMOVE)
h = int(box[3] - box[1] + TO_REMOVE)
w = max(w, 1)
h = max(h, 1)
# Set shape to [batchxCxHxW]
mask = mask.expand((1, 1, -1, -1))
# Resize mask
mask = F.interpolate(mask, size=(h, w), mode='bilinear', align_corners=False)
mask = mask[0][0]
im_mask = torch.zeros((im_h, im_w), dtype=mask.dtype, device=mask.device)
x_0 = max(box[0], 0)
x_1 = min(box[2] + 1, im_w)
y_0 = max(box[1], 0)
y_1 = min(box[3] + 1, im_h)
im_mask[y_0:y_1, x_0:x_1] = mask[
(y_0 - box[1]):(y_1 - box[1]), (x_0 - box[0]):(x_1 - box[0])
]
return im_mask
def paste_masks_in_image(masks, boxes, img_shape, padding=1):
# type: (Tensor, Tensor, Tuple[int, int], int) -> Tensor
masks, scale = expand_masks(masks, padding=padding)
boxes = expand_boxes(boxes, scale).to(dtype=torch.int64)
im_h, im_w = img_shape
res = [
paste_mask_in_image(m[0], b, im_h, im_w)
for m, b in zip(masks, boxes)
]
if len(res) > 0:
ret = torch.stack(res, dim=0)[:, None]
else:
ret = masks.new_empty((0, 1, im_h, im_w))
return ret
# Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import math
import torch
import torchvision
from torch import nn, Tensor
from typing import List, Tuple, Dict, Optional
from model.image_list import ImageList
from model.roi_heads import paste_masks_in_image
@torch.jit.unused
def _get_shape_onnx(image: Tensor) -> Tensor:
from torch.onnx import operators
return operators.shape_as_tensor(image)[-2:]
@torch.jit.unused
def _fake_cast_onnx(v: Tensor) -> float:
# ONNX requires a tensor but here we fake its type for JIT.
return v
def _resize_image_and_masks(image: Tensor,
target: Optional[Dict[str, Tensor]] = None,
image_size: Optional[Tuple[int, int]] = None,
) -> Tuple[Tensor, Optional[Dict[str, Tensor]]]:
if torchvision._is_tracing():
im_shape = _get_shape_onnx(image)
else:
im_shape = torch.tensor(image.shape[-2:])
image = torch.nn.functional.interpolate(image[None], size=image_size, scale_factor=None, mode='bilinear',
recompute_scale_factor=None, align_corners=False)[0]
if target is None:
return image, target
if "masks" in target:
mask = target["masks"]
mask = torch.nn.functional.interpolate(mask[:, None].float(), size=image_size, scale_factor=None,
recompute_scale_factor=None)[:, 0].byte()
target["masks"] = mask
return image, target
class GeneralizedRCNNTransform(nn.Module):
"""
Performs input / target transformation before feeding the data to a GeneralizedRCNN
model.
The transformations it perform are:
- input normalization (mean subtraction and std division)
- input / target resizing to match image_size
It returns a ImageList for the inputs, and a List[Dict[Tensor]] for the targets
"""
def __init__(self, image_size: Optional[Tuple[int, int]],
image_mean: List[float], image_std: List[float],):
super(GeneralizedRCNNTransform, self).__init__()
self.image_size = image_size
self.image_mean = image_mean
self.image_std = image_std
def forward(self,
images: List[Tensor],
targets: Optional[List[Dict[str, Tensor]]] = None
) -> Tuple[ImageList, Optional[List[Dict[str, Tensor]]]]:
images = [img for img in images]
if targets is not None:
# make a copy of targets to avoid modifying it in-place
# once torchscript supports dict comprehension
# this can be simplified as follows
# targets = [{k: v for k,v in t.items()} for t in targets]
targets_copy: List[Dict[str, Tensor]] = []
for t in targets:
data: Dict[str, Tensor] = {}
for k, v in t.items():
data[k] = v
targets_copy.append(data)
targets = targets_copy
for i in range(len(images)):
image = images[i]
target_index = targets[i] if targets is not None else None
if image.dim() != 3:
raise ValueError("images is expected to be a list of 3d tensors "
"of shape [C, H, W], got {}".format(image.shape))
image = self.normalize(image)
image, target_index = self.resize(image, target_index)
images[i] = image
if targets is not None and target_index is not None:
targets[i] = target_index
image_sizes = [img.shape[-2:] for img in images]
images = torch.stack(images)
image_sizes_list: List[Tuple[int, int]] = []
for image_size in image_sizes:
assert len(image_size) == 2
image_sizes_list.append((image_size[0], image_size[1]))
image_list = ImageList(images, image_sizes_list)
return image_list, targets
def normalize(self, image: Tensor) -> Tensor:
if not image.is_floating_point():
raise TypeError(
f"Expected input images to be of floating type (in range [0, 1]), "
f"but found type {image.dtype} instead"
)
dtype, device = image.dtype, image.device
mean = torch.as_tensor(self.image_mean, dtype=dtype, device=device)
std = torch.as_tensor(self.image_std, dtype=dtype, device=device)
return (image - mean[:, None, None]) / std[:, None, None]
def torch_choice(self, k: List[int]) -> int:
"""
Implements `random.choice` via torch ops so it can be compiled with
TorchScript. Remove if https://github.com/pytorch/pytorch/issues/25803
is fixed.
"""
index = int(torch.empty(1).uniform_(0., float(len(k))).item())
return k[index]
def resize(self,
image: Tensor,
target: Optional[Dict[str, Tensor]] = None,
) -> Tuple[Tensor, Optional[Dict[str, Tensor]]]:
h, w = image.shape[-2:]
image, target = _resize_image_and_masks(image, target, self.image_size)
if target is None:
return image, target
bbox = target["boxes"]
bbox = resize_boxes(bbox, (h, w), image.shape[-2:])
target["boxes"] = bbox
if "keypoints" in target:
keypoints = target["keypoints"]
keypoints = resize_keypoints(keypoints, (h, w), image.shape[-2:])
target["keypoints"] = keypoints
return image, target
def postprocess(self,
result: List[Dict[str, Tensor]],
image_shapes: List[Tuple[int, int]],
original_image_sizes: List[Tuple[int, int]]
) -> List[Dict[str, Tensor]]:
if self.training:
return result
for i, (pred, im_s, o_im_s) in enumerate(zip(result, image_shapes, original_image_sizes)):
boxes = pred["boxes"]
boxes = resize_boxes(boxes, im_s, o_im_s)
result[i]["boxes"] = boxes
if "masks" in pred:
masks = pred["masks"]
masks = paste_masks_in_image(masks, boxes, o_im_s)
result[i]["masks"] = masks
if "keypoints" in pred:
keypoints = pred["keypoints"]
keypoints = resize_keypoints(keypoints, im_s, o_im_s)
result[i]["keypoints"] = keypoints
return result
def __repr__(self) -> str:
format_string = self.__class__.__name__ + '('
_indent = '\n '
format_string += "{0}Normalize(mean={1}, std={2})".format(_indent, self.image_mean, self.image_std)
format_string += "{0}Resize(height={1}, width={2}, mode='bilinear')".format(_indent, self.image_size[0],
self.image_size[1])
format_string += '\n)'
return format_string
def resize_keypoints(keypoints: Tensor, original_size: List[int], new_size: List[int]) -> Tensor:
ratios = [
torch.tensor(s, dtype=torch.float32, device=keypoints.device) /
torch.tensor(s_orig, dtype=torch.float32, device=keypoints.device)
for s, s_orig in zip(new_size, original_size)
]
ratio_h, ratio_w = ratios
resized_data = keypoints.clone()
if torch._C._get_tracing_state():
resized_data_0 = resized_data[:, :, 0] * ratio_w
resized_data_1 = resized_data[:, :, 1] * ratio_h
resized_data = torch.stack((resized_data_0, resized_data_1, resized_data[:, :, 2]), dim=2)
else:
resized_data[..., 0] *= ratio_w
resized_data[..., 1] *= ratio_h
return resized_data
def resize_boxes(boxes: Tensor, original_size: List[int], new_size: List[int]) -> Tensor:
ratios = [
torch.tensor(s, dtype=torch.float32, device=boxes.device) /
torch.tensor(s_orig, dtype=torch.float32, device=boxes.device)
for s, s_orig in zip(new_size, original_size)
]
ratio_height, ratio_width = ratios
xmin, ymin, xmax, ymax = boxes.unbind(1)
xmin = xmin * ratio_width
xmax = xmax * ratio_width
ymin = ymin * ratio_height
ymax = ymax * ratio_height
return torch.stack((xmin, ymin, xmax, ymax), dim=1)
# Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import math
import torch
from collections import OrderedDict
from torch import Tensor, nn
from typing import List, Tuple, Dict
from .frozen_bn import FrozenBatchNorm2d
class IntermediateLayerGetter(nn.ModuleDict):
"""
Module wrapper that returns intermediate layers from a model
It has a strong assumption that the modules have been registered
into the model in the same order as they are used.
This means that one should **not** reuse the same nn.Module
twice in the forward if you want this to work.
Additionally, it is only able to query submodules that are directly
assigned to the model. So if `model` is passed, `model.feature1` can
be returned, but not `model.feature1.layer2`.
Args:
model (nn.Module): model on which we will extract the features
return_layers (Dict[name, new_name]): a dict containing the names
of the modules for which the activations will be returned as
the key of the dict, and the value of the dict is the name
of the returned activation (which the user can specify).
Examples::
>>> m = torchvision.models.resnet18(pretrained=True)
>>> # extract layer1 and layer3, giving as names `feat1` and feat2`
>>> new_m = torchvision.models._utils.IntermediateLayerGetter(m,
>>> {'layer1': 'feat1', 'layer3': 'feat2'})
>>> out = new_m(torch.rand(1, 3, 224, 224))
>>> print([(k, v.shape) for k, v in out.items()])
>>> [('feat1', torch.Size([1, 64, 56, 56])),
>>> ('feat2', torch.Size([1, 256, 14, 14]))]
"""
_version = 2
__annotations__ = {
"return_layers": Dict[str, str],
}
def __init__(self, model: nn.Module, return_layers: Dict[str, str]) -> None:
if not set(return_layers).issubset([name for name, _ in model.named_children()]):
raise ValueError("return_layers are not present in model")
orig_return_layers = return_layers
return_layers = {str(k): str(v) for k, v in return_layers.items()}
layers = OrderedDict()
for name, module in model.named_children():
layers[name] = module
if name in return_layers:
del return_layers[name]
if not return_layers:
break
super(IntermediateLayerGetter, self).__init__(layers)
self.return_layers = orig_return_layers
def forward(self, x):
out = OrderedDict()
for name, module in self.items():
x = module(x)
if name in self.return_layers:
out_name = self.return_layers[name]
out[out_name] = x
return out
@torch.jit._script_if_tracing
def encode_boxes(reference_boxes, proposals, weights):
# type: (torch.Tensor, torch.Tensor, torch.Tensor) -> torch.Tensor
"""
Encode a set of proposals with respect to some
reference boxes
Args:
reference_boxes (Tensor): reference boxes
proposals (Tensor): boxes to be encoded
weights (Tensor[4]): the weights for ``(x, y, w, h)``
"""
# perform some unpacking to make it JIT-fusion friendly
wx = weights[0]
wy = weights[1]
ww = weights[2]
wh = weights[3]
proposals_x1 = proposals[:, 0].unsqueeze(1)
proposals_y1 = proposals[:, 1].unsqueeze(1)
proposals_x2 = proposals[:, 2].unsqueeze(1)
proposals_y2 = proposals[:, 3].unsqueeze(1)
reference_boxes_x1 = reference_boxes[:, 0].unsqueeze(1)
reference_boxes_y1 = reference_boxes[:, 1].unsqueeze(1)
reference_boxes_x2 = reference_boxes[:, 2].unsqueeze(1)
reference_boxes_y2 = reference_boxes[:, 3].unsqueeze(1)
# implementation starts here
ex_widths = proposals_x2 - proposals_x1
ex_heights = proposals_y2 - proposals_y1
ex_ctr_x = proposals_x1 + 0.5 * ex_widths
ex_ctr_y = proposals_y1 + 0.5 * ex_heights
gt_widths = reference_boxes_x2 - reference_boxes_x1
gt_heights = reference_boxes_y2 - reference_boxes_y1
gt_ctr_x = reference_boxes_x1 + 0.5 * gt_widths
gt_ctr_y = reference_boxes_y1 + 0.5 * gt_heights
targets_dx = wx * (gt_ctr_x - ex_ctr_x) / ex_widths
targets_dy = wy * (gt_ctr_y - ex_ctr_y) / ex_heights
targets_dw = ww * torch.log(gt_widths / ex_widths)
targets_dh = wh * torch.log(gt_heights / ex_heights)
targets = torch.cat((targets_dx, targets_dy, targets_dw, targets_dh), dim=1)
return targets
# Similar to encode_boxes, but accepts tensors with batch dimension
@torch.jit._script_if_tracing
def encode_boxes_batch(reference_boxes, proposals, weights):
# type: (torch.Tensor, torch.Tensor, torch.Tensor) -> torch.Tensor
"""
Encode a set of proposals with respect to some
reference boxes
Args:
reference_boxes (Tensor): reference boxes
proposals (Tensor): boxes to be encoded
weights (Tensor[4]): the weights for ``(x, y, w, h)``
"""
# perform some unpacking to make it JIT-fusion friendly
wx = weights[0]
wy = weights[1]
ww = weights[2]
wh = weights[3]
proposals_x1 = proposals[:, :, 0]
proposals_y1 = proposals[:, :, 1]
proposals_x2 = proposals[:, :, 2]
proposals_y2 = proposals[:, :, 3]
reference_boxes_x1 = reference_boxes[:, :, 0]
reference_boxes_y1 = reference_boxes[:, :, 1]
reference_boxes_x2 = reference_boxes[:, :, 2]
reference_boxes_y2 = reference_boxes[:, :, 3]
# implementation starts here
ex_widths = proposals_x2 - proposals_x1
ex_heights = proposals_y2 - proposals_y1
ex_ctr_x = proposals_x1 + 0.5 * ex_widths
ex_ctr_y = proposals_y1 + 0.5 * ex_heights
gt_widths = reference_boxes_x2 - reference_boxes_x1
gt_heights = reference_boxes_y2 - reference_boxes_y1
gt_ctr_x = reference_boxes_x1 + 0.5 * gt_widths
gt_ctr_y = reference_boxes_y1 + 0.5 * gt_heights
targets_dx = wx * (gt_ctr_x - ex_ctr_x) / ex_widths
targets_dy = wy * (gt_ctr_y - ex_ctr_y) / ex_heights
targets_dw = ww * torch.log(gt_widths / ex_widths)
targets_dh = wh * torch.log(gt_heights / ex_heights)
targets = torch.cat((targets_dx[:, :, None], targets_dy[:, :, None], targets_dw[:, :, None], targets_dh[:, :, None]), dim=2)
return targets
class BoxCoder(object):
"""
This class encodes and decodes a set of bounding boxes into
the representation used for training the regressors.
"""
def __init__(self, weights, bbox_xform_clip=math.log(1000. / 16)):
# type: (Tuple[float, float, float, float], float) -> None
"""
Args:
weights (4-element tuple)
bbox_xform_clip (float)
"""
self.weights = weights
self.weights_as_tensor = None
self.bbox_xform_clip = bbox_xform_clip
def encode(self, reference_boxes, proposals):
# type: (List[Tensor], List[Tensor]) -> List[Tensor]
boxes_per_image = [len(b) for b in reference_boxes]
reference_boxes = torch.cat(reference_boxes, dim=0)
proposals = torch.cat(proposals, dim=0)
targets = self.encode_single(reference_boxes, proposals)
return targets.split(boxes_per_image, 0)
def encode_single(self, reference_boxes, proposals):
"""
Encode a set of proposals with respect to some
reference boxes
Args:
reference_boxes (Tensor): reference boxes
proposals (Tensor): boxes to be encoded
"""
dtype = reference_boxes.dtype
device = reference_boxes.device
weights = torch.as_tensor(self.weights, dtype=dtype, device=device)
targets = encode_boxes(reference_boxes, proposals, weights)
return targets
# Similar to encode_single, just a wrapper for a batched input
def encode_batch(self, reference_boxes, proposals):
"""
Encode a set of proposals with respect to some
reference boxes
Args:
reference_boxes (Tensor): reference boxes
proposals (Tensor): boxes to be encoded
"""
dtype = reference_boxes.dtype
device = reference_boxes.device
if self.weights_as_tensor is None:
self.weights_as_tensor = torch.as_tensor(self.weights, dtype=dtype, device=device)
weights = self.weights_as_tensor
targets = encode_boxes_batch(reference_boxes, proposals, weights)
return targets
def decode(self, rel_codes, boxes):
# type: (Tensor, List[Tensor]) -> Tensor
assert isinstance(boxes, (list, tuple))
assert isinstance(rel_codes, torch.Tensor)
boxes_per_image = [b.size(0) for b in boxes]
concat_boxes = torch.cat(boxes, dim=0)
box_sum = 0
for val in boxes_per_image:
box_sum += val
if box_sum > 0:
rel_codes = rel_codes.reshape(box_sum, -1)
pred_boxes = self.decode_single(
rel_codes, concat_boxes
)
if box_sum > 0:
pred_boxes = pred_boxes.reshape(box_sum, -1, 4)
return pred_boxes
def decode_single(self, rel_codes, boxes):
"""
From a set of original boxes and encoded relative box offsets,
get the decoded boxes.
Args:
rel_codes (Tensor): encoded boxes
boxes (Tensor): reference boxes.
"""
boxes = boxes.to(rel_codes.dtype)
widths = boxes[:, 2] - boxes[:, 0]
heights = boxes[:, 3] - boxes[:, 1]
ctr_x = boxes[:, 0] + 0.5 * widths
ctr_y = boxes[:, 1] + 0.5 * heights
wx, wy, ww, wh = self.weights
dx = rel_codes[:, 0::4] / wx
dy = rel_codes[:, 1::4] / wy
dw = rel_codes[:, 2::4] / ww
dh = rel_codes[:, 3::4] / wh
# Prevent sending too large values into torch.exp()
dw = torch.clamp(dw, max=self.bbox_xform_clip)
dh = torch.clamp(dh, max=self.bbox_xform_clip)
pred_ctr_x = dx * widths[:, None] + ctr_x[:, None]
pred_ctr_y = dy * heights[:, None] + ctr_y[:, None]
pred_w = torch.exp(dw) * widths[:, None]
pred_h = torch.exp(dh) * heights[:, None]
# Distance from center to box's corner.
c_to_c_h = torch.tensor(0.5, dtype=pred_ctr_y.dtype, device=pred_h.device) * pred_h
c_to_c_w = torch.tensor(0.5, dtype=pred_ctr_x.dtype, device=pred_w.device) * pred_w
pred_boxes1 = pred_ctr_x - c_to_c_w
pred_boxes2 = pred_ctr_y - c_to_c_h
pred_boxes3 = pred_ctr_x + c_to_c_w
pred_boxes4 = pred_ctr_y + c_to_c_h
pred_boxes = torch.stack((pred_boxes1, pred_boxes2, pred_boxes3, pred_boxes4), dim=2).flatten(1)
return pred_boxes
class Matcher(object):
"""
This class assigns to each predicted "element" (e.g., a box) a ground-truth
element. Each predicted element will have exactly zero or one matches; each
ground-truth element may be assigned to zero or more predicted elements.
Matching is based on the MxN match_quality_matrix, that characterizes how well
each (ground-truth, predicted)-pair match. For example, if the elements are
boxes, the matrix may contain box IoU overlap values.
The matcher returns a tensor of size N containing the index of the ground-truth
element m that matches to prediction n. If there is no match, a negative value
is returned.
"""
BELOW_LOW_THRESHOLD = -1
BETWEEN_THRESHOLDS = -2
__annotations__ = {
'BELOW_LOW_THRESHOLD': int,
'BETWEEN_THRESHOLDS': int,
}
def __init__(self, high_threshold, low_threshold, allow_low_quality_matches=False):
# type: (float, float, bool) -> None
"""
Args:
high_threshold (float): quality values greater than or equal to
this value are candidate matches.
low_threshold (float): a lower quality threshold used to stratify
matches into three levels:
1) matches >= high_threshold
2) BETWEEN_THRESHOLDS matches in [low_threshold, high_threshold)
3) BELOW_LOW_THRESHOLD matches in [0, low_threshold)
allow_low_quality_matches (bool): if True, produce additional matches
for predictions that have only low-quality match candidates. See
set_low_quality_matches_ for more details.
"""
self.BELOW_LOW_THRESHOLD = -1
self.BETWEEN_THRESHOLDS = -2
assert low_threshold <= high_threshold
self.high_threshold = high_threshold
self.low_threshold = low_threshold
self.allow_low_quality_matches = allow_low_quality_matches
def __call__(self, match_quality_matrix):
"""
Args:
match_quality_matrix (Tensor[float]): an MxN tensor, containing the
pairwise quality between M ground-truth elements and N predicted elements.
Returns:
matches (Tensor[int64]): an N tensor where N[i] is a matched gt in
[0, M - 1] or a negative value indicating that prediction i could not
be matched.
"""
if match_quality_matrix.numel() == 0:
# empty targets or proposals not supported during training
if match_quality_matrix.shape[0] == 0:
raise ValueError(
"No ground-truth boxes available for one of the images "
"during training")
else:
raise ValueError(
"No proposal boxes available for one of the images "
"during training")
# match_quality_matrix is M (gt) x N (predicted)
# Max over gt elements (dim 0) to find best gt candidate for each prediction
matched_vals, matches = match_quality_matrix.max(dim=0)
if self.allow_low_quality_matches:
all_matches = matches.clone()
else:
all_matches = None
# Assign candidate matches with low quality to negative (unassigned) values
below_low_threshold = matched_vals < self.low_threshold
between_thresholds = (matched_vals >= self.low_threshold) & (
matched_vals < self.high_threshold
)
matches[below_low_threshold] = self.BELOW_LOW_THRESHOLD
matches[between_thresholds] = self.BETWEEN_THRESHOLDS
if self.allow_low_quality_matches:
assert all_matches is not None
self.set_low_quality_matches_(matches, all_matches, match_quality_matrix)
return matches
def set_low_quality_matches_(self, matches, all_matches, match_quality_matrix):
"""
Produce additional matches for predictions that have only low-quality matches.
Specifically, for each ground-truth find the set of predictions that have
maximum overlap with it (including ties); for each prediction in that set, if
it is unmatched, then match it to the ground-truth with which it has the highest
quality value.
"""
# For each gt, find the prediction with which it has highest quality
highest_quality_foreach_gt, _ = match_quality_matrix.max(dim=1)
# Find highest quality match available, even if it is low, including ties
gt_pred_pairs_of_highest_quality = torch.where(
match_quality_matrix == highest_quality_foreach_gt[:, None]
)
# Example gt_pred_pairs_of_highest_quality:
# tensor([[ 0, 39796],
# [ 1, 32055],
# [ 1, 32070],
# [ 2, 39190],
# [ 2, 40255],
# [ 3, 40390],
# [ 3, 41455],
# [ 4, 45470],
# [ 5, 45325],
# [ 5, 46390]])
# Each row is a (gt index, prediction index)
# Note how gt items 1, 2, 3, and 5 each have two ties
pred_inds_to_update = gt_pred_pairs_of_highest_quality[1]
matches[pred_inds_to_update] = all_matches[pred_inds_to_update]
# Similar to Matcher(object), but enabled for batched input
# See original method for additional comments
class MatcherBatch(object):
BELOW_LOW_THRESHOLD = -1
BETWEEN_THRESHOLDS = -2
__annotations__ = {
'BELOW_LOW_THRESHOLD': int,
'BETWEEN_THRESHOLDS': int,
}
def __init__(self, high_threshold, low_threshold, allow_low_quality_matches=False):
# type: (float, float, bool) -> None
self.BELOW_LOW_THRESHOLD = -1
self.BETWEEN_THRESHOLDS = -2
assert low_threshold <= high_threshold
self.high_threshold = high_threshold
self.low_threshold = low_threshold
self.allow_low_quality_matches = allow_low_quality_matches
def __call__(self, match_quality_matrix):
# TODO: move to preprocessing
if match_quality_matrix.numel() == 0:
# empty targets or proposals not supported during training
if match_quality_matrix.shape[0] == 0:
raise ValueError(
"No ground-truth boxes available for one of the images "
"during training")
else:
raise ValueError(
"No proposal boxes available for one of the images "
"during training")
matched_vals, matches = match_quality_matrix.max(dim=1)
all_matches = matches.clone() if self.allow_low_quality_matches else None
below_low_threshold = matched_vals < self.low_threshold
between_thresholds = (matched_vals >= self.low_threshold) & (matched_vals < self.high_threshold)
matches = torch.where(below_low_threshold, self.BELOW_LOW_THRESHOLD, matches)
matches = torch.where(between_thresholds, self.BETWEEN_THRESHOLDS, matches)
if self.allow_low_quality_matches:
assert all_matches is not None
matches = self.set_low_quality_matches_(matches, all_matches, match_quality_matrix)
return matches
def set_low_quality_matches_(self, matches, all_matches, match_quality_matrix):
highest_quality_foreach_gt, _ = match_quality_matrix.max(dim=2)
gt_pred_pairs_of_highest_quality = \
torch.where((match_quality_matrix == highest_quality_foreach_gt[:, :, None]) &
(match_quality_matrix != 0), 1, 0)
gt_pred_pairs_of_highest_quality = gt_pred_pairs_of_highest_quality.sum(dim=1)
matches = torch.where(gt_pred_pairs_of_highest_quality >= 1, all_matches, matches)
return matches
class SSDMatcher(Matcher):
def __init__(self, threshold):
super().__init__(threshold, threshold, allow_low_quality_matches=False)
def __call__(self, match_quality_matrix):
matches = super().__call__(match_quality_matrix)
# For each gt, find the prediction with which it has the highest quality
_, highest_quality_pred_foreach_gt = match_quality_matrix.max(dim=1)
matches[highest_quality_pred_foreach_gt] = torch.arange(highest_quality_pred_foreach_gt.size(0),
dtype=torch.int64,
device=highest_quality_pred_foreach_gt.device)
return matches
def overwrite_eps(model, eps):
"""
This method overwrites the default eps values of all the
FrozenBatchNorm2d layers of the model with the provided value.
This is necessary to address the BC-breaking change introduced
by the bug-fix at pytorch/vision#2933. The overwrite is applied
only when the pretrained weights are loaded to maintain compatibility
with previous versions.
Args:
model (nn.Module): The model on which we perform the overwrite.
eps (float): The new value of eps.
"""
for module in model.modules():
if isinstance(module, FrozenBatchNorm2d):
module.eps = eps
def retrieve_out_channels(model, size):
"""
This method retrieves the number of output channels of a specific model.
Args:
model (nn.Module): The model for which we estimate the out_channels.
It should return a single Tensor or an OrderedDict[Tensor].
size (Tuple[int, int]): The size (wxh) of the input.
Returns:
out_channels (List[int]): A list of the output channels of the model.
"""
in_training = model.training
model.eval()
with torch.no_grad():
# Use dummy data to retrieve the feature map sizes to avoid hard-coding their values
device = next(model.parameters()).device
tmp_img = torch.zeros((1, 3, size[1], size[0]), device=device)
features = model(tmp_img)
if isinstance(features, torch.Tensor):
features = OrderedDict([('0', features)])
out_channels = [x.size(1) for x in features.values()]
if in_training:
model.train()
return out_channels
# Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import torch
import utils
from engine import preprocessing, init_scratchpad, loss_preprocessing, compute_loss, compute_matched_idxs
import copy
def whole_model_capture(model, optimizer, scaler, dataset, args):
print('CUDA graph capture')
# save original params for later
model_bak = copy.deepcopy(model.state_dict())
optimizer_bak = copy.deepcopy(optimizer.state_dict())
model.train()
# direct pointer to the model
model_ptr = model.module if args.distributed else model
# extracting the device name from some layer
device = model_ptr.backbone.body.conv1.weight.device
# Convert epochs to iterations
# we want to control warmup at the epoch level, but update lr every iteration
start_iter = 0
dataset_len = len(dataset) if dataset is not None else int(args.train_sz / args.batch_size / utils.get_world_size())
warmup_iters = args.warmup_epochs * dataset_len
lr_scheduler = utils.warmup_lr_scheduler(optimizer, start_iter, warmup_iters, args.warmup_factor)
if args.cuda_graphs_syn:
assert (dataset is None)
images, targets = [], {'boxes': [], 'labels': []}
for b in range(args.batch_size):
# These are just arbitrary sizes for model capture
images.append(torch.randint(low=0, high=256, size=[3, 1000, 1000], device=device).float() / 255)
targets['boxes'].append(torch.tensor([[10, 20, 30, 40]], device=device))
targets['labels'].append(torch.tensor([1], device=device))
images, targets = preprocessing(images, targets, model_ptr, args.data_layout)
else:
images, targets = [], []
# taking the first batch
for images_, targets_ in dataset:
images = images_
targets = targets_
break
# if not DALI, then we should preprocess the data
if not args.dali:
images = list(image.to(device, non_blocking=True) for image in images)
targets = {k: [dic[k].to(device, non_blocking=True) for dic in targets] for k in targets[0]}
# --- preprocessing
images, targets = preprocessing(images, targets, model_ptr, args.data_layout)
# DALI can compute matched_idxs and put it in targets, but if it doesn't do so, do it here
if 'matched_idxs' not in targets:
with torch.cuda.amp.autocast(enabled=args.amp):
targets['matched_idxs'] = compute_matched_idxs(targets['boxes'], model_ptr)
with torch.cuda.amp.autocast(enabled=args.amp):
init_scratchpad(images, targets, args.batch_size, args.num_classes, args.amp,
args.apex_focal_loss, args.max_boxes, args.cls_head_pad, args.reg_head_pad,
args.cuda_graphs)
if args.not_graphed_prologues:
gt_classes_target, target_regression, num_foreground, valid_idxs, foreground_idxs_mask = \
loss_preprocessing(utils.ScratchPad.target_boxes_padded if args.reg_head_pad else targets['boxes'],
utils.ScratchPad.target_labels_padded if args.cls_head_pad else targets['labels'],
utils.ScratchPad.target_matched_idxs, model_ptr,
args.apex_focal_loss, args.max_boxes, args.cls_head_pad, args.reg_head_pad)
static_matched_idxs = torch.zeros_like(targets['matched_idxs'])
static_matched_idxs.copy_(targets['matched_idxs'])
# --- warmup
s = torch.cuda.Stream()
s.wait_stream(torch.cuda.current_stream())
with torch.cuda.stream(s):
for j in range(11):
if args.apex_adam:
# set_to_none is True by default
optimizer.zero_grad()
else:
optimizer.zero_grad(set_to_none=True)
# lr_scheduler.step()
with torch.cuda.amp.autocast(enabled=args.amp):
if not args.not_graphed_prologues:
# preprocess everything that does not require model forward and backward
gt_classes_target, target_regression, num_foreground, valid_idxs, foreground_idxs_mask = \
loss_preprocessing(utils.ScratchPad.target_boxes_padded if args.reg_head_pad else targets['boxes'],
utils.ScratchPad.target_labels_padded if args.cls_head_pad else targets['labels'],
utils.ScratchPad.target_matched_idxs, model_ptr,
args.apex_focal_loss, args.max_boxes, args.cls_head_pad, args.reg_head_pad)
# forward
model_output = model(images)
# features = model_output[0:5]
# head_outputs = {'cls_logits': model_output[5], 'bbox_regression': model_output[6]}
cls_loss, reg_loss = compute_loss(model_ptr, model_output[5], model_output[6], valid_idxs,
gt_classes_target, num_foreground, target_regression,
foreground_idxs_mask, args.apex_focal_loss, args.reg_head_pad)
losses = cls_loss + reg_loss
assert(not torch.isnan(losses))
# backward
scaler.scale(losses).backward()
# optimizer
scaler.step(optimizer)
scaler.update()
torch.cuda.current_stream().wait_stream(s)
# --- capture
g = torch.cuda.CUDAGraph()
if args.apex_adam:
# set_to_none is True by default
optimizer.zero_grad()
else:
optimizer.zero_grad(set_to_none=True)
with torch.cuda.graph(g):
# # LR was already copied during warmup
# if args.warmup_epochs > 0:
# lr_scheduler.step()
with torch.cuda.amp.autocast(enabled=args.amp):
if not args.not_graphed_prologues:
# loss_preprocessing is now part of the graph
gt_classes_target, target_regression, num_foreground, valid_idxs, foreground_idxs_mask = \
loss_preprocessing(utils.ScratchPad.target_boxes_padded if args.reg_head_pad else targets['boxes'],
utils.ScratchPad.target_labels_padded if args.cls_head_pad else targets['labels'],
utils.ScratchPad.target_matched_idxs, model_ptr,
args.apex_focal_loss, args.max_boxes, args.cls_head_pad, args.reg_head_pad)
# forward
static_model_output = model(images)
# loss
static_cls_loss, static_reg_loss = compute_loss(model_ptr, static_model_output[5], static_model_output[6],
valid_idxs, gt_classes_target, num_foreground,
target_regression, foreground_idxs_mask,
args.apex_focal_loss, args.reg_head_pad)
static_loss = static_cls_loss + static_reg_loss
# backward
scaler.scale(static_loss).backward()
# scaler.step(optimizer)
# scaler.update()
scaler.step(optimizer)
# set scaler and model back to their default values
scaler.update(65536.0)
model.load_state_dict(model_bak)
optimizer.load_state_dict(optimizer_bak)
if args.not_graphed_prologues:
static_prologues_out = [gt_classes_target, target_regression, num_foreground, valid_idxs, foreground_idxs_mask]
else:
static_prologues_out = None
return g, images, static_loss, static_prologues_out
def whole_model_capture_eval(model, dataset, args):
# save original params for later
model_bak = copy.deepcopy(model.state_dict())
# direct pointer to the model
model_ptr = model.module if args.distributed else model
# extracting the device name from some layer
device = model_ptr.backbone.body.conv1.weight.device
# Convert epochs to iterations
# we want to control warmup at the epoch level, but update lr every iteration
if args.cuda_graphs_syn:
assert (dataset is None)
images, targets = [], {'boxes': [], 'labels': []}
for b in range(args.eval_batch_size):
# These are just arbitrary sizes for model capture
images.append(torch.rand([3, 1000, 1000], device=device))
targets['boxes'].append(torch.tensor([[10, 20, 30, 40]], device=device))
targets['labels'].append(torch.tensor([1], device=device))
images, targets = preprocessing(images, targets, model_ptr, args.data_layout)
else:
images, targets = [], []
# taking the first batch
for images_, targets_ in dataset:
images = images_
targets = targets_
break
# if not DALI, then we should preprocess the data
if not args.dali:
images = list(image.to(device, non_blocking=True) for image in images)
targets = {k: [dic[k].to(device, non_blocking=True) for dic in targets] for k in targets[0]}
# --- preprocessing
images, targets = preprocessing(images, targets, model_ptr, args.data_layout)
# --- warmup
s = torch.cuda.Stream()
s.wait_stream(torch.cuda.current_stream())
with torch.cuda.stream(s):
for j in range(11):
with torch.cuda.amp.autocast(enabled=args.amp):
# forward
model_output = model(images)
# features = model_output[0:5]
# head_outputs = {'cls_logits': model_output[5], 'bbox_regression': model_output[6]}
torch.cuda.current_stream().wait_stream(s)
# --- capture
g = torch.cuda.CUDAGraph()
with torch.cuda.graph(g):
with torch.cuda.amp.autocast(enabled=args.amp):
# forward
static_model_output = model(images)
return g, images, static_model_output
def model_eval_warmup(model, batch_size, iters, args):
model.eval()
# direct pointer to the model
model_ptr = model.module if args.distributed else model
# extracting the device name from some layer
device = model_ptr.backbone.body.conv1.weight.device
for i in range(iters):
with torch.cuda.amp.autocast(enabled=args.amp):
x = torch.rand([batch_size, 3, args.image_size[0], args.image_size[1]], device=device)
model(x)
#!/bin/bash
## DL params
export HSA_FORCE_FINE_GRAIN_PCIE=1
export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
export BATCHSIZE=16
export NUMEPOCHS=6
export DATASET_DIR="/data/OpenImages_mlperf"
export EXTRA_PARAMS='--lr 0.000085 --warmup-epochs 0 --frozen-bn-opt --frozen-bn-fp16 --apex-adam --disable-ddp-broadcast-buffers --fp16-allreduce --skip-metric-loss --async-coco'
## System config params
export DGXNGPU=8
# Set variables
EVALBATCHSIZE=${EVALBATCHSIZE:-${BATCHSIZE}}
LOG_INTERVAL=${LOG_INTERVAL:-20}
TORCH_HOME=${TORCH_HOME:-"$(pwd)/torch-model-cache"}
# run benchmark
echo "running benchmark"
PARAMS=(
--batch-size "${BATCHSIZE}"
--eval-batch-size "${EVALBATCHSIZE}"
--epochs "${NUMEPOCHS}"
--print-freq "${LOG_INTERVAL}"
--dataset-path "${DATASET_DIR}"
)
# run training
torchrun --nproc_per_node="${DGXNGPU}" train.py "${PARAMS[@]}" ${EXTRA_PARAMS} 2>&1 | tee ssd_bs16_epoch6.log
# Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import transforms as T
class DetectionPresetTrain:
def __init__(self, data_augmentation, hflip_prob=0.5, mean=(123., 117., 104.)):
if data_augmentation == 'hflip':
self.transforms = T.Compose([
T.RandomHorizontalFlip(p=hflip_prob),
T.ToTensor(),
])
elif data_augmentation == 'ssd':
self.transforms = T.Compose([
T.RandomPhotometricDistort(),
T.RandomZoomOut(fill=list(mean)),
T.RandomIoUCrop(),
T.RandomHorizontalFlip(p=hflip_prob),
T.ToTensor(),
])
elif data_augmentation == 'ssdlite':
self.transforms = T.Compose([
T.RandomIoUCrop(),
T.RandomHorizontalFlip(p=hflip_prob),
T.ToTensor(),
])
else:
raise ValueError(f'Unknown data augmentation policy "{data_augmentation}"')
def __call__(self, img, target):
return self.transforms(img, target)
class DetectionPresetEval:
def __init__(self):
self.transforms = T.ToTensor()
def __call__(self, img, target):
return self.transforms(img, target)
Cython>=0.29.32
scikit-image>=0.19.3
ujson>=5.5.0
pybind11>=2.10.0
git+https://github.com/NVIDIA/mlperf-common.git
git+https://github.com/mlcommons/logging.git@2.1.0-rc1
pyparsing>=3.0.9
#!/bin/bash
#SBATCH --job-name single_stage_detector
# Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
set -euxo pipefail
# Vars without defaults
: "${DGXSYSTEM:?DGXSYSTEM not set}"
: "${CONT:?CONT not set}"
# Vars with defaults
: "${MLPERF_RULESET:=2.1.0}"
: "${NEXP:=5}"
: "${DATESTAMP:=$(date +'%y%m%d%H%M%S%N')}"
: "${CLEAR_CACHES:=1}"
: "${WORK_DIR:=/workspace/ssd}"
: "${CONT_NAME:=single_stage_detector}"
# ci automagically sets this correctly on Selene
: "${LOGDIR:=./results}"
# Scaleout brdige
: "${NVTX_FLAG:=0}"
: "${TIME_TAGS:=0}"
: "${NCCL_TEST:=0}"
: "${SYNTH_DATA:=0}"
: "${EPOCH_PROF:=0}"
: "${DISABLE_CG:=0}"
# API Logging defaults
: "${API_LOGGING:=0}"
: "${API_LOG_DIR:=./api_logs}" # apiLog.sh output dir
LOGBASE="${DATESTAMP}"
SPREFIX="single_stage_detector_pytorch_${DGXNNODES}x${DGXNGPU}x${BATCHSIZE}_${DATESTAMP}"
if [ ${TIME_TAGS} -gt 0 ]; then
LOGBASE="${SPREFIX}_mllog"
fi
if [ ${NVTX_FLAG} -gt 0 ]; then
if [[ "$LOGBASE" == *'_'* ]];then
LOGBASE="${LOGBASE}_nsys"
else
LOGBASE="${SPREFIX}_nsys"
fi
if [[ ! -d "${NVMLPERF_NSIGHT_LOCATION}" ]]; then
echo "$NVMLPERF_NSIGHT_LOCATION doesn't exist on this system!" 1>&2
exit 1
fi
fi
if [ ${SYNTH_DATA} -gt 0 ]; then
if [[ "$LOGBASE" == *'_'* ]];then
LOGBASE="${LOGBASE}_synth"
else
LOGBASE="${SPREFIX}_synth"
fi
fi
if [ ${EPOCH_PROF} -gt 0 ]; then
if [[ "$LOGBASE" == *'_'* ]];then
LOGBASE="${LOGBASE}_epoch"
else
LOGBASE="${SPREFIX}_epoch"
fi
fi
if [ ${DISABLE_CG} -gt 0 ]; then
EXTRA_PARAMS=$(echo $EXTRA_PARAMS | sed 's/--cuda-graphs//')
if [[ "$LOGBASE" == *'_'* ]];then
LOGBASE="${LOGBASE}_nocg"
else
LOGBASE="${SPREFIX}_nocg"
fi
fi
# do we need to fetch the data from lustre to /raid/scratch?
if [[ "${LOCALDISK_FROM_SQUASHFS:-}" ]]; then
# LOCALDISK_FROM_SQUASHFS should be the path/name of a squashfs file on /lustre
echo "fetching ${LOCALDISK_FROM_SQUASHFS}"
dd bs=4M if="${LOCALDISK_FROM_SQUASHFS}" of=/raid/scratch/tmp.sqsh oflag=direct
echo "unsquashing /raid/scratch/tmp.sqsh"
time unsquashfs -no-progress -dest /raid/scratch/local-root /raid/scratch/tmp.sqsh
fi
readonly LOG_FILE_BASE="${LOGDIR}/${LOGBASE}"
CONT_MOUNTS="${DATADIR}:/datasets/open-images-v6,${LOGDIR}:/results,${BACKBONE_DIR}:/root/.cache/torch"
if [[ "${NVTX_FLAG}" -gt 0 ]]; then
CONT_MOUNTS="${CONT_MOUNTS},${NVMLPERF_NSIGHT_LOCATION}:/nsight"
fi
# API Logging
if [ "${API_LOGGING}" -eq 1 ]; then
CONT_MOUNTS="${CONT_MOUNTS},${API_LOG_DIR}:/logs"
fi
# Setup directories
( umask 0002; mkdir -p "${LOGDIR}" )
srun --ntasks="${SLURM_JOB_NUM_NODES}" mkdir -p "${LOGDIR}"
# Setup container
echo MELLANOX_VISIBLE_DEVICES="${MELLANOX_VISIBLE_DEVICES:-}"
srun \
--ntasks="${SLURM_JOB_NUM_NODES}" \
--container-image="${CONT}" \
--container-name="${CONT_NAME}" \
true
srun -N1 -n1 --container-name="${CONT_NAME}" ibv_devinfo --list
srun -N1 -n1 --container-name="${CONT_NAME}" nvidia-smi topo -m
echo "NCCL_TEST = ${NCCL_TEST}"
if [[ ${NCCL_TEST} -eq 1 ]]; then
(srun --mpi=pmix --ntasks="$(( SLURM_JOB_NUM_NODES * DGXNGPU ))" --ntasks-per-node="${DGXNGPU}" \
--container-name="${CONT_NAME}" all_reduce_perf_mpi -b 33260119 -e 33260119 -d half -G 1 ) |& tee "${LOGDIR}/${SPREFIX}_nccl.log"
fi
# Run experiments
for _experiment_index in $(seq -w 1 "${NEXP}"); do
(
echo "Beginning trial ${_experiment_index} of ${NEXP}"
echo ":::DLPAL ${CONT} ${SLURM_JOB_ID} ${SLURM_JOB_NUM_NODES} ${SLURM_JOB_NODELIST}"
# Print system info
srun -N1 -n1 --container-name="${CONT_NAME}" python -c ""
# Clear caches
if [ "${CLEAR_CACHES}" -eq 1 ]; then
srun --ntasks="${SLURM_JOB_NUM_NODES}" bash -c "echo -n 'Clearing cache on ' && hostname && sync && sudo /sbin/sysctl vm.drop_caches=3"
srun --ntasks="${SLURM_JOB_NUM_NODES}" --container-name="${CONT_NAME}" python -c "
from mlperf_logger import mllogger
mllogger.event(key=mllogger.constants.CACHE_CLEAR, value=True)"
fi
# Run experiment
srun \
--ntasks="$(( SLURM_JOB_NUM_NODES * DGXNGPU ))" \
--ntasks-per-node="${DGXNGPU}" \
--container-name="${CONT_NAME}" \
--container-mounts="${CONT_MOUNTS}" \
--container-workdir=${WORK_DIR} \
./run_and_time.sh
) |& tee "${LOG_FILE_BASE}_${_experiment_index}.log"
# compliance checker
srun --ntasks=1 --nodes=1 --container-name="${CONT_NAME}" \
--container-mounts="$(realpath ${LOGDIR}):/results" \
--container-workdir="/results" \
python3 -m mlperf_logging.compliance_checker --usage training \
--ruleset "${MLPERF_RULESET}" \
--log_output "/results/compliance_${DATESTAMP}.out" \
"/results/${LOGBASE}_${_experiment_index}.log" \
|| true
done
#!/bin/bash
# Copyright (c) 2018-2022, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# runs benchmark and reports time to convergence
# to use the script:
# run_and_time.sh
set +x
set -e
# Only rank print
[ "${SLURM_LOCALID-0}" -ne 0 ] && set +x
# start timing
start=$(date +%s)
start_fmt=$(date +%Y-%m-%d\ %r)
echo "STARTING TIMING RUN AT $start_fmt"
# Set variables
[ "${DEBUG}" = "1" ] && set -x
LR=${LR:-0.0001}
WARMUP_EPOCHS=${WARMUP_EPOCHS:-1}
BATCHSIZE=${BATCHSIZE:-2}
EVALBATCHSIZE=${EVALBATCHSIZE:-${BATCHSIZE}}
NUMEPOCHS=${NUMEPOCHS:-10}
LOG_INTERVAL=${LOG_INTERVAL:-20}
DATASET_DIR=${DATASET_DIR:-"/datasets/open-images-v6"}
TORCH_HOME=${TORCH_HOME:-"/torch-home"}
TIME_TAGS=${TIME_TAGS:-0}
NVTX_FLAG=${NVTX_FLAG:-0}
NCCL_TEST=${NCCL_TEST:-0}
EPOCH_PROF=${EPOCH_PROF:-0}
SYNTH_DATA=${SYNTH_DATA:-0}
DISABLE_CG=${DISABLE_CG:-0}
# run benchmark
echo "running benchmark"
if [ ${NVTX_FLAG} -gt 0 ]; then
# FIXME mfrank 2022-May-24: NSYSCMD needs to be an array, not a space-separated string
NSYSCMD=" /nsight/bin/nsys profile --capture-range cudaProfilerApi --capture-range-end stop --sample=none --cpuctxsw=none --trace=cuda,nvtx --force-overwrite true --output /results/single_stage_detector_pytorch_${DGXNNODES}x${DGXNGPU}x${BATCHSIZE}_${DATESTAMP}_${SLURM_PROCID}_${SYNTH_DATA}_${DISABLE_CG}.nsys-rep "
else
NSYSCMD=""
fi
if [ ${SYNTH_DATA} -gt 0 ]; then
EXTRA_PARAMS+=" --syn-dataset --cuda-graphs-syn "
EXTRA_PARAMS=$(echo $EXTRA_PARAMS | sed 's/--dali//')
fi
declare -a CMD
if [ -n "${SLURM_LOCALID-}" ]; then
# Mode 1: Slurm launched a task for each GPU and set some envvars; no need for parallel launch
if [ "${SLURM_NTASKS}" -gt "${SLURM_JOB_NUM_NODES}" ]; then
CMD=( 'bindpcie' '--ib=single' '--' ${NSYSCMD} 'python' '-u' )
else
CMD=( ${NSYSCMD} 'python' '-u' )
fi
else
# Mode 2: Single-node Docker, we've been launched with `torch_run`
# TODO: Replace below CMD with NSYSCMD..., but make sure NSYSCMD is an array, not a string
CMD=( "python" )
fi
if [ "$LOGGER" = "apiLog.sh" ];
then
LOGGER="${LOGGER} -p MLPerf/${MODEL_NAME} -v ${FRAMEWORK}/train/${DGXSYSTEM}"
# TODO(ahmadki): track the apiLog.sh bug and remove the workaround
# there is a bug in apiLog.sh preventing it from collecting
# NCCL logs, the workaround is to log a single rank only
# LOCAL_RANK is set with an enroot hook for Pytorch containers
# SLURM_LOCALID is set by Slurm
# OMPI_COMM_WORLD_LOCAL_RANK is set by mpirun
readonly node_rank="${SLURM_NODEID:-0}"
readonly local_rank="${LOCAL_RANK:=${SLURM_LOCALID:=${OMPI_COMM_WORLD_LOCAL_RANK:-}}}"
if [ "$node_rank" -eq 0 ] && [ "$local_rank" -eq 0 ];
then
LOGGER=$LOGGER
else
LOGGER=""
fi
fi
PARAMS=(
--lr "${LR}"
--batch-size "${BATCHSIZE}"
--eval-batch-size "${EVALBATCHSIZE}"
--epochs "${NUMEPOCHS}"
--print-freq "${LOG_INTERVAL}"
--dataset-path "${DATASET_DIR}"
--warmup-epochs "${WARMUP_EPOCHS}"
)
# run training
${LOGGER:-} "${CMD[@]}" train.py "${PARAMS[@]}" ${EXTRA_PARAMS} ; ret_code=$?
set +x
sleep 3
if [[ $ret_code != 0 ]]; then exit $ret_code; fi
# end timing
end=$(date +%s)
end_fmt=$(date +%Y-%m-%d\ %r)
echo "ENDING TIMING RUN AT $end_fmt"
# report result
result=$(( $end - $start ))
result_name="SINGLE_STAGE_DETECTOR"
echo "RESULT,$result_name,,$result,nvidia,$start_fmt"
#!/bin/bash
# Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
set -euxo pipefail
# Vars without defaults
: "${DGXSYSTEM:?DGXSYSTEM not set}"
: "${CONT:?CONT not set}"
# Vars with defaults
: "${NEXP:=5}"
: "${DATESTAMP:=$(date +'%y%m%d%H%M%S%N')}"
: "${CLEAR_CACHES:=1}"
: "${BACKBONE_DIR:=./torch-home}"
: "${CONT_NAME:=single_stage_detector}"
# ci automagically sets this correctly on Selene
: "${DATADIR:=/raid/datasets/openimages/open-images-v6}"
: "${LOGDIR:=$(pwd)/results}"
# Logging
LOG_BASE="ssd_${DGXNNODES}x${DGXNGPU}x${BATCHSIZE}_${DATESTAMP}"
readonly LOG_FILE_BASE="${LOGDIR}/${LOG_BASE}"
# Other vars
readonly _config_file="./config_${DGXSYSTEM}.sh"
# Mount points
CONT_MOUNTS=(
"--volume=${DATADIR}:/datasets/open-images-v6"
"--volume=${LOGDIR}:/results"
"--volume=${BACKBONE_DIR}:/root/.cache/torch"
)
# MLPerf vars
MLPERF_HOST_OS=$(
source /etc/os-release
source /etc/dgx-release || true
echo "${PRETTY_NAME} / ${DGX_PRETTY_NAME:-???} ${DGX_OTA_VERSION:-${DGX_SWBUILD_VERSION:-???}}"
)
export MLPERF_HOST_OS
# Setup directories
mkdir -p "${LOGDIR}"
# Get list of envvars to pass to docker
mapfile -t _config_env < <(env -i bash -c ". ${_config_file} && compgen -e" | grep -E -v '^(PWD|SHLVL)')
_config_env+=(MLPERF_HOST_OS)
mapfile -t _config_env < <(for v in "${_config_env[@]}"; do echo "--env=$v"; done)
# Cleanup container
cleanup_docker() {
docker container rm -f "${CONT_NAME}" || true
}
cleanup_docker
trap 'set -eux; cleanup_docker' EXIT
# Setup container
if [ -z "${NV_GPU-}" ]; then
readonly _docker_gpu_args="--gpus all"
else
readonly _docker_gpu_args='--gpus="'device=${NV_GPU}'" -e NVIDIA_VISIBLE_DEVICES='"${NV_GPU}"
fi
docker run ${_docker_gpu_args} --rm --init --detach \
--net=host --uts=host --ipc=host --security-opt=seccomp=unconfined \
--ulimit=stack=67108864 --ulimit=memlock=-1 \
--name="${CONT_NAME}" "${_cont_mounts[@]}" \
"${CONT}" sleep infinity
#make sure container has time to finish initialization
sleep 30
docker exec -it "${CONT_NAME}" true
readonly TORCH_RUN="python -m torch.distributed.run --standalone --no_python"
# Run experiments
for _experiment_index in $(seq 1 "${NEXP}"); do
(
echo "Beginning trial ${_experiment_index} of ${NEXP}"
# Clear caches
if [ "${CLEAR_CACHES}" -eq 1 ]; then
sync && sudo /sbin/sysctl vm.drop_caches=3
docker exec -it "${CONT_NAME}" python -c "
from mlperf_logger import mllogger
mllogger.event(key=mllogger.constants.CACHE_CLEAR, value=True)"
fi
# Run experiment
docker exec -it "${_config_env[@]}" "${CONT_NAME}" \
${TORCH_RUN} --nproc_per_node=${DGXNGPU} ./run_and_time.sh
) |& tee "${LOG_FILE_BASE}_${_experiment_index}.log"
done
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment