Commit 54a066bf authored by mashun1's avatar mashun1
Browse files

ootdiffusion

parents
Pipeline #1004 canceled with stages
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
import logging
import numpy as np
import pprint
import sys
from collections import OrderedDict
from collections.abc import Mapping
def print_csv_format(results):
"""
Print main metrics in a format similar to Detectron,
so that they are easy to copypaste into a spreadsheet.
Args:
results (OrderedDict[dict]): task_name -> {metric -> score}
"""
assert isinstance(results, OrderedDict), results # unordered results cannot be properly printed
logger = logging.getLogger(__name__)
for task, res in results.items():
# Don't print "AP-category" metrics since they are usually not tracked.
important_res = [(k, v) for k, v in res.items() if "-" not in k]
logger.info("copypaste: Task: {}".format(task))
logger.info("copypaste: " + ",".join([k[0] for k in important_res]))
logger.info("copypaste: " + ",".join(["{0:.4f}".format(k[1]) for k in important_res]))
def verify_results(cfg, results):
"""
Args:
results (OrderedDict[dict]): task_name -> {metric -> score}
Returns:
bool: whether the verification succeeds or not
"""
expected_results = cfg.TEST.EXPECTED_RESULTS
if not len(expected_results):
return True
ok = True
for task, metric, expected, tolerance in expected_results:
actual = results[task][metric]
if not np.isfinite(actual):
ok = False
diff = abs(actual - expected)
if diff > tolerance:
ok = False
logger = logging.getLogger(__name__)
if not ok:
logger.error("Result verification failed!")
logger.error("Expected Results: " + str(expected_results))
logger.error("Actual Results: " + pprint.pformat(results))
sys.exit(1)
else:
logger.info("Results verification passed.")
return ok
def flatten_results_dict(results):
"""
Expand a hierarchical dict of scalars into a flat dict of scalars.
If results[k1][k2][k3] = v, the returned dict will have the entry
{"k1/k2/k3": v}.
Args:
results (dict):
"""
r = {}
for k, v in results.items():
if isinstance(v, Mapping):
v = flatten_results_dict(v)
for kk, vv in v.items():
r[k + "/" + kk] = vv
else:
r[k] = v
return r
This directory contains code to prepare a detectron2 model for deployment.
Currently it supports exporting a detectron2 model to Caffe2 format through ONNX.
Please see [documentation](https://detectron2.readthedocs.io/tutorials/deployment.html) for its usage.
### Acknowledgements
Thanks to Mobile Vision team at Facebook for developing the conversion tools.
# -*- coding: utf-8 -*-
from .api import *
__all__ = [k for k in globals().keys() if not k.startswith("_")]
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
import copy
import logging
import os
import torch
from caffe2.proto import caffe2_pb2
from torch import nn
from detectron2.config import CfgNode as CN
from .caffe2_export import export_caffe2_detection_model
from .caffe2_export import export_onnx_model as export_onnx_model_impl
from .caffe2_export import run_and_save_graph
from .caffe2_inference import ProtobufDetectionModel
from .caffe2_modeling import META_ARCH_CAFFE2_EXPORT_TYPE_MAP, convert_batched_inputs_to_c2_format
from .shared import get_pb_arg_vali, get_pb_arg_vals, save_graph
__all__ = [
"add_export_config",
"export_caffe2_model",
"Caffe2Model",
"export_onnx_model",
"Caffe2Tracer",
]
def add_export_config(cfg):
"""
Args:
cfg (CfgNode): a detectron2 config
Returns:
CfgNode: an updated config with new options that will be used
by :class:`Caffe2Tracer`.
"""
is_frozen = cfg.is_frozen()
cfg.defrost()
cfg.EXPORT_CAFFE2 = CN()
cfg.EXPORT_CAFFE2.USE_HEATMAP_MAX_KEYPOINT = False
if is_frozen:
cfg.freeze()
return cfg
class Caffe2Tracer:
"""
Make a detectron2 model traceable with caffe2 style.
An original detectron2 model may not be traceable, or
cannot be deployed directly after being traced, due to some reasons:
1. control flow in some ops
2. custom ops
3. complicated pre/post processing
This class provides a traceable version of a detectron2 model by:
1. Rewrite parts of the model using ops in caffe2. Note that some ops do
not have GPU implementation.
2. Define the inputs "after pre-processing" as inputs to the model
3. Remove post-processing and produce raw layer outputs
More specifically about inputs: all builtin models take two input tensors.
(1) NCHW float "data" which is an image (usually in [0, 255])
(2) Nx3 float "im_info", each row of which is (height, width, 1.0)
After making a traceable model, the class provide methods to export such a
model to different deployment formats.
The class currently only supports models using builtin meta architectures.
"""
def __init__(self, cfg, model, inputs):
"""
Args:
cfg (CfgNode): a detectron2 config, with extra export-related options
added by :func:`add_export_config`.
model (nn.Module): a model built by
:func:`detectron2.modeling.build_model`.
inputs: sample inputs that the given model takes for inference.
Will be used to trace the model.
"""
assert isinstance(cfg, CN), cfg
assert isinstance(model, torch.nn.Module), type(model)
if "EXPORT_CAFFE2" not in cfg:
cfg = add_export_config(cfg) # will just the defaults
self.cfg = cfg
self.model = model
self.inputs = inputs
def _get_traceable(self):
# TODO how to make it extensible to support custom models
C2MetaArch = META_ARCH_CAFFE2_EXPORT_TYPE_MAP[self.cfg.MODEL.META_ARCHITECTURE]
traceable_model = C2MetaArch(self.cfg, copy.deepcopy(self.model))
traceable_inputs = traceable_model.get_caffe2_inputs(self.inputs)
return traceable_model, traceable_inputs
def export_caffe2(self):
"""
Export the model to Caffe2's protobuf format.
The returned object can be saved with `.save_protobuf()` method.
The result can be loaded and executed using Caffe2 runtime.
Returns:
Caffe2Model
"""
model, inputs = self._get_traceable()
predict_net, init_net = export_caffe2_detection_model(model, inputs)
return Caffe2Model(predict_net, init_net)
def export_onnx(self):
"""
Export the model to ONNX format.
Note that the exported model contains custom ops only available in caffe2, therefore it
cannot be directly executed by other runtime. Post-processing or transformation passes
may be applied on the model to accommodate different runtimes.
Returns:
onnx.ModelProto: an onnx model.
"""
model, inputs = self._get_traceable()
return export_onnx_model_impl(model, (inputs,))
def export_torchscript(self):
"""
Export the model to a `torch.jit.TracedModule` by tracing.
The returned object can be saved to a file by ".save()".
Returns:
torch.jit.TracedModule: a torch TracedModule
"""
model, inputs = self._get_traceable()
logger = logging.getLogger(__name__)
logger.info("Tracing the model with torch.jit.trace ...")
with torch.no_grad():
return torch.jit.trace(model, (inputs,), optimize=True)
def export_caffe2_model(cfg, model, inputs):
"""
Export a detectron2 model to caffe2 format.
Args:
cfg (CfgNode): a detectron2 config, with extra export-related options
added by :func:`add_export_config`.
model (nn.Module): a model built by
:func:`detectron2.modeling.build_model`.
It will be modified by this function.
inputs: sample inputs that the given model takes for inference.
Will be used to trace the model.
Returns:
Caffe2Model
"""
return Caffe2Tracer(cfg, model, inputs).export_caffe2()
def export_onnx_model(cfg, model, inputs):
"""
Export a detectron2 model to ONNX format.
Note that the exported model contains custom ops only available in caffe2, therefore it
cannot be directly executed by other runtime. Post-processing or transformation passes
may be applied on the model to accommodate different runtimes.
Args:
cfg (CfgNode): a detectron2 config, with extra export-related options
added by :func:`add_export_config`.
model (nn.Module): a model built by
:func:`detectron2.modeling.build_model`.
It will be modified by this function.
inputs: sample inputs that the given model takes for inference.
Will be used to trace the model.
Returns:
onnx.ModelProto: an onnx model.
"""
return Caffe2Tracer(cfg, model, inputs).export_onnx()
class Caffe2Model(nn.Module):
"""
A wrapper around the traced model in caffe2's pb format.
"""
def __init__(self, predict_net, init_net):
super().__init__()
self.eval() # always in eval mode
self._predict_net = predict_net
self._init_net = init_net
self._predictor = None
@property
def predict_net(self):
"""
Returns:
core.Net: the underlying caffe2 predict net
"""
return self._predict_net
@property
def init_net(self):
"""
Returns:
core.Net: the underlying caffe2 init net
"""
return self._init_net
__init__.__HIDE_SPHINX_DOC__ = True
def save_protobuf(self, output_dir):
"""
Save the model as caffe2's protobuf format.
Args:
output_dir (str): the output directory to save protobuf files.
"""
logger = logging.getLogger(__name__)
logger.info("Saving model to {} ...".format(output_dir))
os.makedirs(output_dir, exist_ok=True)
with open(os.path.join(output_dir, "model.pb"), "wb") as f:
f.write(self._predict_net.SerializeToString())
with open(os.path.join(output_dir, "model.pbtxt"), "w") as f:
f.write(str(self._predict_net))
with open(os.path.join(output_dir, "model_init.pb"), "wb") as f:
f.write(self._init_net.SerializeToString())
def save_graph(self, output_file, inputs=None):
"""
Save the graph as SVG format.
Args:
output_file (str): a SVG file
inputs: optional inputs given to the model.
If given, the inputs will be used to run the graph to record
shape of every tensor. The shape information will be
saved together with the graph.
"""
if inputs is None:
save_graph(self._predict_net, output_file, op_only=False)
else:
size_divisibility = get_pb_arg_vali(self._predict_net, "size_divisibility", 0)
device = get_pb_arg_vals(self._predict_net, "device", b"cpu").decode("ascii")
inputs = convert_batched_inputs_to_c2_format(inputs, size_divisibility, device)
inputs = [x.cpu().numpy() for x in inputs]
run_and_save_graph(self._predict_net, self._init_net, inputs, output_file)
@staticmethod
def load_protobuf(dir):
"""
Args:
dir (str): a directory used to save Caffe2Model with
:meth:`save_protobuf`.
The files "model.pb" and "model_init.pb" are needed.
Returns:
Caffe2Model: the caffe2 model loaded from this directory.
"""
predict_net = caffe2_pb2.NetDef()
with open(os.path.join(dir, "model.pb"), "rb") as f:
predict_net.ParseFromString(f.read())
init_net = caffe2_pb2.NetDef()
with open(os.path.join(dir, "model_init.pb"), "rb") as f:
init_net.ParseFromString(f.read())
return Caffe2Model(predict_net, init_net)
def __call__(self, inputs):
"""
An interface that wraps around a caffe2 model and mimics detectron2's models'
input & output format. This is used to compare the outputs of caffe2 model
with its original torch model.
Due to the extra conversion between torch/caffe2,
this method is not meant for benchmark.
"""
if self._predictor is None:
self._predictor = ProtobufDetectionModel(self._predict_net, self._init_net)
return self._predictor(inputs)
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
import math
import torch
import torch.nn.functional as F
from detectron2.layers import cat
from detectron2.layers.roi_align_rotated import ROIAlignRotated
from detectron2.modeling import poolers
from detectron2.modeling.proposal_generator import rpn
from detectron2.modeling.roi_heads.mask_head import mask_rcnn_inference
from detectron2.structures import Boxes, ImageList, Instances, Keypoints
from .shared import alias, to_device
"""
This file contains caffe2-compatible implementation of several detectrno2 components.
"""
class Caffe2Boxes(Boxes):
"""
Representing a list of detectron2.structures.Boxes from minibatch, each box
is represented by a 5d vector (batch index + 4 coordinates), or a 6d vector
(batch index + 5 coordinates) for RotatedBoxes.
"""
def __init__(self, tensor):
assert isinstance(tensor, torch.Tensor)
assert tensor.dim() == 2 and tensor.size(-1) in [4, 5, 6], tensor.size()
# TODO: make tensor immutable when dim is Nx5 for Boxes,
# and Nx6 for RotatedBoxes?
self.tensor = tensor
# TODO clean up this class, maybe just extend Instances
class InstancesList(object):
"""
Tensor representation of a list of Instances object for a batch of images.
When dealing with a batch of images with Caffe2 ops, a list of bboxes
(instances) are usually represented by single Tensor with size
(sigma(Ni), 5) or (sigma(Ni), 4) plus a batch split Tensor. This class is
for providing common functions to convert between these two representations.
"""
def __init__(self, im_info, indices, extra_fields=None):
# [N, 3] -> (H, W, Scale)
self.im_info = im_info
# [N,] -> indice of batch to which the instance belongs
self.indices = indices
# [N, ...]
self.batch_extra_fields = extra_fields or {}
self.image_size = self.im_info
def get_fields(self):
""" like `get_fields` in the Instances object,
but return each field in tensor representations """
ret = {}
for k, v in self.batch_extra_fields.items():
# if isinstance(v, torch.Tensor):
# tensor_rep = v
# elif isinstance(v, (Boxes, Keypoints)):
# tensor_rep = v.tensor
# else:
# raise ValueError("Can't find tensor representation for: {}".format())
ret[k] = v
return ret
def has(self, name):
return name in self.batch_extra_fields
def set(self, name, value):
data_len = len(value)
if len(self.batch_extra_fields):
assert (
len(self) == data_len
), "Adding a field of length {} to a Instances of length {}".format(data_len, len(self))
self.batch_extra_fields[name] = value
def __setattr__(self, name, val):
if name in ["im_info", "indices", "batch_extra_fields", "image_size"]:
super().__setattr__(name, val)
else:
self.set(name, val)
def __getattr__(self, name):
if name not in self.batch_extra_fields:
raise AttributeError("Cannot find field '{}' in the given Instances!".format(name))
return self.batch_extra_fields[name]
def __len__(self):
return len(self.indices)
def flatten(self):
ret = []
for _, v in self.batch_extra_fields.items():
if isinstance(v, (Boxes, Keypoints)):
ret.append(v.tensor)
else:
ret.append(v)
return ret
@staticmethod
def to_d2_instances_list(instances_list):
"""
Convert InstancesList to List[Instances]. The input `instances_list` can
also be a List[Instances], in this case this method is a non-op.
"""
if not isinstance(instances_list, InstancesList):
assert all(isinstance(x, Instances) for x in instances_list)
return instances_list
ret = []
for i, info in enumerate(instances_list.im_info):
instances = Instances(torch.Size([int(info[0].item()), int(info[1].item())]))
ids = instances_list.indices == i
for k, v in instances_list.batch_extra_fields.items():
if isinstance(v, torch.Tensor):
instances.set(k, v[ids])
continue
elif isinstance(v, Boxes):
instances.set(k, v[ids, -4:])
continue
target_type, tensor_source = v
assert isinstance(tensor_source, torch.Tensor)
assert tensor_source.shape[0] == instances_list.indices.shape[0]
tensor_source = tensor_source[ids]
if issubclass(target_type, Boxes):
instances.set(k, Boxes(tensor_source[:, -4:]))
elif issubclass(target_type, Keypoints):
instances.set(k, Keypoints(tensor_source))
elif issubclass(target_type, torch.Tensor):
instances.set(k, tensor_source)
else:
raise ValueError("Can't handle targe type: {}".format(target_type))
ret.append(instances)
return ret
class Caffe2Compatible(object):
def _get_tensor_mode(self):
return self._tensor_mode
def _set_tensor_mode(self, v):
self._tensor_mode = v
tensor_mode = property(_get_tensor_mode, _set_tensor_mode)
"""
If true, the model expects C2-style tensor only inputs/outputs format.
"""
class Caffe2RPN(Caffe2Compatible, rpn.RPN):
def forward(self, images, features, gt_instances=None):
assert not self.training
features = [features[f] for f in self.in_features]
objectness_logits_pred, anchor_deltas_pred = self.rpn_head(features)
assert isinstance(images, ImageList)
if self.tensor_mode:
im_info = images.image_sizes
else:
im_info = torch.Tensor(
[[im_sz[0], im_sz[1], torch.Tensor([1.0])] for im_sz in images.image_sizes]
).to(images.tensor.device)
assert isinstance(im_info, torch.Tensor)
rpn_rois_list = []
rpn_roi_probs_list = []
for scores, bbox_deltas, cell_anchors_tensor, feat_stride in zip(
objectness_logits_pred,
anchor_deltas_pred,
iter(self.anchor_generator.cell_anchors),
self.anchor_generator.strides,
):
scores = scores.detach()
bbox_deltas = bbox_deltas.detach()
rpn_rois, rpn_roi_probs = torch.ops._caffe2.GenerateProposals(
scores,
bbox_deltas,
im_info,
cell_anchors_tensor,
spatial_scale=1.0 / feat_stride,
pre_nms_topN=self.pre_nms_topk[self.training],
post_nms_topN=self.post_nms_topk[self.training],
nms_thresh=self.nms_thresh,
min_size=self.min_box_side_len,
# correct_transform_coords=True, # deprecated argument
angle_bound_on=True, # Default
angle_bound_lo=-180,
angle_bound_hi=180,
clip_angle_thresh=1.0, # Default
legacy_plus_one=False,
)
rpn_rois_list.append(rpn_rois)
rpn_roi_probs_list.append(rpn_roi_probs)
# For FPN in D2, in RPN all proposals from different levels are concated
# together, ranked and picked by top post_nms_topk. Then in ROIPooler
# it calculates level_assignments and calls the RoIAlign from
# the corresponding level.
if len(objectness_logits_pred) == 1:
rpn_rois = rpn_rois_list[0]
rpn_roi_probs = rpn_roi_probs_list[0]
else:
assert len(rpn_rois_list) == len(rpn_roi_probs_list)
rpn_post_nms_topN = self.post_nms_topk[self.training]
device = rpn_rois_list[0].device
input_list = [to_device(x, "cpu") for x in (rpn_rois_list + rpn_roi_probs_list)]
# TODO remove this after confirming rpn_max_level/rpn_min_level
# is not needed in CollectRpnProposals.
feature_strides = list(self.anchor_generator.strides)
rpn_min_level = int(math.log2(feature_strides[0]))
rpn_max_level = int(math.log2(feature_strides[-1]))
assert (rpn_max_level - rpn_min_level + 1) == len(
rpn_rois_list
), "CollectRpnProposals requires continuous levels"
rpn_rois = torch.ops._caffe2.CollectRpnProposals(
input_list,
# NOTE: in current implementation, rpn_max_level and rpn_min_level
# are not needed, only the subtraction of two matters and it
# can be infer from the number of inputs. Keep them now for
# consistency.
rpn_max_level=2 + len(rpn_rois_list) - 1,
rpn_min_level=2,
rpn_post_nms_topN=rpn_post_nms_topN,
)
rpn_rois = to_device(rpn_rois, device)
rpn_roi_probs = []
proposals = self.c2_postprocess(im_info, rpn_rois, rpn_roi_probs, self.tensor_mode)
return proposals, {}
@staticmethod
def c2_postprocess(im_info, rpn_rois, rpn_roi_probs, tensor_mode):
proposals = InstancesList(
im_info=im_info,
indices=rpn_rois[:, 0],
extra_fields={
"proposal_boxes": Caffe2Boxes(rpn_rois),
"objectness_logits": (torch.Tensor, rpn_roi_probs),
},
)
if not tensor_mode:
proposals = InstancesList.to_d2_instances_list(proposals)
else:
proposals = [proposals]
return proposals
class Caffe2ROIPooler(Caffe2Compatible, poolers.ROIPooler):
@staticmethod
def c2_preprocess(box_lists):
assert all(isinstance(x, Boxes) for x in box_lists)
if all(isinstance(x, Caffe2Boxes) for x in box_lists):
# input is pure-tensor based
assert len(box_lists) == 1
pooler_fmt_boxes = box_lists[0].tensor
else:
pooler_fmt_boxes = poolers.convert_boxes_to_pooler_format(box_lists)
return pooler_fmt_boxes
def forward(self, x, box_lists):
assert not self.training
pooler_fmt_boxes = self.c2_preprocess(box_lists)
num_level_assignments = len(self.level_poolers)
if num_level_assignments == 1:
if isinstance(self.level_poolers[0], ROIAlignRotated):
c2_roi_align = torch.ops._caffe2.RoIAlignRotated
aligned = True
else:
c2_roi_align = torch.ops._caffe2.RoIAlign
aligned = self.level_poolers[0].aligned
out = c2_roi_align(
x[0],
pooler_fmt_boxes,
order="NCHW",
spatial_scale=float(self.level_poolers[0].spatial_scale),
pooled_h=int(self.output_size[0]),
pooled_w=int(self.output_size[1]),
sampling_ratio=int(self.level_poolers[0].sampling_ratio),
aligned=aligned,
)
return out
device = pooler_fmt_boxes.device
assert (
self.max_level - self.min_level + 1 == 4
), "Currently DistributeFpnProposals only support 4 levels"
fpn_outputs = torch.ops._caffe2.DistributeFpnProposals(
to_device(pooler_fmt_boxes, "cpu"),
roi_canonical_scale=self.canonical_box_size,
roi_canonical_level=self.canonical_level,
roi_max_level=self.max_level,
roi_min_level=self.min_level,
legacy_plus_one=False,
)
fpn_outputs = [to_device(x, device) for x in fpn_outputs]
rois_fpn_list = fpn_outputs[:-1]
rois_idx_restore_int32 = fpn_outputs[-1]
roi_feat_fpn_list = []
for roi_fpn, x_level, pooler in zip(rois_fpn_list, x, self.level_poolers):
if isinstance(pooler, ROIAlignRotated):
c2_roi_align = torch.ops._caffe2.RoIAlignRotated
aligned = True
else:
c2_roi_align = torch.ops._caffe2.RoIAlign
aligned = bool(pooler.aligned)
roi_feat_fpn = c2_roi_align(
x_level,
roi_fpn,
order="NCHW",
spatial_scale=float(pooler.spatial_scale),
pooled_h=int(self.output_size[0]),
pooled_w=int(self.output_size[1]),
sampling_ratio=int(pooler.sampling_ratio),
aligned=aligned,
)
roi_feat_fpn_list.append(roi_feat_fpn)
roi_feat_shuffled = cat(roi_feat_fpn_list, dim=0)
roi_feat = torch.ops._caffe2.BatchPermutation(roi_feat_shuffled, rois_idx_restore_int32)
return roi_feat
class Caffe2FastRCNNOutputsInference:
def __init__(self, tensor_mode):
self.tensor_mode = tensor_mode # whether the output is caffe2 tensor mode
def __call__(self, box_predictor, predictions, proposals):
""" equivalent to FastRCNNOutputLayers.inference """
score_thresh = box_predictor.test_score_thresh
nms_thresh = box_predictor.test_nms_thresh
topk_per_image = box_predictor.test_topk_per_image
is_rotated = len(box_predictor.box2box_transform.weights) == 5
if is_rotated:
box_dim = 5
assert box_predictor.box2box_transform.weights[4] == 1, (
"The weights for Rotated BBoxTransform in C2 have only 4 dimensions,"
+ " thus enforcing the angle weight to be 1 for now"
)
box2box_transform_weights = box_predictor.box2box_transform.weights[:4]
else:
box_dim = 4
box2box_transform_weights = box_predictor.box2box_transform.weights
class_logits, box_regression = predictions
class_prob = F.softmax(class_logits, -1)
assert box_regression.shape[1] % box_dim == 0
cls_agnostic_bbox_reg = box_regression.shape[1] // box_dim == 1
input_tensor_mode = proposals[0].proposal_boxes.tensor.shape[1] == box_dim + 1
rois = type(proposals[0].proposal_boxes).cat([p.proposal_boxes for p in proposals])
device, dtype = rois.tensor.device, rois.tensor.dtype
if input_tensor_mode:
im_info = proposals[0].image_size
rois = rois.tensor
else:
im_info = torch.Tensor(
[[sz[0], sz[1], 1.0] for sz in [x.image_size for x in proposals]]
)
batch_ids = cat(
[
torch.full((b, 1), i, dtype=dtype, device=device)
for i, b in enumerate(len(p) for p in proposals)
],
dim=0,
)
rois = torch.cat([batch_ids, rois.tensor], dim=1)
roi_pred_bbox, roi_batch_splits = torch.ops._caffe2.BBoxTransform(
to_device(rois, "cpu"),
to_device(box_regression, "cpu"),
to_device(im_info, "cpu"),
weights=box2box_transform_weights,
apply_scale=True,
rotated=is_rotated,
angle_bound_on=True,
angle_bound_lo=-180,
angle_bound_hi=180,
clip_angle_thresh=1.0,
legacy_plus_one=False,
)
roi_pred_bbox = to_device(roi_pred_bbox, device)
roi_batch_splits = to_device(roi_batch_splits, device)
nms_outputs = torch.ops._caffe2.BoxWithNMSLimit(
to_device(class_prob, "cpu"),
to_device(roi_pred_bbox, "cpu"),
to_device(roi_batch_splits, "cpu"),
score_thresh=float(score_thresh),
nms=float(nms_thresh),
detections_per_im=int(topk_per_image),
soft_nms_enabled=False,
soft_nms_method="linear",
soft_nms_sigma=0.5,
soft_nms_min_score_thres=0.001,
rotated=is_rotated,
cls_agnostic_bbox_reg=cls_agnostic_bbox_reg,
input_boxes_include_bg_cls=False,
output_classes_include_bg_cls=False,
legacy_plus_one=False,
)
roi_score_nms = to_device(nms_outputs[0], device)
roi_bbox_nms = to_device(nms_outputs[1], device)
roi_class_nms = to_device(nms_outputs[2], device)
roi_batch_splits_nms = to_device(nms_outputs[3], device)
roi_keeps_nms = to_device(nms_outputs[4], device)
roi_keeps_size_nms = to_device(nms_outputs[5], device)
if not self.tensor_mode:
roi_class_nms = roi_class_nms.to(torch.int64)
roi_batch_ids = cat(
[
torch.full((b, 1), i, dtype=dtype, device=device)
for i, b in enumerate(int(x.item()) for x in roi_batch_splits_nms)
],
dim=0,
)
roi_class_nms = alias(roi_class_nms, "class_nms")
roi_score_nms = alias(roi_score_nms, "score_nms")
roi_bbox_nms = alias(roi_bbox_nms, "bbox_nms")
roi_batch_splits_nms = alias(roi_batch_splits_nms, "batch_splits_nms")
roi_keeps_nms = alias(roi_keeps_nms, "keeps_nms")
roi_keeps_size_nms = alias(roi_keeps_size_nms, "keeps_size_nms")
results = InstancesList(
im_info=im_info,
indices=roi_batch_ids[:, 0],
extra_fields={
"pred_boxes": Caffe2Boxes(roi_bbox_nms),
"scores": roi_score_nms,
"pred_classes": roi_class_nms,
},
)
if not self.tensor_mode:
results = InstancesList.to_d2_instances_list(results)
batch_splits = roi_batch_splits_nms.int().tolist()
kept_indices = list(roi_keeps_nms.to(torch.int64).split(batch_splits))
else:
results = [results]
kept_indices = [roi_keeps_nms]
return results, kept_indices
class Caffe2MaskRCNNInference:
def __call__(self, pred_mask_logits, pred_instances):
""" equivalent to mask_head.mask_rcnn_inference """
if all(isinstance(x, InstancesList) for x in pred_instances):
assert len(pred_instances) == 1
mask_probs_pred = pred_mask_logits.sigmoid()
mask_probs_pred = alias(mask_probs_pred, "mask_fcn_probs")
pred_instances[0].pred_masks = mask_probs_pred
else:
mask_rcnn_inference(pred_mask_logits, pred_instances)
class Caffe2KeypointRCNNInference:
def __init__(self, use_heatmap_max_keypoint):
self.use_heatmap_max_keypoint = use_heatmap_max_keypoint
def __call__(self, pred_keypoint_logits, pred_instances):
# just return the keypoint heatmap for now,
# there will be option to call HeatmapMaxKeypointOp
output = alias(pred_keypoint_logits, "kps_score")
if all(isinstance(x, InstancesList) for x in pred_instances):
assert len(pred_instances) == 1
if self.use_heatmap_max_keypoint:
device = output.device
output = torch.ops._caffe2.HeatmapMaxKeypoint(
to_device(output, "cpu"),
pred_instances[0].pred_boxes.tensor,
should_output_softmax=True, # worth make it configerable?
)
output = to_device(output, device)
output = alias(output, "keypoints_out")
pred_instances[0].pred_keypoints = output
return pred_keypoint_logits
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
import copy
import io
import logging
import numpy as np
from typing import List
import onnx
import torch
from caffe2.proto import caffe2_pb2
from caffe2.python import core
from caffe2.python.onnx.backend import Caffe2Backend
from tabulate import tabulate
from termcolor import colored
from torch.onnx import OperatorExportTypes
from .shared import (
ScopedWS,
construct_init_net_from_params,
fuse_alias_placeholder,
fuse_copy_between_cpu_and_gpu,
get_params_from_init_net,
group_norm_replace_aten_with_caffe2,
infer_device_type,
remove_dead_end_ops,
remove_reshape_for_fc,
save_graph,
)
logger = logging.getLogger(__name__)
def export_onnx_model(model, inputs):
"""
Trace and export a model to onnx format.
Args:
model (nn.Module):
inputs (tuple[args]): the model will be called by `model(*inputs)`
Returns:
an onnx model
"""
assert isinstance(model, torch.nn.Module)
# make sure all modules are in eval mode, onnx may change the training state
# of the module if the states are not consistent
def _check_eval(module):
assert not module.training
model.apply(_check_eval)
# Export the model to ONNX
with torch.no_grad():
with io.BytesIO() as f:
torch.onnx.export(
model,
inputs,
f,
operator_export_type=OperatorExportTypes.ONNX_ATEN_FALLBACK,
# verbose=True, # NOTE: uncomment this for debugging
# export_params=True,
)
onnx_model = onnx.load_from_string(f.getvalue())
# Apply ONNX's Optimization
all_passes = onnx.optimizer.get_available_passes()
passes = ["fuse_bn_into_conv"]
assert all(p in all_passes for p in passes)
onnx_model = onnx.optimizer.optimize(onnx_model, passes)
return onnx_model
def _op_stats(net_def):
type_count = {}
for t in [op.type for op in net_def.op]:
type_count[t] = type_count.get(t, 0) + 1
type_count_list = sorted(type_count.items(), key=lambda kv: kv[0]) # alphabet
type_count_list = sorted(type_count_list, key=lambda kv: -kv[1]) # count
return "\n".join("{:>4}x {}".format(count, name) for name, count in type_count_list)
def _assign_device_option(
predict_net: caffe2_pb2.NetDef, init_net: caffe2_pb2.NetDef, tensor_inputs: List[torch.Tensor]
):
"""
ONNX exported network doesn't have concept of device, assign necessary
device option for each op in order to make it runable on GPU runtime.
"""
def _get_device_type(torch_tensor):
assert torch_tensor.device.type in ["cpu", "cuda"]
assert torch_tensor.device.index == 0
return torch_tensor.device.type
def _assign_op_device_option(net_proto, net_ssa, blob_device_types):
for op, ssa_i in zip(net_proto.op, net_ssa):
if op.type in ["CopyCPUToGPU", "CopyGPUToCPU"]:
op.device_option.CopyFrom(core.DeviceOption(caffe2_pb2.CUDA, 0))
else:
devices = [blob_device_types[b] for b in ssa_i[0] + ssa_i[1]]
assert all(d == devices[0] for d in devices)
if devices[0] == "cuda":
op.device_option.CopyFrom(core.DeviceOption(caffe2_pb2.CUDA, 0))
# update ops in predict_net
predict_net_input_device_types = {
(name, 0): _get_device_type(tensor)
for name, tensor in zip(predict_net.external_input, tensor_inputs)
}
predict_net_device_types = infer_device_type(
predict_net, known_status=predict_net_input_device_types, device_name_style="pytorch"
)
predict_net_ssa, _ = core.get_ssa(predict_net)
_assign_op_device_option(predict_net, predict_net_ssa, predict_net_device_types)
# update ops in init_net
init_net_ssa, versions = core.get_ssa(init_net)
init_net_output_device_types = {
(name, versions[name]): predict_net_device_types[(name, 0)]
for name in init_net.external_output
}
init_net_device_types = infer_device_type(
init_net, known_status=init_net_output_device_types, device_name_style="pytorch"
)
_assign_op_device_option(init_net, init_net_ssa, init_net_device_types)
def export_caffe2_detection_model(model: torch.nn.Module, tensor_inputs: List[torch.Tensor]):
"""
Export a caffe2-compatible Detectron2 model to caffe2 format via ONNX.
Arg:
model: a caffe2-compatible version of detectron2 model, defined in caffe2_modeling.py
tensor_inputs: a list of tensors that caffe2 model takes as input.
"""
model = copy.deepcopy(model)
assert isinstance(model, torch.nn.Module)
assert hasattr(model, "encode_additional_info")
# Export via ONNX
logger.info("Exporting a {} model via ONNX ...".format(type(model).__name__))
onnx_model = export_onnx_model(model, (tensor_inputs,))
# Convert ONNX model to Caffe2 protobuf
init_net, predict_net = Caffe2Backend.onnx_graph_to_caffe2_net(onnx_model)
ops_table = [[op.type, op.input, op.output] for op in predict_net.op]
table = tabulate(ops_table, headers=["type", "input", "output"], tablefmt="pipe")
logger.info(
"ONNX export Done. Exported predict_net (before optimizations):\n" + colored(table, "cyan")
)
# Apply protobuf optimization
fuse_alias_placeholder(predict_net, init_net)
if any(t.device.type != "cpu" for t in tensor_inputs):
fuse_copy_between_cpu_and_gpu(predict_net)
remove_dead_end_ops(init_net)
_assign_device_option(predict_net, init_net, tensor_inputs)
params, device_options = get_params_from_init_net(init_net)
predict_net, params = remove_reshape_for_fc(predict_net, params)
init_net = construct_init_net_from_params(params, device_options)
group_norm_replace_aten_with_caffe2(predict_net)
# Record necessary information for running the pb model in Detectron2 system.
model.encode_additional_info(predict_net, init_net)
logger.info("Operators used in predict_net: \n{}".format(_op_stats(predict_net)))
logger.info("Operators used in init_net: \n{}".format(_op_stats(init_net)))
return predict_net, init_net
def run_and_save_graph(predict_net, init_net, tensor_inputs, graph_save_path):
"""
Run the caffe2 model on given inputs, recording the shape and draw the graph.
predict_net/init_net: caffe2 model.
tensor_inputs: a list of tensors that caffe2 model takes as input.
graph_save_path: path for saving graph of exported model.
"""
logger.info("Saving graph of ONNX exported model to {} ...".format(graph_save_path))
save_graph(predict_net, graph_save_path, op_only=False)
# Run the exported Caffe2 net
logger.info("Running ONNX exported model ...")
with ScopedWS("__ws_tmp__", True) as ws:
ws.RunNetOnce(init_net)
initialized_blobs = set(ws.Blobs())
uninitialized = [inp for inp in predict_net.external_input if inp not in initialized_blobs]
for name, blob in zip(uninitialized, tensor_inputs):
ws.FeedBlob(name, blob)
try:
ws.RunNetOnce(predict_net)
except RuntimeError as e:
logger.warning("Encountered RuntimeError: \n{}".format(str(e)))
ws_blobs = {b: ws.FetchBlob(b) for b in ws.Blobs()}
blob_sizes = {b: ws_blobs[b].shape for b in ws_blobs if isinstance(ws_blobs[b], np.ndarray)}
logger.info("Saving graph with blob shapes to {} ...".format(graph_save_path))
save_graph(predict_net, graph_save_path, op_only=False, blob_sizes=blob_sizes)
return ws_blobs
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
import collections
import logging
import numpy as np
import torch
from caffe2.proto import caffe2_pb2
from caffe2.python import core
from .caffe2_modeling import META_ARCH_CAFFE2_EXPORT_TYPE_MAP, convert_batched_inputs_to_c2_format
from .shared import ScopedWS, get_pb_arg_vali, get_pb_arg_vals, infer_device_type
logger = logging.getLogger(__name__)
class ProtobufModel(torch.nn.Module):
"""
A class works just like nn.Module in terms of inference, but running
caffe2 model under the hood. Input/Output are Dict[str, tensor] whose keys
are in external_input/output.
"""
def __init__(self, predict_net, init_net):
logger.info("Initializing ProtobufModel ...")
super().__init__()
assert isinstance(predict_net, caffe2_pb2.NetDef)
assert isinstance(init_net, caffe2_pb2.NetDef)
self.ws_name = "__ws_tmp__"
self.net = core.Net(predict_net)
with ScopedWS(self.ws_name, is_reset=True, is_cleanup=False) as ws:
ws.RunNetOnce(init_net)
for blob in self.net.Proto().external_input:
if blob not in ws.Blobs():
ws.CreateBlob(blob)
ws.CreateNet(self.net)
self._error_msgs = set()
def forward(self, inputs_dict):
assert all(inp in self.net.Proto().external_input for inp in inputs_dict)
with ScopedWS(self.ws_name, is_reset=False, is_cleanup=False) as ws:
for b, tensor in inputs_dict.items():
ws.FeedBlob(b, tensor)
try:
ws.RunNet(self.net.Proto().name)
except RuntimeError as e:
if not str(e) in self._error_msgs:
self._error_msgs.add(str(e))
logger.warning("Encountered new RuntimeError: \n{}".format(str(e)))
logger.warning("Catch the error and use partial results.")
outputs_dict = collections.OrderedDict(
[(b, ws.FetchBlob(b)) for b in self.net.Proto().external_output]
)
# Remove outputs of current run, this is necessary in order to
# prevent fetching the result from previous run if the model fails
# in the middle.
for b in self.net.Proto().external_output:
# Needs to create uninitialized blob to make the net runable.
# This is "equivalent" to: ws.RemoveBlob(b) then ws.CreateBlob(b),
# but there'no such API.
ws.FeedBlob(b, "{}, a C++ native class of type nullptr (uninitialized).".format(b))
return outputs_dict
class ProtobufDetectionModel(torch.nn.Module):
"""
A class works just like a pytorch meta arch in terms of inference, but running
caffe2 model under the hood.
"""
def __init__(self, predict_net, init_net, *, convert_outputs=None):
"""
Args:
predict_net, init_net (core.Net): caffe2 nets
convert_outptus (callable): a function that converts caffe2
outputs to the same format of the original pytorch model.
By default, use the one defined in the caffe2 meta_arch.
"""
super().__init__()
self.protobuf_model = ProtobufModel(predict_net, init_net)
self.size_divisibility = get_pb_arg_vali(predict_net, "size_divisibility", 0)
self.device = get_pb_arg_vals(predict_net, "device", b"cpu").decode("ascii")
if convert_outputs is None:
meta_arch = get_pb_arg_vals(predict_net, "meta_architecture", b"GeneralizedRCNN")
meta_arch = META_ARCH_CAFFE2_EXPORT_TYPE_MAP[meta_arch.decode("ascii")]
self._convert_outputs = meta_arch.get_outputs_converter(predict_net, init_net)
else:
self._convert_outputs = convert_outputs
def _infer_output_devices(self, inputs_dict):
def _get_device_type(torch_tensor):
assert torch_tensor.device.type in ["cpu", "cuda"]
assert torch_tensor.device.index == 0
return torch_tensor.device.type
predict_net = self.protobuf_model.net.Proto()
input_device_types = {
(name, 0): _get_device_type(tensor) for name, tensor in inputs_dict.items()
}
device_type_map = infer_device_type(
predict_net, known_status=input_device_types, device_name_style="pytorch"
)
ssa, versions = core.get_ssa(predict_net)
versioned_outputs = [(name, versions[name]) for name in predict_net.external_output]
output_devices = [device_type_map[outp] for outp in versioned_outputs]
return output_devices
def _convert_inputs(self, batched_inputs):
# currently all models convert inputs in the same way
data, im_info = convert_batched_inputs_to_c2_format(
batched_inputs, self.size_divisibility, self.device
)
return {"data": data, "im_info": im_info}
def forward(self, batched_inputs):
c2_inputs = self._convert_inputs(batched_inputs)
c2_results = self.protobuf_model(c2_inputs)
if any(t.device.type != "cpu" for _, t in c2_inputs.items()):
output_devices = self._infer_output_devices(c2_inputs)
else:
output_devices = ["cpu" for _ in self.protobuf_model.net.Proto().external_output]
def _cast_caffe2_blob_to_torch_tensor(blob, device):
return torch.Tensor(blob).to(device) if isinstance(blob, np.ndarray) else None
c2_results = {
name: _cast_caffe2_blob_to_torch_tensor(c2_results[name], device)
for name, device in zip(self.protobuf_model.net.Proto().external_output, output_devices)
}
return self._convert_outputs(batched_inputs, c2_inputs, c2_results)
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
import functools
import io
import struct
import types
import torch
from detectron2.modeling import meta_arch
from detectron2.modeling.box_regression import Box2BoxTransform
from detectron2.modeling.meta_arch.panoptic_fpn import combine_semantic_and_instance_outputs
from detectron2.modeling.postprocessing import detector_postprocess, sem_seg_postprocess
from detectron2.modeling.roi_heads import keypoint_head
from detectron2.structures import Boxes, ImageList, Instances, RotatedBoxes
from .c10 import Caffe2Compatible
from .patcher import ROIHeadsPatcher, patch_generalized_rcnn
from .shared import (
alias,
check_set_pb_arg,
get_pb_arg_floats,
get_pb_arg_valf,
get_pb_arg_vali,
get_pb_arg_vals,
mock_torch_nn_functional_interpolate,
)
def assemble_rcnn_outputs_by_name(image_sizes, tensor_outputs, force_mask_on=False):
"""
A function to assemble caffe2 model's outputs (i.e. Dict[str, Tensor])
to detectron2's format (i.e. list of Instances instance).
This only works when the model follows the Caffe2 detectron's naming convention.
Args:
image_sizes (List[List[int, int]]): [H, W] of every image.
tensor_outputs (Dict[str, Tensor]): external_output to its tensor.
force_mask_on (Bool): if true, the it make sure there'll be pred_masks even
if the mask is not found from tensor_outputs (usually due to model crash)
"""
results = [Instances(image_size) for image_size in image_sizes]
batch_splits = tensor_outputs.get("batch_splits", None)
if batch_splits:
raise NotImplementedError()
assert len(image_sizes) == 1
result = results[0]
bbox_nms = tensor_outputs["bbox_nms"]
score_nms = tensor_outputs["score_nms"]
class_nms = tensor_outputs["class_nms"]
# Detection will always success because Conv support 0-batch
assert bbox_nms is not None
assert score_nms is not None
assert class_nms is not None
if bbox_nms.shape[1] == 5:
result.pred_boxes = RotatedBoxes(bbox_nms)
else:
result.pred_boxes = Boxes(bbox_nms)
result.scores = score_nms
result.pred_classes = class_nms.to(torch.int64)
mask_fcn_probs = tensor_outputs.get("mask_fcn_probs", None)
if mask_fcn_probs is not None:
# finish the mask pred
mask_probs_pred = mask_fcn_probs
num_masks = mask_probs_pred.shape[0]
class_pred = result.pred_classes
indices = torch.arange(num_masks, device=class_pred.device)
mask_probs_pred = mask_probs_pred[indices, class_pred][:, None]
result.pred_masks = mask_probs_pred
elif force_mask_on:
# NOTE: there's no way to know the height/width of mask here, it won't be
# used anyway when batch size is 0, so just set them to 0.
result.pred_masks = torch.zeros([0, 1, 0, 0], dtype=torch.uint8)
keypoints_out = tensor_outputs.get("keypoints_out", None)
kps_score = tensor_outputs.get("kps_score", None)
if keypoints_out is not None:
# keypoints_out: [N, 4, #kypoints], where 4 is in order of (x, y, score, prob)
keypoints_tensor = keypoints_out
# NOTE: it's possible that prob is not calculated if "should_output_softmax"
# is set to False in HeatmapMaxKeypoint, so just using raw score, seems
# it doesn't affect mAP. TODO: check more carefully.
keypoint_xyp = keypoints_tensor.transpose(1, 2)[:, :, [0, 1, 2]]
result.pred_keypoints = keypoint_xyp
elif kps_score is not None:
# keypoint heatmap to sparse data structure
pred_keypoint_logits = kps_score
keypoint_head.keypoint_rcnn_inference(pred_keypoint_logits, [result])
return results
def _cast_to_f32(f64):
return struct.unpack("f", struct.pack("f", f64))[0]
def set_caffe2_compatible_tensor_mode(model, enable=True):
def _fn(m):
if isinstance(m, Caffe2Compatible):
m.tensor_mode = enable
model.apply(_fn)
def convert_batched_inputs_to_c2_format(batched_inputs, size_divisibility, device):
"""
See get_caffe2_inputs() below.
"""
assert all(isinstance(x, dict) for x in batched_inputs)
assert all(x["image"].dim() == 3 for x in batched_inputs)
images = [x["image"] for x in batched_inputs]
images = ImageList.from_tensors(images, size_divisibility)
im_info = []
for input_per_image, image_size in zip(batched_inputs, images.image_sizes):
target_height = input_per_image.get("height", image_size[0])
target_width = input_per_image.get("width", image_size[1]) # noqa
# NOTE: The scale inside im_info is kept as convention and for providing
# post-processing information if further processing is needed. For
# current Caffe2 model definitions that don't include post-processing inside
# the model, this number is not used.
# NOTE: There can be a slight difference between width and height
# scales, using a single number can results in numerical difference
# compared with D2's post-processing.
scale = target_height / image_size[0]
im_info.append([image_size[0], image_size[1], scale])
im_info = torch.Tensor(im_info)
return images.tensor.to(device), im_info.to(device)
class Caffe2MetaArch(Caffe2Compatible, torch.nn.Module):
"""
Base class for caffe2-compatible implementation of a meta architecture.
The forward is traceable and its traced graph can be converted to caffe2
graph through ONNX.
"""
def __init__(self, cfg, torch_model):
"""
Args:
cfg (CfgNode):
torch_model (nn.Module): the detectron2 model (meta_arch) to be
converted.
"""
super().__init__()
self._wrapped_model = torch_model
self.eval()
set_caffe2_compatible_tensor_mode(self, True)
def get_caffe2_inputs(self, batched_inputs):
"""
Convert pytorch-style structured inputs to caffe2-style inputs that
are tuples of tensors.
Args:
batched_inputs (list[dict]): inputs to a detectron2 model
in its standard format. Each dict has "image" (CHW tensor), and optionally
"height" and "width".
Returns:
tuple[Tensor]:
tuple of tensors that will be the inputs to the
:meth:`forward` method. For existing models, the first
is an NCHW tensor (padded and batched); the second is
a im_info Nx3 tensor, where the rows are
(height, width, unused legacy parameter)
"""
return convert_batched_inputs_to_c2_format(
batched_inputs,
self._wrapped_model.backbone.size_divisibility,
self._wrapped_model.device,
)
def encode_additional_info(self, predict_net, init_net):
"""
Save extra metadata that will be used by inference in the output protobuf.
"""
pass
def forward(self, inputs):
"""
Run the forward in caffe2-style. It has to use caffe2-compatible ops
and the method will be used for tracing.
Args:
inputs (tuple[Tensor]): inputs defined by :meth:`get_caffe2_input`.
They will be the inputs of the converted caffe2 graph.
Returns:
tuple[Tensor]: output tensors. They will be the outputs of the
converted caffe2 graph.
"""
raise NotImplementedError
def _caffe2_preprocess_image(self, inputs):
"""
Caffe2 implementation of preprocess_image, which is called inside each MetaArch's forward.
It normalizes the input images, and the final caffe2 graph assumes the
inputs have been batched already.
"""
data, im_info = inputs
data = alias(data, "data")
im_info = alias(im_info, "im_info")
mean, std = self._wrapped_model.pixel_mean, self._wrapped_model.pixel_std
normalized_data = (data - mean) / std
normalized_data = alias(normalized_data, "normalized_data")
# Pack (data, im_info) into ImageList which is recognized by self.inference.
images = ImageList(tensor=normalized_data, image_sizes=im_info)
return images
@staticmethod
def get_outputs_converter(predict_net, init_net):
"""
Creates a function that converts outputs of the caffe2 model to
detectron2's standard format.
The function uses information in `predict_net` and `init_net` that are
available at inferene time. Therefore the function logic can be used in inference.
The returned function has the following signature:
def convert(batched_inputs, c2_inputs, c2_results) -> detectron2_outputs
Where
* batched_inputs (list[dict]): the original input format of the meta arch
* c2_inputs (dict[str, Tensor]): the caffe2 inputs.
* c2_results (dict[str, Tensor]): the caffe2 output format,
corresponding to the outputs of the :meth:`forward` function.
* detectron2_outputs: the original output format of the meta arch.
This function can be used to compare the outputs of the original meta arch and
the converted caffe2 graph.
Returns:
callable: a callable of the above signature.
"""
raise NotImplementedError
class Caffe2GeneralizedRCNN(Caffe2MetaArch):
def __init__(self, cfg, torch_model):
assert isinstance(torch_model, meta_arch.GeneralizedRCNN)
torch_model = patch_generalized_rcnn(torch_model)
super().__init__(cfg, torch_model)
self.roi_heads_patcher = ROIHeadsPatcher(cfg, self._wrapped_model.roi_heads)
def encode_additional_info(self, predict_net, init_net):
size_divisibility = self._wrapped_model.backbone.size_divisibility
check_set_pb_arg(predict_net, "size_divisibility", "i", size_divisibility)
check_set_pb_arg(
predict_net, "device", "s", str.encode(str(self._wrapped_model.device), "ascii")
)
check_set_pb_arg(predict_net, "meta_architecture", "s", b"GeneralizedRCNN")
@mock_torch_nn_functional_interpolate()
def forward(self, inputs):
if not self.tensor_mode:
return self._wrapped_model.inference(inputs)
images = self._caffe2_preprocess_image(inputs)
features = self._wrapped_model.backbone(images.tensor)
proposals, _ = self._wrapped_model.proposal_generator(images, features)
with self.roi_heads_patcher.mock_roi_heads():
detector_results, _ = self._wrapped_model.roi_heads(images, features, proposals)
return tuple(detector_results[0].flatten())
@staticmethod
def get_outputs_converter(predict_net, init_net):
def f(batched_inputs, c2_inputs, c2_results):
image_sizes = [[int(im[0]), int(im[1])] for im in c2_inputs["im_info"]]
results = assemble_rcnn_outputs_by_name(image_sizes, c2_results)
return meta_arch.GeneralizedRCNN._postprocess(results, batched_inputs, image_sizes)
return f
class Caffe2PanopticFPN(Caffe2MetaArch):
def __init__(self, cfg, torch_model):
assert isinstance(torch_model, meta_arch.PanopticFPN)
torch_model = patch_generalized_rcnn(torch_model)
super().__init__(cfg, torch_model)
self.roi_heads_patcher = ROIHeadsPatcher(cfg, self._wrapped_model.roi_heads)
@mock_torch_nn_functional_interpolate()
def forward(self, inputs):
assert self.tensor_mode
images = self._caffe2_preprocess_image(inputs)
features = self._wrapped_model.backbone(images.tensor)
sem_seg_results, _ = self._wrapped_model.sem_seg_head(features)
sem_seg_results = alias(sem_seg_results, "sem_seg")
proposals, _ = self._wrapped_model.proposal_generator(images, features)
with self.roi_heads_patcher.mock_roi_heads(self.tensor_mode):
detector_results, _ = self._wrapped_model.roi_heads(images, features, proposals)
return tuple(detector_results[0].flatten()) + (sem_seg_results,)
def encode_additional_info(self, predict_net, init_net):
size_divisibility = self._wrapped_model.backbone.size_divisibility
check_set_pb_arg(predict_net, "size_divisibility", "i", size_divisibility)
check_set_pb_arg(
predict_net, "device", "s", str.encode(str(self._wrapped_model.device), "ascii")
)
check_set_pb_arg(predict_net, "meta_architecture", "s", b"PanopticFPN")
# Inference parameters:
check_set_pb_arg(predict_net, "combine_on", "i", self._wrapped_model.combine_on)
check_set_pb_arg(
predict_net,
"combine_overlap_threshold",
"f",
_cast_to_f32(self._wrapped_model.combine_overlap_threshold),
)
check_set_pb_arg(
predict_net,
"combine_stuff_area_limit",
"i",
self._wrapped_model.combine_stuff_area_limit,
)
check_set_pb_arg(
predict_net,
"combine_instances_confidence_threshold",
"f",
_cast_to_f32(self._wrapped_model.combine_instances_confidence_threshold),
)
@staticmethod
def get_outputs_converter(predict_net, init_net):
combine_on = get_pb_arg_vali(predict_net, "combine_on", None)
combine_overlap_threshold = get_pb_arg_valf(predict_net, "combine_overlap_threshold", None)
combine_stuff_area_limit = get_pb_arg_vali(predict_net, "combine_stuff_area_limit", None)
combine_instances_confidence_threshold = get_pb_arg_valf(
predict_net, "combine_instances_confidence_threshold", None
)
def f(batched_inputs, c2_inputs, c2_results):
image_sizes = [[int(im[0]), int(im[1])] for im in c2_inputs["im_info"]]
detector_results = assemble_rcnn_outputs_by_name(
image_sizes, c2_results, force_mask_on=True
)
sem_seg_results = c2_results["sem_seg"]
# copied from meta_arch/panoptic_fpn.py ...
processed_results = []
for sem_seg_result, detector_result, input_per_image, image_size in zip(
sem_seg_results, detector_results, batched_inputs, image_sizes
):
height = input_per_image.get("height", image_size[0])
width = input_per_image.get("width", image_size[1])
sem_seg_r = sem_seg_postprocess(sem_seg_result, image_size, height, width)
detector_r = detector_postprocess(detector_result, height, width)
processed_results.append({"sem_seg": sem_seg_r, "instances": detector_r})
if combine_on:
panoptic_r = combine_semantic_and_instance_outputs(
detector_r,
sem_seg_r.argmax(dim=0),
combine_overlap_threshold,
combine_stuff_area_limit,
combine_instances_confidence_threshold,
)
processed_results[-1]["panoptic_seg"] = panoptic_r
return processed_results
return f
class Caffe2RetinaNet(Caffe2MetaArch):
def __init__(self, cfg, torch_model):
assert isinstance(torch_model, meta_arch.RetinaNet)
super().__init__(cfg, torch_model)
@mock_torch_nn_functional_interpolate()
def forward(self, inputs):
assert self.tensor_mode
images = self._caffe2_preprocess_image(inputs)
# explicitly return the images sizes to avoid removing "im_info" by ONNX
# since it's not used in the forward path
return_tensors = [images.image_sizes]
features = self._wrapped_model.backbone(images.tensor)
features = [features[f] for f in self._wrapped_model.in_features]
for i, feature_i in enumerate(features):
features[i] = alias(feature_i, "feature_{}".format(i), is_backward=True)
return_tensors.append(features[i])
box_cls, box_delta = self._wrapped_model.head(features)
for i, (box_cls_i, box_delta_i) in enumerate(zip(box_cls, box_delta)):
return_tensors.append(alias(box_cls_i, "box_cls_{}".format(i)))
return_tensors.append(alias(box_delta_i, "box_delta_{}".format(i)))
return tuple(return_tensors)
def encode_additional_info(self, predict_net, init_net):
size_divisibility = self._wrapped_model.backbone.size_divisibility
check_set_pb_arg(predict_net, "size_divisibility", "i", size_divisibility)
check_set_pb_arg(
predict_net, "device", "s", str.encode(str(self._wrapped_model.device), "ascii")
)
check_set_pb_arg(predict_net, "meta_architecture", "s", b"RetinaNet")
# Inference parameters:
check_set_pb_arg(
predict_net, "score_threshold", "f", _cast_to_f32(self._wrapped_model.score_threshold)
)
check_set_pb_arg(predict_net, "topk_candidates", "i", self._wrapped_model.topk_candidates)
check_set_pb_arg(
predict_net, "nms_threshold", "f", _cast_to_f32(self._wrapped_model.nms_threshold)
)
check_set_pb_arg(
predict_net,
"max_detections_per_image",
"i",
self._wrapped_model.max_detections_per_image,
)
check_set_pb_arg(
predict_net,
"bbox_reg_weights",
"floats",
[_cast_to_f32(w) for w in self._wrapped_model.box2box_transform.weights],
)
self._encode_anchor_generator_cfg(predict_net)
def _encode_anchor_generator_cfg(self, predict_net):
# serialize anchor_generator for future use
serialized_anchor_generator = io.BytesIO()
torch.save(self._wrapped_model.anchor_generator, serialized_anchor_generator)
# Ideally we can put anchor generating inside the model, then we don't
# need to store this information.
bytes = serialized_anchor_generator.getvalue()
check_set_pb_arg(predict_net, "serialized_anchor_generator", "s", bytes)
@staticmethod
def get_outputs_converter(predict_net, init_net):
self = types.SimpleNamespace()
serialized_anchor_generator = io.BytesIO(
get_pb_arg_vals(predict_net, "serialized_anchor_generator", None)
)
self.anchor_generator = torch.load(serialized_anchor_generator)
bbox_reg_weights = get_pb_arg_floats(predict_net, "bbox_reg_weights", None)
self.box2box_transform = Box2BoxTransform(weights=tuple(bbox_reg_weights))
self.score_threshold = get_pb_arg_valf(predict_net, "score_threshold", None)
self.topk_candidates = get_pb_arg_vali(predict_net, "topk_candidates", None)
self.nms_threshold = get_pb_arg_valf(predict_net, "nms_threshold", None)
self.max_detections_per_image = get_pb_arg_vali(
predict_net, "max_detections_per_image", None
)
# hack to reuse inference code from RetinaNet
self.inference = functools.partial(meta_arch.RetinaNet.inference, self)
self.inference_single_image = functools.partial(
meta_arch.RetinaNet.inference_single_image, self
)
def f(batched_inputs, c2_inputs, c2_results):
image_sizes = [[int(im[0]), int(im[1])] for im in c2_inputs["im_info"]]
num_features = len([x for x in c2_results.keys() if x.startswith("box_cls_")])
box_cls = [c2_results["box_cls_{}".format(i)] for i in range(num_features)]
box_delta = [c2_results["box_delta_{}".format(i)] for i in range(num_features)]
# For each feature level, feature should have the same batch size and
# spatial dimension as the box_cls and box_delta.
dummy_features = [box_delta[i].clone()[:, 0:0, :, :] for i in range(num_features)]
anchors = self.anchor_generator(dummy_features)
# self.num_classess can be inferred
self.num_classes = box_cls[0].shape[1] // (box_delta[0].shape[1] // 4)
results = self.inference(box_cls, box_delta, anchors, image_sizes)
return meta_arch.GeneralizedRCNN._postprocess(results, batched_inputs, image_sizes)
return f
META_ARCH_CAFFE2_EXPORT_TYPE_MAP = {
"GeneralizedRCNN": Caffe2GeneralizedRCNN,
"PanopticFPN": Caffe2PanopticFPN,
"RetinaNet": Caffe2RetinaNet,
}
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
import contextlib
import mock
import torch
from detectron2.modeling import poolers
from detectron2.modeling.proposal_generator import rpn
from detectron2.modeling.roi_heads import keypoint_head, mask_head
from detectron2.modeling.roi_heads.fast_rcnn import FastRCNNOutputLayers
from .c10 import (
Caffe2Compatible,
Caffe2FastRCNNOutputsInference,
Caffe2KeypointRCNNInference,
Caffe2MaskRCNNInference,
Caffe2ROIPooler,
Caffe2RPN,
)
class GenericMixin(object):
pass
class Caffe2CompatibleConverter(object):
"""
A GenericUpdater which implements the `create_from` interface, by modifying
module object and assign it with another class replaceCls.
"""
def __init__(self, replaceCls):
self.replaceCls = replaceCls
def create_from(self, module):
# update module's class to the new class
assert isinstance(module, torch.nn.Module)
if issubclass(self.replaceCls, GenericMixin):
# replaceCls should act as mixin, create a new class on-the-fly
new_class = type(
"{}MixedWith{}".format(self.replaceCls.__name__, module.__class__.__name__),
(self.replaceCls, module.__class__),
{}, # {"new_method": lambda self: ...},
)
module.__class__ = new_class
else:
# replaceCls is complete class, this allow arbitrary class swap
module.__class__ = self.replaceCls
# initialize Caffe2Compatible
if isinstance(module, Caffe2Compatible):
module.tensor_mode = False
return module
def patch(model, target, updater, *args, **kwargs):
"""
recursively (post-order) update all modules with the target type and its
subclasses, make a initialization/composition/inheritance/... via the
updater.create_from.
"""
for name, module in model.named_children():
model._modules[name] = patch(module, target, updater, *args, **kwargs)
if isinstance(model, target):
return updater.create_from(model, *args, **kwargs)
return model
def patch_generalized_rcnn(model):
ccc = Caffe2CompatibleConverter
model = patch(model, rpn.RPN, ccc(Caffe2RPN))
model = patch(model, poolers.ROIPooler, ccc(Caffe2ROIPooler))
return model
@contextlib.contextmanager
def mock_fastrcnn_outputs_inference(
tensor_mode, check=True, box_predictor_type=FastRCNNOutputLayers
):
with mock.patch.object(
box_predictor_type,
"inference",
autospec=True,
side_effect=Caffe2FastRCNNOutputsInference(tensor_mode),
) as mocked_func:
yield
if check:
assert mocked_func.call_count > 0
@contextlib.contextmanager
def mock_mask_rcnn_inference(tensor_mode, patched_module, check=True):
with mock.patch(
"{}.mask_rcnn_inference".format(patched_module), side_effect=Caffe2MaskRCNNInference()
) as mocked_func:
yield
if check:
assert mocked_func.call_count > 0
@contextlib.contextmanager
def mock_keypoint_rcnn_inference(tensor_mode, patched_module, use_heatmap_max_keypoint, check=True):
with mock.patch(
"{}.keypoint_rcnn_inference".format(patched_module),
side_effect=Caffe2KeypointRCNNInference(use_heatmap_max_keypoint),
) as mocked_func:
yield
if check:
assert mocked_func.call_count > 0
class ROIHeadsPatcher:
def __init__(self, cfg, heads):
self.heads = heads
self.use_heatmap_max_keypoint = cfg.EXPORT_CAFFE2.USE_HEATMAP_MAX_KEYPOINT
@contextlib.contextmanager
def mock_roi_heads(self, tensor_mode=True):
"""
Patching several inference functions inside ROIHeads and its subclasses
Args:
tensor_mode (bool): whether the inputs/outputs are caffe2's tensor
format or not. Default to True.
"""
# NOTE: this requries the `keypoint_rcnn_inference` and `mask_rcnn_inference`
# are called inside the same file as BaseXxxHead due to using mock.patch.
kpt_heads_mod = keypoint_head.BaseKeypointRCNNHead.__module__
mask_head_mod = mask_head.BaseMaskRCNNHead.__module__
mock_ctx_managers = [
mock_fastrcnn_outputs_inference(
tensor_mode=tensor_mode,
check=True,
box_predictor_type=type(self.heads.box_predictor),
)
]
if getattr(self.heads, "keypoint_on", False):
mock_ctx_managers += [
mock_keypoint_rcnn_inference(
tensor_mode, kpt_heads_mod, self.use_heatmap_max_keypoint
)
]
if getattr(self.heads, "mask_on", False):
mock_ctx_managers += [mock_mask_rcnn_inference(tensor_mode, mask_head_mod)]
with contextlib.ExitStack() as stack: # python 3.3+
for mgr in mock_ctx_managers:
stack.enter_context(mgr)
yield
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
import collections
import contextlib
import copy
import functools
import logging
import mock
import numpy as np
import os
from typing import Any, Callable, Dict, List, Optional, Tuple, Union
import caffe2.python.utils as putils
import torch
import torch.nn.functional as F
from caffe2.proto import caffe2_pb2
from caffe2.python import core, net_drawer, workspace
from torch.nn.functional import interpolate as interp
logger = logging.getLogger(__name__)
# ==== torch/utils_toffee/cast.py =======================================
def to_device(t, device_str):
"""
This function is a replacement of .to(another_device) such that it allows the
casting to be traced properly by explicitly calling the underlying copy ops.
It also avoids introducing unncessary op when casting to the same device.
"""
src = t.device
dst = torch.device(device_str)
if src == dst:
return t
elif src.type == "cuda" and dst.type == "cpu":
return torch.ops._caffe2.CopyGPUToCPU(t)
elif src.type == "cpu" and dst.type == "cuda":
return torch.ops._caffe2.CopyCPUToGPU(t)
else:
raise RuntimeError("Can't cast tensor from device {} to device {}".format(src, dst))
# ==== torch/utils_toffee/interpolate.py =======================================
# Note: borrowed from vision/detection/fair/detectron/detectron/modeling/detector.py
def BilinearInterpolation(tensor_in, up_scale):
assert up_scale % 2 == 0, "Scale should be even"
def upsample_filt(size):
factor = (size + 1) // 2
if size % 2 == 1:
center = factor - 1
else:
center = factor - 0.5
og = np.ogrid[:size, :size]
return (1 - abs(og[0] - center) / factor) * (1 - abs(og[1] - center) / factor)
kernel_size = int(up_scale) * 2
bil_filt = upsample_filt(kernel_size)
dim = int(tensor_in.shape[1])
kernel = np.zeros((dim, dim, kernel_size, kernel_size), dtype=np.float32)
kernel[range(dim), range(dim), :, :] = bil_filt
tensor_out = F.conv_transpose2d(
tensor_in,
weight=to_device(torch.Tensor(kernel), tensor_in.device),
bias=None,
stride=int(up_scale),
padding=int(up_scale / 2),
)
return tensor_out
# NOTE: ONNX is incompatible with traced torch.nn.functional.interpolate if
# using dynamic `scale_factor` rather than static `size`. (T43166860)
# NOTE: Caffe2 Int8 conversion might not be able to quantize `size` properly.
def onnx_compatibale_interpolate(
input, size=None, scale_factor=None, mode="nearest", align_corners=None
):
# NOTE: The input dimensions are interpreted in the form:
# `mini-batch x channels x [optional depth] x [optional height] x width`.
if size is None and scale_factor is not None:
if input.dim() == 4:
if isinstance(scale_factor, (int, float)):
height_scale, width_scale = (scale_factor, scale_factor)
else:
assert isinstance(scale_factor, (tuple, list))
assert len(scale_factor) == 2
height_scale, width_scale = scale_factor
assert not align_corners, "No matching C2 op for align_corners == True"
if mode == "nearest":
return torch.ops._caffe2.ResizeNearest(
input, order="NCHW", width_scale=width_scale, height_scale=height_scale
)
elif mode == "bilinear":
logger.warning(
"Use F.conv_transpose2d for bilinear interpolate"
" because there's no such C2 op, this may cause significant"
" slowdown and the boundary pixels won't be as same as"
" using F.interpolate due to padding."
)
assert height_scale == width_scale
return BilinearInterpolation(input, up_scale=height_scale)
logger.warning("Output size is not static, it might cause ONNX conversion issue")
return interp(input, size, scale_factor, mode, align_corners)
@contextlib.contextmanager
def mock_torch_nn_functional_interpolate():
if torch.onnx.is_in_onnx_export():
with mock.patch(
"torch.nn.functional.interpolate", side_effect=onnx_compatibale_interpolate
):
yield
else:
yield
# ==== torch/utils_caffe2/ws_utils.py ==========================================
class ScopedWS(object):
def __init__(self, ws_name, is_reset, is_cleanup=False):
self.ws_name = ws_name
self.is_reset = is_reset
self.is_cleanup = is_cleanup
self.org_ws = ""
def __enter__(self):
self.org_ws = workspace.CurrentWorkspace()
if self.ws_name is not None:
workspace.SwitchWorkspace(self.ws_name, True)
if self.is_reset:
workspace.ResetWorkspace()
return workspace
def __exit__(self, *args):
if self.is_cleanup:
workspace.ResetWorkspace()
if self.ws_name is not None:
workspace.SwitchWorkspace(self.org_ws)
def fetch_any_blob(name):
bb = None
try:
bb = workspace.FetchBlob(name)
except TypeError:
bb = workspace.FetchInt8Blob(name)
except Exception as e:
logger.error("Get blob {} error: {}".format(name, e))
return bb
# ==== torch/utils_caffe2/protobuf.py ==========================================
def get_pb_arg(pb, arg_name):
for x in pb.arg:
if x.name == arg_name:
return x
return None
def get_pb_arg_valf(pb, arg_name, default_val):
arg = get_pb_arg(pb, arg_name)
return arg.f if arg is not None else default_val
def get_pb_arg_floats(pb, arg_name, default_val):
arg = get_pb_arg(pb, arg_name)
return list(map(float, arg.floats)) if arg is not None else default_val
def get_pb_arg_ints(pb, arg_name, default_val):
arg = get_pb_arg(pb, arg_name)
return list(map(int, arg.ints)) if arg is not None else default_val
def get_pb_arg_vali(pb, arg_name, default_val):
arg = get_pb_arg(pb, arg_name)
return arg.i if arg is not None else default_val
def get_pb_arg_vals(pb, arg_name, default_val):
arg = get_pb_arg(pb, arg_name)
return arg.s if arg is not None else default_val
def get_pb_arg_valstrings(pb, arg_name, default_val):
arg = get_pb_arg(pb, arg_name)
return list(arg.strings) if arg is not None else default_val
def check_set_pb_arg(pb, arg_name, arg_attr, arg_value, allow_override=False):
arg = get_pb_arg(pb, arg_name)
if arg is None:
arg = putils.MakeArgument(arg_name, arg_value)
assert hasattr(arg, arg_attr)
pb.arg.extend([arg])
if allow_override and getattr(arg, arg_attr) != arg_value:
logger.warning(
"Override argument {}: {} -> {}".format(arg_name, getattr(arg, arg_attr), arg_value)
)
setattr(arg, arg_attr, arg_value)
else:
assert arg is not None
assert getattr(arg, arg_attr) == arg_value, "Existing value {}, new value {}".format(
getattr(arg, arg_attr), arg_value
)
def _create_const_fill_op_from_numpy(name, tensor, device_option=None):
assert type(tensor) == np.ndarray
kTypeNameMapper = {
np.dtype("float32"): "GivenTensorFill",
np.dtype("int32"): "GivenTensorIntFill",
np.dtype("int64"): "GivenTensorInt64Fill",
np.dtype("uint8"): "GivenTensorStringFill",
}
args_dict = {}
if tensor.dtype == np.dtype("uint8"):
args_dict.update({"values": [str(tensor.data)], "shape": [1]})
else:
args_dict.update({"values": tensor, "shape": tensor.shape})
if device_option is not None:
args_dict["device_option"] = device_option
return core.CreateOperator(kTypeNameMapper[tensor.dtype], [], [name], **args_dict)
def _create_const_fill_op_from_c2_int8_tensor(name, int8_tensor):
assert type(int8_tensor) == workspace.Int8Tensor
kTypeNameMapper = {
np.dtype("int32"): "Int8GivenIntTensorFill",
np.dtype("uint8"): "Int8GivenTensorFill",
}
tensor = int8_tensor.data
assert tensor.dtype in [np.dtype("uint8"), np.dtype("int32")]
values = tensor.tobytes() if tensor.dtype == np.dtype("uint8") else tensor
return core.CreateOperator(
kTypeNameMapper[tensor.dtype],
[],
[name],
values=values,
shape=tensor.shape,
Y_scale=int8_tensor.scale,
Y_zero_point=int8_tensor.zero_point,
)
def create_const_fill_op(
name: str,
blob: Union[np.ndarray, workspace.Int8Tensor],
device_option: Optional[caffe2_pb2.DeviceOption] = None,
) -> caffe2_pb2.OperatorDef:
"""
Given a blob object, return the Caffe2 operator that creates this blob
as constant. Currently support NumPy tensor and Caffe2 Int8Tensor.
"""
tensor_type = type(blob)
assert tensor_type in [
np.ndarray,
workspace.Int8Tensor,
], 'Error when creating const fill op for "{}", unsupported blob type: {}'.format(
name, type(blob)
)
if tensor_type == np.ndarray:
return _create_const_fill_op_from_numpy(name, blob, device_option)
elif tensor_type == workspace.Int8Tensor:
assert device_option is None
return _create_const_fill_op_from_c2_int8_tensor(name, blob)
def construct_init_net_from_params(
params: Dict[str, Any], device_options: Optional[Dict[str, caffe2_pb2.DeviceOption]] = None
) -> caffe2_pb2.NetDef:
"""
Construct the init_net from params dictionary
"""
init_net = caffe2_pb2.NetDef()
device_options = device_options or {}
for name, blob in params.items():
if isinstance(blob, str):
logger.warning(
(
"Blob {} with type {} is not supported in generating init net,"
" skipped.".format(name, type(blob))
)
)
continue
init_net.op.extend(
[create_const_fill_op(name, blob, device_option=device_options.get(name, None))]
)
init_net.external_output.append(name)
return init_net
def get_producer_map(ssa):
"""
Return dict from versioned blob to (i, j),
where i is index of producer op, j is the index of output of that op.
"""
producer_map = {}
for i in range(len(ssa)):
outputs = ssa[i][1]
for j, outp in enumerate(outputs):
producer_map[outp] = (i, j)
return producer_map
def get_consumer_map(ssa):
"""
Return dict from versioned blob to list of (i, j),
where i is index of consumer op, j is the index of input of that op.
"""
consumer_map = collections.defaultdict(list)
for i in range(len(ssa)):
inputs = ssa[i][0]
for j, inp in enumerate(inputs):
consumer_map[inp].append((i, j))
return consumer_map
def get_params_from_init_net(
init_net: caffe2_pb2.NetDef,
) -> [Dict[str, Any], Dict[str, caffe2_pb2.DeviceOption]]:
"""
Take the output blobs from init_net by running it.
Outputs:
params: dict from blob name to numpy array
device_options: dict from blob name to the device option of its creating op
"""
# NOTE: this assumes that the params is determined by producer op with the
# only exception be CopyGPUToCPU which is CUDA op but returns CPU tensor.
def _get_device_option(producer_op):
if producer_op.type == "CopyGPUToCPU":
return caffe2_pb2.DeviceOption()
else:
return producer_op.device_option
with ScopedWS("__get_params_from_init_net__", is_reset=True, is_cleanup=True) as ws:
ws.RunNetOnce(init_net)
params = {b: fetch_any_blob(b) for b in init_net.external_output}
ssa, versions = core.get_ssa(init_net)
producer_map = get_producer_map(ssa)
device_options = {
b: _get_device_option(init_net.op[producer_map[(b, versions[b])][0]])
for b in init_net.external_output
}
return params, device_options
def _updater_raise(op, input_types, output_types):
raise RuntimeError(
"Failed to apply updater for op {} given input_types {} and"
" output_types {}".format(op, input_types, output_types)
)
def _generic_status_identifier(
predict_net: caffe2_pb2.NetDef,
status_updater: Callable,
known_status: Dict[Tuple[str, int], Any],
) -> Dict[Tuple[str, int], Any]:
"""
Statically infer the status of each blob, the status can be such as device type
(CPU/GPU), layout (NCHW/NHWC), data type (float32/int8), etc. "Blob" here
is versioned blob (Tuple[str, int]) in the format compatible with ssa.
Inputs:
predict_net: the caffe2 network
status_updater: a callable, given an op and the status of its input/output,
it returns the updated status of input/output. `None` is used for
representing unknown status.
known_status: a dict containing known status, used as initialization.
Outputs:
A dict mapping from versioned blob to its status
"""
ssa, versions = core.get_ssa(predict_net)
versioned_ext_input = [(b, 0) for b in predict_net.external_input]
versioned_ext_output = [(b, versions[b]) for b in predict_net.external_output]
all_versioned_blobs = set().union(*[set(x[0] + x[1]) for x in ssa])
allowed_vbs = all_versioned_blobs.union(versioned_ext_input).union(versioned_ext_output)
assert all(k in allowed_vbs for k in known_status)
assert all(v is not None for v in known_status.values())
_known_status = copy.deepcopy(known_status)
def _check_and_update(key, value):
assert value is not None
if key in _known_status:
if not _known_status[key] == value:
raise RuntimeError(
"Confilict status for {}, existing status {}, new status {}".format(
key, _known_status[key], value
)
)
_known_status[key] = value
def _update_i(op, ssa_i):
versioned_inputs = ssa_i[0]
versioned_outputs = ssa_i[1]
inputs_status = [_known_status.get(b, None) for b in versioned_inputs]
outputs_status = [_known_status.get(b, None) for b in versioned_outputs]
new_inputs_status, new_outputs_status = status_updater(op, inputs_status, outputs_status)
for versioned_blob, status in zip(
versioned_inputs + versioned_outputs, new_inputs_status + new_outputs_status
):
if status is not None:
_check_and_update(versioned_blob, status)
for op, ssa_i in zip(predict_net.op, ssa):
_update_i(op, ssa_i)
for op, ssa_i in zip(reversed(predict_net.op), reversed(ssa)):
_update_i(op, ssa_i)
# NOTE: This strictly checks all the blob from predict_net must be assgined
# a known status. However sometimes it's impossible (eg. having deadend op),
# we may relax this constraint if
for k in all_versioned_blobs:
if k not in _known_status:
raise NotImplementedError(
"Can not infer the status for {}. Currently only support the case where"
" a single forward and backward pass can identify status for all blobs.".format(k)
)
return _known_status
def infer_device_type(
predict_net: caffe2_pb2.NetDef,
known_status: Dict[Tuple[str, int], Any],
device_name_style: str = "caffe2",
) -> Dict[Tuple[str, int], str]:
""" Return the device type ("cpu" or "gpu"/"cuda") of each (versioned) blob """
assert device_name_style in ["caffe2", "pytorch"]
_CPU_STR = "cpu"
_GPU_STR = "gpu" if device_name_style == "caffe2" else "cuda"
def _copy_cpu_to_gpu_updater(op, input_types, output_types):
if input_types[0] == _GPU_STR or output_types[0] == _CPU_STR:
_updater_raise(op, input_types, output_types)
return ([_CPU_STR], [_GPU_STR])
def _copy_gpu_to_cpu_updater(op, input_types, output_types):
if input_types[0] == _CPU_STR or output_types[0] == _GPU_STR:
_updater_raise(op, input_types, output_types)
return ([_GPU_STR], [_CPU_STR])
def _other_ops_updater(op, input_types, output_types):
non_none_types = [x for x in input_types + output_types if x is not None]
if len(non_none_types) > 0:
the_type = non_none_types[0]
if not all(x == the_type for x in non_none_types):
_updater_raise(op, input_types, output_types)
else:
the_type = None
return ([the_type for _ in op.input], [the_type for _ in op.output])
def _device_updater(op, *args, **kwargs):
return {
"CopyCPUToGPU": _copy_cpu_to_gpu_updater,
"CopyGPUToCPU": _copy_gpu_to_cpu_updater,
}.get(op.type, _other_ops_updater)(op, *args, **kwargs)
return _generic_status_identifier(predict_net, _device_updater, known_status)
# ==== torch/utils_caffe2/vis.py ===============================================
def _modify_blob_names(ops, blob_rename_f):
ret = []
def _replace_list(blob_list, replaced_list):
del blob_list[:]
blob_list.extend(replaced_list)
for x in ops:
cur = copy.deepcopy(x)
_replace_list(cur.input, list(map(blob_rename_f, cur.input)))
_replace_list(cur.output, list(map(blob_rename_f, cur.output)))
ret.append(cur)
return ret
def _rename_blob(name, blob_sizes, blob_ranges):
def _list_to_str(bsize):
ret = ", ".join([str(x) for x in bsize])
ret = "[" + ret + "]"
return ret
ret = name
if blob_sizes is not None and name in blob_sizes:
ret += "\n" + _list_to_str(blob_sizes[name])
if blob_ranges is not None and name in blob_ranges:
ret += "\n" + _list_to_str(blob_ranges[name])
return ret
# graph_name could not contain word 'graph'
def save_graph(net, file_name, graph_name="net", op_only=True, blob_sizes=None, blob_ranges=None):
blob_rename_f = functools.partial(_rename_blob, blob_sizes=blob_sizes, blob_ranges=blob_ranges)
return save_graph_base(net, file_name, graph_name, op_only, blob_rename_f)
def save_graph_base(net, file_name, graph_name="net", op_only=True, blob_rename_func=None):
graph = None
ops = net.op
if blob_rename_func is not None:
ops = _modify_blob_names(ops, blob_rename_func)
if not op_only:
graph = net_drawer.GetPydotGraph(ops, graph_name, rankdir="TB")
else:
graph = net_drawer.GetPydotGraphMinimal(
ops, graph_name, rankdir="TB", minimal_dependency=True
)
try:
par_dir = os.path.dirname(file_name)
if not os.path.exists(par_dir):
os.makedirs(par_dir)
format = os.path.splitext(os.path.basename(file_name))[-1]
if format == ".png":
graph.write_png(file_name)
elif format == ".pdf":
graph.write_pdf(file_name)
elif format == ".svg":
graph.write_svg(file_name)
else:
print("Incorrect format {}".format(format))
except Exception as e:
print("Error when writing graph to image {}".format(e))
return graph
# ==== torch/utils_toffee/aten_to_caffe2.py ====================================
def group_norm_replace_aten_with_caffe2(predict_net: caffe2_pb2.NetDef):
"""
For ONNX exported model, GroupNorm will be represented as ATen op,
this can be a drop in replacement from ATen to GroupNorm
"""
count = 0
for op in predict_net.op:
if op.type == "ATen":
op_name = get_pb_arg_vals(op, "operator", None) # return byte in py3
if op_name and op_name.decode() == "group_norm":
op.arg.remove(get_pb_arg(op, "operator"))
if get_pb_arg_vali(op, "cudnn_enabled", None):
op.arg.remove(get_pb_arg(op, "cudnn_enabled"))
num_groups = get_pb_arg_vali(op, "num_groups", None)
if num_groups is not None:
op.arg.remove(get_pb_arg(op, "num_groups"))
check_set_pb_arg(op, "group", "i", num_groups)
op.type = "GroupNorm"
count += 1
if count > 1:
logger.info("Replaced {} ATen operator to GroupNormOp".format(count))
# ==== torch/utils_toffee/alias.py =============================================
def alias(x, name, is_backward=False):
if not torch.onnx.is_in_onnx_export():
return x
assert isinstance(x, torch.Tensor)
return torch.ops._caffe2.AliasWithName(x, name, is_backward=is_backward)
def fuse_alias_placeholder(predict_net, init_net):
""" Remove AliasWithName placeholder and rename the input/output of it """
# First we finish all the re-naming
for i, op in enumerate(predict_net.op):
if op.type == "AliasWithName":
assert len(op.input) == 1
assert len(op.output) == 1
name = get_pb_arg_vals(op, "name", None).decode()
is_backward = bool(get_pb_arg_vali(op, "is_backward", 0))
rename_op_input(predict_net, init_net, i, 0, name, from_producer=is_backward)
rename_op_output(predict_net, i, 0, name)
# Remove AliasWithName, should be very safe since it's a non-op
new_ops = []
for op in predict_net.op:
if op.type != "AliasWithName":
new_ops.append(op)
else:
# safety check
assert op.input == op.output
assert op.input[0] == op.arg[0].s.decode()
del predict_net.op[:]
predict_net.op.extend(new_ops)
# ==== torch/utils_caffe2/graph_transform.py ===================================
class IllegalGraphTransformError(ValueError):
""" When a graph transform function call can't be executed. """
def _rename_versioned_blob_in_proto(
proto: caffe2_pb2.NetDef,
old_name: str,
new_name: str,
version: int,
ssa: List[Tuple[List[Tuple[str, int]], List[Tuple[str, int]]]],
start_versions: Dict[str, int],
end_versions: Dict[str, int],
):
""" In given proto, rename all blobs with matched version """
# Operater list
for op, i_th_ssa in zip(proto.op, ssa):
versioned_inputs, versioned_outputs = i_th_ssa
for i in range(len(op.input)):
if versioned_inputs[i] == (old_name, version):
op.input[i] = new_name
for i in range(len(op.output)):
if versioned_outputs[i] == (old_name, version):
op.output[i] = new_name
# external_input
if start_versions.get(old_name, 0) == version:
for i in range(len(proto.external_input)):
if proto.external_input[i] == old_name:
proto.external_input[i] = new_name
# external_output
if end_versions.get(old_name, 0) == version:
for i in range(len(proto.external_output)):
if proto.external_output[i] == old_name:
proto.external_output[i] = new_name
def rename_op_input(
predict_net: caffe2_pb2.NetDef,
init_net: caffe2_pb2.NetDef,
op_id: int,
input_id: int,
new_name: str,
from_producer: bool = False,
):
"""
Rename the op_id-th operator in predict_net, change it's input_id-th input's
name to the new_name. It also does automatic re-route and change
external_input and init_net if necessary.
- It requires the input is only consumed by this op.
- This function modifies predict_net and init_net in-place.
- When from_producer is enable, this also updates other operators that consumes
the same input. Be cautious because may trigger unintended behavior.
"""
assert isinstance(predict_net, caffe2_pb2.NetDef)
assert isinstance(init_net, caffe2_pb2.NetDef)
init_net_ssa, init_net_versions = core.get_ssa(init_net)
predict_net_ssa, predict_net_versions = core.get_ssa(
predict_net, copy.deepcopy(init_net_versions)
)
versioned_inputs, versioned_outputs = predict_net_ssa[op_id]
old_name, version = versioned_inputs[input_id]
if from_producer:
producer_map = get_producer_map(predict_net_ssa)
if not (old_name, version) in producer_map:
raise NotImplementedError(
"Can't find producer, the input {} is probably from"
" init_net, this is not supported yet.".format(old_name)
)
producer = producer_map[(old_name, version)]
rename_op_output(predict_net, producer[0], producer[1], new_name)
return
def contain_targets(op_ssa):
return (old_name, version) in op_ssa[0]
is_consumer = [contain_targets(op_ssa) for op_ssa in predict_net_ssa]
if sum(is_consumer) > 1:
raise IllegalGraphTransformError(
(
"Input '{}' of operator(#{}) are consumed by other ops, please use"
+ " rename_op_output on the producer instead. Offending op: \n{}"
).format(old_name, op_id, predict_net.op[op_id])
)
# update init_net
_rename_versioned_blob_in_proto(
init_net, old_name, new_name, version, init_net_ssa, {}, init_net_versions
)
# update predict_net
_rename_versioned_blob_in_proto(
predict_net,
old_name,
new_name,
version,
predict_net_ssa,
init_net_versions,
predict_net_versions,
)
def rename_op_output(predict_net: caffe2_pb2.NetDef, op_id: int, output_id: int, new_name: str):
"""
Rename the op_id-th operator in predict_net, change it's output_id-th input's
name to the new_name. It also does automatic re-route and change
external_output and if necessary.
- It allows multiple consumers of its output.
- This function modifies predict_net in-place, doesn't need init_net.
"""
assert isinstance(predict_net, caffe2_pb2.NetDef)
ssa, blob_versions = core.get_ssa(predict_net)
versioned_inputs, versioned_outputs = ssa[op_id]
old_name, version = versioned_outputs[output_id]
# update predict_net
_rename_versioned_blob_in_proto(
predict_net, old_name, new_name, version, ssa, {}, blob_versions
)
def get_sub_graph_external_input_output(
predict_net: caffe2_pb2.NetDef, sub_graph_op_indices: List[int]
) -> Tuple[List[Tuple[str, int]], List[Tuple[str, int]]]:
"""
Return the list of external input/output of sub-graph,
each element is tuple of the name and corresponding version in predict_net.
external input/output is defined the same way as caffe2 NetDef.
"""
ssa, versions = core.get_ssa(predict_net)
all_inputs = []
all_outputs = []
for op_id in sub_graph_op_indices:
all_inputs += [inp for inp in ssa[op_id][0] if inp not in all_inputs]
all_outputs += list(ssa[op_id][1]) # ssa output won't repeat
# for versioned blobs, external inputs are just those blob in all_inputs
# but not in all_outputs
ext_inputs = [inp for inp in all_inputs if inp not in all_outputs]
# external outputs are essentially outputs of this subgraph that are used
# outside of this sub-graph (including predict_net.external_output)
all_other_inputs = sum(
(ssa[i][0] for i in range(len(ssa)) if i not in sub_graph_op_indices),
[(outp, versions[outp]) for outp in predict_net.external_output],
)
ext_outputs = [outp for outp in all_outputs if outp in set(all_other_inputs)]
return ext_inputs, ext_outputs
class DiGraph:
""" A DAG representation of caffe2 graph, each vertice is a versioned blob. """
def __init__(self):
self.vertices = set()
self.graph = collections.defaultdict(list)
def add_edge(self, u, v):
self.graph[u].append(v)
self.vertices.add(u)
self.vertices.add(v)
# grab from https://www.geeksforgeeks.org/find-paths-given-source-destination/
def get_all_paths(self, s, d):
visited = {k: False for k in self.vertices}
path = []
all_paths = []
def _get_all_paths_util(graph, u, d, visited, path):
visited[u] = True
path.append(u)
if u == d:
all_paths.append(copy.deepcopy(path))
else:
for i in graph[u]:
if not visited[i]:
_get_all_paths_util(graph, i, d, visited, path)
path.pop()
visited[u] = False
_get_all_paths_util(self.graph, s, d, visited, path)
return all_paths
@staticmethod
def from_ssa(ssa):
graph = DiGraph()
for op_id in range(len(ssa)):
for inp in ssa[op_id][0]:
for outp in ssa[op_id][1]:
graph.add_edge(inp, outp)
return graph
def _get_dependency_chain(ssa, versioned_target, versioned_source):
"""
Return the index list of relevant operator to produce target blob from source blob,
if there's no dependency, return empty list.
"""
# finding all paths between nodes can be O(N!), thus we can only search
# in the subgraph using the op starting from the first consumer of source blob
# to the producer of the target blob.
consumer_map = get_consumer_map(ssa)
producer_map = get_producer_map(ssa)
start_op = min(x[0] for x in consumer_map[versioned_source]) - 15
end_op = (
producer_map[versioned_target][0] + 15 if versioned_target in producer_map else start_op
)
sub_graph_ssa = ssa[start_op : end_op + 1]
if len(sub_graph_ssa) > 30:
logger.warning(
"Subgraph bebetween {} and {} is large (from op#{} to op#{}), it"
" might take non-trival time to find all paths between them.".format(
versioned_source, versioned_target, start_op, end_op
)
)
dag = DiGraph.from_ssa(sub_graph_ssa)
paths = dag.get_all_paths(versioned_source, versioned_target) # include two ends
ops_in_paths = [[producer_map[blob][0] for blob in path[1:]] for path in paths]
return sorted(set().union(*[set(ops) for ops in ops_in_paths]))
def identify_reshape_sub_graph(predict_net: caffe2_pb2.NetDef) -> List[List[int]]:
"""
Idenfity the reshape sub-graph in a protobuf.
The reshape sub-graph is defined as matching the following pattern:
(input_blob) -> Op_1 -> ... -> Op_N -> (new_shape) -─┐
└-------------------------------------------> Reshape -> (output_blob)
Return:
List of sub-graphs, each sub-graph is represented as a list of indices
of the relavent ops, [Op_1, Op_2, ..., Op_N, Reshape]
"""
ssa, _ = core.get_ssa(predict_net)
ret = []
for i, op in enumerate(predict_net.op):
if op.type == "Reshape":
assert len(op.input) == 2
input_ssa = ssa[i][0]
data_source = input_ssa[0]
shape_source = input_ssa[1]
op_indices = _get_dependency_chain(ssa, shape_source, data_source)
ret.append(op_indices + [i])
return ret
def remove_reshape_for_fc(predict_net, params):
"""
In PyTorch nn.Linear has to take 2D tensor, this often leads to reshape
a 4D tensor to 2D by calling .view(). However this (dynamic) reshaping
doesn't work well with ONNX and Int8 tools, and cause using extra
ops (eg. ExpandDims) that might not be available on mobile.
Luckily Caffe2 supports 4D tensor for FC, so we can remove those reshape
after exporting ONNX model.
"""
from caffe2.python import core
# find all reshape sub-graph that can be removed, which is now all Reshape
# sub-graph whose output is only consumed by FC.
# TODO: to make it safer, we may need the actually value to better determine
# if a Reshape before FC is removable.
reshape_sub_graphs = identify_reshape_sub_graph(predict_net)
sub_graphs_to_remove = []
for reshape_sub_graph in reshape_sub_graphs:
reshape_op_id = reshape_sub_graph[-1]
assert predict_net.op[reshape_op_id].type == "Reshape"
ssa, _ = core.get_ssa(predict_net)
reshape_output = ssa[reshape_op_id][1][0]
consumers = [i for i in range(len(ssa)) if reshape_output in ssa[i][0]]
if all(predict_net.op[consumer].type == "FC" for consumer in consumers):
# safety check if the sub-graph is isolated, for this reshape sub-graph,
# it means it has one non-param external input and one external output.
ext_inputs, ext_outputs = get_sub_graph_external_input_output(
predict_net, reshape_sub_graph
)
non_params_ext_inputs = [inp for inp in ext_inputs if inp[1] != 0]
if len(non_params_ext_inputs) == 1 and len(ext_outputs) == 1:
sub_graphs_to_remove.append(reshape_sub_graph)
# perform removing subgraph by:
# 1: rename the Reshape's output to its input, then the graph can be
# seen as in-place itentify, meaning whose external input/output are the same.
# 2: simply remove those ops.
remove_op_ids = []
params_to_remove = []
for sub_graph in sub_graphs_to_remove:
logger.info(
"Remove Reshape sub-graph:\n{}".format(
"".join(["(#{:>4})\n{}".format(i, predict_net.op[i]) for i in sub_graph])
)
)
reshape_op_id = sub_graph[-1]
new_reshap_output = predict_net.op[reshape_op_id].input[0]
rename_op_output(predict_net, reshape_op_id, 0, new_reshap_output)
ext_inputs, ext_outputs = get_sub_graph_external_input_output(predict_net, sub_graph)
non_params_ext_inputs = [inp for inp in ext_inputs if inp[1] != 0]
params_ext_inputs = [inp for inp in ext_inputs if inp[1] == 0]
assert len(non_params_ext_inputs) == 1 and len(ext_outputs) == 1
assert ext_outputs[0][0] == non_params_ext_inputs[0][0]
assert ext_outputs[0][1] == non_params_ext_inputs[0][1] + 1
remove_op_ids.extend(sub_graph)
params_to_remove.extend(params_ext_inputs)
predict_net = copy.deepcopy(predict_net)
new_ops = [op for i, op in enumerate(predict_net.op) if i not in remove_op_ids]
del predict_net.op[:]
predict_net.op.extend(new_ops)
for versioned_params in params_to_remove:
name = versioned_params[0]
logger.info("Remove params: {} from init_net and predict_net.external_input".format(name))
del params[name]
predict_net.external_input.remove(name)
return predict_net, params
def fuse_copy_between_cpu_and_gpu(predict_net: caffe2_pb2.NetDef):
"""
In-place fuse extra copy ops between cpu/gpu for the following case:
a -CopyAToB-> b -CopyBToA> c1 -NextOp1-> d1
-CopyBToA> c2 -NextOp2-> d2
The fused network will look like:
a -NextOp1-> d1
-NextOp2-> d2
"""
_COPY_OPS = ["CopyCPUToGPU", "CopyGPUToCPU"]
def _fuse_once(predict_net):
ssa, blob_versions = core.get_ssa(predict_net)
consumer_map = get_consumer_map(ssa)
versioned_external_output = [
(name, blob_versions[name]) for name in predict_net.external_output
]
for op_id, op in enumerate(predict_net.op):
if op.type in _COPY_OPS:
fw_copy_versioned_output = ssa[op_id][1][0]
consumer_ids = [x[0] for x in consumer_map[fw_copy_versioned_output]]
reverse_op_type = _COPY_OPS[1 - _COPY_OPS.index(op.type)]
is_fusable = (
len(consumer_ids) > 0
and fw_copy_versioned_output not in versioned_external_output
and all(
predict_net.op[_op_id].type == reverse_op_type
and ssa[_op_id][1][0] not in versioned_external_output
for _op_id in consumer_ids
)
)
if is_fusable:
for rv_copy_op_id in consumer_ids:
# making each NextOp uses "a" directly and removing Copy ops
rs_copy_versioned_output = ssa[rv_copy_op_id][1][0]
next_op_id, inp_id = consumer_map[rs_copy_versioned_output][0]
predict_net.op[next_op_id].input[inp_id] = op.input[0]
# remove CopyOps
new_ops = [
op
for i, op in enumerate(predict_net.op)
if i != op_id and i not in consumer_ids
]
del predict_net.op[:]
predict_net.op.extend(new_ops)
return True
return False
# _fuse_once returns False is nothing can be fused
while _fuse_once(predict_net):
pass
def remove_dead_end_ops(net_def: caffe2_pb2.NetDef):
""" remove ops if its output is not used or not in external_output """
ssa, versions = core.get_ssa(net_def)
versioned_external_output = [(name, versions[name]) for name in net_def.external_output]
consumer_map = get_consumer_map(ssa)
removed_op_ids = set()
def _is_dead_end(versioned_blob):
return not (
versioned_blob in versioned_external_output
or (
len(consumer_map[versioned_blob]) > 0
and all(x[0] not in removed_op_ids for x in consumer_map[versioned_blob])
)
)
for i, ssa_i in reversed(list(enumerate(ssa))):
versioned_outputs = ssa_i[1]
if all(_is_dead_end(outp) for outp in versioned_outputs):
removed_op_ids.add(i)
# simply removing those deadend ops should have no effect to external_output
new_ops = [op for i, op in enumerate(net_def.op) if i not in removed_op_ids]
del net_def.op[:]
net_def.op.extend(new_ops)
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
from .batch_norm import FrozenBatchNorm2d, get_norm, NaiveSyncBatchNorm
from .deform_conv import DeformConv, ModulatedDeformConv
from .mask_ops import paste_masks_in_image
from .nms import batched_nms, batched_nms_rotated, nms, nms_rotated
from .roi_align import ROIAlign, roi_align
from .roi_align_rotated import ROIAlignRotated, roi_align_rotated
from .shape_spec import ShapeSpec
from .wrappers import BatchNorm2d, Conv2d, ConvTranspose2d, cat, interpolate, Linear
from .blocks import CNNBlockBase
__all__ = [k for k in globals().keys() if not k.startswith("_")]
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
import logging
import torch
import torch.distributed as dist
from torch import nn
from torch.autograd.function import Function
from torch.nn import functional as F
from detectron2.utils import comm
from .wrappers import BatchNorm2d
TORCH_VERSION = tuple(int(x) for x in torch.__version__.split(".")[:2])
class FrozenBatchNorm2d(nn.Module):
"""
BatchNorm2d where the batch statistics and the affine parameters are fixed.
It contains non-trainable buffers called
"weight" and "bias", "running_mean", "running_var",
initialized to perform identity transformation.
The pre-trained backbone models from Caffe2 only contain "weight" and "bias",
which are computed from the original four parameters of BN.
The affine transform `x * weight + bias` will perform the equivalent
computation of `(x - running_mean) / sqrt(running_var) * weight + bias`.
When loading a backbone model from Caffe2, "running_mean" and "running_var"
will be left unchanged as identity transformation.
Other pre-trained backbone models may contain all 4 parameters.
The forward is implemented by `F.batch_norm(..., training=False)`.
"""
_version = 3
def __init__(self, num_features, eps=1e-5):
super().__init__()
self.num_features = num_features
self.eps = eps
self.register_buffer("weight", torch.ones(num_features))
self.register_buffer("bias", torch.zeros(num_features))
self.register_buffer("running_mean", torch.zeros(num_features))
self.register_buffer("running_var", torch.ones(num_features) - eps)
def forward(self, x):
if x.requires_grad:
# When gradients are needed, F.batch_norm will use extra memory
# because its backward op computes gradients for weight/bias as well.
scale = self.weight * (self.running_var + self.eps).rsqrt()
bias = self.bias - self.running_mean * scale
scale = scale.reshape(1, -1, 1, 1)
bias = bias.reshape(1, -1, 1, 1)
return x * scale + bias
else:
# When gradients are not needed, F.batch_norm is a single fused op
# and provide more optimization opportunities.
return F.batch_norm(
x,
self.running_mean,
self.running_var,
self.weight,
self.bias,
training=False,
eps=self.eps,
)
def _load_from_state_dict(
self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs
):
version = local_metadata.get("version", None)
if version is None or version < 2:
# No running_mean/var in early versions
# This will silent the warnings
if prefix + "running_mean" not in state_dict:
state_dict[prefix + "running_mean"] = torch.zeros_like(self.running_mean)
if prefix + "running_var" not in state_dict:
state_dict[prefix + "running_var"] = torch.ones_like(self.running_var)
if version is not None and version < 3:
logger = logging.getLogger(__name__)
logger.info("FrozenBatchNorm {} is upgraded to version 3.".format(prefix.rstrip(".")))
# In version < 3, running_var are used without +eps.
state_dict[prefix + "running_var"] -= self.eps
super()._load_from_state_dict(
state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs
)
def __repr__(self):
return "FrozenBatchNorm2d(num_features={}, eps={})".format(self.num_features, self.eps)
@classmethod
def convert_frozen_batchnorm(cls, module):
"""
Convert BatchNorm/SyncBatchNorm in module into FrozenBatchNorm.
Args:
module (torch.nn.Module):
Returns:
If module is BatchNorm/SyncBatchNorm, returns a new module.
Otherwise, in-place convert module and return it.
Similar to convert_sync_batchnorm in
https://github.com/pytorch/pytorch/blob/master/torch/nn/modules/batchnorm.py
"""
bn_module = nn.modules.batchnorm
bn_module = (bn_module.BatchNorm2d, bn_module.SyncBatchNorm)
res = module
if isinstance(module, bn_module):
res = cls(module.num_features)
if module.affine:
res.weight.data = module.weight.data.clone().detach()
res.bias.data = module.bias.data.clone().detach()
res.running_mean.data = module.running_mean.data
res.running_var.data = module.running_var.data
res.eps = module.eps
else:
for name, child in module.named_children():
new_child = cls.convert_frozen_batchnorm(child)
if new_child is not child:
res.add_module(name, new_child)
return res
def get_norm(norm, out_channels):
"""
Args:
norm (str or callable): either one of BN, SyncBN, FrozenBN, GN;
or a callable that takes a channel number and returns
the normalization layer as a nn.Module.
Returns:
nn.Module or None: the normalization layer
"""
if isinstance(norm, str):
if len(norm) == 0:
return None
norm = {
"BN": BatchNorm2d,
# Fixed in https://github.com/pytorch/pytorch/pull/36382
"SyncBN": NaiveSyncBatchNorm if TORCH_VERSION <= (1, 5) else nn.SyncBatchNorm,
"FrozenBN": FrozenBatchNorm2d,
"GN": lambda channels: nn.GroupNorm(32, channels),
# for debugging:
"nnSyncBN": nn.SyncBatchNorm,
"naiveSyncBN": NaiveSyncBatchNorm,
}[norm]
return norm(out_channels)
class AllReduce(Function):
@staticmethod
def forward(ctx, input):
input_list = [torch.zeros_like(input) for k in range(dist.get_world_size())]
# Use allgather instead of allreduce since I don't trust in-place operations ..
dist.all_gather(input_list, input, async_op=False)
inputs = torch.stack(input_list, dim=0)
return torch.sum(inputs, dim=0)
@staticmethod
def backward(ctx, grad_output):
dist.all_reduce(grad_output, async_op=False)
return grad_output
class NaiveSyncBatchNorm(BatchNorm2d):
"""
In PyTorch<=1.5, `nn.SyncBatchNorm` has incorrect gradient
when the batch size on each worker is different.
(e.g., when scale augmentation is used, or when it is applied to mask head).
This is a slower but correct alternative to `nn.SyncBatchNorm`.
Note:
There isn't a single definition of Sync BatchNorm.
When ``stats_mode==""``, this module computes overall statistics by using
statistics of each worker with equal weight. The result is true statistics
of all samples (as if they are all on one worker) only when all workers
have the same (N, H, W). This mode does not support inputs with zero batch size.
When ``stats_mode=="N"``, this module computes overall statistics by weighting
the statistics of each worker by their ``N``. The result is true statistics
of all samples (as if they are all on one worker) only when all workers
have the same (H, W). It is slower than ``stats_mode==""``.
Even though the result of this module may not be the true statistics of all samples,
it may still be reasonable because it might be preferrable to assign equal weights
to all workers, regardless of their (H, W) dimension, instead of putting larger weight
on larger images. From preliminary experiments, little difference is found between such
a simplified implementation and an accurate computation of overall mean & variance.
"""
def __init__(self, *args, stats_mode="", **kwargs):
super().__init__(*args, **kwargs)
assert stats_mode in ["", "N"]
self._stats_mode = stats_mode
def forward(self, input):
if comm.get_world_size() == 1 or not self.training:
return super().forward(input)
B, C = input.shape[0], input.shape[1]
mean = torch.mean(input, dim=[0, 2, 3])
meansqr = torch.mean(input * input, dim=[0, 2, 3])
if self._stats_mode == "":
assert B > 0, 'SyncBatchNorm(stats_mode="") does not support zero batch size.'
vec = torch.cat([mean, meansqr], dim=0)
vec = AllReduce.apply(vec) * (1.0 / dist.get_world_size())
mean, meansqr = torch.split(vec, C)
momentum = self.momentum
else:
if B == 0:
vec = torch.zeros([2 * C + 1], device=mean.device, dtype=mean.dtype)
vec = vec + input.sum() # make sure there is gradient w.r.t input
else:
vec = torch.cat(
[mean, meansqr, torch.ones([1], device=mean.device, dtype=mean.dtype)], dim=0
)
vec = AllReduce.apply(vec * B)
total_batch = vec[-1].detach()
momentum = total_batch.clamp(max=1) * self.momentum # no update if total_batch is 0
total_batch = torch.max(total_batch, torch.ones_like(total_batch)) # avoid div-by-zero
mean, meansqr, _ = torch.split(vec / total_batch, C)
var = meansqr - mean * mean
invstd = torch.rsqrt(var + self.eps)
scale = self.weight * invstd
bias = self.bias - mean * scale
scale = scale.reshape(1, -1, 1, 1)
bias = bias.reshape(1, -1, 1, 1)
self.running_mean += momentum * (mean.detach() - self.running_mean)
self.running_var += momentum * (var.detach() - self.running_var)
return input * scale + bias
# -*- coding: utf-8 -*-
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
from torch import nn
from .batch_norm import FrozenBatchNorm2d
class CNNBlockBase(nn.Module):
"""
A CNN block is assumed to have input channels, output channels and a stride.
The input and output of `forward()` method must be NCHW tensors.
The method can perform arbitrary computation but must match the given
channels and stride specification.
Attribute:
in_channels (int):
out_channels (int):
stride (int):
"""
def __init__(self, in_channels, out_channels, stride):
"""
The `__init__` method of any subclass should also contain these arguments.
Args:
in_channels (int):
out_channels (int):
stride (int):
"""
super().__init__()
self.in_channels = in_channels
self.out_channels = out_channels
self.stride = stride
def freeze(self):
"""
Make this block not trainable.
This method sets all parameters to `requires_grad=False`,
and convert all BatchNorm layers to FrozenBatchNorm
Returns:
the block itself
"""
for p in self.parameters():
p.requires_grad = False
FrozenBatchNorm2d.convert_frozen_batchnorm(self)
return self
To add a new Op:
1. Create a new directory
2. Implement new ops there
3. Delcare its Python interface in `vision.cpp`.
// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
#pragma once
#include <torch/types.h>
namespace detectron2 {
at::Tensor ROIAlign_forward_cpu(
const at::Tensor& input,
const at::Tensor& rois,
const float spatial_scale,
const int pooled_height,
const int pooled_width,
const int sampling_ratio,
bool aligned);
at::Tensor ROIAlign_backward_cpu(
const at::Tensor& grad,
const at::Tensor& rois,
const float spatial_scale,
const int pooled_height,
const int pooled_width,
const int batch_size,
const int channels,
const int height,
const int width,
const int sampling_ratio,
bool aligned);
#ifdef WITH_CUDA
at::Tensor ROIAlign_forward_cuda(
const at::Tensor& input,
const at::Tensor& rois,
const float spatial_scale,
const int pooled_height,
const int pooled_width,
const int sampling_ratio,
bool aligned);
at::Tensor ROIAlign_backward_cuda(
const at::Tensor& grad,
const at::Tensor& rois,
const float spatial_scale,
const int pooled_height,
const int pooled_width,
const int batch_size,
const int channels,
const int height,
const int width,
const int sampling_ratio,
bool aligned);
#endif
// Interface for Python
inline at::Tensor ROIAlign_forward(
const at::Tensor& input,
const at::Tensor& rois,
const float spatial_scale,
const int pooled_height,
const int pooled_width,
const int sampling_ratio,
bool aligned) {
if (input.is_cuda()) {
#ifdef WITH_CUDA
return ROIAlign_forward_cuda(
input,
rois,
spatial_scale,
pooled_height,
pooled_width,
sampling_ratio,
aligned);
#else
AT_ERROR("Not compiled with GPU support");
#endif
}
return ROIAlign_forward_cpu(
input,
rois,
spatial_scale,
pooled_height,
pooled_width,
sampling_ratio,
aligned);
}
inline at::Tensor ROIAlign_backward(
const at::Tensor& grad,
const at::Tensor& rois,
const float spatial_scale,
const int pooled_height,
const int pooled_width,
const int batch_size,
const int channels,
const int height,
const int width,
const int sampling_ratio,
bool aligned) {
if (grad.is_cuda()) {
#ifdef WITH_CUDA
return ROIAlign_backward_cuda(
grad,
rois,
spatial_scale,
pooled_height,
pooled_width,
batch_size,
channels,
height,
width,
sampling_ratio,
aligned);
#else
AT_ERROR("Not compiled with GPU support");
#endif
}
return ROIAlign_backward_cpu(
grad,
rois,
spatial_scale,
pooled_height,
pooled_width,
batch_size,
channels,
height,
width,
sampling_ratio,
aligned);
}
} // namespace detectron2
// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
#include <ATen/TensorUtils.h>
#include "ROIAlign.h"
namespace {
// implementation taken from Caffe2
template <typename T>
struct PreCalc {
int pos1;
int pos2;
int pos3;
int pos4;
T w1;
T w2;
T w3;
T w4;
};
template <typename T>
void pre_calc_for_bilinear_interpolate(
const int height,
const int width,
const int pooled_height,
const int pooled_width,
const int iy_upper,
const int ix_upper,
T roi_start_h,
T roi_start_w,
T bin_size_h,
T bin_size_w,
int roi_bin_grid_h,
int roi_bin_grid_w,
std::vector<PreCalc<T>>& pre_calc) {
int pre_calc_index = 0;
for (int ph = 0; ph < pooled_height; ph++) {
for (int pw = 0; pw < pooled_width; pw++) {
for (int iy = 0; iy < iy_upper; iy++) {
const T yy = roi_start_h + ph * bin_size_h +
static_cast<T>(iy + .5f) * bin_size_h /
static_cast<T>(roi_bin_grid_h); // e.g., 0.5, 1.5
for (int ix = 0; ix < ix_upper; ix++) {
const T xx = roi_start_w + pw * bin_size_w +
static_cast<T>(ix + .5f) * bin_size_w /
static_cast<T>(roi_bin_grid_w);
T x = xx;
T y = yy;
// deal with: inverse elements are out of feature map boundary
if (y < -1.0 || y > height || x < -1.0 || x > width) {
// empty
PreCalc<T> pc;
pc.pos1 = 0;
pc.pos2 = 0;
pc.pos3 = 0;
pc.pos4 = 0;
pc.w1 = 0;
pc.w2 = 0;
pc.w3 = 0;
pc.w4 = 0;
pre_calc[pre_calc_index] = pc;
pre_calc_index += 1;
continue;
}
if (y <= 0) {
y = 0;
}
if (x <= 0) {
x = 0;
}
int y_low = (int)y;
int x_low = (int)x;
int y_high;
int x_high;
if (y_low >= height - 1) {
y_high = y_low = height - 1;
y = (T)y_low;
} else {
y_high = y_low + 1;
}
if (x_low >= width - 1) {
x_high = x_low = width - 1;
x = (T)x_low;
} else {
x_high = x_low + 1;
}
T ly = y - y_low;
T lx = x - x_low;
T hy = 1. - ly, hx = 1. - lx;
T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
// save weights and indices
PreCalc<T> pc;
pc.pos1 = y_low * width + x_low;
pc.pos2 = y_low * width + x_high;
pc.pos3 = y_high * width + x_low;
pc.pos4 = y_high * width + x_high;
pc.w1 = w1;
pc.w2 = w2;
pc.w3 = w3;
pc.w4 = w4;
pre_calc[pre_calc_index] = pc;
pre_calc_index += 1;
}
}
}
}
}
template <typename T>
void ROIAlignForward(
const int nthreads,
const T* input,
const T& spatial_scale,
const int channels,
const int height,
const int width,
const int pooled_height,
const int pooled_width,
const int sampling_ratio,
const T* rois,
T* output,
bool aligned) {
int n_rois = nthreads / channels / pooled_width / pooled_height;
// (n, c, ph, pw) is an element in the pooled output
// can be parallelized using omp
// #pragma omp parallel for num_threads(32)
for (int n = 0; n < n_rois; n++) {
int index_n = n * channels * pooled_width * pooled_height;
const T* offset_rois = rois + n * 5;
int roi_batch_ind = offset_rois[0];
// Do not use rounding; this implementation detail is critical
T offset = aligned ? (T)0.5 : (T)0.0;
T roi_start_w = offset_rois[1] * spatial_scale - offset;
T roi_start_h = offset_rois[2] * spatial_scale - offset;
T roi_end_w = offset_rois[3] * spatial_scale - offset;
T roi_end_h = offset_rois[4] * spatial_scale - offset;
T roi_width = roi_end_w - roi_start_w;
T roi_height = roi_end_h - roi_start_h;
if (aligned) {
AT_ASSERTM(
roi_width >= 0 && roi_height >= 0,
"ROIs in ROIAlign cannot have non-negative size!");
} else { // for backward-compatibility only
roi_width = std::max(roi_width, (T)1.);
roi_height = std::max(roi_height, (T)1.);
}
T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
// We use roi_bin_grid to sample the grid and mimic integral
int roi_bin_grid_h = (sampling_ratio > 0)
? sampling_ratio
: ceil(roi_height / pooled_height); // e.g., = 2
int roi_bin_grid_w =
(sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
// We do average (integral) pooling inside a bin
// When the grid is empty, output zeros == 0/1, instead of NaN.
const T count = std::max(roi_bin_grid_h * roi_bin_grid_w, 1); // e.g. = 4
// we want to precalculate indices and weights shared by all channels,
// this is the key point of optimization
std::vector<PreCalc<T>> pre_calc(
roi_bin_grid_h * roi_bin_grid_w * pooled_width * pooled_height);
pre_calc_for_bilinear_interpolate(
height,
width,
pooled_height,
pooled_width,
roi_bin_grid_h,
roi_bin_grid_w,
roi_start_h,
roi_start_w,
bin_size_h,
bin_size_w,
roi_bin_grid_h,
roi_bin_grid_w,
pre_calc);
for (int c = 0; c < channels; c++) {
int index_n_c = index_n + c * pooled_width * pooled_height;
const T* offset_input =
input + (roi_batch_ind * channels + c) * height * width;
int pre_calc_index = 0;
for (int ph = 0; ph < pooled_height; ph++) {
for (int pw = 0; pw < pooled_width; pw++) {
int index = index_n_c + ph * pooled_width + pw;
T output_val = 0.;
for (int iy = 0; iy < roi_bin_grid_h; iy++) {
for (int ix = 0; ix < roi_bin_grid_w; ix++) {
PreCalc<T> pc = pre_calc[pre_calc_index];
output_val += pc.w1 * offset_input[pc.pos1] +
pc.w2 * offset_input[pc.pos2] +
pc.w3 * offset_input[pc.pos3] + pc.w4 * offset_input[pc.pos4];
pre_calc_index += 1;
}
}
output_val /= count;
output[index] = output_val;
} // for pw
} // for ph
} // for c
} // for n
}
template <typename T>
void bilinear_interpolate_gradient(
const int height,
const int width,
T y,
T x,
T& w1,
T& w2,
T& w3,
T& w4,
int& x_low,
int& x_high,
int& y_low,
int& y_high,
const int index /* index for debug only*/) {
// deal with cases that inverse elements are out of feature map boundary
if (y < -1.0 || y > height || x < -1.0 || x > width) {
// empty
w1 = w2 = w3 = w4 = 0.;
x_low = x_high = y_low = y_high = -1;
return;
}
if (y <= 0)
y = 0;
if (x <= 0)
x = 0;
y_low = (int)y;
x_low = (int)x;
if (y_low >= height - 1) {
y_high = y_low = height - 1;
y = (T)y_low;
} else {
y_high = y_low + 1;
}
if (x_low >= width - 1) {
x_high = x_low = width - 1;
x = (T)x_low;
} else {
x_high = x_low + 1;
}
T ly = y - y_low;
T lx = x - x_low;
T hy = 1. - ly, hx = 1. - lx;
// reference in forward
// T v1 = input[y_low * width + x_low];
// T v2 = input[y_low * width + x_high];
// T v3 = input[y_high * width + x_low];
// T v4 = input[y_high * width + x_high];
// T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
return;
}
template <class T>
inline void add(T* address, const T& val) {
*address += val;
}
template <typename T>
void ROIAlignBackward(
const int nthreads,
// may not be contiguous, and should be indexed using n_stride, etc
const T* grad_output,
const T& spatial_scale,
const int channels,
const int height,
const int width,
const int pooled_height,
const int pooled_width,
const int sampling_ratio,
T* grad_input,
const T* rois,
const int n_stride,
const int c_stride,
const int h_stride,
const int w_stride,
bool aligned) {
for (int index = 0; index < nthreads; index++) {
// (n, c, ph, pw) is an element in the pooled output
int pw = index % pooled_width;
int ph = (index / pooled_width) % pooled_height;
int c = (index / pooled_width / pooled_height) % channels;
int n = index / pooled_width / pooled_height / channels;
const T* offset_rois = rois + n * 5;
int roi_batch_ind = offset_rois[0];
// Do not use rounding; this implementation detail is critical
T offset = aligned ? (T)0.5 : (T)0.0;
T roi_start_w = offset_rois[1] * spatial_scale - offset;
T roi_start_h = offset_rois[2] * spatial_scale - offset;
T roi_end_w = offset_rois[3] * spatial_scale - offset;
T roi_end_h = offset_rois[4] * spatial_scale - offset;
T roi_width = roi_end_w - roi_start_w;
T roi_height = roi_end_h - roi_start_h;
if (aligned) {
AT_ASSERTM(
roi_width >= 0 && roi_height >= 0,
"ROIs in ROIAlign do not have non-negative size!");
} else { // for backward-compatibility only
roi_width = std::max(roi_width, (T)1.);
roi_height = std::max(roi_height, (T)1.);
}
T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
T* offset_grad_input =
grad_input + ((roi_batch_ind * channels + c) * height * width);
int output_offset = n * n_stride + c * c_stride;
const T* offset_grad_output = grad_output + output_offset;
const T grad_output_this_bin =
offset_grad_output[ph * h_stride + pw * w_stride];
// We use roi_bin_grid to sample the grid and mimic integral
int roi_bin_grid_h = (sampling_ratio > 0)
? sampling_ratio
: ceil(roi_height / pooled_height); // e.g., = 2
int roi_bin_grid_w =
(sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
// We do average (integral) pooling inside a bin
const T count = roi_bin_grid_h * roi_bin_grid_w; // e.g. = 4
for (int iy = 0; iy < roi_bin_grid_h; iy++) {
const T y = roi_start_h + ph * bin_size_h +
static_cast<T>(iy + .5f) * bin_size_h /
static_cast<T>(roi_bin_grid_h); // e.g., 0.5, 1.5
for (int ix = 0; ix < roi_bin_grid_w; ix++) {
const T x = roi_start_w + pw * bin_size_w +
static_cast<T>(ix + .5f) * bin_size_w /
static_cast<T>(roi_bin_grid_w);
T w1, w2, w3, w4;
int x_low, x_high, y_low, y_high;
bilinear_interpolate_gradient(
height,
width,
y,
x,
w1,
w2,
w3,
w4,
x_low,
x_high,
y_low,
y_high,
index);
T g1 = grad_output_this_bin * w1 / count;
T g2 = grad_output_this_bin * w2 / count;
T g3 = grad_output_this_bin * w3 / count;
T g4 = grad_output_this_bin * w4 / count;
if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) {
// atomic add is not needed for now since it is single threaded
add(offset_grad_input + y_low * width + x_low, static_cast<T>(g1));
add(offset_grad_input + y_low * width + x_high, static_cast<T>(g2));
add(offset_grad_input + y_high * width + x_low, static_cast<T>(g3));
add(offset_grad_input + y_high * width + x_high, static_cast<T>(g4));
} // if
} // ix
} // iy
} // for
} // ROIAlignBackward
} // namespace
namespace detectron2 {
at::Tensor ROIAlign_forward_cpu(
const at::Tensor& input,
const at::Tensor& rois,
const float spatial_scale,
const int pooled_height,
const int pooled_width,
const int sampling_ratio,
bool aligned) {
AT_ASSERTM(input.device().is_cpu(), "input must be a CPU tensor");
AT_ASSERTM(rois.device().is_cpu(), "rois must be a CPU tensor");
at::TensorArg input_t{input, "input", 1}, rois_t{rois, "rois", 2};
at::CheckedFrom c = "ROIAlign_forward_cpu";
at::checkAllSameType(c, {input_t, rois_t});
auto num_rois = rois.size(0);
auto channels = input.size(1);
auto height = input.size(2);
auto width = input.size(3);
at::Tensor output = at::zeros(
{num_rois, channels, pooled_height, pooled_width}, input.options());
auto output_size = num_rois * pooled_height * pooled_width * channels;
if (output.numel() == 0)
return output;
auto input_ = input.contiguous(), rois_ = rois.contiguous();
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
input.scalar_type(), "ROIAlign_forward", [&] {
ROIAlignForward<scalar_t>(
output_size,
input_.data_ptr<scalar_t>(),
spatial_scale,
channels,
height,
width,
pooled_height,
pooled_width,
sampling_ratio,
rois_.data_ptr<scalar_t>(),
output.data_ptr<scalar_t>(),
aligned);
});
return output;
}
at::Tensor ROIAlign_backward_cpu(
const at::Tensor& grad,
const at::Tensor& rois,
const float spatial_scale,
const int pooled_height,
const int pooled_width,
const int batch_size,
const int channels,
const int height,
const int width,
const int sampling_ratio,
bool aligned) {
AT_ASSERTM(grad.device().is_cpu(), "grad must be a CPU tensor");
AT_ASSERTM(rois.device().is_cpu(), "rois must be a CPU tensor");
at::TensorArg grad_t{grad, "grad", 1}, rois_t{rois, "rois", 2};
at::CheckedFrom c = "ROIAlign_backward_cpu";
at::checkAllSameType(c, {grad_t, rois_t});
at::Tensor grad_input =
at::zeros({batch_size, channels, height, width}, grad.options());
// handle possibly empty gradients
if (grad.numel() == 0) {
return grad_input;
}
// get stride values to ensure indexing into gradients is correct.
int n_stride = grad.stride(0);
int c_stride = grad.stride(1);
int h_stride = grad.stride(2);
int w_stride = grad.stride(3);
auto rois_ = rois.contiguous();
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
grad.scalar_type(), "ROIAlign_forward", [&] {
ROIAlignBackward<scalar_t>(
grad.numel(),
grad.data_ptr<scalar_t>(),
spatial_scale,
channels,
height,
width,
pooled_height,
pooled_width,
sampling_ratio,
grad_input.data_ptr<scalar_t>(),
rois_.data_ptr<scalar_t>(),
n_stride,
c_stride,
h_stride,
w_stride,
aligned);
});
return grad_input;
}
} // namespace detectron2
// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
#include <ATen/ATen.h>
#include <ATen/cuda/CUDAContext.h>
#include <c10/cuda/CUDAGuard.h>
#include <ATen/cuda/CUDAApplyUtils.cuh>
// TODO make it in a common file
#define CUDA_1D_KERNEL_LOOP(i, n) \
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; \
i += blockDim.x * gridDim.x)
template <typename T>
__device__ T bilinear_interpolate(
const T* bottom_data,
const int height,
const int width,
T y,
T x,
const int index /* index for debug only*/) {
// deal with cases that inverse elements are out of feature map boundary
if (y < -1.0 || y > height || x < -1.0 || x > width) {
// empty
return 0;
}
if (y <= 0)
y = 0;
if (x <= 0)
x = 0;
int y_low = (int)y;
int x_low = (int)x;
int y_high;
int x_high;
if (y_low >= height - 1) {
y_high = y_low = height - 1;
y = (T)y_low;
} else {
y_high = y_low + 1;
}
if (x_low >= width - 1) {
x_high = x_low = width - 1;
x = (T)x_low;
} else {
x_high = x_low + 1;
}
T ly = y - y_low;
T lx = x - x_low;
T hy = 1. - ly, hx = 1. - lx;
// do bilinear interpolation
T v1 = bottom_data[y_low * width + x_low];
T v2 = bottom_data[y_low * width + x_high];
T v3 = bottom_data[y_high * width + x_low];
T v4 = bottom_data[y_high * width + x_high];
T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
return val;
}
template <typename T>
__global__ void RoIAlignForward(
const int nthreads,
const T* bottom_data,
const T spatial_scale,
const int channels,
const int height,
const int width,
const int pooled_height,
const int pooled_width,
const int sampling_ratio,
const T* bottom_rois,
T* top_data,
bool aligned) {
CUDA_1D_KERNEL_LOOP(index, nthreads) {
// (n, c, ph, pw) is an element in the pooled output
int pw = index % pooled_width;
int ph = (index / pooled_width) % pooled_height;
int c = (index / pooled_width / pooled_height) % channels;
int n = index / pooled_width / pooled_height / channels;
const T* offset_bottom_rois = bottom_rois + n * 5;
int roi_batch_ind = offset_bottom_rois[0];
// Do not use rounding; this implementation detail is critical
T offset = aligned ? (T)0.5 : (T)0.0;
T roi_start_w = offset_bottom_rois[1] * spatial_scale - offset;
T roi_start_h = offset_bottom_rois[2] * spatial_scale - offset;
T roi_end_w = offset_bottom_rois[3] * spatial_scale - offset;
T roi_end_h = offset_bottom_rois[4] * spatial_scale - offset;
T roi_width = roi_end_w - roi_start_w;
T roi_height = roi_end_h - roi_start_h;
if (!aligned) { // for backward-compatibility only
roi_width = max(roi_width, (T)1.);
roi_height = max(roi_height, (T)1.);
}
T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
const T* offset_bottom_data =
bottom_data + (roi_batch_ind * channels + c) * height * width;
// We use roi_bin_grid to sample the grid and mimic integral
int roi_bin_grid_h = (sampling_ratio > 0)
? sampling_ratio
: ceil(roi_height / pooled_height); // e.g., = 2
int roi_bin_grid_w =
(sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
// We do average (integral) pooling inside a bin
// When the grid is empty, output zeros == 0/1, instead of NaN.
const T count = max(roi_bin_grid_h * roi_bin_grid_w, 1); // e.g. = 4
T output_val = 0.;
for (int iy = 0; iy < roi_bin_grid_h; iy++) // e.g., iy = 0, 1
{
const T y = roi_start_h + ph * bin_size_h +
static_cast<T>(iy + .5f) * bin_size_h /
static_cast<T>(roi_bin_grid_h); // e.g., 0.5, 1.5
for (int ix = 0; ix < roi_bin_grid_w; ix++) {
const T x = roi_start_w + pw * bin_size_w +
static_cast<T>(ix + .5f) * bin_size_w /
static_cast<T>(roi_bin_grid_w);
T val = bilinear_interpolate(
offset_bottom_data, height, width, y, x, index);
output_val += val;
}
}
output_val /= count;
top_data[index] = output_val;
}
}
template <typename T>
__device__ void bilinear_interpolate_gradient(
const int height,
const int width,
T y,
T x,
T& w1,
T& w2,
T& w3,
T& w4,
int& x_low,
int& x_high,
int& y_low,
int& y_high,
const int index /* index for debug only*/) {
// deal with cases that inverse elements are out of feature map boundary
if (y < -1.0 || y > height || x < -1.0 || x > width) {
// empty
w1 = w2 = w3 = w4 = 0.;
x_low = x_high = y_low = y_high = -1;
return;
}
if (y <= 0)
y = 0;
if (x <= 0)
x = 0;
y_low = (int)y;
x_low = (int)x;
if (y_low >= height - 1) {
y_high = y_low = height - 1;
y = (T)y_low;
} else {
y_high = y_low + 1;
}
if (x_low >= width - 1) {
x_high = x_low = width - 1;
x = (T)x_low;
} else {
x_high = x_low + 1;
}
T ly = y - y_low;
T lx = x - x_low;
T hy = 1. - ly, hx = 1. - lx;
// reference in forward
// T v1 = bottom_data[y_low * width + x_low];
// T v2 = bottom_data[y_low * width + x_high];
// T v3 = bottom_data[y_high * width + x_low];
// T v4 = bottom_data[y_high * width + x_high];
// T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
return;
}
template <typename T>
__global__ void RoIAlignBackwardFeature(
const int nthreads,
const T* top_diff,
const int num_rois,
const T spatial_scale,
const int channels,
const int height,
const int width,
const int pooled_height,
const int pooled_width,
const int sampling_ratio,
T* bottom_diff,
const T* bottom_rois,
bool aligned) {
CUDA_1D_KERNEL_LOOP(index, nthreads) {
// (n, c, ph, pw) is an element in the pooled output
int pw = index % pooled_width;
int ph = (index / pooled_width) % pooled_height;
int c = (index / pooled_width / pooled_height) % channels;
int n = index / pooled_width / pooled_height / channels;
const T* offset_bottom_rois = bottom_rois + n * 5;
int roi_batch_ind = offset_bottom_rois[0];
// Do not use rounding; this implementation detail is critical
T offset = aligned ? (T)0.5 : (T)0.0;
T roi_start_w = offset_bottom_rois[1] * spatial_scale - offset;
T roi_start_h = offset_bottom_rois[2] * spatial_scale - offset;
T roi_end_w = offset_bottom_rois[3] * spatial_scale - offset;
T roi_end_h = offset_bottom_rois[4] * spatial_scale - offset;
T roi_width = roi_end_w - roi_start_w;
T roi_height = roi_end_h - roi_start_h;
if (!aligned) { // for backward-compatibility only
roi_width = max(roi_width, (T)1.);
roi_height = max(roi_height, (T)1.);
}
T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
T* offset_bottom_diff =
bottom_diff + (roi_batch_ind * channels + c) * height * width;
int top_offset = (n * channels + c) * pooled_height * pooled_width;
const T* offset_top_diff = top_diff + top_offset;
const T top_diff_this_bin = offset_top_diff[ph * pooled_width + pw];
// We use roi_bin_grid to sample the grid and mimic integral
int roi_bin_grid_h = (sampling_ratio > 0)
? sampling_ratio
: ceil(roi_height / pooled_height); // e.g., = 2
int roi_bin_grid_w =
(sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
// We do average (integral) pooling inside a bin
const T count = roi_bin_grid_h * roi_bin_grid_w; // e.g. = 4
for (int iy = 0; iy < roi_bin_grid_h; iy++) // e.g., iy = 0, 1
{
const T y = roi_start_h + ph * bin_size_h +
static_cast<T>(iy + .5f) * bin_size_h /
static_cast<T>(roi_bin_grid_h); // e.g., 0.5, 1.5
for (int ix = 0; ix < roi_bin_grid_w; ix++) {
const T x = roi_start_w + pw * bin_size_w +
static_cast<T>(ix + .5f) * bin_size_w /
static_cast<T>(roi_bin_grid_w);
T w1, w2, w3, w4;
int x_low, x_high, y_low, y_high;
bilinear_interpolate_gradient(
height,
width,
y,
x,
w1,
w2,
w3,
w4,
x_low,
x_high,
y_low,
y_high,
index);
T g1 = top_diff_this_bin * w1 / count;
T g2 = top_diff_this_bin * w2 / count;
T g3 = top_diff_this_bin * w3 / count;
T g4 = top_diff_this_bin * w4 / count;
if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) {
atomicAdd(
offset_bottom_diff + y_low * width + x_low, static_cast<T>(g1));
atomicAdd(
offset_bottom_diff + y_low * width + x_high, static_cast<T>(g2));
atomicAdd(
offset_bottom_diff + y_high * width + x_low, static_cast<T>(g3));
atomicAdd(
offset_bottom_diff + y_high * width + x_high, static_cast<T>(g4));
} // if
} // ix
} // iy
} // CUDA_1D_KERNEL_LOOP
} // RoIAlignBackward
namespace detectron2 {
at::Tensor ROIAlign_forward_cuda(
const at::Tensor& input,
const at::Tensor& rois,
const float spatial_scale,
const int pooled_height,
const int pooled_width,
const int sampling_ratio,
bool aligned) {
AT_ASSERTM(input.device().is_cuda(), "input must be a CUDA tensor");
AT_ASSERTM(rois.device().is_cuda(), "rois must be a CUDA tensor");
at::TensorArg input_t{input, "input", 1}, rois_t{rois, "rois", 2};
at::CheckedFrom c = "ROIAlign_forward_cuda";
at::checkAllSameGPU(c, {input_t, rois_t});
at::checkAllSameType(c, {input_t, rois_t});
at::cuda::CUDAGuard device_guard(input.device());
auto num_rois = rois.size(0);
auto channels = input.size(1);
auto height = input.size(2);
auto width = input.size(3);
auto output = at::empty(
{num_rois, channels, pooled_height, pooled_width}, input.options());
auto output_size = num_rois * pooled_height * pooled_width * channels;
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
dim3 grid(std::min(
at::cuda::ATenCeilDiv(
static_cast<int64_t>(output_size), static_cast<int64_t>(512)),
static_cast<int64_t>(4096)));
dim3 block(512);
if (output.numel() == 0) {
AT_CUDA_CHECK(cudaGetLastError());
return output;
}
auto input_ = input.contiguous(), rois_ = rois.contiguous();
AT_DISPATCH_FLOATING_TYPES(input.scalar_type(), "ROIAlign_forward", [&] {
RoIAlignForward<scalar_t><<<grid, block, 0, stream>>>(
output_size,
input_.data_ptr<scalar_t>(),
spatial_scale,
channels,
height,
width,
pooled_height,
pooled_width,
sampling_ratio,
rois_.data_ptr<scalar_t>(),
output.data_ptr<scalar_t>(),
aligned);
});
cudaDeviceSynchronize();
AT_CUDA_CHECK(cudaGetLastError());
return output;
}
// TODO remove the dependency on input and use instead its sizes -> save memory
at::Tensor ROIAlign_backward_cuda(
const at::Tensor& grad,
const at::Tensor& rois,
const float spatial_scale,
const int pooled_height,
const int pooled_width,
const int batch_size,
const int channels,
const int height,
const int width,
const int sampling_ratio,
bool aligned) {
AT_ASSERTM(grad.device().is_cuda(), "grad must be a CUDA tensor");
AT_ASSERTM(rois.device().is_cuda(), "rois must be a CUDA tensor");
at::TensorArg grad_t{grad, "grad", 1}, rois_t{rois, "rois", 2};
at::CheckedFrom c = "ROIAlign_backward_cuda";
at::checkAllSameGPU(c, {grad_t, rois_t});
at::checkAllSameType(c, {grad_t, rois_t});
at::cuda::CUDAGuard device_guard(grad.device());
auto num_rois = rois.size(0);
auto grad_input =
at::zeros({batch_size, channels, height, width}, grad.options());
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
dim3 grid(std::min(
at::cuda::ATenCeilDiv(
static_cast<int64_t>(grad.numel()), static_cast<int64_t>(512)),
static_cast<int64_t>(4096)));
dim3 block(512);
// handle possibly empty gradients
if (grad.numel() == 0) {
AT_CUDA_CHECK(cudaGetLastError());
return grad_input;
}
auto grad_ = grad.contiguous(), rois_ = rois.contiguous();
AT_DISPATCH_FLOATING_TYPES(grad.scalar_type(), "ROIAlign_backward", [&] {
RoIAlignBackwardFeature<scalar_t><<<grid, block, 0, stream>>>(
grad.numel(),
grad_.data_ptr<scalar_t>(),
num_rois,
spatial_scale,
channels,
height,
width,
pooled_height,
pooled_width,
sampling_ratio,
grad_input.data_ptr<scalar_t>(),
rois_.data_ptr<scalar_t>(),
aligned);
});
AT_CUDA_CHECK(cudaGetLastError());
return grad_input;
}
} // namespace detectron2
// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
#pragma once
#include <torch/types.h>
namespace detectron2 {
at::Tensor ROIAlignRotated_forward_cpu(
const at::Tensor& input,
const at::Tensor& rois,
const float spatial_scale,
const int pooled_height,
const int pooled_width,
const int sampling_ratio);
at::Tensor ROIAlignRotated_backward_cpu(
const at::Tensor& grad,
const at::Tensor& rois,
const float spatial_scale,
const int pooled_height,
const int pooled_width,
const int batch_size,
const int channels,
const int height,
const int width,
const int sampling_ratio);
#ifdef WITH_CUDA
at::Tensor ROIAlignRotated_forward_cuda(
const at::Tensor& input,
const at::Tensor& rois,
const float spatial_scale,
const int pooled_height,
const int pooled_width,
const int sampling_ratio);
at::Tensor ROIAlignRotated_backward_cuda(
const at::Tensor& grad,
const at::Tensor& rois,
const float spatial_scale,
const int pooled_height,
const int pooled_width,
const int batch_size,
const int channels,
const int height,
const int width,
const int sampling_ratio);
#endif
// Interface for Python
inline at::Tensor ROIAlignRotated_forward(
const at::Tensor& input,
const at::Tensor& rois,
const float spatial_scale,
const int pooled_height,
const int pooled_width,
const int sampling_ratio) {
if (input.is_cuda()) {
#ifdef WITH_CUDA
return ROIAlignRotated_forward_cuda(
input,
rois,
spatial_scale,
pooled_height,
pooled_width,
sampling_ratio);
#else
AT_ERROR("Not compiled with GPU support");
#endif
}
return ROIAlignRotated_forward_cpu(
input, rois, spatial_scale, pooled_height, pooled_width, sampling_ratio);
}
inline at::Tensor ROIAlignRotated_backward(
const at::Tensor& grad,
const at::Tensor& rois,
const float spatial_scale,
const int pooled_height,
const int pooled_width,
const int batch_size,
const int channels,
const int height,
const int width,
const int sampling_ratio) {
if (grad.is_cuda()) {
#ifdef WITH_CUDA
return ROIAlignRotated_backward_cuda(
grad,
rois,
spatial_scale,
pooled_height,
pooled_width,
batch_size,
channels,
height,
width,
sampling_ratio);
#else
AT_ERROR("Not compiled with GPU support");
#endif
}
return ROIAlignRotated_backward_cpu(
grad,
rois,
spatial_scale,
pooled_height,
pooled_width,
batch_size,
channels,
height,
width,
sampling_ratio);
}
} // namespace detectron2
// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
#include <ATen/TensorUtils.h>
#include "ROIAlignRotated.h"
// Note: this implementation originates from the Caffe2 ROIAlignRotated Op
// and PyTorch ROIAlign (non-rotated) Op implementations.
// The key difference between this implementation and those ones is
// we don't do "legacy offset" in this version, as there aren't many previous
// works, if any, using the "legacy" ROIAlignRotated Op.
// This would make the interface a bit cleaner.
namespace detectron2 {
namespace {
template <typename T>
struct PreCalc {
int pos1;
int pos2;
int pos3;
int pos4;
T w1;
T w2;
T w3;
T w4;
};
template <typename T>
void pre_calc_for_bilinear_interpolate(
const int height,
const int width,
const int pooled_height,
const int pooled_width,
const int iy_upper,
const int ix_upper,
T roi_start_h,
T roi_start_w,
T bin_size_h,
T bin_size_w,
int roi_bin_grid_h,
int roi_bin_grid_w,
T roi_center_h,
T roi_center_w,
T cos_theta,
T sin_theta,
std::vector<PreCalc<T>>& pre_calc) {
int pre_calc_index = 0;
for (int ph = 0; ph < pooled_height; ph++) {
for (int pw = 0; pw < pooled_width; pw++) {
for (int iy = 0; iy < iy_upper; iy++) {
const T yy = roi_start_h + ph * bin_size_h +
static_cast<T>(iy + .5f) * bin_size_h /
static_cast<T>(roi_bin_grid_h); // e.g., 0.5, 1.5
for (int ix = 0; ix < ix_upper; ix++) {
const T xx = roi_start_w + pw * bin_size_w +
static_cast<T>(ix + .5f) * bin_size_w /
static_cast<T>(roi_bin_grid_w);
// Rotate by theta around the center and translate
// In image space, (y, x) is the order for Right Handed System,
// and this is essentially multiplying the point by a rotation matrix
// to rotate it counterclockwise through angle theta.
T y = yy * cos_theta - xx * sin_theta + roi_center_h;
T x = yy * sin_theta + xx * cos_theta + roi_center_w;
// deal with: inverse elements are out of feature map boundary
if (y < -1.0 || y > height || x < -1.0 || x > width) {
// empty
PreCalc<T> pc;
pc.pos1 = 0;
pc.pos2 = 0;
pc.pos3 = 0;
pc.pos4 = 0;
pc.w1 = 0;
pc.w2 = 0;
pc.w3 = 0;
pc.w4 = 0;
pre_calc[pre_calc_index] = pc;
pre_calc_index += 1;
continue;
}
if (y < 0) {
y = 0;
}
if (x < 0) {
x = 0;
}
int y_low = (int)y;
int x_low = (int)x;
int y_high;
int x_high;
if (y_low >= height - 1) {
y_high = y_low = height - 1;
y = (T)y_low;
} else {
y_high = y_low + 1;
}
if (x_low >= width - 1) {
x_high = x_low = width - 1;
x = (T)x_low;
} else {
x_high = x_low + 1;
}
T ly = y - y_low;
T lx = x - x_low;
T hy = 1. - ly, hx = 1. - lx;
T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
// save weights and indices
PreCalc<T> pc;
pc.pos1 = y_low * width + x_low;
pc.pos2 = y_low * width + x_high;
pc.pos3 = y_high * width + x_low;
pc.pos4 = y_high * width + x_high;
pc.w1 = w1;
pc.w2 = w2;
pc.w3 = w3;
pc.w4 = w4;
pre_calc[pre_calc_index] = pc;
pre_calc_index += 1;
}
}
}
}
}
template <typename T>
void bilinear_interpolate_gradient(
const int height,
const int width,
T y,
T x,
T& w1,
T& w2,
T& w3,
T& w4,
int& x_low,
int& x_high,
int& y_low,
int& y_high) {
// deal with cases that inverse elements are out of feature map boundary
if (y < -1.0 || y > height || x < -1.0 || x > width) {
// empty
w1 = w2 = w3 = w4 = 0.;
x_low = x_high = y_low = y_high = -1;
return;
}
if (y < 0) {
y = 0;
}
if (x < 0) {
x = 0;
}
y_low = (int)y;
x_low = (int)x;
if (y_low >= height - 1) {
y_high = y_low = height - 1;
y = (T)y_low;
} else {
y_high = y_low + 1;
}
if (x_low >= width - 1) {
x_high = x_low = width - 1;
x = (T)x_low;
} else {
x_high = x_low + 1;
}
T ly = y - y_low;
T lx = x - x_low;
T hy = 1. - ly, hx = 1. - lx;
// reference in forward
// T v1 = input[y_low * width + x_low];
// T v2 = input[y_low * width + x_high];
// T v3 = input[y_high * width + x_low];
// T v4 = input[y_high * width + x_high];
// T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
return;
}
template <class T>
inline void add(T* address, const T& val) {
*address += val;
}
} // namespace
template <typename T>
void ROIAlignRotatedForward(
const int nthreads,
const T* input,
const T& spatial_scale,
const int channels,
const int height,
const int width,
const int pooled_height,
const int pooled_width,
const int sampling_ratio,
const T* rois,
T* output) {
int n_rois = nthreads / channels / pooled_width / pooled_height;
// (n, c, ph, pw) is an element in the pooled output
// can be parallelized using omp
// #pragma omp parallel for num_threads(32)
for (int n = 0; n < n_rois; n++) {
int index_n = n * channels * pooled_width * pooled_height;
const T* current_roi = rois + n * 6;
int roi_batch_ind = current_roi[0];
// Do not use rounding; this implementation detail is critical
// ROIAlignRotated supports align == true, i.e., continuous coordinate
// by default, thus the 0.5 offset
T offset = (T)0.5;
T roi_center_w = current_roi[1] * spatial_scale - offset;
T roi_center_h = current_roi[2] * spatial_scale - offset;
T roi_width = current_roi[3] * spatial_scale;
T roi_height = current_roi[4] * spatial_scale;
T theta = current_roi[5] * M_PI / 180.0;
T cos_theta = cos(theta);
T sin_theta = sin(theta);
AT_ASSERTM(
roi_width >= 0 && roi_height >= 0,
"ROIs in ROIAlignRotated do not have non-negative size!");
T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
// We use roi_bin_grid to sample the grid and mimic integral
int roi_bin_grid_h = (sampling_ratio > 0)
? sampling_ratio
: ceil(roi_height / pooled_height); // e.g., = 2
int roi_bin_grid_w =
(sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
// We do average (integral) pooling inside a bin
const T count = std::max(roi_bin_grid_h * roi_bin_grid_w, 1); // e.g. = 4
// we want to precalculate indices and weights shared by all channels,
// this is the key point of optimization
std::vector<PreCalc<T>> pre_calc(
roi_bin_grid_h * roi_bin_grid_w * pooled_width * pooled_height);
// roi_start_h and roi_start_w are computed wrt the center of RoI (x, y).
// Appropriate translation needs to be applied after.
T roi_start_h = -roi_height / 2.0;
T roi_start_w = -roi_width / 2.0;
pre_calc_for_bilinear_interpolate(
height,
width,
pooled_height,
pooled_width,
roi_bin_grid_h,
roi_bin_grid_w,
roi_start_h,
roi_start_w,
bin_size_h,
bin_size_w,
roi_bin_grid_h,
roi_bin_grid_w,
roi_center_h,
roi_center_w,
cos_theta,
sin_theta,
pre_calc);
for (int c = 0; c < channels; c++) {
int index_n_c = index_n + c * pooled_width * pooled_height;
const T* offset_input =
input + (roi_batch_ind * channels + c) * height * width;
int pre_calc_index = 0;
for (int ph = 0; ph < pooled_height; ph++) {
for (int pw = 0; pw < pooled_width; pw++) {
int index = index_n_c + ph * pooled_width + pw;
T output_val = 0.;
for (int iy = 0; iy < roi_bin_grid_h; iy++) {
for (int ix = 0; ix < roi_bin_grid_w; ix++) {
PreCalc<T> pc = pre_calc[pre_calc_index];
output_val += pc.w1 * offset_input[pc.pos1] +
pc.w2 * offset_input[pc.pos2] +
pc.w3 * offset_input[pc.pos3] + pc.w4 * offset_input[pc.pos4];
pre_calc_index += 1;
}
}
output_val /= count;
output[index] = output_val;
} // for pw
} // for ph
} // for c
} // for n
}
template <typename T>
void ROIAlignRotatedBackward(
const int nthreads,
// may not be contiguous. should index using n_stride, etc
const T* grad_output,
const T& spatial_scale,
const int channels,
const int height,
const int width,
const int pooled_height,
const int pooled_width,
const int sampling_ratio,
T* grad_input,
const T* rois,
const int n_stride,
const int c_stride,
const int h_stride,
const int w_stride) {
for (int index = 0; index < nthreads; index++) {
// (n, c, ph, pw) is an element in the pooled output
int pw = index % pooled_width;
int ph = (index / pooled_width) % pooled_height;
int c = (index / pooled_width / pooled_height) % channels;
int n = index / pooled_width / pooled_height / channels;
const T* current_roi = rois + n * 6;
int roi_batch_ind = current_roi[0];
// Do not use rounding; this implementation detail is critical
// ROIAlignRotated supports align == true, i.e., continuous coordinate
// by default, thus the 0.5 offset
T offset = (T)0.5;
T roi_center_w = current_roi[1] * spatial_scale - offset;
T roi_center_h = current_roi[2] * spatial_scale - offset;
T roi_width = current_roi[3] * spatial_scale;
T roi_height = current_roi[4] * spatial_scale;
T theta = current_roi[5] * M_PI / 180.0;
T cos_theta = cos(theta);
T sin_theta = sin(theta);
AT_ASSERTM(
roi_width >= 0 && roi_height >= 0,
"ROIs in ROIAlignRotated do not have non-negative size!");
T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
T* offset_grad_input =
grad_input + ((roi_batch_ind * channels + c) * height * width);
int output_offset = n * n_stride + c * c_stride;
const T* offset_grad_output = grad_output + output_offset;
const T grad_output_this_bin =
offset_grad_output[ph * h_stride + pw * w_stride];
// We use roi_bin_grid to sample the grid and mimic integral
int roi_bin_grid_h = (sampling_ratio > 0)
? sampling_ratio
: ceil(roi_height / pooled_height); // e.g., = 2
int roi_bin_grid_w =
(sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
// roi_start_h and roi_start_w are computed wrt the center of RoI (x, y).
// Appropriate translation needs to be applied after.
T roi_start_h = -roi_height / 2.0;
T roi_start_w = -roi_width / 2.0;
// We do average (integral) pooling inside a bin
const T count = roi_bin_grid_h * roi_bin_grid_w; // e.g. = 4
for (int iy = 0; iy < roi_bin_grid_h; iy++) {
const T yy = roi_start_h + ph * bin_size_h +
static_cast<T>(iy + .5f) * bin_size_h /
static_cast<T>(roi_bin_grid_h); // e.g., 0.5, 1.5
for (int ix = 0; ix < roi_bin_grid_w; ix++) {
const T xx = roi_start_w + pw * bin_size_w +
static_cast<T>(ix + .5f) * bin_size_w /
static_cast<T>(roi_bin_grid_w);
// Rotate by theta around the center and translate
T y = yy * cos_theta - xx * sin_theta + roi_center_h;
T x = yy * sin_theta + xx * cos_theta + roi_center_w;
T w1, w2, w3, w4;
int x_low, x_high, y_low, y_high;
bilinear_interpolate_gradient(
height, width, y, x, w1, w2, w3, w4, x_low, x_high, y_low, y_high);
T g1 = grad_output_this_bin * w1 / count;
T g2 = grad_output_this_bin * w2 / count;
T g3 = grad_output_this_bin * w3 / count;
T g4 = grad_output_this_bin * w4 / count;
if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) {
// atomic add is not needed for now since it is single threaded
add(offset_grad_input + y_low * width + x_low, static_cast<T>(g1));
add(offset_grad_input + y_low * width + x_high, static_cast<T>(g2));
add(offset_grad_input + y_high * width + x_low, static_cast<T>(g3));
add(offset_grad_input + y_high * width + x_high, static_cast<T>(g4));
} // if
} // ix
} // iy
} // for
} // ROIAlignRotatedBackward
at::Tensor ROIAlignRotated_forward_cpu(
const at::Tensor& input,
const at::Tensor& rois,
const float spatial_scale,
const int pooled_height,
const int pooled_width,
const int sampling_ratio) {
AT_ASSERTM(input.device().is_cpu(), "input must be a CPU tensor");
AT_ASSERTM(rois.device().is_cpu(), "rois must be a CPU tensor");
at::TensorArg input_t{input, "input", 1}, rois_t{rois, "rois", 2};
at::CheckedFrom c = "ROIAlign_forward_cpu";
at::checkAllSameType(c, {input_t, rois_t});
auto num_rois = rois.size(0);
auto channels = input.size(1);
auto height = input.size(2);
auto width = input.size(3);
at::Tensor output = at::zeros(
{num_rois, channels, pooled_height, pooled_width}, input.options());
auto output_size = num_rois * pooled_height * pooled_width * channels;
if (output.numel() == 0) {
return output;
}
auto input_ = input.contiguous(), rois_ = rois.contiguous();
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
input.scalar_type(), "ROIAlignRotated_forward", [&] {
ROIAlignRotatedForward<scalar_t>(
output_size,
input_.data_ptr<scalar_t>(),
spatial_scale,
channels,
height,
width,
pooled_height,
pooled_width,
sampling_ratio,
rois_.data_ptr<scalar_t>(),
output.data_ptr<scalar_t>());
});
return output;
}
at::Tensor ROIAlignRotated_backward_cpu(
const at::Tensor& grad,
const at::Tensor& rois,
const float spatial_scale,
const int pooled_height,
const int pooled_width,
const int batch_size,
const int channels,
const int height,
const int width,
const int sampling_ratio) {
AT_ASSERTM(grad.device().is_cpu(), "grad must be a CPU tensor");
AT_ASSERTM(rois.device().is_cpu(), "rois must be a CPU tensor");
at::TensorArg grad_t{grad, "grad", 1}, rois_t{rois, "rois", 2};
at::CheckedFrom c = "ROIAlignRotated_backward_cpu";
at::checkAllSameType(c, {grad_t, rois_t});
at::Tensor grad_input =
at::zeros({batch_size, channels, height, width}, grad.options());
// handle possibly empty gradients
if (grad.numel() == 0) {
return grad_input;
}
// get stride values to ensure indexing into gradients is correct.
int n_stride = grad.stride(0);
int c_stride = grad.stride(1);
int h_stride = grad.stride(2);
int w_stride = grad.stride(3);
auto rois_ = rois.contiguous();
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
grad.scalar_type(), "ROIAlignRotated_forward", [&] {
ROIAlignRotatedBackward<scalar_t>(
grad.numel(),
grad.data_ptr<scalar_t>(),
spatial_scale,
channels,
height,
width,
pooled_height,
pooled_width,
sampling_ratio,
grad_input.data_ptr<scalar_t>(),
rois_.data_ptr<scalar_t>(),
n_stride,
c_stride,
h_stride,
w_stride);
});
return grad_input;
}
} // namespace detectron2
// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
#include <ATen/ATen.h>
#include <ATen/cuda/CUDAContext.h>
#include <c10/cuda/CUDAGuard.h>
#include <ATen/cuda/CUDAApplyUtils.cuh>
// TODO make it in a common file
#define CUDA_1D_KERNEL_LOOP(i, n) \
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; \
i += blockDim.x * gridDim.x)
// Note: this implementation originates from the Caffe2 ROIAlignRotated Op
// and PyTorch ROIAlign (non-rotated) Op implementations.
// The key difference between this implementation and those ones is
// we don't do "legacy offset" in this version, as there aren't many previous
// works, if any, using the "legacy" ROIAlignRotated Op.
// This would make the interface a bit cleaner.
namespace detectron2 {
namespace {
template <typename T>
__device__ T bilinear_interpolate(
const T* input,
const int height,
const int width,
T y,
T x) {
// deal with cases that inverse elements are out of feature map boundary
if (y < -1.0 || y > height || x < -1.0 || x > width) {
// empty
return 0;
}
if (y < 0) {
y = 0;
}
if (x < 0) {
x = 0;
}
int y_low = (int)y;
int x_low = (int)x;
int y_high;
int x_high;
if (y_low >= height - 1) {
y_high = y_low = height - 1;
y = (T)y_low;
} else {
y_high = y_low + 1;
}
if (x_low >= width - 1) {
x_high = x_low = width - 1;
x = (T)x_low;
} else {
x_high = x_low + 1;
}
T ly = y - y_low;
T lx = x - x_low;
T hy = 1. - ly, hx = 1. - lx;
// do bilinear interpolation
T v1 = input[y_low * width + x_low];
T v2 = input[y_low * width + x_high];
T v3 = input[y_high * width + x_low];
T v4 = input[y_high * width + x_high];
T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
return val;
}
template <typename T>
__device__ void bilinear_interpolate_gradient(
const int height,
const int width,
T y,
T x,
T& w1,
T& w2,
T& w3,
T& w4,
int& x_low,
int& x_high,
int& y_low,
int& y_high) {
// deal with cases that inverse elements are out of feature map boundary
if (y < -1.0 || y > height || x < -1.0 || x > width) {
// empty
w1 = w2 = w3 = w4 = 0.;
x_low = x_high = y_low = y_high = -1;
return;
}
if (y < 0) {
y = 0;
}
if (x < 0) {
x = 0;
}
y_low = (int)y;
x_low = (int)x;
if (y_low >= height - 1) {
y_high = y_low = height - 1;
y = (T)y_low;
} else {
y_high = y_low + 1;
}
if (x_low >= width - 1) {
x_high = x_low = width - 1;
x = (T)x_low;
} else {
x_high = x_low + 1;
}
T ly = y - y_low;
T lx = x - x_low;
T hy = 1. - ly, hx = 1. - lx;
// reference in forward
// T v1 = input[y_low * width + x_low];
// T v2 = input[y_low * width + x_high];
// T v3 = input[y_high * width + x_low];
// T v4 = input[y_high * width + x_high];
// T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
return;
}
} // namespace
template <typename T>
__global__ void RoIAlignRotatedForward(
const int nthreads,
const T* input,
const T spatial_scale,
const int channels,
const int height,
const int width,
const int pooled_height,
const int pooled_width,
const int sampling_ratio,
const T* rois,
T* top_data) {
CUDA_1D_KERNEL_LOOP(index, nthreads) {
// (n, c, ph, pw) is an element in the pooled output
int pw = index % pooled_width;
int ph = (index / pooled_width) % pooled_height;
int c = (index / pooled_width / pooled_height) % channels;
int n = index / pooled_width / pooled_height / channels;
const T* current_roi = rois + n * 6;
int roi_batch_ind = current_roi[0];
// Do not use rounding; this implementation detail is critical
// ROIAlignRotated supports align == true, i.e., continuous coordinate
// by default, thus the 0.5 offset
T offset = (T)0.5;
T roi_center_w = current_roi[1] * spatial_scale - offset;
T roi_center_h = current_roi[2] * spatial_scale - offset;
T roi_width = current_roi[3] * spatial_scale;
T roi_height = current_roi[4] * spatial_scale;
T theta = current_roi[5] * M_PI / 180.0;
T cos_theta = cos(theta);
T sin_theta = sin(theta);
T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
const T* offset_input =
input + (roi_batch_ind * channels + c) * height * width;
// We use roi_bin_grid to sample the grid and mimic integral
int roi_bin_grid_h = (sampling_ratio > 0)
? sampling_ratio
: ceil(roi_height / pooled_height); // e.g., = 2
int roi_bin_grid_w =
(sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
// roi_start_h and roi_start_w are computed wrt the center of RoI (x, y).
// Appropriate translation needs to be applied after.
T roi_start_h = -roi_height / 2.0;
T roi_start_w = -roi_width / 2.0;
// We do average (inte gral) pooling inside a bin
const T count = max(roi_bin_grid_h * roi_bin_grid_w, 1); // e.g. = 4
T output_val = 0.;
for (int iy = 0; iy < roi_bin_grid_h; iy++) // e.g., iy = 0, 1
{
const T yy = roi_start_h + ph * bin_size_h +
static_cast<T>(iy + .5f) * bin_size_h /
static_cast<T>(roi_bin_grid_h); // e.g., 0.5, 1.5
for (int ix = 0; ix < roi_bin_grid_w; ix++) {
const T xx = roi_start_w + pw * bin_size_w +
static_cast<T>(ix + .5f) * bin_size_w /
static_cast<T>(roi_bin_grid_w);
// Rotate by theta around the center and translate
T y = yy * cos_theta - xx * sin_theta + roi_center_h;
T x = yy * sin_theta + xx * cos_theta + roi_center_w;
T val = bilinear_interpolate(offset_input, height, width, y, x);
output_val += val;
}
}
output_val /= count;
top_data[index] = output_val;
}
}
template <typename T>
__global__ void RoIAlignRotatedBackwardFeature(
const int nthreads,
const T* top_diff,
const int num_rois,
const T spatial_scale,
const int channels,
const int height,
const int width,
const int pooled_height,
const int pooled_width,
const int sampling_ratio,
T* bottom_diff,
const T* rois) {
CUDA_1D_KERNEL_LOOP(index, nthreads) {
// (n, c, ph, pw) is an element in the pooled output
int pw = index % pooled_width;
int ph = (index / pooled_width) % pooled_height;
int c = (index / pooled_width / pooled_height) % channels;
int n = index / pooled_width / pooled_height / channels;
const T* current_roi = rois + n * 6;
int roi_batch_ind = current_roi[0];
// Do not use rounding; this implementation detail is critical
// ROIAlignRotated supports align == true, i.e., continuous coordinate
// by default, thus the 0.5 offset
T offset = (T)0.5;
T roi_center_w = current_roi[1] * spatial_scale - offset;
T roi_center_h = current_roi[2] * spatial_scale - offset;
T roi_width = current_roi[3] * spatial_scale;
T roi_height = current_roi[4] * spatial_scale;
T theta = current_roi[5] * M_PI / 180.0;
T cos_theta = cos(theta);
T sin_theta = sin(theta);
T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
T* offset_bottom_diff =
bottom_diff + (roi_batch_ind * channels + c) * height * width;
int top_offset = (n * channels + c) * pooled_height * pooled_width;
const T* offset_top_diff = top_diff + top_offset;
const T top_diff_this_bin = offset_top_diff[ph * pooled_width + pw];
// We use roi_bin_grid to sample the grid and mimic integral
int roi_bin_grid_h = (sampling_ratio > 0)
? sampling_ratio
: ceil(roi_height / pooled_height); // e.g., = 2
int roi_bin_grid_w =
(sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
// roi_start_h and roi_start_w are computed wrt the center of RoI (x, y).
// Appropriate translation needs to be applied after.
T roi_start_h = -roi_height / 2.0;
T roi_start_w = -roi_width / 2.0;
// We do average (integral) pooling inside a bin
const T count = roi_bin_grid_h * roi_bin_grid_w; // e.g. = 4
for (int iy = 0; iy < roi_bin_grid_h; iy++) // e.g., iy = 0, 1
{
const T yy = roi_start_h + ph * bin_size_h +
static_cast<T>(iy + .5f) * bin_size_h /
static_cast<T>(roi_bin_grid_h); // e.g., 0.5, 1.5
for (int ix = 0; ix < roi_bin_grid_w; ix++) {
const T xx = roi_start_w + pw * bin_size_w +
static_cast<T>(ix + .5f) * bin_size_w /
static_cast<T>(roi_bin_grid_w);
// Rotate by theta around the center and translate
T y = yy * cos_theta - xx * sin_theta + roi_center_h;
T x = yy * sin_theta + xx * cos_theta + roi_center_w;
T w1, w2, w3, w4;
int x_low, x_high, y_low, y_high;
bilinear_interpolate_gradient(
height, width, y, x, w1, w2, w3, w4, x_low, x_high, y_low, y_high);
T g1 = top_diff_this_bin * w1 / count;
T g2 = top_diff_this_bin * w2 / count;
T g3 = top_diff_this_bin * w3 / count;
T g4 = top_diff_this_bin * w4 / count;
if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) {
atomicAdd(
offset_bottom_diff + y_low * width + x_low, static_cast<T>(g1));
atomicAdd(
offset_bottom_diff + y_low * width + x_high, static_cast<T>(g2));
atomicAdd(
offset_bottom_diff + y_high * width + x_low, static_cast<T>(g3));
atomicAdd(
offset_bottom_diff + y_high * width + x_high, static_cast<T>(g4));
} // if
} // ix
} // iy
} // CUDA_1D_KERNEL_LOOP
} // RoIAlignRotatedBackward
at::Tensor ROIAlignRotated_forward_cuda(
const at::Tensor& input,
const at::Tensor& rois,
const float spatial_scale,
const int pooled_height,
const int pooled_width,
const int sampling_ratio) {
AT_ASSERTM(input.device().is_cuda(), "input must be a CUDA tensor");
AT_ASSERTM(rois.device().is_cuda(), "rois must be a CUDA tensor");
at::TensorArg input_t{input, "input", 1}, rois_t{rois, "rois", 2};
at::CheckedFrom c = "ROIAlignRotated_forward_cuda";
at::checkAllSameGPU(c, {input_t, rois_t});
at::checkAllSameType(c, {input_t, rois_t});
at::cuda::CUDAGuard device_guard(input.device());
auto num_rois = rois.size(0);
auto channels = input.size(1);
auto height = input.size(2);
auto width = input.size(3);
auto output = at::empty(
{num_rois, channels, pooled_height, pooled_width}, input.options());
auto output_size = num_rois * pooled_height * pooled_width * channels;
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
dim3 grid(std::min(
at::cuda::ATenCeilDiv(
static_cast<int64_t>(output_size), static_cast<int64_t>(512)),
static_cast<int64_t>(4096)));
dim3 block(512);
if (output.numel() == 0) {
AT_CUDA_CHECK(cudaGetLastError());
return output;
}
auto input_ = input.contiguous(), rois_ = rois.contiguous();
AT_DISPATCH_FLOATING_TYPES(
input.scalar_type(), "ROIAlignRotated_forward", [&] {
RoIAlignRotatedForward<scalar_t><<<grid, block, 0, stream>>>(
output_size,
input_.data_ptr<scalar_t>(),
spatial_scale,
channels,
height,
width,
pooled_height,
pooled_width,
sampling_ratio,
rois_.data_ptr<scalar_t>(),
output.data_ptr<scalar_t>());
});
cudaDeviceSynchronize();
AT_CUDA_CHECK(cudaGetLastError());
return output;
}
// TODO remove the dependency on input and use instead its sizes -> save memory
at::Tensor ROIAlignRotated_backward_cuda(
const at::Tensor& grad,
const at::Tensor& rois,
const float spatial_scale,
const int pooled_height,
const int pooled_width,
const int batch_size,
const int channels,
const int height,
const int width,
const int sampling_ratio) {
AT_ASSERTM(grad.device().is_cuda(), "grad must be a CUDA tensor");
AT_ASSERTM(rois.device().is_cuda(), "rois must be a CUDA tensor");
at::TensorArg grad_t{grad, "grad", 1}, rois_t{rois, "rois", 2};
at::CheckedFrom c = "ROIAlign_backward_cuda";
at::checkAllSameGPU(c, {grad_t, rois_t});
at::checkAllSameType(c, {grad_t, rois_t});
at::cuda::CUDAGuard device_guard(grad.device());
auto num_rois = rois.size(0);
auto grad_input =
at::zeros({batch_size, channels, height, width}, grad.options());
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
dim3 grid(std::min(
at::cuda::ATenCeilDiv(
static_cast<int64_t>(grad.numel()), static_cast<int64_t>(512)),
static_cast<int64_t>(4096)));
dim3 block(512);
// handle possibly empty gradients
if (grad.numel() == 0) {
AT_CUDA_CHECK(cudaGetLastError());
return grad_input;
}
auto grad_ = grad.contiguous(), rois_ = rois.contiguous();
AT_DISPATCH_FLOATING_TYPES(
grad.scalar_type(), "ROIAlignRotated_backward", [&] {
RoIAlignRotatedBackwardFeature<scalar_t><<<grid, block, 0, stream>>>(
grad.numel(),
grad_.data_ptr<scalar_t>(),
num_rois,
spatial_scale,
channels,
height,
width,
pooled_height,
pooled_width,
sampling_ratio,
grad_input.data_ptr<scalar_t>(),
rois_.data_ptr<scalar_t>());
});
AT_CUDA_CHECK(cudaGetLastError());
return grad_input;
}
} // namespace detectron2
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment