Unverified Commit 0a8586c9 authored by mcarilli's avatar mcarilli Committed by GitHub
Browse files

[WIP] Allow autocast for 1.6 (#2384)



* Fixes Xiao's repro

* Ports nms to use full dispatcher

* Move HIPGuard to nms_cuda

* clang-format

* run models in test_models.py on GPU if available

* Francisco's comment, also disable cuda model tests to see if CPU alone still passes

* cuda tests now pass locally, although still not comparing to saved numerics

* add note for thing to ask francisco

* Allow cuda and cpu tests to share a data file

* ignore suffix if unneeded

* Skip autocast numerics checks for a few models

* Add roi_align test
Co-authored-by: default avatarMichael Carilli <mcarilli@nvidia.com>
parent 4246abc8
...@@ -85,7 +85,7 @@ def is_iterable(obj): ...@@ -85,7 +85,7 @@ def is_iterable(obj):
class TestCase(unittest.TestCase): class TestCase(unittest.TestCase):
precision = 1e-5 precision = 1e-5
def assertExpected(self, output, subname=None, prec=None): def assertExpected(self, output, subname=None, prec=None, strip_suffix=None):
r""" r"""
Test that a python value matches the recorded contents of a file Test that a python value matches the recorded contents of a file
derived from the name of this test and subname. The value must be derived from the name of this test and subname. The value must be
...@@ -96,16 +96,24 @@ class TestCase(unittest.TestCase): ...@@ -96,16 +96,24 @@ class TestCase(unittest.TestCase):
If you call this multiple times in a single function, you must If you call this multiple times in a single function, you must
give a unique subname each time. give a unique subname each time.
strip_suffix allows different tests that expect similar numerics, e.g.
"test_xyz_cuda" and "test_xyz_cpu", to use the same pickled data.
test_xyz_cuda would pass strip_suffix="_cuda", test_xyz_cpu would pass
strip_suffix="_cpu", and they would both use a data file name based on
"test_xyz".
""" """
def remove_prefix(text, prefix): def remove_prefix_suffix(text, prefix, suffix):
if text.startswith(prefix): if text.startswith(prefix):
return text[len(prefix):] text = text[len(prefix):]
if suffix is not None and text.endswith(suffix):
text = text[:len(text) - len(suffix)]
return text return text
# NB: we take __file__ from the module that defined the test # NB: we take __file__ from the module that defined the test
# class, so we place the expect directory where the test script # class, so we place the expect directory where the test script
# lives, NOT where test/common_utils.py lives. # lives, NOT where test/common_utils.py lives.
module_id = self.__class__.__module__ module_id = self.__class__.__module__
munged_id = remove_prefix(self.id(), module_id + ".") munged_id = remove_prefix_suffix(self.id(), module_id + ".", strip_suffix)
test_file = os.path.realpath(sys.modules[module_id].__file__) test_file = os.path.realpath(sys.modules[module_id].__file__)
expected_file = os.path.join(os.path.dirname(test_file), expected_file = os.path.join(os.path.dirname(test_file),
"expect", "expect",
......
...@@ -74,6 +74,26 @@ script_test_models = { ...@@ -74,6 +74,26 @@ script_test_models = {
} }
# The following models exhibit flaky numerics under autocast in _test_*_model harnesses.
# This may be caused by the harness environment (e.g. num classes, input initialization
# via torch.rand), and does not prove autocast is unsuitable when training with real data
# (autocast has been used successfully with real data for some of these models).
# TODO: investigate why autocast numerics are flaky in the harnesses.
#
# For the following models, _test_*_model harnesses skip numerical checks on outputs when
# trying autocast. However, they still try an autocasted forward pass, so they still ensure
# autocast coverage suffices to prevent dtype errors in each model.
autocast_flaky_numerics = (
"fasterrcnn_resnet50_fpn",
"inception_v3",
"keypointrcnn_resnet50_fpn",
"maskrcnn_resnet50_fpn",
"resnet101",
"resnet152",
"wide_resnet101_2",
)
class ModelTester(TestCase): class ModelTester(TestCase):
def checkModule(self, model, name, args): def checkModule(self, model, name, args):
if name not in script_test_models: if name not in script_test_models:
...@@ -81,65 +101,87 @@ class ModelTester(TestCase): ...@@ -81,65 +101,87 @@ class ModelTester(TestCase):
unwrapper = script_test_models[name].get('unwrapper', None) unwrapper = script_test_models[name].get('unwrapper', None)
return super(ModelTester, self).checkModule(model, args, unwrapper=unwrapper, skip=False) return super(ModelTester, self).checkModule(model, args, unwrapper=unwrapper, skip=False)
def _test_classification_model(self, name, input_shape): def _test_classification_model(self, name, input_shape, dev):
set_rng_seed(0) set_rng_seed(0)
# passing num_class equal to a number other than 1000 helps in making the test # passing num_class equal to a number other than 1000 helps in making the test
# more enforcing in nature # more enforcing in nature
model = models.__dict__[name](num_classes=50) model = models.__dict__[name](num_classes=50)
model.eval() model.eval().to(device=dev)
x = torch.rand(input_shape) # RNG always on CPU, to ensure x in cuda tests is bitwise identical to x in cpu tests
x = torch.rand(input_shape).to(device=dev)
out = model(x) out = model(x)
self.assertExpected(out, prec=0.1) self.assertExpected(out.cpu(), prec=0.1, strip_suffix="_" + dev)
self.assertEqual(out.shape[-1], 50) self.assertEqual(out.shape[-1], 50)
self.checkModule(model, name, (x,)) self.checkModule(model, name, (x,))
def _test_segmentation_model(self, name): if dev == "cuda":
with torch.cuda.amp.autocast():
out = model(x)
# See autocast_flaky_numerics comment at top of file.
if name not in autocast_flaky_numerics:
self.assertExpected(out.cpu(), prec=0.1, strip_suffix="_" + dev)
self.assertEqual(out.shape[-1], 50)
def _test_segmentation_model(self, name, dev):
# passing num_class equal to a number other than 1000 helps in making the test # passing num_class equal to a number other than 1000 helps in making the test
# more enforcing in nature # more enforcing in nature
model = models.segmentation.__dict__[name](num_classes=50, pretrained_backbone=False) model = models.segmentation.__dict__[name](num_classes=50, pretrained_backbone=False)
model.eval() model.eval().to(device=dev)
input_shape = (1, 3, 300, 300) input_shape = (1, 3, 300, 300)
x = torch.rand(input_shape) # RNG always on CPU, to ensure x in cuda tests is bitwise identical to x in cpu tests
x = torch.rand(input_shape).to(device=dev)
out = model(x) out = model(x)
self.assertEqual(tuple(out["out"].shape), (1, 50, 300, 300)) self.assertEqual(tuple(out["out"].shape), (1, 50, 300, 300))
self.checkModule(model, name, (x,)) self.checkModule(model, name, (x,))
def _test_detection_model(self, name): if dev == "cuda":
with torch.cuda.amp.autocast():
out = model(x)
self.assertEqual(tuple(out["out"].shape), (1, 50, 300, 300))
def _test_detection_model(self, name, dev):
set_rng_seed(0) set_rng_seed(0)
model = models.detection.__dict__[name](num_classes=50, pretrained_backbone=False) model = models.detection.__dict__[name](num_classes=50, pretrained_backbone=False)
model.eval() model.eval().to(device=dev)
input_shape = (3, 300, 300) input_shape = (3, 300, 300)
x = torch.rand(input_shape) # RNG always on CPU, to ensure x in cuda tests is bitwise identical to x in cpu tests
x = torch.rand(input_shape).to(device=dev)
model_input = [x] model_input = [x]
out = model(model_input) out = model(model_input)
self.assertIs(model_input[0], x) self.assertIs(model_input[0], x)
self.assertEqual(len(out), 1)
def subsample_tensor(tensor): def check_out(out):
num_elems = tensor.numel() self.assertEqual(len(out), 1)
num_samples = 20
if num_elems <= num_samples: def subsample_tensor(tensor):
return tensor num_elems = tensor.numel()
num_samples = 20
flat_tensor = tensor.flatten() if num_elems <= num_samples:
ith_index = num_elems // num_samples return tensor
return flat_tensor[ith_index - 1::ith_index]
flat_tensor = tensor.flatten()
def compute_mean_std(tensor): ith_index = num_elems // num_samples
# can't compute mean of integral tensor return flat_tensor[ith_index - 1::ith_index]
tensor = tensor.to(torch.double)
mean = torch.mean(tensor) def compute_mean_std(tensor):
std = torch.std(tensor) # can't compute mean of integral tensor
return {"mean": mean, "std": std} tensor = tensor.to(torch.double)
mean = torch.mean(tensor)
# maskrcnn_resnet_50_fpn numerically unstable across platforms, so for now std = torch.std(tensor)
# compare results with mean and std return {"mean": mean, "std": std}
if name == "maskrcnn_resnet50_fpn":
test_value = map_nested_tensor_object(out, tensor_map_fn=compute_mean_std) # maskrcnn_resnet_50_fpn numerically unstable across platforms, so for now
# mean values are small, use large prec # compare results with mean and std
self.assertExpected(test_value, prec=.01) if name == "maskrcnn_resnet50_fpn":
else: test_value = map_nested_tensor_object(out, tensor_map_fn=compute_mean_std)
self.assertExpected(map_nested_tensor_object(out, tensor_map_fn=subsample_tensor), prec=0.01) # mean values are small, use large prec
self.assertExpected(test_value, prec=.01, strip_suffix="_" + dev)
else:
self.assertExpected(map_nested_tensor_object(out, tensor_map_fn=subsample_tensor),
prec=0.01,
strip_suffix="_" + dev)
check_out(out)
scripted_model = torch.jit.script(model) scripted_model = torch.jit.script(model)
scripted_model.eval() scripted_model.eval()
...@@ -156,6 +198,13 @@ class ModelTester(TestCase): ...@@ -156,6 +198,13 @@ class ModelTester(TestCase):
# self.check_script(model, name) # self.check_script(model, name)
self.checkModule(model, name, ([x],)) self.checkModule(model, name, ([x],))
if dev == "cuda":
with torch.cuda.amp.autocast():
out = model(model_input)
# See autocast_flaky_numerics comment at top of file.
if name not in autocast_flaky_numerics:
check_out(out)
def _test_detection_model_validation(self, name): def _test_detection_model_validation(self, name):
set_rng_seed(0) set_rng_seed(0)
model = models.detection.__dict__[name](num_classes=50, pretrained_backbone=False) model = models.detection.__dict__[name](num_classes=50, pretrained_backbone=False)
...@@ -179,18 +228,24 @@ class ModelTester(TestCase): ...@@ -179,18 +228,24 @@ class ModelTester(TestCase):
targets = [{'boxes': boxes}] targets = [{'boxes': boxes}]
self.assertRaises(ValueError, model, x, targets=targets) self.assertRaises(ValueError, model, x, targets=targets)
def _test_video_model(self, name): def _test_video_model(self, name, dev):
# the default input shape is # the default input shape is
# bs * num_channels * clip_len * h *w # bs * num_channels * clip_len * h *w
input_shape = (1, 3, 4, 112, 112) input_shape = (1, 3, 4, 112, 112)
# test both basicblock and Bottleneck # test both basicblock and Bottleneck
model = models.video.__dict__[name](num_classes=50) model = models.video.__dict__[name](num_classes=50)
model.eval() model.eval().to(device=dev)
x = torch.rand(input_shape) # RNG always on CPU, to ensure x in cuda tests is bitwise identical to x in cpu tests
x = torch.rand(input_shape).to(device=dev)
out = model(x) out = model(x)
self.checkModule(model, name, (x,)) self.checkModule(model, name, (x,))
self.assertEqual(out.shape[-1], 50) self.assertEqual(out.shape[-1], 50)
if dev == "cuda":
with torch.cuda.amp.autocast():
out = model(x)
self.assertEqual(out.shape[-1], 50)
def _make_sliced_model(self, model, stop_layer): def _make_sliced_model(self, model, stop_layer):
layers = OrderedDict() layers = OrderedDict()
for name, layer in model.named_children(): for name, layer in model.named_children():
...@@ -272,6 +327,12 @@ class ModelTester(TestCase): ...@@ -272,6 +327,12 @@ class ModelTester(TestCase):
@unittest.skipIf(not torch.cuda.is_available(), 'needs GPU') @unittest.skipIf(not torch.cuda.is_available(), 'needs GPU')
def test_fasterrcnn_switch_devices(self): def test_fasterrcnn_switch_devices(self):
def checkOut(out):
self.assertEqual(len(out), 1)
self.assertTrue("boxes" in out[0])
self.assertTrue("scores" in out[0])
self.assertTrue("labels" in out[0])
model = models.detection.fasterrcnn_resnet50_fpn(num_classes=50, pretrained_backbone=False) model = models.detection.fasterrcnn_resnet50_fpn(num_classes=50, pretrained_backbone=False)
model.cuda() model.cuda()
model.eval() model.eval()
...@@ -280,17 +341,20 @@ class ModelTester(TestCase): ...@@ -280,17 +341,20 @@ class ModelTester(TestCase):
model_input = [x] model_input = [x]
out = model(model_input) out = model(model_input)
self.assertIs(model_input[0], x) self.assertIs(model_input[0], x)
self.assertEqual(len(out), 1)
self.assertTrue("boxes" in out[0]) checkOut(out)
self.assertTrue("scores" in out[0])
self.assertTrue("labels" in out[0]) with torch.cuda.amp.autocast():
out = model(model_input)
checkOut(out)
# now switch to cpu and make sure it works # now switch to cpu and make sure it works
model.cpu() model.cpu()
x = x.cpu() x = x.cpu()
out_cpu = model([x]) out_cpu = model([x])
self.assertTrue("boxes" in out_cpu[0])
self.assertTrue("scores" in out_cpu[0]) checkOut(out_cpu)
self.assertTrue("labels" in out_cpu[0])
def test_generalizedrcnn_transform_repr(self): def test_generalizedrcnn_transform_repr(self):
...@@ -312,34 +376,40 @@ class ModelTester(TestCase): ...@@ -312,34 +376,40 @@ class ModelTester(TestCase):
self.assertEqual(t.__repr__(), expected_string) self.assertEqual(t.__repr__(), expected_string)
_devs = ["cpu", "cuda"] if torch.cuda.is_available() else ["cpu"]
for model_name in get_available_classification_models(): for model_name in get_available_classification_models():
# for-loop bodies don't define scopes, so we have to save the variables for dev in _devs:
# we want to close over in some way # for-loop bodies don't define scopes, so we have to save the variables
def do_test(self, model_name=model_name): # we want to close over in some way
input_shape = (1, 3, 224, 224) def do_test(self, model_name=model_name, dev=dev):
if model_name in ['inception_v3']: input_shape = (1, 3, 224, 224)
input_shape = (1, 3, 299, 299) if model_name in ['inception_v3']:
self._test_classification_model(model_name, input_shape) input_shape = (1, 3, 299, 299)
self._test_classification_model(model_name, input_shape, dev)
setattr(ModelTester, "test_" + model_name, do_test) setattr(ModelTester, "test_" + model_name + "_" + dev, do_test)
for model_name in get_available_segmentation_models(): for model_name in get_available_segmentation_models():
# for-loop bodies don't define scopes, so we have to save the variables for dev in _devs:
# we want to close over in some way # for-loop bodies don't define scopes, so we have to save the variables
def do_test(self, model_name=model_name): # we want to close over in some way
self._test_segmentation_model(model_name) def do_test(self, model_name=model_name, dev=dev):
self._test_segmentation_model(model_name, dev)
setattr(ModelTester, "test_" + model_name, do_test) setattr(ModelTester, "test_" + model_name + "_" + dev, do_test)
for model_name in get_available_detection_models(): for model_name in get_available_detection_models():
# for-loop bodies don't define scopes, so we have to save the variables for dev in _devs:
# we want to close over in some way # for-loop bodies don't define scopes, so we have to save the variables
def do_test(self, model_name=model_name): # we want to close over in some way
self._test_detection_model(model_name) def do_test(self, model_name=model_name, dev=dev):
self._test_detection_model(model_name, dev)
setattr(ModelTester, "test_" + model_name, do_test) setattr(ModelTester, "test_" + model_name + "_" + dev, do_test)
def do_validation_test(self, model_name=model_name): def do_validation_test(self, model_name=model_name):
self._test_detection_model_validation(model_name) self._test_detection_model_validation(model_name)
...@@ -348,11 +418,11 @@ for model_name in get_available_detection_models(): ...@@ -348,11 +418,11 @@ for model_name in get_available_detection_models():
for model_name in get_available_video_models(): for model_name in get_available_video_models():
for dev in _devs:
def do_test(self, model_name=model_name, dev=dev):
self._test_video_model(model_name, dev)
def do_test(self, model_name=model_name): setattr(ModelTester, "test_" + model_name + "_" + dev, do_test)
self._test_video_model(model_name)
setattr(ModelTester, "test_" + model_name, do_test)
if __name__ == '__main__': if __name__ == '__main__':
unittest.main() unittest.main()
...@@ -52,25 +52,30 @@ class OpTester(object): ...@@ -52,25 +52,30 @@ class OpTester(object):
class RoIOpTester(OpTester): class RoIOpTester(OpTester):
def _test_forward(self, device, contiguous): def _test_forward(self, device, contiguous, x_dtype=None, rois_dtype=None):
x_dtype = self.dtype if x_dtype is None else x_dtype
rois_dtype = self.dtype if rois_dtype is None else rois_dtype
pool_size = 5 pool_size = 5
# n_channels % (pool_size ** 2) == 0 required for PS opeartions. # n_channels % (pool_size ** 2) == 0 required for PS opeartions.
n_channels = 2 * (pool_size ** 2) n_channels = 2 * (pool_size ** 2)
x = torch.rand(2, n_channels, 10, 10, dtype=self.dtype, device=device) x = torch.rand(2, n_channels, 10, 10, dtype=x_dtype, device=device)
if not contiguous: if not contiguous:
x = x.permute(0, 1, 3, 2) x = x.permute(0, 1, 3, 2)
rois = torch.tensor([[0, 0, 0, 9, 9], # format is (xyxy) rois = torch.tensor([[0, 0, 0, 9, 9], # format is (xyxy)
[0, 0, 5, 4, 9], [0, 0, 5, 4, 9],
[0, 5, 5, 9, 9], [0, 5, 5, 9, 9],
[1, 0, 0, 9, 9]], [1, 0, 0, 9, 9]],
dtype=self.dtype, device=device) dtype=rois_dtype, device=device)
pool_h, pool_w = pool_size, pool_size pool_h, pool_w = pool_size, pool_size
y = self.fn(x, rois, pool_h, pool_w, spatial_scale=1, sampling_ratio=-1) y = self.fn(x, rois, pool_h, pool_w, spatial_scale=1, sampling_ratio=-1)
# the following should be true whether we're running an autocast test or not.
self.assertTrue(y.dtype == x.dtype)
gt_y = self.expected_fn(x, rois, pool_h, pool_w, spatial_scale=1, gt_y = self.expected_fn(x, rois, pool_h, pool_w, spatial_scale=1,
sampling_ratio=-1, device=device, dtype=self.dtype) sampling_ratio=-1, device=device, dtype=self.dtype)
self.assertTrue(torch.allclose(gt_y, y)) tol = 1e-3 if (x_dtype is torch.half or rois_dtype is torch.half) else 1e-5
self.assertTrue(torch.allclose(gt_y.to(y.dtype), y, rtol=tol, atol=tol))
def _test_backward(self, device, contiguous): def _test_backward(self, device, contiguous):
pool_size = 2 pool_size = 2
...@@ -290,6 +295,13 @@ class RoIAlignTester(RoIOpTester, unittest.TestCase): ...@@ -290,6 +295,13 @@ class RoIAlignTester(RoIOpTester, unittest.TestCase):
def _test_boxes_shape(self): def _test_boxes_shape(self):
self._helper_boxes_shape(ops.roi_align) self._helper_boxes_shape(ops.roi_align)
@unittest.skipIf(not torch.cuda.is_available(), "CUDA unavailable")
def test_roi_align_autocast(self):
for x_dtype in (torch.float, torch.half):
for rois_dtype in (torch.float, torch.half):
with torch.cuda.amp.autocast():
self._test_forward(torch.device("cuda"), contiguous=False, x_dtype=x_dtype, rois_dtype=rois_dtype)
class PSRoIAlignTester(RoIOpTester, unittest.TestCase): class PSRoIAlignTester(RoIOpTester, unittest.TestCase):
def fn(self, x, rois, pool_h, pool_w, spatial_scale=1, sampling_ratio=-1, **kwargs): def fn(self, x, rois, pool_h, pool_w, spatial_scale=1, sampling_ratio=-1, **kwargs):
......
...@@ -3,6 +3,7 @@ ...@@ -3,6 +3,7 @@
#include "cpu/vision_cpu.h" #include "cpu/vision_cpu.h"
#ifdef WITH_CUDA #ifdef WITH_CUDA
#include "autocast.h"
#include "cuda/vision_cuda.h" #include "cuda/vision_cuda.h"
#endif #endif
#ifdef WITH_HIP #ifdef WITH_HIP
...@@ -11,6 +12,7 @@ ...@@ -11,6 +12,7 @@
// TODO: put this stuff in torchvision namespace // TODO: put this stuff in torchvision namespace
// roi_align dispatch nexus
at::Tensor roi_align( at::Tensor roi_align(
const at::Tensor& input, // Input feature map. const at::Tensor& input, // Input feature map.
const at::Tensor& rois, // List of ROIs to pool over. const at::Tensor& rois, // List of ROIs to pool over.
...@@ -35,6 +37,28 @@ at::Tensor roi_align( ...@@ -35,6 +37,28 @@ at::Tensor roi_align(
aligned); aligned);
} }
#ifdef WITH_CUDA
at::Tensor ROIAlign_autocast(
const at::Tensor& input,
const at::Tensor& rois,
const double spatial_scale,
const int64_t pooled_height,
const int64_t pooled_width,
const int64_t sampling_ratio,
const bool aligned) {
c10::impl::ExcludeDispatchKeyGuard no_autocast(c10::DispatchKey::Autocast);
return roi_align(
autocast::_cast(at::kFloat, input),
autocast::_cast(at::kFloat, rois),
spatial_scale,
pooled_height,
pooled_width,
sampling_ratio,
aligned)
.to(input.scalar_type());
}
#endif
at::Tensor _roi_align_backward( at::Tensor _roi_align_backward(
const at::Tensor& grad, const at::Tensor& grad,
const at::Tensor& rois, const at::Tensor& rois,
......
#pragma once
#ifdef WITH_CUDA
namespace autocast {
inline bool is_eligible(const at::Tensor& arg) {
return (
arg.is_cuda() && arg.is_floating_point() &&
(arg.scalar_type() != at::kDouble));
}
// Overload to catch Tensor args
inline at::Tensor _cast(at::ScalarType to_type, const at::Tensor& arg) {
if (is_eligible(arg) && (arg.scalar_type() != to_type)) {
return arg.to(to_type);
} else {
return arg;
}
}
// Template to catch non-Tensor args
template <typename T>
inline T _cast(at::ScalarType to_type, T arg) {
return arg;
}
} // namespace autocast
#endif
...@@ -4,7 +4,7 @@ template <typename scalar_t> ...@@ -4,7 +4,7 @@ template <typename scalar_t>
at::Tensor nms_cpu_kernel( at::Tensor nms_cpu_kernel(
const at::Tensor& dets, const at::Tensor& dets,
const at::Tensor& scores, const at::Tensor& scores,
const float iou_threshold) { const double iou_threshold) {
AT_ASSERTM(!dets.is_cuda(), "dets must be a CPU tensor"); AT_ASSERTM(!dets.is_cuda(), "dets must be a CPU tensor");
AT_ASSERTM(!scores.is_cuda(), "scores must be a CPU tensor"); AT_ASSERTM(!scores.is_cuda(), "scores must be a CPU tensor");
AT_ASSERTM( AT_ASSERTM(
...@@ -72,7 +72,26 @@ at::Tensor nms_cpu_kernel( ...@@ -72,7 +72,26 @@ at::Tensor nms_cpu_kernel(
at::Tensor nms_cpu( at::Tensor nms_cpu(
const at::Tensor& dets, const at::Tensor& dets,
const at::Tensor& scores, const at::Tensor& scores,
const float iou_threshold) { const double iou_threshold) {
TORCH_CHECK(
dets.dim() == 2, "boxes should be a 2d tensor, got ", dets.dim(), "D");
TORCH_CHECK(
dets.size(1) == 4,
"boxes should have 4 elements in dimension 1, got ",
dets.size(1));
TORCH_CHECK(
scores.dim() == 1,
"scores should be a 1d tensor, got ",
scores.dim(),
"D");
TORCH_CHECK(
dets.size(0) == scores.size(0),
"boxes and scores should have same number of elements in ",
"dimension 0, got ",
dets.size(0),
" and ",
scores.size(0));
auto result = at::empty({0}, dets.options()); auto result = at::empty({0}, dets.options());
AT_DISPATCH_FLOATING_TYPES(dets.scalar_type(), "nms", [&] { AT_DISPATCH_FLOATING_TYPES(dets.scalar_type(), "nms", [&] {
......
...@@ -85,7 +85,7 @@ at::Tensor PSROIAlign_backward_cpu( ...@@ -85,7 +85,7 @@ at::Tensor PSROIAlign_backward_cpu(
at::Tensor nms_cpu( at::Tensor nms_cpu(
const at::Tensor& dets, const at::Tensor& dets,
const at::Tensor& scores, const at::Tensor& scores,
const float iou_threshold); const double iou_threshold);
at::Tensor DeformConv2d_forward_cpu( at::Tensor DeformConv2d_forward_cpu(
const at::Tensor& input, const at::Tensor& input,
......
#include <ATen/ATen.h> #include <ATen/ATen.h>
#include <ATen/cuda/CUDAContext.h> #include <ATen/cuda/CUDAContext.h>
#if defined(WITH_CUDA)
#include <c10/cuda/CUDAGuard.h> #include <c10/cuda/CUDAGuard.h>
#elif defined(WITH_HIP)
#include <c10/hip/HIPGuard.h>
#endif
#include "cuda_helpers.h" #include "cuda_helpers.h"
...@@ -70,10 +75,40 @@ __global__ void nms_kernel( ...@@ -70,10 +75,40 @@ __global__ void nms_kernel(
at::Tensor nms_cuda(const at::Tensor& dets, at::Tensor nms_cuda(const at::Tensor& dets,
const at::Tensor& scores, const at::Tensor& scores,
float iou_threshold) { const double iou_threshold) {
AT_ASSERTM(dets.is_cuda(), "dets must be a CUDA tensor"); AT_ASSERTM(dets.is_cuda(), "dets must be a CUDA tensor");
AT_ASSERTM(scores.is_cuda(), "scores must be a CUDA tensor"); AT_ASSERTM(scores.is_cuda(), "scores must be a CUDA tensor");
TORCH_CHECK(
dets.dim() == 2, "boxes should be a 2d tensor, got ", dets.dim(), "D");
TORCH_CHECK(
dets.size(1) == 4,
"boxes should have 4 elements in dimension 1, got ",
dets.size(1));
TORCH_CHECK(
scores.dim() == 1,
"scores should be a 1d tensor, got ",
scores.dim(),
"D");
TORCH_CHECK(
dets.size(0) == scores.size(0),
"boxes and scores should have same number of elements in ",
"dimension 0, got ",
dets.size(0),
" and ",
scores.size(0))
#if defined(WITH_CUDA)
at::cuda::CUDAGuard device_guard(dets.device()); at::cuda::CUDAGuard device_guard(dets.device());
#elif defined(WITH_HIP)
at::cuda::HIPGuard device_guard(dets.device());
#else
AT_ERROR("Not compiled with GPU support");
#endif
if (dets.numel() == 0) {
return at::empty({0}, dets.options().dtype(at::kLong));
}
auto order_t = std::get<1>(scores.sort(0, /* descending=*/true)); auto order_t = std::get<1>(scores.sort(0, /* descending=*/true));
auto dets_sorted = dets.index_select(0, order_t).contiguous(); auto dets_sorted = dets.index_select(0, order_t).contiguous();
......
#pragma once #pragma once
#if defined(WITH_CUDA)
#include <c10/cuda/CUDAGuard.h>
#elif defined(WITH_HIP)
#include <c10/hip/HIPGuard.h>
#endif
#include <torch/extension.h> #include <torch/extension.h>
at::Tensor ROIAlign_forward_cuda( at::Tensor ROIAlign_forward_cuda(
...@@ -90,7 +85,7 @@ at::Tensor PSROIAlign_backward_cuda( ...@@ -90,7 +85,7 @@ at::Tensor PSROIAlign_backward_cuda(
at::Tensor nms_cuda( at::Tensor nms_cuda(
const at::Tensor& dets, const at::Tensor& dets,
const at::Tensor& scores, const at::Tensor& scores,
const float iou_threshold); const double iou_threshold);
at::Tensor DeformConv2d_forward_cuda( at::Tensor DeformConv2d_forward_cuda(
const at::Tensor& input, const at::Tensor& input,
......
...@@ -2,52 +2,33 @@ ...@@ -2,52 +2,33 @@
#include "cpu/vision_cpu.h" #include "cpu/vision_cpu.h"
#ifdef WITH_CUDA #ifdef WITH_CUDA
#include "autocast.h"
#include "cuda/vision_cuda.h" #include "cuda/vision_cuda.h"
#endif #endif
#ifdef WITH_HIP #ifdef WITH_HIP
#include "hip/vision_cuda.h" #include "hip/vision_cuda.h"
#endif #endif
// nms dispatch nexus
at::Tensor nms( at::Tensor nms(
const at::Tensor& dets, const at::Tensor& dets,
const at::Tensor& scores, const at::Tensor& scores,
const double iou_threshold) { const double iou_threshold) {
TORCH_CHECK( static auto op = c10::Dispatcher::singleton()
dets.dim() == 2, "boxes should be a 2d tensor, got ", dets.dim(), "D"); .findSchemaOrThrow("torchvision::nms", "")
TORCH_CHECK( .typed<decltype(nms)>();
dets.size(1) == 4, return op.call(dets, scores, iou_threshold);
"boxes should have 4 elements in dimension 1, got ", }
dets.size(1));
TORCH_CHECK(
scores.dim() == 1,
"scores should be a 1d tensor, got ",
scores.dim(),
"D");
TORCH_CHECK(
dets.size(0) == scores.size(0),
"boxes and scores should have same number of elements in ",
"dimension 0, got ",
dets.size(0),
" and ",
scores.size(0));
if (dets.is_cuda()) {
#if defined(WITH_CUDA)
if (dets.numel() == 0) {
at::cuda::CUDAGuard device_guard(dets.device());
return at::empty({0}, dets.options().dtype(at::kLong));
}
return nms_cuda(dets, scores, iou_threshold);
#elif defined(WITH_HIP)
if (dets.numel() == 0) {
at::cuda::HIPGuard device_guard(dets.device());
return at::empty({0}, dets.options().dtype(at::kLong));
}
return nms_cuda(dets, scores, iou_threshold);
#else
AT_ERROR("Not compiled with GPU support");
#endif
}
at::Tensor result = nms_cpu(dets, scores, iou_threshold); #ifdef WITH_CUDA
return result; at::Tensor nms_autocast(
const at::Tensor& dets,
const at::Tensor& scores,
const double iou_threshold) {
c10::impl::ExcludeDispatchKeyGuard no_autocast(c10::DispatchKey::Autocast);
return nms(
autocast::_cast(at::kFloat, dets),
autocast::_cast(at::kFloat, scores),
iou_threshold);
} }
#endif
...@@ -43,7 +43,7 @@ int64_t _cuda_version() { ...@@ -43,7 +43,7 @@ int64_t _cuda_version() {
} }
TORCH_LIBRARY(torchvision, m) { TORCH_LIBRARY(torchvision, m) {
m.def("nms", &nms); m.def("nms(Tensor dets, Tensor scores, float iou_threshold) -> Tensor");
m.def( m.def(
"roi_align(Tensor input, Tensor rois, float spatial_scale, int pooled_height, int pooled_width, int sampling_ratio, bool aligned) -> Tensor"); "roi_align(Tensor input, Tensor rois, float spatial_scale, int pooled_height, int pooled_width, int sampling_ratio, bool aligned) -> Tensor");
m.def( m.def(
...@@ -59,6 +59,7 @@ TORCH_LIBRARY(torchvision, m) { ...@@ -59,6 +59,7 @@ TORCH_LIBRARY(torchvision, m) {
TORCH_LIBRARY_IMPL(torchvision, CPU, m) { TORCH_LIBRARY_IMPL(torchvision, CPU, m) {
m.impl("roi_align", ROIAlign_forward_cpu); m.impl("roi_align", ROIAlign_forward_cpu);
m.impl("_roi_align_backward", ROIAlign_backward_cpu); m.impl("_roi_align_backward", ROIAlign_backward_cpu);
m.impl("nms", nms_cpu);
} }
// TODO: Place this in a hypothetical separate torchvision_cuda library // TODO: Place this in a hypothetical separate torchvision_cuda library
...@@ -66,6 +67,15 @@ TORCH_LIBRARY_IMPL(torchvision, CPU, m) { ...@@ -66,6 +67,15 @@ TORCH_LIBRARY_IMPL(torchvision, CPU, m) {
TORCH_LIBRARY_IMPL(torchvision, CUDA, m) { TORCH_LIBRARY_IMPL(torchvision, CUDA, m) {
m.impl("roi_align", ROIAlign_forward_cuda); m.impl("roi_align", ROIAlign_forward_cuda);
m.impl("_roi_align_backward", ROIAlign_backward_cuda); m.impl("_roi_align_backward", ROIAlign_backward_cuda);
m.impl("nms", nms_cuda);
}
#endif
// Autocast only needs to wrap forward pass ops.
#if defined(WITH_CUDA)
TORCH_LIBRARY_IMPL(torchvision, Autocast, m) {
m.impl("roi_align", ROIAlign_autocast);
m.impl("nms", nms_autocast);
} }
#endif #endif
......
...@@ -243,7 +243,14 @@ class MultiScaleRoIAlign(nn.Module): ...@@ -243,7 +243,14 @@ class MultiScaleRoIAlign(nn.Module):
if torchvision._is_tracing(): if torchvision._is_tracing():
tracing_results.append(result_idx_in_level.to(dtype)) tracing_results.append(result_idx_in_level.to(dtype))
else: else:
result[idx_in_level] = result_idx_in_level # result and result_idx_in_level's dtypes are based on dtypes of different
# elements in x_filtered. x_filtered contains tensors output by different
# layers. When autocast is active, it may choose different dtypes for
# different layers' outputs. Therefore, we defensively match result's dtype
# before copying elements from result_idx_in_level in the following op.
# We need to cast manually (can't rely on autocast to cast for us) because
# the op acts on result in-place, and autocast only affects out-of-place ops.
result[idx_in_level] = result_idx_in_level.to(result.dtype)
if torchvision._is_tracing(): if torchvision._is_tracing():
result = _onnx_merge_levels(levels, tracing_results) result = _onnx_merge_levels(levels, tracing_results)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment