Fix Windows build in Torchvision Custom op Registration (#1320)

* Revert "Revert "Register Torchvision Ops as Cutom Ops (#1267)" (#1316)" This reverts commit fe234fc8. * Make import of C++ extensions lazy * define python initialization functions for extension * Fix lint

Fix Windows build in Torchvision Custom op Registration (#1320)
* Revert "Revert "Register Torchvision Ops as Cutom Ops (#1267)" (#1316)" This reverts commit fe234fc8. * Make import of C++ extensions lazy * define python initialization functions for extension * Fix lint
7f7e7663 · Lara Haidar · Francisco Massa · 6ddda3ae · 7f7e7663 · 7f7e7663
Commit 7f7e7663 authored Sep 10, 2019 by Lara Haidar Committed by Francisco Massa Sep 10, 2019
13 changed files
--- a/.travis.yml
+++ b/.travis.yml
@@ -47,6 +47,10 @@ before_install:
  - pip install future
  - pip install pytest pytest-cov codecov
  - pip install mock
+  - |
+    if [[ $TRAVIS_PYTHON_VERSION == 3.6 ]]; then
+      pip install onnxruntime
+    fi
  - conda install av -c conda-forge

--- a/setup.py
+++ b/setup.py
@@ -96,12 +96,21 @@ def get_extensions():
    source_models = [os.path.join(models_dir, s) for s in source_models]
    tests = test_file + source_models
+    custom_ops_sources = [os.path.join(extensions_dir, "custom_ops", "custom_ops.cpp"),
+                          os.path.join(extensions_dir, "cpu", "nms_cpu.cpp"),
+                          os.path.join(extensions_dir, "cpu", "ROIAlign_cpu.cpp"),
+                          os.path.join(extensions_dir, "cpu", "ROIPool_cpu.cpp")]
+    custom_ops_sources_cuda = [os.path.join(extensions_dir, "cuda", "nms_cuda.cu"),
+                               os.path.join(extensions_dir, "cuda", "ROIAlign_cuda.cu"),
+                               os.path.join(extensions_dir, "cuda", "ROIPool_cuda.cu")]
    define_macros = []
    extra_compile_args = {}
    if (torch.cuda.is_available() and CUDA_HOME is not None) or os.getenv('FORCE_CUDA', '0') == '1':
        extension = CUDAExtension
        sources += source_cuda
+        custom_ops_sources += custom_ops_sources_cuda
        define_macros += [('WITH_CUDA', None)]
        nvcc_flags = os.getenv('NVCC_FLAGS', '')
        if nvcc_flags == '':
@@ -138,7 +147,14 @@ def get_extensions():
            include_dirs=tests_include_dirs,
            define_macros=define_macros,
            extra_compile_args=extra_compile_args,
-        )
+        ),
+        extension(
+            "torchvision._custom_ops",
+            sources=custom_ops_sources,
+            include_dirs=include_dirs,
+            define_macros=define_macros,
+            extra_compile_args=extra_compile_args,
+        ),
    ]
    return ext_modules
@@ -179,5 +195,6 @@ setup(
        "scipy": ["scipy"],
    },
    ext_modules=get_extensions(),
-    cmdclass={'build_ext': torch.utils.cpp_extension.BuildExtension, 'clean': clean}
+    cmdclass={'build_ext': torch.utils.cpp_extension.BuildExtension,
+              'clean': clean}
 )
--- a/test/test_onnx.py
+++ b/test/test_onnx.py
+import io
+import torch
+from torchvision import ops
+# onnxruntime requires python 3.5 or above
+try:
+    import onnxruntime
+except ImportError:
+    onnxruntime = None
+import unittest
+@unittest.skipIf(onnxruntime is None, 'ONNX Runtime unavailable')
+class ONNXExporterTester(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        torch.manual_seed(123)
+    def run_model(self, model, inputs):
+        model.eval()
+        # run pytorch model
+        with torch.no_grad():
+            if isinstance(inputs, torch.Tensor):
+                inputs = (inputs,)
+            outputs = model(*inputs)
+            if isinstance(outputs, torch.Tensor):
+                outputs = (outputs,)
+        onnx_io = io.BytesIO()
+        # export to onnx
+        torch.onnx.export(model, inputs, onnx_io, do_constant_folding=True, opset_version=10)
+        # validate the exported model with onnx runtime
+        self.ort_validate(onnx_io, inputs, outputs)
+    def ort_validate(self, onnx_io, inputs, outputs):
+        inputs, _ = torch.jit._flatten(inputs)
+        outputs, _ = torch.jit._flatten(outputs)
+        def to_numpy(tensor):
+            if tensor.requires_grad:
+                return tensor.detach().cpu().numpy()
+            else:
+                return tensor.cpu().numpy()
+        inputs = list(map(to_numpy, inputs))
+        outputs = list(map(to_numpy, outputs))
+        ort_session = onnxruntime.InferenceSession(onnx_io.getvalue())
+        # compute onnxruntime output prediction
+        ort_inputs = dict((ort_session.get_inputs()[i].name, inpt) for i, inpt in enumerate(inputs))
+        ort_outs = ort_session.run(None, ort_inputs)
+        for i in range(0, len(outputs)):
+            torch.testing.assert_allclose(outputs[i], ort_outs[i], rtol=1e-03, atol=1e-05)
+    def test_nms(self):
+        boxes = torch.rand(5, 4)
+        boxes[:, 2:] += torch.rand(5, 2)
+        scores = torch.randn(5)
+        class Module(torch.nn.Module):
+            def forward(self, boxes, scores):
+                return ops.nms(boxes, scores, 0.5)
+        self.run_model(Module(), (boxes, scores))
+    def test_roi_pool(self):
+        x = torch.rand(1, 1, 10, 10, dtype=torch.float32)
+        single_roi = torch.tensor([[0, 0, 0, 4, 4]], dtype=torch.float32)
+        model = ops.RoIAlign((5, 5), 1, 2)
+        self.run_model(model, (x, single_roi))
+    def test_roi_align(self):
+        x = torch.rand(1, 1, 10, 10, dtype=torch.float32)
+        rois = torch.tensor([[0, 0, 0, 4, 4]], dtype=torch.float32)
+        pool_h = 5
+        pool_w = 5
+        model = ops.RoIPool((pool_h, pool_w), 2)
+        model.eval()
+        self.run_model(model, (x, rois))
+if __name__ == '__main__':
+    unittest.main()
--- a/torchvision/csrc/ROIAlign.h
+++ b/torchvision/csrc/ROIAlign.h
@@ -10,11 +10,11 @@
 at::Tensor ROIAlign_forward(
    const at::Tensor& input, // Input feature map.
    const at::Tensor& rois, // List of ROIs to pool over.
-    const float spatial_scale, // The scale of the image features. ROIs will be
+    const double spatial_scale, // The scale of the image features. ROIs will be
    // scaled to this.
-    const int pooled_height, // The height of the pooled feature map.
+    const int64_t pooled_height, // The height of the pooled feature map.
-    const int pooled_width, // The width of the pooled feature
+    const int64_t pooled_width, // The width of the pooled feature
-    const int sampling_ratio) // The number of points to sample in each bin
+    const int64_t sampling_ratio) // The number of points to sample in each bin
 // along each axis.
 {
  if (input.type().is_cuda()) {

--- a/torchvision/csrc/ROIPool.h
+++ b/torchvision/csrc/ROIPool.h
@@ -9,9 +9,9 @@
 std::tuple<at::Tensor, at::Tensor> ROIPool_forward(
    const at::Tensor& input,
    const at::Tensor& rois,
-    const float spatial_scale,
+    const double spatial_scale,
-    const int pooled_height,
+    const int64_t pooled_height,
-    const int pooled_width) {
+    const int64_t pooled_width) {
  if (input.type().is_cuda()) {
 #ifdef WITH_CUDA
    return ROIPool_forward_cuda(

--- a/torchvision/csrc/custom_ops/custom_ops.cpp
+++ b/torchvision/csrc/custom_ops/custom_ops.cpp
+#include <Python.h>
+#include <torch/script.h>
+#include "ROIAlign.h"
+#include "ROIPool.h"
+#include "nms.h"
+using namespace at;
+// If we are in a Windows environment, we need to define
+// initialization functions for the _custom_ops extension
+#ifdef _WIN32
+#if PY_MAJOR_VERSION < 3
+PyMODINIT_FUNC init_custom_ops(void) {
+  // No need to do anything.
+  // _custom_ops.py will run on load
+  return NULL;
+}
+#else
+PyMODINIT_FUNC PyInit__custom_ops(void) {
+  // No need to do anything.
+  // _custom_ops.py will run on load
+  return NULL;
+}
+#endif
+#endif
+static auto registry =
+    torch::RegisterOperators()
+        .op("torchvision::nms", &nms)
+        .op("torchvision::roi_align(Tensor input, Tensor rois, float spatial_scale, int pooled_height, int pooled_width, int sampling_ratio) -> Tensor",
+            &ROIAlign_forward)
+        .op("torchvision::roi_pool", &ROIPool_forward);
--- a/torchvision/csrc/nms.h
+++ b/torchvision/csrc/nms.h
@@ -8,7 +8,7 @@
 at::Tensor nms(
    const at::Tensor& dets,
    const at::Tensor& scores,
-    const float iou_threshold) {
+    const double iou_threshold) {
  if (dets.device().is_cuda()) {
 #ifdef WITH_CUDA
    if (dets.numel() == 0) {

--- a/torchvision/csrc/vision.cpp
+++ b/torchvision/csrc/vision.cpp
@@ -7,6 +7,8 @@
 #endif
 PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  // TODO: remove nms from here since it is now registered
+  //       and used as a PyTorch custom op
  m.def("nms", &nms, "non-maximum suppression");
  m.def("roi_align_forward", &ROIAlign_forward, "ROIAlign_forward");
  m.def("roi_align_backward", &ROIAlign_backward, "ROIAlign_backward");

--- a/torchvision/extension.py
+++ b/torchvision/extension.py
@@ -10,6 +10,7 @@ def _lazy_import():
        return _C
    import torch
    from torchvision import _C as C
+    import torchvision.ops._custom_ops
    _C = C
    if hasattr(_C, "CUDA_VERSION") and torch.version.cuda is not None:
        tv_version = str(_C.CUDA_VERSION)

--- a/torchvision/ops/_custom_ops.py
+++ b/torchvision/ops/_custom_ops.py
+import os
+import sys
+import imp
+import torch
+# load the custom_op_library and register the custom ops
+lib_dir = os.path.join(os.path.dirname(__file__), '..')
+file, path, description = imp.find_module("_custom_ops", [lib_dir])
+torch.ops.load_library(path)
+def register_custom_op():
+    from torch.onnx.symbolic_helper import parse_args, scalar_type_to_onnx
+    from torch.onnx.symbolic_opset9 import select, unsqueeze, squeeze, _cast_Long, reshape
+    @parse_args('v', 'v', 'f')
+    def symbolic_multi_label_nms(g, boxes, scores, iou_threshold):
+        boxes = unsqueeze(g, boxes, 0)
+        scores = unsqueeze(g, unsqueeze(g, scores, 0), 0)
+        max_output_per_class = g.op('Constant', value_t=torch.tensor([sys.maxsize], dtype=torch.long))
+        iou_threshold = g.op('Constant', value_t=torch.tensor([iou_threshold], dtype=torch.float))
+        nms_out = g.op('NonMaxSuppression', boxes, scores, max_output_per_class, iou_threshold)
+        return squeeze(g, select(g, nms_out, 1, g.op('Constant', value_t=torch.tensor([2], dtype=torch.long))), 1)
+    @parse_args('v', 'v', 'f', 'i', 'i', 'i')
+    def roi_align(g, input, rois, spatial_scale, pooled_height, pooled_width, sampling_ratio):
+        batch_indices = _cast_Long(g, squeeze(g, select(g, rois, 1, g.op('Constant',
+                                   value_t=torch.tensor([0], dtype=torch.long))), 1), False)
+        rois = select(g, rois, 1, g.op('Constant', value_t=torch.tensor([1, 2, 3, 4], dtype=torch.long)))
+        return g.op('RoiAlign', input, rois, batch_indices, spatial_scale_f=spatial_scale,
+                    output_height_i=pooled_height, output_width_i=pooled_width, sampling_ratio_i=sampling_ratio)
+    @parse_args('v', 'v', 'f', 'i', 'i')
+    def roi_pool(g, input, rois, spatial_scale, pooled_height, pooled_width):
+        roi_pool = g.op('MaxRoiPool', input, rois,
+                        pooled_shape_i=(pooled_height, pooled_width), spatial_scale_f=spatial_scale)
+        return roi_pool, None
+    from torch.onnx import register_custom_op_symbolic
+    register_custom_op_symbolic('torchvision::nms', symbolic_multi_label_nms, 10)
+    register_custom_op_symbolic('torchvision::roi_align', roi_align, 10)
+    register_custom_op_symbolic('torchvision::roi_pool', roi_pool, 10)
+register_custom_op()
--- a/torchvision/ops/boxes.py
+++ b/torchvision/ops/boxes.py
@@ -29,8 +29,8 @@ def nms(boxes, scores, iou_threshold):
        of the elements that have been kept
        by NMS, sorted in decreasing order of scores
    """
-    _C = _lazy_import()
+    _lazy_import()
-    return _C.nms(boxes, scores, iou_threshold)
+    return torch.ops.torchvision.nms(boxes, scores, iou_threshold)
 def batched_nms(boxes, scores, idxs, iou_threshold):

--- a/torchvision/ops/roi_align.py
+++ b/torchvision/ops/roi_align.py
@@ -66,6 +66,13 @@ def roi_align(input, boxes, output_size, spatial_scale=1.0, sampling_ratio=-1):
    rois = boxes
    if not isinstance(rois, torch.Tensor):
        rois = convert_boxes_to_roi_format(rois)
+    # TODO: Change this to support backwards, which we
+    #       do not currently support when JIT tracing.
+    if torch._C._get_tracing_state():
+        _lazy_import()
+        return torch.ops.torchvision.roi_align(input, rois, spatial_scale,
+                                               output_size[0], output_size[1],
+                                               sampling_ratio)
    return _RoIAlignFunction.apply(input, rois, output_size, spatial_scale, sampling_ratio)

--- a/torchvision/ops/roi_pool.py
+++ b/torchvision/ops/roi_pool.py
@@ -59,6 +59,13 @@ def roi_pool(input, boxes, output_size, spatial_scale=1.0):
    rois = boxes
    if not isinstance(rois, torch.Tensor):
        rois = convert_boxes_to_roi_format(rois)
+    # TODO: Change this to support backwards, which we
+    #       do not currently support when JIT tracing.
+    if torch._C._get_tracing_state():
+        _lazy_import()
+        output, _ = torch.ops.torchvision.roi_pool(input, rois, spatial_scale,
+                                                   output_size[0], output_size[1])
+        return output
    return _RoIPoolFunction.apply(input, rois, output_size, spatial_scale)