support force exporting gpu model for rcnn meta_arch

Summary: Pull Request resolved: https://github.com/facebookresearch/d2go/pull/191 When exporting model to torchscript (using `MODEL.DEVICE = "cpu"`), mean/std are constant instead of model parameters. Therefore after casting the torchscript to CUDA, the mean/std remains on cpu. This will cause problem when running inference on GPU. The fix is exporting the model with `MODEL.DEVICE = "cuda"`. However D2 (https://github.com/facebookresearch/d2go/commit/87374efb134e539090e0b5c476809dc35bf6aedb)Go internally uses "cpu" during export (via cli: https://fburl.com/code/4mpk153i, via workflow: https://fburl.com/code/zcj5ud4u) by default. For CLI, user can manually set `--device`, but for workflow it's hard to do so. Further more it's hard to support mixed model using single `--device` option. So this diff adds a special handling in the RCNN's `default_prepare_for_export` to bypass the `--device` option. Reviewed By: zhanghang1989 Differential Revision: D35097613 fbshipit-source-id: df9f44f49af1f0fd4baf3d7ccae6c31e341f3ef6

support force exporting gpu model for rcnn meta_arch
Summary: Pull Request resolved: https://github.com/facebookresearch/d2go/pull/191 When exporting model to torchscript (using `MODEL.DEVICE = "cpu"`), mean/std are constant instead of model parameters. Therefore after casting the torchscript to CUDA, the mean/std remains on cpu. This will cause problem when running inference on GPU. The fix is exporting the model with `MODEL.DEVICE = "cuda"`. However D2 (https://github.com/facebookresearch/d2go/commit/87374efb134e539090e0b5c476809dc35bf6aedb)Go internally uses "cpu" during export (via cli: https://fburl.com/code/4mpk153i, via workflow: https://fburl.com/code/zcj5ud4u) by default. For CLI, user can manually set `--device`, but for workflow it's hard to do so. Further more it's hard to support mixed model using single `--device` option. So this diff adds a special handling in the RCNN's `default_prepare_for_export` to bypass the `--device` option. Reviewed By: zhanghang1989 Differential Revision: D35097613 fbshipit-source-id: df9f44f49af1f0fd4baf3d7ccae6c31e341f3ef6
b9dc151a · Yanghan Wang · Facebook GitHub Bot · a781894c · b9dc151a · b9dc151a
Commit b9dc151a authored Mar 24, 2022 by Yanghan Wang Committed by Facebook GitHub Bot Mar 24, 2022
4 changed files
--- a/d2go/modeling/meta_arch/rcnn.py
+++ b/d2go/modeling/meta_arch/rcnn.py
@@ -2,6 +2,7 @@
 # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved


+import inspect
 import logging

 import torch
@@ -59,6 +60,16 @@ class GeneralizedRCNNPatch:

 @RCNN_PREPARE_FOR_EXPORT_REGISTRY.register()
 def default_rcnn_prepare_for_export(self, cfg, inputs, predictor_type):
+    pytorch_model = self
+
+    # NOTE: currently Exporter doesn't support specifying exporting GPU model via
+    # `model_export_method` in a general way. For RCNN model, we only need to cast
+    # the model to GPU and trace the model (scripting might not work) normally to
+    # get the GPU torchscripts.
+    if "_gpu" in predictor_type:
+        pytorch_model = _cast_detection_model(pytorch_model, "cuda")
+        predictor_type = predictor_type.replace("_gpu", "", 1)
+
    if (
        "@c2_ops" in predictor_type
        or "caffe2" in predictor_type
@@ -67,7 +78,7 @@ def default_rcnn_prepare_for_export(self, cfg, inputs, predictor_type):
        from detectron2.export.caffe2_modeling import META_ARCH_CAFFE2_EXPORT_TYPE_MAP

        C2MetaArch = META_ARCH_CAFFE2_EXPORT_TYPE_MAP[cfg.MODEL.META_ARCHITECTURE]
-        c2_compatible_model = C2MetaArch(cfg, self)
+        c2_compatible_model = C2MetaArch(cfg, pytorch_model)

        preprocess_info = FuncInfo.gen_func_info(
            D2Caffe2MetaArchPreprocessFunc,
@@ -100,8 +111,9 @@ def default_rcnn_prepare_for_export(self, cfg, inputs, predictor_type):
        )
        preprocess_func = preprocess_info.instantiate()
        return PredictorExportConfig(
-            model=D2RCNNInferenceWrapper(self),
+            model=D2RCNNInferenceWrapper(pytorch_model),
            data_generator=lambda x: (preprocess_func(x),),
+            model_export_method=predictor_type,
            preprocess_info=preprocess_info,
            postprocess_info=FuncInfo.gen_func_info(
                D2RCNNInferenceWrapper.Postprocess, params={}
@@ -430,3 +442,28 @@ class D2RCNNInferenceWrapper(nn.Module):
            width, height = batch[0]["width"], batch[0]["height"]
            r = detector_postprocess(outputs, height, width)
            return [{"instances": r}]
+
+
+# TODO: model.to(device) might not work for detection meta-arch, this function is the
+# workaround, in general, we might need a meta-arch API for this if needed.
+def _cast_detection_model(model, device):
+    # check model is an instance of one of the meta arch
+    from detectron2.export.caffe2_modeling import Caffe2MetaArch
+    from detectron2.modeling import META_ARCH_REGISTRY
+
+    if isinstance(model, Caffe2MetaArch):
+        model._wrapped_model = _cast_detection_model(model._wrapped_model, device)
+        return model
+
+    assert isinstance(model, tuple(META_ARCH_REGISTRY._obj_map.values()))
+    model.to(device)
+    # cast normalizer separately
+    if hasattr(model, "normalizer") and not (
+        hasattr(model, "pixel_mean") and hasattr(model, "pixel_std")
+    ):
+        pixel_mean = inspect.getclosurevars(model.normalizer).nonlocals["pixel_mean"]
+        pixel_std = inspect.getclosurevars(model.normalizer).nonlocals["pixel_std"]
+        pixel_mean = pixel_mean.to(device)
+        pixel_std = pixel_std.to(device)
+        model.normalizer = lambda x: (x - pixel_mean) / pixel_std
+    return model
--- a/d2go/modeling/quantization.py
+++ b/d2go/modeling/quantization.py
@@ -3,7 +3,6 @@


 import copy
-import inspect
 import logging
 import math
 from typing import Tuple
@@ -137,27 +136,12 @@ def add_quantization_default_configs(_C):

 # TODO: model.to(device) might not work for detection meta-arch, this function is the
 # workaround, in general, we might need a meta-arch API for this if needed.
-def _cast_detection_model(model, device):
-    # check model is an instance of one of the meta arch
-    from detectron2.export.caffe2_modeling import Caffe2MetaArch
-    from detectron2.modeling import META_ARCH_REGISTRY
+def _cast_model_to_device(model, device):
+    from d2go.modeling.meta_arch.rcnn import _cast_detection_model
+    from detectron2.modeling import GeneralizedRCNN

-    if isinstance(model, Caffe2MetaArch):
-        model._wrapped_model = _cast_detection_model(model._wrapped_model, device)
-        return model
-
-    assert isinstance(model, tuple(META_ARCH_REGISTRY._obj_map.values()))
-    model.to(device)
-    # cast normalizer separately
-    if hasattr(model, "normalizer") and not (
-        hasattr(model, "pixel_mean") and hasattr(model, "pixel_std")
-    ):
-        pixel_mean = inspect.getclosurevars(model.normalizer).nonlocals["pixel_mean"]
-        pixel_std = inspect.getclosurevars(model.normalizer).nonlocals["pixel_std"]
-        pixel_mean = pixel_mean.to(device)
-        pixel_std = pixel_std.to(device)
-        model.normalizer = lambda x: (x - pixel_mean) / pixel_std
-    return model
+    assert isinstance(model, GeneralizedRCNN), "Currently only availabe for RCNN"
+    return _cast_detection_model(model, device)


 def add_d2_quant_mapping(mappings):
@@ -304,7 +288,7 @@ def post_training_quantize(cfg, model, data_loader):
    if calibration_force_on_gpu:
        # NOTE: model.to(device) may not handle cases such as normalizer, FPN, only
        # do move to GPU if specified.
-        _cast_detection_model(model, "cuda")
+        _cast_model_to_device(model, "cuda")

    calibration_iters = cfg.QUANTIZATION.PTQ.CALIBRATION_NUM_IMAGES
    for idx, inputs in enumerate(data_loader):
@@ -327,7 +311,7 @@ def post_training_quantize(cfg, model, data_loader):

    # cast model back to the original device
    if calibration_force_on_gpu:
-        _cast_detection_model(model, cfg.MODEL.DEVICE)
+        _cast_model_to_device(model, cfg.MODEL.DEVICE)

    return model


--- a/d2go/utils/testing/rcnn_helper.py
+++ b/d2go/utils/testing/rcnn_helper.py
@@ -180,8 +180,10 @@ class MockRCNNInference(object):
        return results


-def _validate_outputs(inputs, outputs):
+def _validate_outputs(inputs, outputs, is_gpu=False):
    assert len(inputs) == len(outputs)
+    if is_gpu:
+        assert outputs[0]["instances"].pred_classes.device.type == "cuda"
    # TODO: figure out how to validate outputs


@@ -311,7 +313,8 @@ class RCNNBaseTestCases:

                predictor = create_predictor(predictor_path)
                predictor_outputs = predictor(inputs)
-                _validate_outputs(inputs, predictor_outputs)
+                is_gpu = self.cfg.MODEL.DEVICE != "cpu" or "_gpu" in predictor_type
+                _validate_outputs(inputs, predictor_outputs, is_gpu=is_gpu)

                if compare_match:
                    with torch.no_grad():

--- a/tests/modeling/test_meta_arch_rcnn.py
+++ b/tests/modeling/test_meta_arch_rcnn.py
@@ -19,6 +19,14 @@ from mobile_cv.common.misc.file_utils import make_temp_directory
 patch_d2_meta_arch()


+def _maybe_skip_test(self, predictor_type):
+    if os.getenv("OSSRUN") == "1" and "@c2_ops" in predictor_type:
+        self.skipTest("Caffe2 is not available for OSS")
+
+    if not torch.cuda.is_available() and "_gpu" in predictor_type:
+        self.skipTest("GPU is not available for exporting GPU model")
+
+
 class TestFBNetV3MaskRCNNFP32(RCNNBaseTestCases.TemplateTestCase):
    def setup_custom_test(self):
        super().setup_custom_test()
@@ -31,13 +39,13 @@ class TestFBNetV3MaskRCNNFP32(RCNNBaseTestCases.TemplateTestCase):
        [
            ["torchscript@c2_ops", True],
            ["torchscript", True],
+            ["torchscript_gpu", False],  # can't compare across device
            ["torchscript_int8@c2_ops", False],
            ["torchscript_int8", False],
        ]
    )
    def test_export(self, predictor_type, compare_match):
-        if os.getenv("OSSRUN") == "1" and "@c2_ops" in predictor_type:
-            self.skipTest("Caffe2 is not available for OSS")
+        _maybe_skip_test(self, predictor_type)
        self._test_export(predictor_type, compare_match=compare_match)


@@ -58,8 +66,7 @@ class TestFBNetV3MaskRCNNFPNFP32(RCNNBaseTestCases.TemplateTestCase):
        ]
    )
    def test_export(self, predictor_type, compare_match):
-        if os.getenv("OSSRUN") == "1" and "@c2_ops" in predictor_type:
-            self.skipTest("Caffe2 is not available for OSS")
+        _maybe_skip_test(self, predictor_type)
        self._test_export(predictor_type, compare_match=compare_match)


@@ -89,8 +96,7 @@ class TestFBNetV3MaskRCNNQATEager(RCNNBaseTestCases.TemplateTestCase):
        ]
    )
    def test_export(self, predictor_type, compare_match):
-        if os.getenv("OSSRUN") == "1" and "@c2_ops" in predictor_type:
-            self.skipTest("Caffe2 is not available for OSS")
+        _maybe_skip_test(self, predictor_type)
        self._test_export(predictor_type, compare_match=compare_match)