init_0905

3b8d508a · lishj6 · e968ab0f · 3b8d508a · 3b8d508a · 3b8d508a
Commit 3b8d508a authored Sep 05, 2025 by lishj6 🏸
20 changed files
--- a/projects/mmdet3d_plugin/ops/nearest_assign/src/nearest_assign_cuda.cu
+++ b/projects/mmdet3d_plugin/ops/nearest_assign/src/nearest_assign_cuda.cu
+// Copyright (c) Phigent Robotics. All rights reserved.
+// Reference https://arxiv.org/abs/2211.17111
+
+#include <stdio.h>
+#include <stdlib.h>
+
+__global__ void nearest_assign_kernel(
+                                  const int* l2s_key,
+                                  int l2s_size,
+                                  const int* occind2detind,
+                                  const int *__restrict__ occ_pred,
+                                  const int *__restrict__ inst_xyz,
+                                  const int *__restrict__ inst_cls,
+                                  const int *__restrict__ inst_id_list,
+                                  int inst_size,
+                                  int* __restrict__ inst_pred) {
+  int idx = blockIdx.x * blockDim.x + threadIdx.x;
+  // while (idx < 200*200*16)
+  if (true)
+  {
+    int occ_pred_label = occ_pred[idx];
+    int dist_min = 100000000;
+    for (int index = 0; index < l2s_size; index ++)
+    {
+      if (occ_pred_label == l2s_key[index])
+      {
+        int x = idx/(200*16);
+        int y = (idx - x*200*16)/16;
+        int z = idx - x*200*16 - y*16;
+        int inst_ind = 0;
+        for (inst_ind = 0; inst_ind < inst_size; inst_ind ++)
+        {
+          if (inst_cls[inst_ind] == occind2detind[occ_pred_label])
+          {
+            int dx = x - inst_xyz[inst_ind*3+0];
+            int dy = y - inst_xyz[inst_ind*3+1];
+            int dz = z - inst_xyz[inst_ind*3+2];
+            int dist = dx*dx + dy*dy + dz*dz;
+            if (dist < dist_min){
+              dist_min = dist;
+              inst_pred[idx] = inst_id_list[inst_ind];
+            }
+          }
+        }
+        return;
+      }
+    }
+    inst_pred[idx] = occ_pred[idx];
+
+    // idx += blockDim.x * gridDim.x;
+  }
+
+}
+
+void nearest_assign(
+              const int* l2s_key,
+              int l2s_size,
+              const int *__restrict__ occind2detind,
+              int inst_size,
+              const int *__restrict__ occ_pred,
+              const int *__restrict__ inst_xyz,
+              const int *__restrict__ inst_cls,
+              const int *__restrict__ inst_id_list,
+              int* __restrict__ inst_pred) {
+  // nearest_assign_kernel<<<128, 256>>>(
+  nearest_assign_kernel<<<(int)ceil(((double)200 * 200 * 16 / 256)), 256>>>(
+    l2s_key, l2s_size, occind2detind, 
+    occ_pred, inst_xyz, inst_cls, 
+    inst_id_list, inst_size, inst_pred
+  );
+}
+
+
--- a/projects/setup.py
+++ b/projects/setup.py
+from setuptools import find_packages, setup
+
+import os
+import shutil
+import sys
+import torch
+import warnings
+from os import path as osp
+from torch.utils.cpp_extension import (BuildExtension, CppExtension,
+                                       CUDAExtension)
+
+
+def make_cuda_ext(name,
+                  module,
+                  sources,
+                  sources_cuda=[],
+                  extra_args=[],
+                  extra_include_path=[]):
+
+    define_macros = []
+    extra_compile_args = {'cxx': [] + extra_args}
+
+    if torch.cuda.is_available() or os.getenv('FORCE_CUDA', '0') == '1':
+        define_macros += [('WITH_CUDA', None)]
+        extension = CUDAExtension
+        extra_compile_args['nvcc'] = extra_args + [
+            '-D__CUDA_NO_HALF_OPERATORS__',
+            '-D__CUDA_NO_HALF_CONVERSIONS__',
+            '-D__CUDA_NO_HALF2_OPERATORS__',
+        ]
+        sources += sources_cuda
+    else:
+        print('Compiling {} without CUDA'.format(name))
+        extension = CppExtension
+        # raise EnvironmentError('CUDA is required to compile MMDetection!')
+
+    return extension(
+        name='{}.{}'.format(module, name),
+        sources=[os.path.join(*module.split('.'), p) for p in sources],
+        include_dirs=extra_include_path,
+        define_macros=define_macros,
+        extra_compile_args=extra_compile_args)
+
+
+if __name__ == '__main__':
+    setup(
+        name='flashocc_plugin',
+        description=("OpenMMLab's next-generation platform"
+                     'for general 3D object detection.'),
+        long_description_content_type='text/markdown',
+        author='MMDetection3D Contributors',
+        author_email='zwwdev@gmail.com',
+        keywords='computer vision, 3D object detection',
+        url='https://github.com/open-mmlab/mmdetection3d',
+        classifiers=[
+            'Development Status :: 4 - Beta',
+            'License :: OSI Approved :: Apache Software License',
+            'Operating System :: OS Independent',
+            'Programming Language :: Python :: 3',
+            'Programming Language :: Python :: 3.6',
+            'Programming Language :: Python :: 3.7',
+        ],
+        license='Apache License 2.0',
+        ext_modules=[
+            make_cuda_ext(
+                name="bev_pool_ext",
+                module="mmdet3d_plugin.ops.bev_pool",
+                sources=[
+                    "src/bev_pooling.cpp",
+                    "src/bev_sum_pool.cpp",
+                    "src/bev_sum_pool_cuda.cu",
+                    "src/bev_max_pool.cpp",
+                    "src/bev_max_pool_cuda.cu",
+                ],
+            ),
+            make_cuda_ext(
+                name="bev_pool_v2_ext",
+                module="mmdet3d_plugin.ops.bev_pool_v2",
+                sources=[
+                    "src/bev_pool.cpp",
+                    "src/bev_pool_cuda.cu"
+                ],
+            ),
+            make_cuda_ext(
+                name="nearest_assign_ext",
+                module="mmdet3d_plugin.ops.nearest_assign",
+                sources=[
+                    "src/nearest_assign.cpp",
+                    "src/nearest_assign_cuda.cu"
+                ],
+            ),
+        ],
+        cmdclass={'build_ext': BuildExtension},
+        zip_safe=False)
--- a/repro.py
+++ b/repro.py
+
+import torch
+from torch import tensor, device
+import torch.fx as fx
+from torch._dynamo.testing import rand_strided
+from math import inf
+import torch._inductor.inductor_prims
+
+import torch._dynamo.config
+import torch._inductor.config
+import torch._functorch.config
+import torch.fx.experimental._config
+torch._dynamo.config.capture_scalar_outputs = True
+
+
+
+
+
+isolate_fails_code_str = None
+
+
+
+# torch version: 2.4.1
+# torch cuda version: None
+# torch git version: 45d303c9e4f41ec2f5450b6f60031246f67189d6
+
+
+# CUDA Info: 
+# nvcc not found
+# GPU Hardware Info: 
+# BW200 : 8 
+
+
+from torch.nn import *
+class Repro(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+
+    
+    
+    def forward(self, primals_1, primals_2, primals_4, primals_5, primals_6, primals_7, primals_8, primals_10, convert_element_type_1, clamp_max, convert_element_type_3, clamp_max_1, clamp_max_2, clamp_max_3, cat, convolution, squeeze_1, relu, convolution_1, getitem_3, rsqrt_1, convert_element_type_5, clamp_max_4, convert_element_type_7, clamp_max_5, clamp_max_6, clamp_max_7, add_19, convolution_2, squeeze_7, relu_2, unsqueeze_14, unsqueeze_38, tangents_1):
+        sum_1 = torch.ops.aten.sum.dim_IntList(tangents_1, [0, 2, 3])
+        convolution_backward = torch.ops.aten.convolution_backward.default(tangents_1, relu_2, primals_10, [256], [1, 1], [0, 0], [1, 1], False, [0, 0], 1, [True, True, False]);  tangents_1 = primals_10 = None
+        getitem_6 = convolution_backward[0]
+        getitem_7 = convolution_backward[1];  convolution_backward = None
+        le = torch.ops.aten.le.Scalar(relu_2, 0);  relu_2 = None
+        full_default = torch.ops.aten.full.default([], 0.0, dtype = torch.float32, layout = torch.strided, device = device(type='cuda', index=2), pin_memory = False)
+        where = torch.ops.aten.where.self(le, full_default, getitem_6);  le = getitem_6 = None
+        sum_2 = torch.ops.aten.sum.dim_IntList(where, [0, 2, 3])
+        sub_13 = torch.ops.aten.sub.Tensor(convolution_2, unsqueeze_14);  convolution_2 = unsqueeze_14 = None
+        mul_31 = torch.ops.aten.mul.Tensor(where, sub_13)
+        sum_3 = torch.ops.aten.sum.dim_IntList(mul_31, [0, 2, 3]);  mul_31 = None
+        mul_32 = torch.ops.aten.mul.Tensor(sum_2, 6.25e-06)
+        unsqueeze_15 = torch.ops.aten.unsqueeze.default(mul_32, 0);  mul_32 = None
+        unsqueeze_16 = torch.ops.aten.unsqueeze.default(unsqueeze_15, 2);  unsqueeze_15 = None
+        unsqueeze_17 = torch.ops.aten.unsqueeze.default(unsqueeze_16, 3);  unsqueeze_16 = None
+        mul_33 = torch.ops.aten.mul.Tensor(sum_3, 6.25e-06)
+        mul_34 = torch.ops.aten.mul.Tensor(squeeze_7, squeeze_7)
+        mul_35 = torch.ops.aten.mul.Tensor(mul_33, mul_34);  mul_33 = mul_34 = None
+        unsqueeze_18 = torch.ops.aten.unsqueeze.default(mul_35, 0);  mul_35 = None
+        unsqueeze_19 = torch.ops.aten.unsqueeze.default(unsqueeze_18, 2);  unsqueeze_18 = None
+        unsqueeze_20 = torch.ops.aten.unsqueeze.default(unsqueeze_19, 3);  unsqueeze_19 = None
+        mul_36 = torch.ops.aten.mul.Tensor(squeeze_7, primals_8);  primals_8 = None
+        unsqueeze_21 = torch.ops.aten.unsqueeze.default(mul_36, 0);  mul_36 = None
+        unsqueeze_22 = torch.ops.aten.unsqueeze.default(unsqueeze_21, 2);  unsqueeze_21 = None
+        unsqueeze_23 = torch.ops.aten.unsqueeze.default(unsqueeze_22, 3);  unsqueeze_22 = None
+        mul_37 = torch.ops.aten.mul.Tensor(sub_13, unsqueeze_20);  sub_13 = unsqueeze_20 = None
+        sub_15 = torch.ops.aten.sub.Tensor(where, mul_37);  where = mul_37 = None
+        sub_16 = torch.ops.aten.sub.Tensor(sub_15, unsqueeze_17);  sub_15 = unsqueeze_17 = None
+        mul_38 = torch.ops.aten.mul.Tensor(sub_16, unsqueeze_23);  sub_16 = unsqueeze_23 = None
+        mul_39 = torch.ops.aten.mul.Tensor(sum_3, squeeze_7);  sum_3 = squeeze_7 = None
+        convolution_backward_1 = torch.ops.aten.convolution_backward.default(mul_38, add_19, primals_7, [0], [1, 1], [1, 1], [1, 1], False, [0, 0], 1, [True, True, False]);  mul_38 = add_19 = primals_7 = None
+        getitem_9 = convolution_backward_1[0]
+        getitem_10 = convolution_backward_1[1];  convolution_backward_1 = None
+        mul_40 = torch.ops.aten.mul.Tensor(getitem_9, clamp_max_7);  clamp_max_7 = None
+        neg = torch.ops.aten.neg.default(mul_40)
+        add_25 = torch.ops.aten.add.Tensor(getitem_9, neg);  getitem_9 = neg = None
+        mul_41 = torch.ops.aten.mul.Tensor(mul_40, clamp_max_6)
+        neg_1 = torch.ops.aten.neg.default(mul_41)
+        add_26 = torch.ops.aten.add.Tensor(mul_40, neg_1);  mul_40 = neg_1 = None
+        mul_42 = torch.ops.aten.mul.Tensor(add_25, clamp_max_6);  clamp_max_6 = None
+        neg_2 = torch.ops.aten.neg.default(mul_42)
+        add_27 = torch.ops.aten.add.Tensor(add_25, neg_2);  add_25 = neg_2 = None
+        full_default_1 = torch.ops.aten.full.default([4, 512, 100, 100], 0, dtype = torch.float32, layout = torch.strided, device = device(type='cuda', index=2), pin_memory = False)
+        _unsafe_index_put = torch.ops.aten._unsafe_index_put.default(full_default_1, [None, None, clamp_max_4, clamp_max_5], mul_41, True);  mul_41 = None
+        _unsafe_index_put_1 = torch.ops.aten._unsafe_index_put.default(full_default_1, [None, None, clamp_max_4, convert_element_type_7], add_26, True);  clamp_max_4 = add_26 = None
+        add_28 = torch.ops.aten.add.Tensor(_unsafe_index_put, _unsafe_index_put_1);  _unsafe_index_put = _unsafe_index_put_1 = None
+        _unsafe_index_put_2 = torch.ops.aten._unsafe_index_put.default(full_default_1, [None, None, convert_element_type_5, clamp_max_5], mul_42, True);  clamp_max_5 = mul_42 = None
+        add_29 = torch.ops.aten.add.Tensor(add_28, _unsafe_index_put_2);  add_28 = _unsafe_index_put_2 = None
+        _unsafe_index_put_3 = torch.ops.aten._unsafe_index_put.default(full_default_1, [None, None, convert_element_type_5, convert_element_type_7], add_27, True);  full_default_1 = convert_element_type_5 = convert_element_type_7 = add_27 = None
+        add_30 = torch.ops.aten.add.Tensor(add_29, _unsafe_index_put_3);  add_29 = _unsafe_index_put_3 = None
+        sub_6 = torch.ops.aten.sub.Tensor(convolution_1, getitem_3)
+        mul_12 = torch.ops.aten.mul.Tensor(sub_6, rsqrt_1);  sub_6 = None
+        unsqueeze_4 = torch.ops.aten.unsqueeze.default(primals_5, -1)
+        unsqueeze_5 = torch.ops.aten.unsqueeze.default(unsqueeze_4, -1);  unsqueeze_4 = None
+        mul_18 = torch.ops.aten.mul.Tensor(mul_12, unsqueeze_5);  mul_12 = unsqueeze_5 = None
+        unsqueeze_6 = torch.ops.aten.unsqueeze.default(primals_6, -1);  primals_6 = None
+        unsqueeze_7 = torch.ops.aten.unsqueeze.default(unsqueeze_6, -1);  unsqueeze_6 = None
+        add_14 = torch.ops.aten.add.Tensor(mul_18, unsqueeze_7);  mul_18 = unsqueeze_7 = None
+        relu_1 = torch.ops.aten.relu.default(add_14);  add_14 = None
+        le_1 = torch.ops.aten.le.Scalar(relu_1, 0);  relu_1 = None
+        where_1 = torch.ops.aten.where.self(le_1, full_default, add_30);  le_1 = add_30 = None
+        squeeze_3 = torch.ops.aten.squeeze.dims(getitem_3, [0, 2, 3]);  getitem_3 = None
+        unsqueeze_24 = torch.ops.aten.unsqueeze.default(squeeze_3, 0);  squeeze_3 = None
+        unsqueeze_25 = torch.ops.aten.unsqueeze.default(unsqueeze_24, 2);  unsqueeze_24 = None
+        unsqueeze_26 = torch.ops.aten.unsqueeze.default(unsqueeze_25, 3);  unsqueeze_25 = None
+        sum_4 = torch.ops.aten.sum.dim_IntList(where_1, [0, 2, 3])
+        sub_17 = torch.ops.aten.sub.Tensor(convolution_1, unsqueeze_26);  convolution_1 = unsqueeze_26 = None
+        mul_43 = torch.ops.aten.mul.Tensor(where_1, sub_17)
+        sum_5 = torch.ops.aten.sum.dim_IntList(mul_43, [0, 2, 3]);  mul_43 = None
+        mul_44 = torch.ops.aten.mul.Tensor(sum_4, 2.5e-05)
+        unsqueeze_27 = torch.ops.aten.unsqueeze.default(mul_44, 0);  mul_44 = None
+        unsqueeze_28 = torch.ops.aten.unsqueeze.default(unsqueeze_27, 2);  unsqueeze_27 = None
+        unsqueeze_29 = torch.ops.aten.unsqueeze.default(unsqueeze_28, 3);  unsqueeze_28 = None
+        mul_45 = torch.ops.aten.mul.Tensor(sum_5, 2.5e-05)
+        squeeze_4 = torch.ops.aten.squeeze.dims(rsqrt_1, [0, 2, 3]);  rsqrt_1 = None
+        mul_46 = torch.ops.aten.mul.Tensor(squeeze_4, squeeze_4)
+        mul_47 = torch.ops.aten.mul.Tensor(mul_45, mul_46);  mul_45 = mul_46 = None
+        unsqueeze_30 = torch.ops.aten.unsqueeze.default(mul_47, 0);  mul_47 = None
+        unsqueeze_31 = torch.ops.aten.unsqueeze.default(unsqueeze_30, 2);  unsqueeze_30 = None
+        unsqueeze_32 = torch.ops.aten.unsqueeze.default(unsqueeze_31, 3);  unsqueeze_31 = None
+        mul_48 = torch.ops.aten.mul.Tensor(squeeze_4, primals_5);  primals_5 = None
+        unsqueeze_33 = torch.ops.aten.unsqueeze.default(mul_48, 0);  mul_48 = None
+        unsqueeze_34 = torch.ops.aten.unsqueeze.default(unsqueeze_33, 2);  unsqueeze_33 = None
+        unsqueeze_35 = torch.ops.aten.unsqueeze.default(unsqueeze_34, 3);  unsqueeze_34 = None
+        mul_49 = torch.ops.aten.mul.Tensor(sub_17, unsqueeze_32);  sub_17 = unsqueeze_32 = None
+        sub_19 = torch.ops.aten.sub.Tensor(where_1, mul_49);  where_1 = mul_49 = None
+        sub_20 = torch.ops.aten.sub.Tensor(sub_19, unsqueeze_29);  sub_19 = unsqueeze_29 = None
+        mul_50 = torch.ops.aten.mul.Tensor(sub_20, unsqueeze_35);  sub_20 = unsqueeze_35 = None
+        mul_51 = torch.ops.aten.mul.Tensor(sum_5, squeeze_4);  sum_5 = squeeze_4 = None
+        convolution_backward_2 = torch.ops.aten.convolution_backward.default(mul_50, relu, primals_4, [0], [1, 1], [1, 1], [1, 1], False, [0, 0], 1, [True, True, False]);  mul_50 = primals_4 = None
+        getitem_12 = convolution_backward_2[0]
+        getitem_13 = convolution_backward_2[1];  convolution_backward_2 = None
+        le_2 = torch.ops.aten.le.Scalar(relu, 0);  relu = None
+        where_2 = torch.ops.aten.where.self(le_2, full_default, getitem_12);  le_2 = full_default = getitem_12 = None
+        sum_6 = torch.ops.aten.sum.dim_IntList(where_2, [0, 2, 3])
+        sub_21 = torch.ops.aten.sub.Tensor(convolution, unsqueeze_38);  convolution = unsqueeze_38 = None
+        mul_52 = torch.ops.aten.mul.Tensor(where_2, sub_21)
+        sum_7 = torch.ops.aten.sum.dim_IntList(mul_52, [0, 2, 3]);  mul_52 = None
+        mul_53 = torch.ops.aten.mul.Tensor(sum_6, 2.5e-05)
+        unsqueeze_39 = torch.ops.aten.unsqueeze.default(mul_53, 0);  mul_53 = None
+        unsqueeze_40 = torch.ops.aten.unsqueeze.default(unsqueeze_39, 2);  unsqueeze_39 = None
+        unsqueeze_41 = torch.ops.aten.unsqueeze.default(unsqueeze_40, 3);  unsqueeze_40 = None
+        mul_54 = torch.ops.aten.mul.Tensor(sum_7, 2.5e-05)
+        mul_55 = torch.ops.aten.mul.Tensor(squeeze_1, squeeze_1)
+        mul_56 = torch.ops.aten.mul.Tensor(mul_54, mul_55);  mul_54 = mul_55 = None
+        unsqueeze_42 = torch.ops.aten.unsqueeze.default(mul_56, 0);  mul_56 = None
+        unsqueeze_43 = torch.ops.aten.unsqueeze.default(unsqueeze_42, 2);  unsqueeze_42 = None
+        unsqueeze_44 = torch.ops.aten.unsqueeze.default(unsqueeze_43, 3);  unsqueeze_43 = None
+        mul_57 = torch.ops.aten.mul.Tensor(squeeze_1, primals_2);  primals_2 = None
+        unsqueeze_45 = torch.ops.aten.unsqueeze.default(mul_57, 0);  mul_57 = None
+        unsqueeze_46 = torch.ops.aten.unsqueeze.default(unsqueeze_45, 2);  unsqueeze_45 = None
+        unsqueeze_47 = torch.ops.aten.unsqueeze.default(unsqueeze_46, 3);  unsqueeze_46 = None
+        mul_58 = torch.ops.aten.mul.Tensor(sub_21, unsqueeze_44);  sub_21 = unsqueeze_44 = None
+        sub_23 = torch.ops.aten.sub.Tensor(where_2, mul_58);  where_2 = mul_58 = None
+        sub_24 = torch.ops.aten.sub.Tensor(sub_23, unsqueeze_41);  sub_23 = unsqueeze_41 = None
+        mul_59 = torch.ops.aten.mul.Tensor(sub_24, unsqueeze_47);  sub_24 = unsqueeze_47 = None
+        mul_60 = torch.ops.aten.mul.Tensor(sum_7, squeeze_1);  sum_7 = squeeze_1 = None
+        convolution_backward_3 = torch.ops.aten.convolution_backward.default(mul_59, cat, primals_1, [0], [1, 1], [1, 1], [1, 1], False, [0, 0], 1, [True, True, False]);  mul_59 = cat = primals_1 = None
+        getitem_15 = convolution_backward_3[0]
+        getitem_16 = convolution_backward_3[1];  convolution_backward_3 = None
+        slice_1 = torch.ops.aten.slice.Tensor(getitem_15, 1, 0, 128)
+        slice_2 = torch.ops.aten.slice.Tensor(getitem_15, 1, 128, 640);  getitem_15 = None
+        mul_61 = torch.ops.aten.mul.Tensor(slice_2, clamp_max_3);  clamp_max_3 = None
+        neg_3 = torch.ops.aten.neg.default(mul_61)
+        add_31 = torch.ops.aten.add.Tensor(slice_2, neg_3);  slice_2 = neg_3 = None
+        mul_62 = torch.ops.aten.mul.Tensor(mul_61, clamp_max_2)
+        neg_4 = torch.ops.aten.neg.default(mul_62)
+        add_32 = torch.ops.aten.add.Tensor(mul_61, neg_4);  mul_61 = neg_4 = None
+        mul_63 = torch.ops.aten.mul.Tensor(add_31, clamp_max_2);  clamp_max_2 = None
+        neg_5 = torch.ops.aten.neg.default(mul_63)
+        add_33 = torch.ops.aten.add.Tensor(add_31, neg_5);  add_31 = neg_5 = None
+        full_default_7 = torch.ops.aten.full.default([4, 512, 25, 25], 0, dtype = torch.float32, layout = torch.strided, device = device(type='cuda', index=2), pin_memory = False)
+        _unsafe_index_put_4 = torch.ops.aten._unsafe_index_put.default(full_default_7, [None, None, clamp_max, clamp_max_1], mul_62, True);  mul_62 = None
+        _unsafe_index_put_5 = torch.ops.aten._unsafe_index_put.default(full_default_7, [None, None, clamp_max, convert_element_type_3], add_32, True);  clamp_max = add_32 = None
+        add_34 = torch.ops.aten.add.Tensor(_unsafe_index_put_4, _unsafe_index_put_5);  _unsafe_index_put_4 = _unsafe_index_put_5 = None
+        _unsafe_index_put_6 = torch.ops.aten._unsafe_index_put.default(full_default_7, [None, None, convert_element_type_1, clamp_max_1], mul_63, True);  clamp_max_1 = mul_63 = None
+        add_35 = torch.ops.aten.add.Tensor(add_34, _unsafe_index_put_6);  add_34 = _unsafe_index_put_6 = None
+        _unsafe_index_put_7 = torch.ops.aten._unsafe_index_put.default(full_default_7, [None, None, convert_element_type_1, convert_element_type_3], add_33, True);  full_default_7 = convert_element_type_1 = convert_element_type_3 = add_33 = None
+        add_36 = torch.ops.aten.add.Tensor(add_35, _unsafe_index_put_7);  add_35 = _unsafe_index_put_7 = None
+        return [getitem_16, mul_60, sum_6, getitem_13, mul_51, sum_4, getitem_10, mul_39, sum_2, getitem_7, sum_1, None, None, None, None, None, None, None, None, None, slice_1, add_36]
+        
+def load_args(reader):
+    buf0 = reader.storage('934c55e4a7a69a0a29a96cd8ef9f11c9859658e1', 11796480, device=device(type='cuda', index=2))
+    reader.tensor(buf0, (512, 640, 3, 3), requires_grad=True, is_leaf=True)  # primals_1
+    buf1 = reader.storage('f12094f433480ec90280d223057708434df38941', 2048, device=device(type='cuda', index=2))
+    reader.tensor(buf1, (512,), requires_grad=True, is_leaf=True)  # primals_2
+    buf2 = reader.storage('06c46ad2c91ec5c8eebc4fb0be80459bdfe007a8', 9437184, device=device(type='cuda', index=2))
+    reader.tensor(buf2, (512, 512, 3, 3), requires_grad=True, is_leaf=True)  # primals_4
+    buf3 = reader.storage('aba0c4266c842d1845e720dc0c789942770a60b7', 2048, device=device(type='cuda', index=2))
+    reader.tensor(buf3, (512,), requires_grad=True, is_leaf=True)  # primals_5
+    buf4 = reader.storage('bb8471d379e03c8ccb9897ce7d3a2dfbacb44e30', 2048, device=device(type='cuda', index=2))
+    reader.tensor(buf4, (512,), requires_grad=True, is_leaf=True)  # primals_6
+    buf5 = reader.storage('b9484105fb5b2045fb6550a1edb77af72e639416', 4718592, device=device(type='cuda', index=2))
+    reader.tensor(buf5, (256, 512, 3, 3), requires_grad=True, is_leaf=True)  # primals_7
+    buf6 = reader.storage('b778b8cab416c3fa6763b88e431266ae6ea28941', 1024, device=device(type='cuda', index=2))
+    reader.tensor(buf6, (256,), requires_grad=True, is_leaf=True)  # primals_8
+    buf7 = reader.storage('c5f14ec72c73a593b47ef4aecf37f6bb25d2dec4', 262144, device=device(type='cuda', index=2))
+    reader.tensor(buf7, (256, 256, 1, 1), requires_grad=True, is_leaf=True)  # primals_10
+    buf8 = reader.storage('99ef5c7086a924dfc5221c01ff1520de469849c8', 800, device=device(type='cuda', index=2), dtype_hint=torch.int64)
+    reader.tensor(buf8, (100, 1), dtype=torch.int64, is_leaf=True)  # convert_element_type_1
+    buf9 = reader.storage('532b7b8fc19c48c7434e569ab96aa0670d5651ef', 800, device=device(type='cuda', index=2), dtype_hint=torch.int64)
+    reader.tensor(buf9, (100, 1), dtype=torch.int64, is_leaf=True)  # clamp_max
+    buf10 = reader.storage('99ef5c7086a924dfc5221c01ff1520de469849c8', 800, device=device(type='cuda', index=2), dtype_hint=torch.int64)
+    reader.tensor(buf10, (100,), dtype=torch.int64, is_leaf=True)  # convert_element_type_3
+    buf11 = reader.storage('532b7b8fc19c48c7434e569ab96aa0670d5651ef', 800, device=device(type='cuda', index=2), dtype_hint=torch.int64)
+    reader.tensor(buf11, (100,), dtype=torch.int64, is_leaf=True)  # clamp_max_1
+    buf12 = reader.storage('0538ed039b8a4706a4f85bf431e12664d8940742', 400, device=device(type='cuda', index=2))
+    reader.tensor(buf12, (100,), is_leaf=True)  # clamp_max_2
+    buf13 = reader.storage('0538ed039b8a4706a4f85bf431e12664d8940742', 400, device=device(type='cuda', index=2))
+    reader.tensor(buf13, (100, 1), is_leaf=True)  # clamp_max_3
+    buf14 = reader.storage('5d41e66671a283b70001fd74345d8e7e3def00bd', 102400000, device=device(type='cuda', index=2))
+    reader.tensor(buf14, (4, 640, 100, 100), is_leaf=True)  # cat
+    buf15 = reader.storage('a8fe0ed584571bb3218d663656459a36545be5e6', 81920000, device=device(type='cuda', index=2))
+    reader.tensor(buf15, (4, 512, 100, 100), is_leaf=True)  # convolution
+    buf16 = reader.storage('0af13bcf109b8ca2df7f5ce3387d51e8576fb30a', 2048, device=device(type='cuda', index=2))
+    reader.tensor(buf16, (512,), is_leaf=True)  # squeeze_1
+    buf17 = reader.storage('32f14d6fa07f654fbb09ef1563066303a3501eda', 81920000, device=device(type='cuda', index=2))
+    reader.tensor(buf17, (4, 512, 100, 100), is_leaf=True)  # relu
+    buf18 = reader.storage('aca23d51e723ad9b4bec2e54d6f0af4b5b85cc7d', 81920000, device=device(type='cuda', index=2))
+    reader.tensor(buf18, (4, 512, 100, 100), is_leaf=True)  # convolution_1
+    buf19 = reader.storage('4940c79e48676c2e1359870dc770e25cd780983d', 2048, device=device(type='cuda', index=2))
+    reader.tensor(buf19, (1, 512, 1, 1), is_leaf=True)  # getitem_3
+    buf20 = reader.storage('d17407a9f45954a4d0d36e5b20a40ac554cc3aff', 2048, device=device(type='cuda', index=2))
+    reader.tensor(buf20, (1, 512, 1, 1), is_leaf=True)  # rsqrt_1
+    buf21 = reader.storage('95fbd2b85e217ab78f8f9d7900b273a1362b3112', 1600, device=device(type='cuda', index=2), dtype_hint=torch.int64)
+    reader.tensor(buf21, (200, 1), dtype=torch.int64, is_leaf=True)  # convert_element_type_5
+    buf22 = reader.storage('d9920b87a7261c94c907bc68889b005f277cd597', 1600, device=device(type='cuda', index=2), dtype_hint=torch.int64)
+    reader.tensor(buf22, (200, 1), dtype=torch.int64, is_leaf=True)  # clamp_max_4
+    buf23 = reader.storage('95fbd2b85e217ab78f8f9d7900b273a1362b3112', 1600, device=device(type='cuda', index=2), dtype_hint=torch.int64)
+    reader.tensor(buf23, (200,), dtype=torch.int64, is_leaf=True)  # convert_element_type_7
+    buf24 = reader.storage('d9920b87a7261c94c907bc68889b005f277cd597', 1600, device=device(type='cuda', index=2), dtype_hint=torch.int64)
+    reader.tensor(buf24, (200,), dtype=torch.int64, is_leaf=True)  # clamp_max_5
+    buf25 = reader.storage('131d76cb798ee04745f0c7dcb67b63c74a6c00df', 800, device=device(type='cuda', index=2))
+    reader.tensor(buf25, (200,), is_leaf=True)  # clamp_max_6
+    buf26 = reader.storage('131d76cb798ee04745f0c7dcb67b63c74a6c00df', 800, device=device(type='cuda', index=2))
+    reader.tensor(buf26, (200, 1), is_leaf=True)  # clamp_max_7
+    buf27 = reader.storage('32194c54194bddd5f695a8d306828130629246fc', 327680000, device=device(type='cuda', index=2))
+    reader.tensor(buf27, (4, 512, 200, 200), is_leaf=True)  # add_19
+    buf28 = reader.storage('e3a286ef8d6373c83ef30afe16eaae96ee52b965', 163840000, device=device(type='cuda', index=2))
+    reader.tensor(buf28, (4, 256, 200, 200), is_leaf=True)  # convolution_2
+    buf29 = reader.storage('9572b289e6d5c9bdd20a79367d4005440da40795', 1024, device=device(type='cuda', index=2))
+    reader.tensor(buf29, (256,), is_leaf=True)  # squeeze_7
+    buf30 = reader.storage('42f9ce794a05b12a40f15cbd4abb1201ccef0f72', 163840000, device=device(type='cuda', index=2))
+    reader.tensor(buf30, (4, 256, 200, 200), is_leaf=True)  # relu_2
+    buf31 = reader.storage('61670207f087dc68f052bc03747d9ab365297b17', 1024, device=device(type='cuda', index=2))
+    reader.tensor(buf31, (1, 256, 1, 1), is_leaf=True)  # unsqueeze_14
+    buf32 = reader.storage('ab77896e6dd76345e63586ecda30b1e4a63439cc', 2048, device=device(type='cuda', index=2))
+    reader.tensor(buf32, (1, 512, 1, 1), is_leaf=True)  # unsqueeze_38
+    buf33 = reader.storage('f0ec623d2a44ff0f64fc264faf9128c2a6896e57', 163840000, device=device(type='cuda', index=2))
+    reader.tensor(buf33, (4, 256, 200, 200), is_leaf=True)  # tangents_1
+load_args._version = 0
+mod = Repro()
+if __name__ == '__main__':
+    from torch._dynamo.repro.after_aot import run_repro
+    with torch.no_grad():
+        run_repro(mod, load_args, accuracy=True, command='run', save_dir='/root/FlashOCC/torch_compile_debug/run_2025_08_24_19_42_28_279064-pid_182645/minifier/checkpoints', tracing_mode='real', check_str=None)
+        # To run it separately, do 
+        # mod, args = run_repro(mod, load_args, accuracy=True, command='get_args', save_dir='/root/FlashOCC/torch_compile_debug/run_2025_08_24_19_42_28_279064-pid_182645/minifier/checkpoints', tracing_mode='real', check_str=None)
+        # mod(*args)
\ No newline at end of file
--- a/requirements.txt
+++ b/requirements.txt
+-r requirements/build.txt
+-r requirements/optional.txt
+-r requirements/runtime.txt
+-r requirements/tests.txt
--- a/requirements/build.txt
+++ b/requirements/build.txt
--- a/requirements/docs.txt
+++ b/requirements/docs.txt
+docutils==0.16.0
+m2r
+mistune==0.8.4
+myst-parser
+-e git+https://github.com/open-mmlab/pytorch_sphinx_theme.git#egg=pytorch_sphinx_theme
+sphinx==4.0.2
+sphinx-copybutton
+sphinx_markdown_tables
--- a/requirements/mminstall.txt
+++ b/requirements/mminstall.txt
+mmcv-full>=1.4.8,<=1.6.0
+mmdet>=2.24.0,<=3.0.0
+mmsegmentation>=0.20.0,<=1.0.0
--- a/requirements/optional.txt
+++ b/requirements/optional.txt
+open3d
+spconv
+waymo-open-dataset-tf-2-1-0==1.2.0
--- a/requirements/readthedocs.txt
+++ b/requirements/readthedocs.txt
+mmcv>=1.4.8
+mmdet>=2.24.0
+mmsegmentation>=0.20.1
+torch
+torchvision
--- a/requirements/runtime.txt
+++ b/requirements/runtime.txt
+lyft_dataset_sdk
+networkx>=2.2,<2.3
+numba==0.53.0
+numpy
+nuscenes-devkit
+plyfile
+scikit-image
+# by default we also use tensorboard to log results
+tensorboard
+trimesh>=2.35.39,<2.35.40
--- a/requirements/tests.txt
+++ b/requirements/tests.txt
+asynctest
+codecov
+flake8
+interrogate
+isort
+# Note: used for kwarray.group_items, this may be ported to mmcv in the future.
+kwarray
+pytest
+pytest-cov
+pytest-runner
+ubelt
+xdoctest >= 0.10.0
+yapf
--- a/tools/analysis_tools/analyze_logs.py
+++ b/tools/analysis_tools/analyze_logs.py
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import json
+from collections import defaultdict
+
+import numpy as np
+import seaborn as sns
+from matplotlib import pyplot as plt
+
+
+def cal_train_time(log_dicts, args):
+    for i, log_dict in enumerate(log_dicts):
+        print(f'{"-" * 5}Analyze train time of {args.json_logs[i]}{"-" * 5}')
+        all_times = []
+        for epoch in log_dict.keys():
+            if args.include_outliers:
+                all_times.append(log_dict[epoch]['time'])
+            else:
+                all_times.append(log_dict[epoch]['time'][1:])
+        all_times = np.array(all_times)
+        epoch_ave_time = all_times.mean(-1)
+        slowest_epoch = epoch_ave_time.argmax()
+        fastest_epoch = epoch_ave_time.argmin()
+        std_over_epoch = epoch_ave_time.std()
+        print(f'slowest epoch {slowest_epoch + 1}, '
+              f'average time is {epoch_ave_time[slowest_epoch]:.4f}')
+        print(f'fastest epoch {fastest_epoch + 1}, '
+              f'average time is {epoch_ave_time[fastest_epoch]:.4f}')
+        print(f'time std over epochs is {std_over_epoch:.4f}')
+        print(f'average iter time: {np.mean(all_times):.4f} s/iter')
+        print()
+
+
+def plot_curve(log_dicts, args):
+    if args.backend is not None:
+        plt.switch_backend(args.backend)
+    sns.set_style(args.style)
+    # if legend is None, use {filename}_{key} as legend
+    legend = args.legend
+    if legend is None:
+        legend = []
+        for json_log in args.json_logs:
+            for metric in args.keys:
+                legend.append(f'{json_log}_{metric}')
+    assert len(legend) == (len(args.json_logs) * len(args.keys))
+    metrics = args.keys
+
+    num_metrics = len(metrics)
+    for i, log_dict in enumerate(log_dicts):
+        epochs = list(log_dict.keys())
+        for j, metric in enumerate(metrics):
+            print(f'plot curve of {args.json_logs[i]}, metric is {metric}')
+            if metric not in log_dict[epochs[args.interval - 1]]:
+                raise KeyError(
+                    f'{args.json_logs[i]} does not contain metric {metric}')
+
+            if args.mode == 'eval':
+                if min(epochs) == args.interval:
+                    x0 = args.interval
+                else:
+                    # if current training is resumed from previous checkpoint
+                    # we lost information in early epochs
+                    # `xs` should start according to `min(epochs)`
+                    if min(epochs) % args.interval == 0:
+                        x0 = min(epochs)
+                    else:
+                        # find the first epoch that do eval
+                        x0 = min(epochs) + args.interval - \
+                            min(epochs) % args.interval
+                xs = np.arange(x0, max(epochs) + 1, args.interval)
+                ys = []
+                for epoch in epochs[args.interval - 1::args.interval]:
+                    ys += log_dict[epoch][metric]
+
+                # if training is aborted before eval of the last epoch
+                # `xs` and `ys` will have different length and cause an error
+                # check if `ys[-1]` is empty here
+                if not log_dict[epoch][metric]:
+                    xs = xs[:-1]
+
+                ax = plt.gca()
+                ax.set_xticks(xs)
+                plt.xlabel('epoch')
+                plt.plot(xs, ys, label=legend[i * num_metrics + j], marker='o')
+            else:
+                xs = []
+                ys = []
+                num_iters_per_epoch = \
+                    log_dict[epochs[args.interval-1]]['iter'][-1]
+                for epoch in epochs[args.interval - 1::args.interval]:
+                    iters = log_dict[epoch]['iter']
+                    if log_dict[epoch]['mode'][-1] == 'val':
+                        iters = iters[:-1]
+                    xs.append(
+                        np.array(iters) + (epoch - 1) * num_iters_per_epoch)
+                    ys.append(np.array(log_dict[epoch][metric][:len(iters)]))
+                xs = np.concatenate(xs)
+                ys = np.concatenate(ys)
+                plt.xlabel('iter')
+                plt.plot(
+                    xs, ys, label=legend[i * num_metrics + j], linewidth=0.5)
+            plt.legend()
+        if args.title is not None:
+            plt.title(args.title)
+    if args.out is None:
+        plt.show()
+    else:
+        print(f'save curve to: {args.out}')
+        plt.savefig(args.out)
+        plt.cla()
+
+
+def add_plot_parser(subparsers):
+    parser_plt = subparsers.add_parser(
+        'plot_curve', help='parser for plotting curves')
+    parser_plt.add_argument(
+        'json_logs',
+        type=str,
+        nargs='+',
+        help='path of train log in json format')
+    parser_plt.add_argument(
+        '--keys',
+        type=str,
+        nargs='+',
+        default=['mAP_0.25'],
+        help='the metric that you want to plot')
+    parser_plt.add_argument('--title', type=str, help='title of figure')
+    parser_plt.add_argument(
+        '--legend',
+        type=str,
+        nargs='+',
+        default=None,
+        help='legend of each plot')
+    parser_plt.add_argument(
+        '--backend', type=str, default=None, help='backend of plt')
+    parser_plt.add_argument(
+        '--style', type=str, default='dark', help='style of plt')
+    parser_plt.add_argument('--out', type=str, default=None)
+    parser_plt.add_argument('--mode', type=str, default='train')
+    parser_plt.add_argument('--interval', type=int, default=1)
+
+
+def add_time_parser(subparsers):
+    parser_time = subparsers.add_parser(
+        'cal_train_time',
+        help='parser for computing the average time per training iteration')
+    parser_time.add_argument(
+        'json_logs',
+        type=str,
+        nargs='+',
+        help='path of train log in json format')
+    parser_time.add_argument(
+        '--include-outliers',
+        action='store_true',
+        help='include the first value of every epoch when computing '
+        'the average time')
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Analyze Json Log')
+    # currently only support plot curve and calculate average train time
+    subparsers = parser.add_subparsers(dest='task', help='task parser')
+    add_plot_parser(subparsers)
+    add_time_parser(subparsers)
+    args = parser.parse_args()
+    return args
+
+
+def load_json_logs(json_logs):
+    # load and convert json_logs to log_dict, key is epoch, value is a sub dict
+    # keys of sub dict is different metrics, e.g. memory, bbox_mAP
+    # value of sub dict is a list of corresponding values of all iterations
+    log_dicts = [dict() for _ in json_logs]
+    for json_log, log_dict in zip(json_logs, log_dicts):
+        with open(json_log, 'r') as log_file:
+            for line in log_file:
+                log = json.loads(line.strip())
+                # skip lines without `epoch` field
+                if 'epoch' not in log:
+                    continue
+                epoch = log.pop('epoch')
+                if epoch not in log_dict:
+                    log_dict[epoch] = defaultdict(list)
+                for k, v in log.items():
+                    log_dict[epoch][k].append(v)
+    return log_dicts
+
+
+def main():
+    args = parse_args()
+
+    json_logs = args.json_logs
+    for json_log in json_logs:
+        assert json_log.endswith('.json')
+
+    log_dicts = load_json_logs(json_logs)
+
+    eval(args.task)(log_dicts, args)
+
+
+if __name__ == '__main__':
+    main()
--- a/tools/analysis_tools/benchmark.py
+++ b/tools/analysis_tools/benchmark.py
+# Copyright (c) OpenMMLab. All rights reserved.
+import sys
+import argparse
+import time
+import os
+import torch
+from mmcv import Config
+from mmcv.parallel import MMDataParallel
+from mmcv.runner import load_checkpoint, wrap_fp16_model
+
+from mmdet3d.datasets import build_dataloader, build_dataset
+from mmdet3d.models import build_detector
+from tools.misc.fuse_conv_bn import fuse_module
+
+sys.path.insert(0, os.getcwd())
+print(sys.path)
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='MMDet benchmark a model')
+    parser.add_argument('config', help='test config file path')
+    parser.add_argument('checkpoint', help='checkpoint file')
+    parser.add_argument('--samples', default=500, help='samples to benchmark')
+    parser.add_argument(
+        '--log-interval', default=50, help='interval of logging')
+    parser.add_argument(
+        '--fuse-conv-bn',
+        action='store_true',
+        help='Whether to fuse conv and bn, this will slightly increase'
+        'the inference speed')
+    parser.add_argument(
+        '--w_pano',
+        action='store_true')
+    parser.add_argument(
+        '--w_panoproc',
+        action='store_true')
+    parser.add_argument(
+        '--no-acceleration',
+        action='store_true',
+        help='Omit the pre-computation acceleration')
+    args = parser.parse_args()
+    return args
+
+def main():
+    args = parse_args()
+
+    cfg = Config.fromfile(args.config)
+    # set cudnn_benchmark
+    if cfg.get('cudnn_benchmark', False):
+        torch.backends.cudnn.benchmark = True
+    cfg.model.pretrained = None
+    cfg.data.test.test_mode = True
+
+    # import modules from plguin/xx, registry will be updated
+    if hasattr(cfg, 'plugin'):
+        if cfg.plugin:
+            import importlib
+            if hasattr(cfg, 'plugin_dir'):
+                plugin_dir = cfg.plugin_dir
+                _module_dir = os.path.dirname(plugin_dir)
+                _module_dir = _module_dir.split('/')
+                _module_path = _module_dir[0]
+
+                for m in _module_dir[1:]:
+                    _module_path = _module_path + '.' + m
+                print(_module_path)
+                plg_lib = importlib.import_module(_module_path)
+            else:
+                # import dir is the dirpath for the config file
+                _module_dir = os.path.dirname(args.config)
+                _module_dir = _module_dir.split('/')
+                _module_path = _module_dir[0]
+                for m in _module_dir[1:]:
+                    _module_path = _module_path + '.' + m
+                plg_lib = importlib.import_module(_module_path)
+
+    # build the dataloader
+    # TODO: support multiple images per gpu (only minor changes are needed)
+    dataset = build_dataset(cfg.data.test)
+    data_loader = build_dataloader(
+        dataset,
+        samples_per_gpu=1,
+        workers_per_gpu=0,
+        dist=False,
+        shuffle=False)
+
+    # build the model and load checkpoint
+    if not args.no_acceleration:
+        cfg.model.img_view_transformer.accelerate=True
+    cfg.model.train_cfg = None
+    model = build_detector(cfg.model, test_cfg=cfg.get('test_cfg'))
+    fp16_cfg = cfg.get('fp16', None)
+    if fp16_cfg is not None:
+        wrap_fp16_model(model)
+    load_checkpoint(model, args.checkpoint, map_location='cpu')
+    if args.fuse_conv_bn:
+        model = fuse_module(model)
+
+    model = MMDataParallel(model, device_ids=[0])
+
+    model.eval()
+
+    # the first several iterations may be very slow so skip them
+    num_warmup = 5
+    pure_inf_time = 0
+
+    # benchmark with several samples and take the average
+    # for i, data_ori in enumerate(data_loader):
+    #     if i == 0:
+    #         break
+    # import copy
+    # for i in range(500):
+    #     data = copy.deepcopy(data_ori)
+    for i, data in enumerate(data_loader):
+
+        torch.cuda.synchronize()
+        start_time = time.perf_counter()
+
+        with torch.no_grad():
+            model(return_loss=False, rescale=True, 
+                  w_pano=args.w_pano,
+                  w_panoproc=args.w_panoproc,
+                  **data)
+
+        torch.cuda.synchronize()
+        elapsed = time.perf_counter() - start_time
+
+        if i >= num_warmup:
+            pure_inf_time += elapsed
+            if (i + 1) % args.log_interval == 0:
+                fps = (i + 1 - num_warmup) / pure_inf_time
+                print(f'Done image [{i + 1:<3}/ {args.samples}], '
+                      f'fps: {fps:.1f} img / s')
+
+        if (i + 1) == args.samples:
+            pure_inf_time += elapsed
+            fps = (i + 1 - num_warmup) / pure_inf_time
+            print(f'Overall \nfps: {fps:.2f} img / s '
+                  f'\ninference time: {1000 / fps:.2f} ms')
+            break
+
+
+if __name__ == '__main__':
+    main()
--- a/tools/analysis_tools/benchmark_sequential.py
+++ b/tools/analysis_tools/benchmark_sequential.py
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import time
+import os
+import sys
+import torch
+from mmcv import Config
+from mmcv.parallel import MMDataParallel
+from mmcv.runner import load_checkpoint, wrap_fp16_model
+
+from mmdet3d.datasets import build_dataloader, build_dataset
+from mmdet3d.models import build_detector
+from tools.misc.fuse_conv_bn import fuse_module
+
+sys.path.insert(0, os.getcwd())
+print(sys.path)
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='MMDet benchmark a model')
+    parser.add_argument('config', help='test config file path')
+    parser.add_argument('checkpoint', help='checkpoint file')
+    parser.add_argument('--samples', default=400, help='samples to benchmark')
+    parser.add_argument(
+        '--log-interval', default=50, help='interval of logging')
+    parser.add_argument(
+        '--fuse-conv-bn',
+        action='store_true',
+        help='Whether to fuse conv and bn, this will slightly increase'
+        'the inference speed')
+    parser.add_argument(
+        '--w_pano',
+        action='store_true')
+    parser.add_argument(
+        '--w_panoproc',
+        action='store_true')
+    parser.add_argument(
+        '--no-acceleration',
+        action='store_true',
+        help='Omit the pre-computation acceleration')
+    args = parser.parse_args()
+    return args
+
+
+def main():
+    args = parse_args()
+
+    cfg = Config.fromfile(args.config)
+    # set cudnn_benchmark
+    if cfg.get('cudnn_benchmark', False):
+        torch.backends.cudnn.benchmark = True
+    cfg.model.pretrained = None
+    cfg.data.test.test_mode = True
+
+    # import modules from plguin/xx, registry will be updated
+    if hasattr(cfg, 'plugin'):
+        if cfg.plugin:
+            import importlib
+            if hasattr(cfg, 'plugin_dir'):
+                plugin_dir = cfg.plugin_dir
+                _module_dir = os.path.dirname(plugin_dir)
+                _module_dir = _module_dir.split('/')
+                _module_path = _module_dir[0]
+
+                for m in _module_dir[1:]:
+                    _module_path = _module_path + '.' + m
+                print(_module_path)
+                plg_lib = importlib.import_module(_module_path)
+            else:
+                # import dir is the dirpath for the config file
+                _module_dir = os.path.dirname(args.config)
+                _module_dir = _module_dir.split('/')
+                _module_path = _module_dir[0]
+                for m in _module_dir[1:]:
+                    _module_path = _module_path + '.' + m
+                plg_lib = importlib.import_module(_module_path)
+
+    # build the dataloader
+    # TODO: support multiple images per gpu (only minor changes are needed)
+    dataset = build_dataset(cfg.data.test)
+    data_loader = build_dataloader(
+        dataset,
+        samples_per_gpu=1,
+        workers_per_gpu=0,
+        dist=False,
+        shuffle=False)
+
+    # build the model and load checkpoint
+    cfg.model.train_cfg = None
+    cfg.model.align_after_view_transfromation=True
+    if not args.no_acceleration:
+        cfg.model.img_view_transformer.accelerate=True
+    model = build_detector(cfg.model, test_cfg=cfg.get('test_cfg'))
+    fp16_cfg = cfg.get('fp16', None)
+    if fp16_cfg is not None:
+        wrap_fp16_model(model)
+    load_checkpoint(model, args.checkpoint, map_location='cpu')
+    if args.fuse_conv_bn:
+        model = fuse_module(model)
+
+    model = MMDataParallel(model, device_ids=[0])
+
+    model.eval()
+
+    # the first several iterations may be very slow so skip them
+    num_warmup = 5
+    pure_inf_time = 0
+
+    # benchmark with several samples and take the average
+    for i, data in enumerate(data_loader):
+        inputs = [d.cuda() for d in data['img_inputs'][0]]
+        with torch.no_grad():
+            feat_prev, inputs = model.module.extract_img_feat(
+                inputs, pred_prev=True, img_metas=None)
+        data['img_inputs'][0] = inputs
+
+        torch.cuda.synchronize()
+        start_time = time.perf_counter()
+
+        with torch.no_grad():
+            model(
+                return_loss=False,
+                rescale=True,
+                sequential=True,
+                feat_prev=feat_prev,
+                w_pano=args.w_pano,
+                w_panoproc=args.w_panoproc,
+                **data)
+
+        torch.cuda.synchronize()
+        elapsed = time.perf_counter() - start_time
+
+        if i >= num_warmup:
+            pure_inf_time += elapsed
+            if (i + 1) % args.log_interval == 0:
+                fps = (i + 1 - num_warmup) / pure_inf_time
+                print(f'Done image [{i + 1:<3}/ {args.samples}], '
+                      f'fps: {fps:.1f} img / s')
+
+        if (i + 1) == args.samples:
+            pure_inf_time += elapsed
+            fps = (i + 1 - num_warmup) / pure_inf_time
+            print(f'Overall \nfps: {fps:.2f} img / s '
+                  f'\ninference time: {1000 / fps:.2f} ms')
+            break
+
+
+if __name__ == '__main__':
+    main()
\ No newline at end of file
--- a/tools/analysis_tools/benchmark_trt.py
+++ b/tools/analysis_tools/benchmark_trt.py
+import time
+from typing import Dict, Optional, Sequence, Union
+import os
+from os import path as osp
+import sys
+sys.path.insert(0, os.getcwd())
+
+import tensorrt as trt
+import torch
+import torch.onnx
+from mmcv import Config
+from mmdeploy.backend.tensorrt import load_tensorrt_plugin
+
+try:
+    # If mmdet version > 2.23.0, compat_cfg would be imported and
+    # used from mmdet instead of mmdet3d.
+    from mmdet.utils import compat_cfg
+except ImportError:
+    from mmdet3d.utils import compat_cfg
+
+import argparse
+
+from mmdet3d.core import bbox3d2result
+from mmdet3d.core.bbox.structures.box_3d_mode import LiDARInstance3DBoxes
+from mmdet3d.datasets import build_dataloader, build_dataset
+from mmdet3d.models import build_model
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Deploy BEVDet with Tensorrt')
+    parser.add_argument('config', help='deploy config file path')
+    parser.add_argument('engine', help='checkpoint file')
+    parser.add_argument('--samples', default=500, help='samples to benchmark')
+    parser.add_argument('--postprocessing', action='store_true')
+    parser.add_argument('--eval', action='store_true')
+    parser.add_argument('--prefetch', action='store_true',
+                        help='use prefetch to accelerate the data loading, '
+                             'the inference speed is sightly degenerated due '
+                             'to the computational occupancy of prefetch')
+    args = parser.parse_args()
+    return args
+
+
+def torch_dtype_from_trt(dtype: trt.DataType) -> torch.dtype:
+    """Convert pytorch dtype to TensorRT dtype.
+
+    Args:
+        dtype (str.DataType): The data type in tensorrt.
+
+    Returns:
+        torch.dtype: The corresponding data type in torch.
+    """
+
+    if dtype == trt.bool:
+        return torch.bool
+    elif dtype == trt.int8:
+        return torch.int8
+    elif dtype == trt.int32:
+        return torch.int32
+    elif dtype == trt.float16:
+        return torch.float16
+    elif dtype == trt.float32:
+        return torch.float32
+    else:
+        raise TypeError(f'{dtype} is not supported by torch')
+
+
+class TRTWrapper(torch.nn.Module):
+
+    def __init__(self,
+                 engine: Union[str, trt.ICudaEngine],
+                 output_names: Optional[Sequence[str]] = None) -> None:
+        super().__init__()
+        self.engine = engine
+        if isinstance(self.engine, str):
+            with trt.Logger() as logger, trt.Runtime(logger) as runtime:
+                with open(self.engine, mode='rb') as f:
+                    engine_bytes = f.read()
+                self.engine = runtime.deserialize_cuda_engine(engine_bytes)
+        self.context = self.engine.create_execution_context()
+        names = [_ for _ in self.engine]
+        input_names = list(filter(self.engine.binding_is_input, names))
+        self._input_names = input_names
+        self._output_names = output_names
+
+        if self._output_names is None:
+            output_names = list(set(names) - set(input_names))
+            self._output_names = output_names
+
+    def forward(self, inputs: Dict[str, torch.Tensor]):
+        bindings = [None] * (len(self._input_names) + len(self._output_names))
+        for input_name, input_tensor in inputs.items():
+            idx = self.engine.get_binding_index(input_name)
+            self.context.set_binding_shape(idx, tuple(input_tensor.shape))
+            bindings[idx] = input_tensor.contiguous().data_ptr()
+
+            # create output tensors
+        outputs = {}
+        for output_name in self._output_names:
+            idx = self.engine.get_binding_index(output_name)
+            dtype = torch_dtype_from_trt(self.engine.get_binding_dtype(idx))
+            shape = tuple(self.context.get_binding_shape(idx))
+
+            device = torch.device('cuda')
+            output = torch.zeros(size=shape, dtype=dtype, device=device)
+            outputs[output_name] = output
+            bindings[idx] = output.data_ptr()
+        self.context.execute_async_v2(bindings,
+                                      torch.cuda.current_stream().cuda_stream)
+        return outputs
+
+
+def get_plugin_names():
+    return [pc.name for pc in trt.get_plugin_registry().plugin_creator_list]
+
+
+def main():
+
+    load_tensorrt_plugin()
+
+    args = parse_args()
+
+    if args.eval:
+        args.postprocessing=True
+        print('Warnings: evaluation requirement detected, set '
+              'postprocessing=True for evaluation purpose')
+    cfg = Config.fromfile(args.config)
+    cfg.model.pretrained = None
+    cfg.model.type = cfg.model.type + 'TRT'
+    cfg = compat_cfg(cfg)
+    cfg.gpu_ids = [0]
+
+    if not args.prefetch:
+        cfg.data.test_dataloader.workers_per_gpu=0
+
+    # import modules from plguin/xx, registry will be updated
+    if hasattr(cfg, 'plugin'):
+        if cfg.plugin:
+            import importlib
+            if hasattr(cfg, 'plugin_dir'):
+                plugin_dir = cfg.plugin_dir
+                _module_dir = os.path.dirname(plugin_dir)
+                _module_dir = _module_dir.split('/')
+                _module_path = _module_dir[0]
+
+                for m in _module_dir[1:]:
+                    _module_path = _module_path + '.' + m
+                print(_module_path)
+                plg_lib = importlib.import_module(_module_path)
+            else:
+                # import dir is the dirpath for the config file
+                _module_dir = os.path.dirname(args.config)
+                _module_dir = _module_dir.split('/')
+                _module_path = _module_dir[0]
+                for m in _module_dir[1:]:
+                    _module_path = _module_path + '.' + m
+                plg_lib = importlib.import_module(_module_path)
+
+    # build dataloader
+    assert cfg.data.test.test_mode
+    test_dataloader_default_args = dict(
+        samples_per_gpu=1, workers_per_gpu=2, dist=False, shuffle=False)
+    test_loader_cfg = {
+        **test_dataloader_default_args,
+        **cfg.data.get('test_dataloader', {})
+    }
+    dataset = build_dataset(cfg.data.test)
+    data_loader = build_dataloader(dataset, **test_loader_cfg)
+
+    # build the model
+    cfg.model.train_cfg = None
+    model = build_model(cfg.model, test_cfg=cfg.get('test_cfg'))
+
+    # build tensorrt model
+    if (cfg.model.get('wdet3d', True) == True) and (cfg.model.get('wocc', True) == False):
+        trt_model = TRTWrapper(args.engine, [f'output_{i}' for i in range(6 * len(model.pts_bbox_head.task_heads))])
+    elif (cfg.model.get('wdet3d', True) == True) and (cfg.model.get('wocc', True) == True):
+        trt_model = TRTWrapper(args.engine, [f'output_{i}' for i in range(1 + 6 * len(model.pts_bbox_head.task_heads))])
+    elif (cfg.model.get('wdet3d', True) == False) and (cfg.model.get('wocc', True) == True):
+        trt_model = TRTWrapper(args.engine, [f'output_{i}' for i in range(1)])
+    else:
+        raise(" At least one of wdet3d and wocc is set as True!! ")
+
+    num_warmup = 50
+    pure_inf_time = 0
+
+    init_ = True
+    metas = dict()
+    # benchmark with several samples and take the average
+    results = list()
+    for i, data in enumerate(data_loader):
+        if init_:
+            inputs = [t.cuda() for t in data['img_inputs'][0]]
+            if model.__class__.__name__ in ['FBOCCTRT', 'FBOCC2DTRT']:
+                metas_ = model.get_bev_pool_input(inputs, img_metas=data['img_metas'])
+            else:
+                if model.__class__.__name__ in ['BEVDetOCCTRT']:
+                    metas_ = model.get_bev_pool_input(inputs)
+                elif model.__class__.__name__ in ['BEVDepthOCCTRT']:
+                    metas_, mlp_input = model.get_bev_pool_input(inputs)
+            if model.__class__.__name__ in ['FBOCCTRT', 'FBOCC2DTRT', 'BEVDetOCCTRT']:
+                metas = dict(
+                    ranks_bev=metas_[0].int().contiguous(),
+                    ranks_depth=metas_[1].int().contiguous(),
+                    ranks_feat=metas_[2].int().contiguous(),
+                    interval_starts=metas_[3].int().contiguous(),
+                    interval_lengths=metas_[4].int().contiguous())
+            elif model.__class__.__name__ in ['BEVDepthOCCTRT']:
+                metas = dict(
+                    ranks_bev=metas_[0].int().contiguous(),
+                    ranks_depth=metas_[1].int().contiguous(),
+                    ranks_feat=metas_[2].int().contiguous(),
+                    interval_starts=metas_[3].int().contiguous(),
+                    interval_lengths=metas_[4].int().contiguous(),
+                    mlp_input=mlp_input)
+            init_ = False
+        img = data['img_inputs'][0][0].cuda().squeeze(0).contiguous()
+        if img.shape[0] > 6:
+            img = img[:6]
+        torch.cuda.synchronize()
+        start_time = time.perf_counter()
+        trt_output = trt_model.forward(dict(img=img, **metas))
+
+        # postprocessing
+        if args.postprocessing:
+            if cfg.model.get('wdet3d', True):
+                trt_output_det = [trt_output[f'output_{i}'] for i in
+                            range(6 * len(model.pts_bbox_head.task_heads))]
+                pred = model.result_deserialize(trt_output_det)
+                img_metas = [dict(box_type_3d=LiDARInstance3DBoxes)]
+                bbox_list = model.pts_bbox_head.get_bboxes(
+                    pred, img_metas, rescale=True)
+                bbox_results = [
+                    bbox3d2result(bboxes, scores, labels)
+                    for bboxes, scores, labels in bbox_list
+                ]
+            if cfg.model.get('wocc', True):
+                # occupancy
+                if cfg.model.get('wdet3d', True):
+                    occ_preds = model.occ_head.get_occ(trt_output['output_6'])      # List[(Dx, Dy, Dz), (Dx, Dy, Dz), ...]
+                else:
+                    occ_preds = model.occ_head.get_occ(trt_output['output_0'])      # List[(Dx, Dy, Dz), (Dx, Dy, Dz), ...]
+            if args.eval:
+                if cfg.model.get('wdet3d', True) and (not cfg.model.get('wocc', True)):
+                    results.append(bbox_results[0])
+                elif cfg.model.get('wdet3d', True) and cfg.model.get('wocc', True):
+                    results.append({'pts_bbox': bbox_results[0], 'pred_occ': occ_preds[0]})
+                elif (not cfg.model.get('wdet3d', False)) and cfg.model.get('wocc', True):
+                    results.append(occ_preds[0])
+
+        torch.cuda.synchronize()
+        elapsed = time.perf_counter() - start_time
+
+        if i >= num_warmup:
+            pure_inf_time += elapsed
+            if (i + 1) % 50 == 0:
+                fps = (i + 1 - num_warmup) / pure_inf_time
+                print(f'Done image [{i + 1:<3}/ {args.samples}], '
+                      f'fps: {fps:.2f} img / s')
+
+        if (i + 1) == args.samples:
+            pure_inf_time += elapsed
+            fps = (i + 1 - num_warmup) / pure_inf_time
+            print(f'Overall \nfps: {fps:.2f} img / s '
+                  f'\ninference time: {1000/fps:.2f} ms')
+            if not args.eval:
+                return
+
+    assert args.eval
+    eval_kwargs = cfg.get('evaluation', {}).copy()
+    # hard-code way to remove EvalHook args
+    for key in [
+        'interval', 'tmpdir', 'start', 'gpu_collect', 'save_best',
+        'rule'
+    ]:
+        eval_kwargs.pop(key, None)
+    eval_kwargs.update(dict(metric=args.eval))
+    print(dataset.evaluate(results, **eval_kwargs))
+
+
+if __name__ == '__main__':
+    fps = main()
--- a/tools/analysis_tools/benchmark_view_transformer.py
+++ b/tools/analysis_tools/benchmark_view_transformer.py
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import time
+
+import numpy as np
+import torch
+from mmcv import Config
+from mmcv.parallel import MMDataParallel
+from mmcv.runner import load_checkpoint
+
+from mmdet3d.datasets import build_dataloader, build_dataset
+from mmdet3d.models import build_detector
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='MMDet benchmark a model')
+    parser.add_argument('config', help='test config file path')
+    parser.add_argument('checkpoint', help='checkpoint file')
+    parser.add_argument('--samples', default=1000, help='samples to benchmark')
+    parser.add_argument(
+        '--log-interval', default=50, help='interval of logging')
+    parser.add_argument(
+        '--mem-only',
+        action='store_true',
+        help='Conduct the memory analysis only')
+    parser.add_argument(
+        '--no-acceleration',
+        action='store_true',
+        help='Omit the pre-computation acceleration')
+    args = parser.parse_args()
+    return args
+
+
+def main():
+    args = parse_args()
+
+    cfg = Config.fromfile(args.config)
+    # set cudnn_benchmark
+    if cfg.get('cudnn_benchmark', False):
+        torch.backends.cudnn.benchmark = True
+    cfg.model.pretrained = None
+    cfg.data.test.test_mode = True
+
+    # build the dataloader
+    # TODO: support multiple images per gpu (only minor changes are needed)
+    dataset = build_dataset(cfg.data.test)
+    data_loader = build_dataloader(
+        dataset,
+        samples_per_gpu=1,
+        workers_per_gpu=cfg.data.workers_per_gpu,
+        dist=False,
+        shuffle=False)
+
+    # build the model and load checkpoint
+    if not args.no_acceleration:
+        cfg.model.img_view_transformer.accelerate=True
+    cfg.model.train_cfg = None
+    assert cfg.model.type == 'BEVDet', \
+        'Please use class BEVDet for ' \
+        'view transformation inference ' \
+        'speed estimation instead of %s'% cfg.model.type
+    model = build_detector(cfg.model, test_cfg=cfg.get('test_cfg'))
+    load_checkpoint(model, args.checkpoint, map_location='cpu')
+    model = MMDataParallel(model, device_ids=[0])
+
+    model.eval()
+
+    # the first several iterations may be very slow so skip them
+    num_warmup = 100
+    pure_inf_time = 0
+    D = model.module.img_view_transformer.D
+    out_channels = model.module.img_view_transformer.out_channels
+    depth_net = model.module.img_view_transformer.depth_net
+    view_transformer = model.module.img_view_transformer
+    # benchmark with several samples and take the average
+    for i, data in enumerate(data_loader):
+
+        with torch.no_grad():
+            img_feat, _ = \
+                model.module.image_encoder(data['img_inputs'][0][0].cuda())
+            B, N, C, H, W = img_feat.shape
+            x = depth_net(img_feat.reshape(B * N, C, H, W))
+            depth_digit = x[:, :D, ...]
+            tran_feat = x[:, D:D + out_channels, ...]
+            depth = depth_digit.softmax(dim=1)
+        input = [img_feat] + [d.cuda() for d in data['img_inputs'][0][1:]]
+
+        if i == 0:
+            precomputed_memory_allocated = 0.0
+            if view_transformer.accelerate:
+                start_mem_allocated = torch.cuda.memory_allocated()
+                view_transformer.pre_compute(input)
+                end_mem_allocated = torch.cuda.memory_allocated()
+                precomputed_memory_allocated = \
+                    end_mem_allocated - start_mem_allocated
+                ref_max_mem_allocated = torch.cuda.max_memory_allocated()
+                # occupy the memory
+                size = (ref_max_mem_allocated - end_mem_allocated) // 4
+                occupy_tensor = torch.zeros(
+                    size=(size, ), device='cuda', dtype=torch.float32)
+            print('Memory analysis: \n'
+                  'precomputed_memory_allocated : %d B / %.01f MB \n' %
+                  (precomputed_memory_allocated,
+                   precomputed_memory_allocated / 1024 / 1024))
+            start_mem_allocated = torch.cuda.memory_allocated()
+            bev_feat = view_transformer.view_transform_core(
+                input, depth, tran_feat)[0]
+            end_max_mem_allocated = torch.cuda.max_memory_allocated()
+            peak_memory_allocated = \
+                end_max_mem_allocated - start_mem_allocated
+            total_memory_requirement = \
+                precomputed_memory_allocated + peak_memory_allocated
+            print('Memory analysis: \n'
+                  'Memory requirement : %d B / %.01f MB \n' %
+                  (total_memory_requirement,
+                   total_memory_requirement / 1024 / 1024))
+            if args.mem_only:
+                return
+
+        torch.cuda.synchronize()
+        start_time = time.perf_counter()
+        with torch.no_grad():
+            view_transformer.view_transform(input, depth, tran_feat)[0]
+        torch.cuda.synchronize()
+        elapsed = time.perf_counter() - start_time
+
+        if i >= num_warmup:
+            pure_inf_time += elapsed
+            if (i + 1) % args.log_interval == 0:
+                fps = (i + 1 - num_warmup) / pure_inf_time
+                print(f'Done image [{i + 1:<3}/ {args.samples}], '
+                      f'fps: {fps:.1f} img / s')
+
+        if (i + 1) == args.samples:
+            pure_inf_time += elapsed
+            fps = (i + 1 - num_warmup) / pure_inf_time
+            print(f'Overall fps: {fps:.1f} img / s')
+            return fps
+
+
+if __name__ == '__main__':
+    repeat_times = 1
+    fps_list = []
+    for _ in range(repeat_times):
+        fps = main()
+        time.sleep(5)
+        fps_list.append(fps)
+    fps_list = np.array(fps_list, dtype=np.float32)
+    print(f'Mean Overall fps: {fps_list.mean():.4f} +'
+          f' {np.sqrt(fps_list.var()):.4f} img / s')
--- a/tools/analysis_tools/get_flops.py
+++ b/tools/analysis_tools/get_flops.py
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import os
+import sys
+sys.path.insert(0, os.getcwd())
+import torch
+from mmcv import Config, DictAction
+
+from mmdet3d.models import build_model
+
+try:
+    from mmcv.cnn import get_model_complexity_info
+except ImportError:
+    raise ImportError('Please upgrade mmcv to >0.6.2')
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Train a detector')
+    parser.add_argument('config', help='train config file path')
+    parser.add_argument(
+        '--shape',
+        type=int,
+        nargs='+',
+        default=[40000, 4],
+        help='input point cloud size')
+    parser.add_argument(
+        '--modality',
+        type=str,
+        default='point',
+        choices=['point', 'image', 'multi'],
+        help='input data modality')
+    parser.add_argument(
+        '--cfg-options',
+        nargs='+',
+        action=DictAction,
+        help='override some settings in the used config, the key-value pair '
+        'in xxx=yyy format will be merged into config file. If the value to '
+        'be overwritten is a list, it should be like key="[a,b]" or key=a,b '
+        'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" '
+        'Note that the quotation marks are necessary and that no white space '
+        'is allowed.')
+    args = parser.parse_args()
+    return args
+
+
+def construct_input(input_shape):
+    rot = torch.eye(4).float().cuda().view(1, 1, 4, 4).expand(1,6,4,4)
+
+    intrins = torch.eye(3).float().cuda().view(1,1, 3, 3).expand(1,6,3,3)
+    input = dict(img_inputs=[
+        torch.ones(()).new_empty((1, 6, *input_shape)).cuda(), rot,
+        rot, intrins, intrins,
+        torch.ones((1, 6, 3)).cuda(),
+        torch.eye(3).float().cuda().view(1, 3, 3)
+    ])
+    return input
+
+
+def main():
+
+    args = parse_args()
+
+    if args.modality == 'point':
+        assert len(args.shape) == 2, 'invalid input shape'
+        input_shape = tuple(args.shape)
+    elif args.modality == 'image':
+        if len(args.shape) == 1:
+            input_shape = (3, args.shape[0], args.shape[0])
+        elif len(args.shape) == 2:
+            input_shape = (3, ) + tuple(args.shape)
+        else:
+            raise ValueError('invalid input shape')
+    elif args.modality == 'multi':
+        raise NotImplementedError(
+            'FLOPs counter is currently not supported for models with '
+            'multi-modality input')
+
+    cfg = Config.fromfile(args.config)
+    # if 'stereo' in args.config or 'longterm' in args.config:
+    #     assert False,'Config has not supported: %s ' % args.config
+    if args.cfg_options is not None:
+        cfg.merge_from_dict(args.cfg_options)
+
+    # import modules from plguin/xx, registry will be updated
+    if hasattr(cfg, 'plugin'):
+        if cfg.plugin:
+            import importlib
+            if hasattr(cfg, 'plugin_dir'):
+                plugin_dir = cfg.plugin_dir
+                _module_dir = os.path.dirname(plugin_dir)
+                _module_dir = _module_dir.split('/')
+                _module_path = _module_dir[0]
+
+                for m in _module_dir[1:]:
+                    _module_path = _module_path + '.' + m
+                print(_module_path)
+                plg_lib = importlib.import_module(_module_path)
+            else:
+                # import dir is the dirpath for the config file
+                _module_dir = os.path.dirname(args.config)
+                _module_dir = _module_dir.split('/')
+                _module_path = _module_dir[0]
+                for m in _module_dir[1:]:
+                    _module_path = _module_path + '.' + m
+                plg_lib = importlib.import_module(_module_path)
+
+    model = build_model(
+        cfg.model,
+        train_cfg=cfg.get('train_cfg'),
+        test_cfg=cfg.get('test_cfg'))
+    if torch.cuda.is_available():
+        model.cuda()
+    model.eval()
+
+    if hasattr(model, 'forward_dummy'):
+        model.forward = model.forward_dummy
+    else:
+        raise NotImplementedError(
+            'FLOPs counter is currently not supported for {}'.format(
+                model.__class__.__name__))
+
+    flops, params = get_model_complexity_info(
+        model, input_shape, input_constructor=construct_input)
+    split_line = '=' * 30
+    print(f'{split_line}\nInput shape: {input_shape}\n'
+          f'Flops: {flops}\nParams: {params}\n{split_line}')
+    print('!!!Please be cautious if you use the results in papers. '
+          'You may need to check if all ops are supported and verify that the '
+          'flops computation is correct.')
+
+
+if __name__ == '__main__':
+    main()
--- a/tools/analysis_tools/vis.py
+++ b/tools/analysis_tools/vis.py
+# Copyright (c) Phigent Robotics. All rights reserved.
+import argparse
+import json
+import os
+import pickle
+
+import cv2
+import numpy as np
+from pyquaternion.quaternion import Quaternion
+
+from mmdet3d.core.bbox.structures.lidar_box3d import LiDARInstance3DBoxes as LB
+
+
+def check_point_in_img(points, height, width):
+    valid = np.logical_and(points[:, 0] >= 0, points[:, 1] >= 0)
+    valid = np.logical_and(
+        valid, np.logical_and(points[:, 0] < width, points[:, 1] < height))
+    return valid
+
+
+def depth2color(depth):
+    gray = max(0, min((depth + 2.5) / 3.0, 1.0))
+    max_lumi = 200
+    colors = np.array(
+        [[max_lumi, 0, max_lumi], [max_lumi, 0, 0], [max_lumi, max_lumi, 0],
+         [0, max_lumi, 0], [0, max_lumi, max_lumi], [0, 0, max_lumi]],
+        dtype=np.float32)
+    if gray == 1:
+        return tuple(colors[-1].tolist())
+    num_rank = len(colors) - 1
+    rank = np.floor(gray * num_rank).astype(np.int)
+    diff = (gray - rank / num_rank) * num_rank
+    return tuple(
+        (colors[rank] + (colors[rank + 1] - colors[rank]) * diff).tolist())
+
+
+def lidar2img(points_lidar, camrera_info):
+    points_lidar_homogeneous = \
+        np.concatenate([points_lidar,
+                        np.ones((points_lidar.shape[0], 1),
+                                dtype=points_lidar.dtype)], axis=1)
+    camera2lidar = np.eye(4, dtype=np.float32)
+    camera2lidar[:3, :3] = camrera_info['sensor2lidar_rotation']
+    camera2lidar[:3, 3] = camrera_info['sensor2lidar_translation']
+    lidar2camera = np.linalg.inv(camera2lidar)
+    points_camera_homogeneous = points_lidar_homogeneous @ lidar2camera.T
+    points_camera = points_camera_homogeneous[:, :3]
+    valid = np.ones((points_camera.shape[0]), dtype=bool)
+    valid = np.logical_and(points_camera[:, -1] > 0.5, valid)
+    points_camera = points_camera / points_camera[:, 2:3]
+    camera2img = camrera_info['cam_intrinsic']
+    points_img = points_camera @ camera2img.T
+    points_img = points_img[:, :2]
+    return points_img, valid
+
+
+def get_lidar2global(infos):
+    lidar2ego = np.eye(4, dtype=np.float32)
+    lidar2ego[:3, :3] = Quaternion(infos['lidar2ego_rotation']).rotation_matrix
+    lidar2ego[:3, 3] = infos['lidar2ego_translation']
+    ego2global = np.eye(4, dtype=np.float32)
+    ego2global[:3, :3] = Quaternion(
+        infos['ego2global_rotation']).rotation_matrix
+    ego2global[:3, 3] = infos['ego2global_translation']
+    return ego2global @ lidar2ego
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Visualize the predicted '
+                                     'result of nuScenes')
+    parser.add_argument(
+        'res', help='Path to the predicted result in json format')
+    parser.add_argument(
+        '--show-range',
+        type=int,
+        default=50,
+        help='Range of visualization in BEV')
+    parser.add_argument(
+        '--canva-size', type=int, default=1000, help='Size of canva in pixel')
+    parser.add_argument(
+        '--vis-frames',
+        type=int,
+        default=500,
+        help='Number of frames for visualization')
+    parser.add_argument(
+        '--scale-factor',
+        type=int,
+        default=4,
+        help='Trade-off between image-view and bev in size of '
+        'the visualized canvas')
+    parser.add_argument(
+        '--vis-thred',
+        type=float,
+        default=0.3,
+        help='Threshold the predicted results')
+    parser.add_argument('--draw-gt', action='store_true')
+    parser.add_argument(
+        '--version',
+        type=str,
+        default='val',
+        help='Version of nuScenes dataset')
+    parser.add_argument(
+        '--root_path',
+        type=str,
+        default='./data/nuscenes',
+        help='Path to nuScenes dataset')
+    parser.add_argument(
+        '--save_path',
+        type=str,
+        default='./vis',
+        help='Path to save visualization results')
+    parser.add_argument(
+        '--format',
+        type=str,
+        default='video',
+        choices=['video', 'image'],
+        help='The desired format of the visualization result')
+    parser.add_argument(
+        '--fps', type=int, default=20, help='Frame rate of video')
+    parser.add_argument(
+        '--video-prefix', type=str, default='vis', help='name of video')
+    args = parser.parse_args()
+    return args
+
+
+color_map = {0: (255, 255, 0), 1: (0, 255, 255)}
+
+
+def main():
+    args = parse_args()
+    # load predicted results
+    res = json.load(open(args.res, 'r'))
+    # load dataset information
+    info_path = \
+        args.root_path + '/bevdetv2-nuscenes_infos_%s.pkl' % args.version
+    dataset = pickle.load(open(info_path, 'rb'))
+    # prepare save path and medium
+    vis_dir = args.save_path
+    if not os.path.exists(vis_dir):
+        os.makedirs(vis_dir)
+    print('saving visualized result to %s' % vis_dir)
+    scale_factor = args.scale_factor
+    canva_size = args.canva_size
+    show_range = args.show_range
+    if args.format == 'video':
+        fourcc = cv2.VideoWriter_fourcc(*'MP4V')
+        vout = cv2.VideoWriter(
+            os.path.join(vis_dir, '%s.mp4' % args.video_prefix), fourcc,
+            args.fps, (int(1600 / scale_factor * 3),
+                       int(900 / scale_factor * 2 + canva_size)))
+
+    draw_boxes_indexes_bev = [(0, 1), (1, 2), (2, 3), (3, 0)]
+    draw_boxes_indexes_img_view = [(0, 1), (1, 2), (2, 3), (3, 0), (4, 5),
+                                   (5, 6), (6, 7), (7, 4), (0, 4), (1, 5),
+                                   (2, 6), (3, 7)]
+    views = [
+        'CAM_FRONT_LEFT', 'CAM_FRONT', 'CAM_FRONT_RIGHT', 'CAM_BACK_LEFT',
+        'CAM_BACK', 'CAM_BACK_RIGHT'
+    ]
+    print('start visualizing results')
+    for cnt, infos in enumerate(
+            dataset['infos'][:min(args.vis_frames, len(dataset['infos']))]):
+        if cnt % 10 == 0:
+            print('%d/%d' % (cnt, min(args.vis_frames, len(dataset['infos']))))
+        # collect instances
+        pred_res = res['results'][infos['token']]
+        pred_boxes = [
+            pred_res[rid]['translation'] + pred_res[rid]['size'] + [
+                Quaternion(pred_res[rid]['rotation']).yaw_pitch_roll[0] +
+                np.pi / 2
+            ] for rid in range(len(pred_res))
+        ]
+        if len(pred_boxes) == 0:
+            corners_lidar = np.zeros((0, 3), dtype=np.float32)
+        else:
+            pred_boxes = np.array(pred_boxes, dtype=np.float32)
+            boxes = LB(pred_boxes, origin=(0.5, 0.5, 0.0))
+            corners_global = boxes.corners.numpy().reshape(-1, 3)
+            corners_global = np.concatenate(
+                [corners_global,
+                 np.ones([corners_global.shape[0], 1])],
+                axis=1)
+            l2g = get_lidar2global(infos)
+            corners_lidar = corners_global @ np.linalg.inv(l2g).T
+            corners_lidar = corners_lidar[:, :3]
+        pred_flag = np.ones((corners_lidar.shape[0] // 8, ), dtype=np.bool)
+        scores = [
+            pred_res[rid]['detection_score'] for rid in range(len(pred_res))
+        ]
+        if args.draw_gt:
+            gt_boxes = infos['gt_boxes']
+            gt_boxes[:, -1] = gt_boxes[:, -1] + np.pi / 2
+            width = gt_boxes[:, 4].copy()
+            gt_boxes[:, 4] = gt_boxes[:, 3]
+            gt_boxes[:, 3] = width
+            corners_lidar_gt = \
+                LB(infos['gt_boxes'],
+                   origin=(0.5, 0.5, 0.5)).corners.numpy().reshape(-1, 3)
+            corners_lidar = np.concatenate([corners_lidar, corners_lidar_gt],
+                                           axis=0)
+            gt_flag = np.ones((corners_lidar_gt.shape[0] // 8), dtype=np.bool)
+            pred_flag = np.concatenate(
+                [pred_flag, np.logical_not(gt_flag)], axis=0)
+            scores = scores + [0 for _ in range(infos['gt_boxes'].shape[0])]
+        scores = np.array(scores, dtype=np.float32)
+        sort_ids = np.argsort(scores)
+
+        # image view
+        imgs = []
+        for view in views:
+            img = cv2.imread(infos['cams'][view]['data_path'])
+            # draw instances
+            corners_img, valid = lidar2img(corners_lidar, infos['cams'][view])
+            valid = np.logical_and(
+                valid,
+                check_point_in_img(corners_img, img.shape[0], img.shape[1]))
+            valid = valid.reshape(-1, 8)
+            corners_img = corners_img.reshape(-1, 8, 2).astype(np.int)
+            for aid in range(valid.shape[0]):
+                for index in draw_boxes_indexes_img_view:
+                    if valid[aid, index[0]] and valid[aid, index[1]]:
+                        cv2.line(
+                            img,
+                            tuple(corners_img[aid, index[0]]),
+                            tuple(corners_img[aid, index[1]]),
+                            color=color_map[int(pred_flag[aid])],
+                            thickness=scale_factor)
+            imgs.append(img)
+
+        # bird-eye-view
+        canvas = np.zeros((int(canva_size), int(canva_size), 3),
+                          dtype=np.uint8)
+        # draw lidar points
+        lidar_points = np.fromfile(infos['lidar_path'], dtype=np.float32)
+        lidar_points = lidar_points.reshape(-1, 5)[:, :3]
+        lidar_points[:, 1] = -lidar_points[:, 1]
+        lidar_points[:, :2] = \
+            (lidar_points[:, :2] + show_range) / show_range / 2.0 * canva_size
+        for p in lidar_points:
+            if check_point_in_img(
+                    p.reshape(1, 3), canvas.shape[1], canvas.shape[0])[0]:
+                color = depth2color(p[2])
+                cv2.circle(
+                    canvas, (int(p[0]), int(p[1])),
+                    radius=0,
+                    color=color,
+                    thickness=1)
+
+        # draw instances
+        corners_lidar = corners_lidar.reshape(-1, 8, 3)
+        corners_lidar[:, :, 1] = -corners_lidar[:, :, 1]
+        bottom_corners_bev = corners_lidar[:, [0, 3, 7, 4], :2]
+        bottom_corners_bev = \
+            (bottom_corners_bev + show_range) / show_range / 2.0 * canva_size
+        bottom_corners_bev = np.round(bottom_corners_bev).astype(np.int32)
+        center_bev = corners_lidar[:, [0, 3, 7, 4], :2].mean(axis=1)
+        head_bev = corners_lidar[:, [0, 4], :2].mean(axis=1)
+        canter_canvas = \
+            (center_bev + show_range) / show_range / 2.0 * canva_size
+        center_canvas = canter_canvas.astype(np.int32)
+        head_canvas = (head_bev + show_range) / show_range / 2.0 * canva_size
+        head_canvas = head_canvas.astype(np.int32)
+
+        for rid in sort_ids:
+            score = scores[rid]
+            if score < args.vis_thred and pred_flag[rid]:
+                continue
+            score = min(score * 2.0, 1.0) if pred_flag[rid] else 1.0
+            color = color_map[int(pred_flag[rid])]
+            for index in draw_boxes_indexes_bev:
+                cv2.line(
+                    canvas,
+                    bottom_corners_bev[rid, index[0]],
+                    bottom_corners_bev[rid, index[1]],
+                    [color[0] * score, color[1] * score, color[2] * score],
+                    thickness=1)
+            cv2.line(
+                canvas,
+                center_canvas[rid],
+                head_canvas[rid],
+                [color[0] * score, color[1] * score, color[2] * score],
+                1,
+                lineType=8)
+
+        # fuse image-view and bev
+        img = np.zeros((900 * 2 + canva_size * scale_factor, 1600 * 3, 3),
+                       dtype=np.uint8)
+        img[:900, :, :] = np.concatenate(imgs[:3], axis=1)
+        img_back = np.concatenate(
+            [imgs[3][:, ::-1, :], imgs[4][:, ::-1, :], imgs[5][:, ::-1, :]],
+            axis=1)
+        img[900 + canva_size * scale_factor:, :, :] = img_back
+        img = cv2.resize(img, (int(1600 / scale_factor * 3),
+                               int(900 / scale_factor * 2 + canva_size)))
+        w_begin = int((1600 * 3 / scale_factor - canva_size) // 2)
+        img[int(900 / scale_factor):int(900 / scale_factor) + canva_size,
+            w_begin:w_begin + canva_size, :] = canvas
+
+        if args.format == 'image':
+            cv2.imwrite(os.path.join(vis_dir, '%s.jpg' % infos['token']), img)
+        elif args.format == 'video':
+            vout.write(img)
+    if args.format == 'video':
+        vout.release()
+
+
+if __name__ == '__main__':
+    main()
--- a/tools/analysis_tools/vis_occ.py
+++ b/tools/analysis_tools/vis_occ.py
+import os
+
+import mmcv
+import open3d as o3d
+import numpy as np
+import torch
+import pickle
+import math
+from typing import Tuple, List, Dict, Iterable
+import argparse
+import cv2
+
+NOT_OBSERVED = -1
+FREE = 0
+OCCUPIED = 1
+FREE_LABEL = 17
+BINARY_OBSERVED = 1
+BINARY_NOT_OBSERVED = 0
+
+VOXEL_SIZE = [0.4, 0.4, 0.4]
+POINT_CLOUD_RANGE = [-40, -40, -1, 40, 40, 5.4]
+SPTIAL_SHAPE = [200, 200, 16]
+TGT_VOXEL_SIZE = [0.4, 0.4, 0.4]
+TGT_POINT_CLOUD_RANGE = [-40, -40, -1, 40, 40, 5.4]
+
+
+colormap_to_colors = np.array(
+    [
+        [0,   0,   0, 255],  # 0 undefined
+        [112, 128, 144, 255],  # 1 barrier  orange
+        [220, 20, 60, 255],    # 2 bicycle  Blue
+        [255, 127, 80, 255],   # 3 bus  Darkslategrey
+        [255, 158, 0, 255],  # 4 car  Crimson
+        [233, 150, 70, 255],   # 5 cons. Veh  Orangered
+        [255, 61, 99, 255],  # 6 motorcycle  Darkorange
+        [0, 0, 230, 255], # 7 pedestrian  Darksalmon
+        [47, 79, 79, 255],  # 8 traffic cone  Red
+        [255, 140, 0, 255],# 9 trailer  Slategrey
+        [255, 99, 71, 255],# 10 truck Burlywood
+        [0, 207, 191, 255],    # 11 drive sur  Green
+        [175, 0, 75, 255],  # 12 other lat  nuTonomy green
+        [75, 0, 75, 255],  # 13 sidewalk
+        [112, 180, 60, 255],    # 14 terrain
+        [222, 184, 135, 255],    # 15 manmade
+        [0, 175, 0, 255],   # 16 vegeyation
+], dtype=np.float32)
+
+
+
+def voxel2points(voxel, occ_show, voxelSize):
+    """
+    Args:
+        voxel: (Dx, Dy, Dz)
+        occ_show: (Dx, Dy, Dz)
+        voxelSize: (dx, dy, dz)
+
+    Returns:
+        points: (N, 3) 3: (x, y, z)
+        voxel: (N, ) cls_id
+        occIdx: (x_idx, y_idx, z_idx)
+    """
+    occIdx = torch.where(occ_show)
+    points = torch.cat((occIdx[0][:, None] * voxelSize[0] + POINT_CLOUD_RANGE[0], \
+                        occIdx[1][:, None] * voxelSize[1] + POINT_CLOUD_RANGE[1], \
+                        occIdx[2][:, None] * voxelSize[2] + POINT_CLOUD_RANGE[2]),
+                       dim=1)      # (N, 3) 3: (x, y, z)
+    return points, voxel[occIdx], occIdx
+
+
+def voxel_profile(voxel, voxel_size):
+    """
+    Args:
+        voxel: (N, 3)  3:(x, y, z)
+        voxel_size: (vx, vy, vz)
+
+    Returns:
+        box: (N, 7) (x, y, z - dz/2, vx, vy, vz, 0)
+    """
+    centers = torch.cat((voxel[:, :2], voxel[:, 2][:, None] - voxel_size[2] / 2), dim=1)     # (x, y, z - dz/2)
+    # centers = voxel
+    wlh = torch.cat((torch.tensor(voxel_size[0]).repeat(centers.shape[0])[:, None],
+                     torch.tensor(voxel_size[1]).repeat(centers.shape[0])[:, None],
+                     torch.tensor(voxel_size[2]).repeat(centers.shape[0])[:, None]), dim=1)
+    yaw = torch.full_like(centers[:, 0:1], 0)
+    return torch.cat((centers, wlh, yaw), dim=1)
+
+
+def rotz(t):
+    """Rotation about the z-axis."""
+    c = torch.cos(t)
+    s = torch.sin(t)
+    return torch.tensor([[c, -s,  0],
+                     [s,  c,  0],
+                     [0,  0,  1]])
+
+
+def my_compute_box_3d(center, size, heading_angle):
+    """
+    Args:
+        center: (N, 3)  3: (x, y, z - dz/2)
+        size: (N, 3)    3: (vx, vy, vz)
+        heading_angle: (N, 1)
+    Returns:
+        corners_3d: (N, 8, 3)
+    """
+    h, w, l = size[:, 2], size[:, 0], size[:, 1]
+    center[:, 2] = center[:, 2] + h / 2
+    l, w, h = (l / 2).unsqueeze(1), (w / 2).unsqueeze(1), (h / 2).unsqueeze(1)
+    x_corners = torch.cat([-l, l, l, -l, -l, l, l, -l], dim=1)[..., None]
+    y_corners = torch.cat([w, w, -w, -w, w, w, -w, -w], dim=1)[..., None]
+    z_corners = torch.cat([h, h, h, h, -h, -h, -h, -h], dim=1)[..., None]
+    corners_3d = torch.cat([x_corners, y_corners, z_corners], dim=2)
+    corners_3d[..., 0] += center[:, 0:1]
+    corners_3d[..., 1] += center[:, 1:2]
+    corners_3d[..., 2] += center[:, 2:3]
+    return corners_3d
+
+
+def show_point_cloud(points: np.ndarray, colors=True, points_colors=None, bbox3d=None, voxelize=False,
+                     bbox_corners=None, linesets=None, vis=None, offset=[0,0,0], large_voxel=True, voxel_size=0.4):
+    """
+    :param points: (N, 3)  3:(x, y, z)
+    :param colors: false 不显示点云颜色
+    :param points_colors: (N, 4）
+    :param bbox3d: voxel grid (N, 7) 7: (center, wlh, yaw=0)
+    :param voxelize: false 不显示voxel边界
+    :param bbox_corners: (N, 8, 3)  voxel grid 角点坐标, 用于绘制voxel grid 边界.
+    :param linesets: 用于绘制voxel grid 边界.
+    :return:
+    """
+    if vis is None:
+        vis = o3d.visualization.VisualizerWithKeyCallback()
+        vis.create_window()
+    if isinstance(offset, list) or isinstance(offset, tuple):
+        offset = np.array(offset)
+
+    pcd = o3d.geometry.PointCloud()
+    pcd.points = o3d.utility.Vector3dVector(points+offset)
+    if colors:
+        pcd.colors = o3d.utility.Vector3dVector(points_colors[:, :3])
+    mesh_frame = o3d.geometry.TriangleMesh.create_coordinate_frame(
+        size=1, origin=[0, 0, 0])
+
+    voxelGrid = o3d.geometry.VoxelGrid.create_from_point_cloud(pcd, voxel_size=voxel_size)
+    if large_voxel:
+        vis.add_geometry(voxelGrid)
+    else:
+        vis.add_geometry(pcd)
+
+    if voxelize:
+        line_sets = o3d.geometry.LineSet()
+        line_sets.points = o3d.open3d.utility.Vector3dVector(bbox_corners.reshape((-1, 3))+offset)
+        line_sets.lines = o3d.open3d.utility.Vector2iVector(linesets.reshape((-1, 2)))
+        line_sets.paint_uniform_color((0, 0, 0))
+        vis.add_geometry(line_sets)
+
+    vis.add_geometry(mesh_frame)
+
+    # ego_pcd = o3d.geometry.PointCloud()
+    # ego_points = generate_the_ego_car()
+    # ego_pcd.points = o3d.utility.Vector3dVector(ego_points)
+    # vis.add_geometry(ego_pcd)
+
+    return vis
+
+
+def show_occ(occ_state, occ_show, voxel_size, vis=None, offset=[0, 0, 0]):
+    """
+    Args:
+        occ_state: (Dx, Dy, Dz), cls_id
+        occ_show: (Dx, Dy, Dz), bool
+        voxel_size: [0.4, 0.4, 0.4]
+        vis: Visualizer
+        offset:
+
+    Returns:
+
+    """
+    colors = colormap_to_colors / 255
+    pcd, labels, occIdx = voxel2points(occ_state, occ_show, voxel_size)
+    # pcd: (N, 3)  3: (x, y, z)
+    # labels: (N, )  cls_id
+    _labels = labels % len(colors)
+    pcds_colors = colors[_labels]   # (N, 4)
+
+    bboxes = voxel_profile(pcd, voxel_size)    # (N, 7)   7: (x, y, z - dz/2, dx, dy, dz, 0)
+    bboxes_corners = my_compute_box_3d(bboxes[:, 0:3], bboxes[:, 3:6], bboxes[:, 6:7])      # (N, 8, 3)
+
+    bases_ = torch.arange(0, bboxes_corners.shape[0] * 8, 8)
+    edges = torch.tensor([[0, 1], [1, 2], [2, 3], [3, 0], [4, 5], [5, 6], [6, 7], [7, 4], [0, 4], [1, 5], [2, 6], [3, 7]])  # lines along y-axis
+    edges = edges.reshape((1, 12, 2)).repeat(bboxes_corners.shape[0], 1, 1)     # (N, 12, 2)
+    # (N, 12, 2) + (N, 1, 1) --> (N, 12, 2)   此时edges中记录的是bboxes_corners的整体id: (0, N*8).
+    edges = edges + bases_[:, None, None]
+
+    vis = show_point_cloud(
+        points=pcd.numpy(),
+        colors=True,
+        points_colors=pcds_colors,
+        voxelize=True,
+        bbox3d=bboxes.numpy(),
+        bbox_corners=bboxes_corners.numpy(),
+        linesets=edges.numpy(),
+        vis=vis,
+        offset=offset,
+        large_voxel=True,
+        voxel_size=0.4
+    )
+    return vis
+
+
+def generate_the_ego_car():
+    ego_range = [-2, -1, 0, 2, 1, 1.5]
+    ego_voxel_size=[0.1, 0.1, 0.1]
+    ego_xdim = int((ego_range[3] - ego_range[0]) / ego_voxel_size[0])
+    ego_ydim = int((ego_range[4] - ego_range[1]) / ego_voxel_size[1])
+    ego_zdim = int((ego_range[5] - ego_range[2]) / ego_voxel_size[2])
+    temp_x = np.arange(ego_xdim)
+    temp_y = np.arange(ego_ydim)
+    temp_z = np.arange(ego_zdim)
+    ego_xyz = np.stack(np.meshgrid(temp_y, temp_x, temp_z), axis=-1).reshape(-1, 3)
+    ego_point_x = (ego_xyz[:, 0:1] + 0.5) / ego_xdim * (ego_range[3] - ego_range[0]) + ego_range[0]
+    ego_point_y = (ego_xyz[:, 1:2] + 0.5) / ego_ydim * (ego_range[4] - ego_range[1]) + ego_range[1]
+    ego_point_z = (ego_xyz[:, 2:3] + 0.5) / ego_zdim * (ego_range[5] - ego_range[2]) + ego_range[2]
+    ego_point_xyz = np.concatenate((ego_point_y, ego_point_x, ego_point_z), axis=-1)
+    ego_points_label =  (np.ones((ego_point_xyz.shape[0]))*16).astype(np.uint8)
+    ego_dict = {}
+    ego_dict['point'] = ego_point_xyz
+    ego_dict['label'] = ego_points_label
+    return ego_point_xyz
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Visualize the predicted '
+                                     'result of nuScenes')
+    parser.add_argument(
+        'res', help='Path to the predicted result')
+    parser.add_argument(
+        '--canva-size', type=int, default=1000, help='Size of canva in pixel')
+    parser.add_argument(
+        '--vis-frames',
+        type=int,
+        default=500,
+        help='Number of frames for visualization')
+    parser.add_argument(
+        '--scale-factor',
+        type=int,
+        default=4,
+        help='Trade-off between image-view and bev in size of '
+        'the visualized canvas')
+    parser.add_argument(
+        '--version',
+        type=str,
+        default='val',
+        help='Version of nuScenes dataset')
+    parser.add_argument('--draw-gt', action='store_true')
+    parser.add_argument(
+        '--root_path',
+        type=str,
+        default='./data/nuscenes',
+        help='Path to nuScenes dataset')
+    parser.add_argument(
+        '--save_path',
+        type=str,
+        default='./vis',
+        help='Path to save visualization results')
+    parser.add_argument(
+        '--format',
+        type=str,
+        default='image',
+        choices=['video', 'image'],
+        help='The desired format of the visualization result')
+    parser.add_argument(
+        '--fps', type=int, default=10, help='Frame rate of video')
+    parser.add_argument(
+        '--video-prefix', type=str, default='vis', help='name of video')
+    args = parser.parse_args()
+    return args
+
+
+def main():
+    args = parse_args()
+    # load predicted results
+    results_dir = args.res
+
+    # load dataset information
+    info_path = \
+        args.root_path + '/bevdetv2-nuscenes_infos_%s.pkl' % args.version
+    dataset = pickle.load(open(info_path, 'rb'))
+    # prepare save path and medium
+    vis_dir = args.save_path
+    if not os.path.exists(vis_dir):
+        os.makedirs(vis_dir)
+    print('saving visualized result to %s' % vis_dir)
+    scale_factor = args.scale_factor
+    canva_size = args.canva_size
+    if args.format == 'video':
+        fourcc = cv2.VideoWriter_fourcc('m', 'p', '4', 'v')
+        vout = cv2.VideoWriter(
+            os.path.join(vis_dir, '%s.mp4' % args.video_prefix), fourcc,
+            args.fps, (int(1600 / scale_factor * 3),
+                       int(900 / scale_factor * 2 + canva_size)))
+
+    views = [
+        'CAM_FRONT_LEFT', 'CAM_FRONT', 'CAM_FRONT_RIGHT', 'CAM_BACK_LEFT',
+        'CAM_BACK', 'CAM_BACK_RIGHT'
+    ]
+    print('start visualizing results')
+
+    vis = o3d.visualization.VisualizerWithKeyCallback()
+    vis.create_window()
+
+    for cnt, info in enumerate(
+            dataset['infos'][:min(args.vis_frames, len(dataset['infos']))]):
+        if cnt % 10 == 0:
+            print('%d/%d' % (cnt, min(args.vis_frames, len(dataset['infos']))))
+
+        scene_name = info['scene_name']
+        sample_token = info['token']
+
+        pred_occ_path = os.path.join(results_dir, scene_name, sample_token, 'pred.npz')
+        gt_occ_path = info['occ_path']
+
+        pred_occ = np.load(pred_occ_path)['pred']
+        gt_data = np.load(os.path.join(args.root_path, gt_occ_path, 'labels.npz'))
+        voxel_label = gt_data['semantics']
+        lidar_mask = gt_data['mask_lidar']
+        camera_mask = gt_data['mask_camera']
+
+        # load imgs
+        imgs = []
+        for view in views:
+            img = cv2.imread(info['cams'][view]['data_path'])
+            imgs.append(img)
+
+        # occ_canvas
+        voxel_show = np.logical_and(pred_occ != FREE_LABEL, camera_mask)
+        # voxel_show = pred_occ != FREE_LABEL
+        voxel_size = VOXEL_SIZE
+        vis = show_occ(torch.from_numpy(pred_occ), torch.from_numpy(voxel_show), voxel_size=voxel_size, vis=vis,
+                       offset=[0, pred_occ.shape[0] * voxel_size[0] * 1.2 * 0, 0])
+
+        if args.draw_gt:
+            voxel_show = np.logical_and(voxel_label != FREE_LABEL, camera_mask)
+            vis = show_occ(torch.from_numpy(voxel_label), torch.from_numpy(voxel_show), voxel_size=voxel_size, vis=vis,
+                           offset=[0, voxel_label.shape[0] * voxel_size[0] * 1.2 * 1, 0])
+
+        view_control = vis.get_view_control()
+
+        look_at = np.array([-0.185, 0.513, 3.485])
+        front = np.array([-0.974, -0.055, 0.221])
+        up = np.array([0.221, 0.014, 0.975])
+        zoom = np.array([0.08])
+
+        view_control.set_lookat(look_at)
+        view_control.set_front(front)
+        view_control.set_up(up)
+        view_control.set_zoom(zoom)
+
+        opt = vis.get_render_option()
+        opt.background_color = np.asarray([1, 1, 1])
+        opt.line_width = 5
+
+        vis.poll_events()
+        vis.update_renderer()
+        vis.run()
+
+        # if args.format == 'image':
+        #     out_dir = os.path.join(vis_dir, f'{scene_name}', f'{sample_token}')
+        #     mmcv.mkdir_or_exist(out_dir)
+        #     vis.capture_screen_image(os.path.join(out_dir, 'screen_occ.png'), do_render=True)
+
+        occ_canvas = vis.capture_screen_float_buffer(do_render=True)
+        occ_canvas = np.asarray(occ_canvas)
+        occ_canvas = (occ_canvas * 255).astype(np.uint8)
+        occ_canvas = occ_canvas[..., [2, 1, 0]]
+        occ_canvas_resize = cv2.resize(occ_canvas, (canva_size, canva_size), interpolation=cv2.INTER_CUBIC)
+
+        vis.clear_geometries()
+
+        big_img = np.zeros((900 * 2 + canva_size * scale_factor, 1600 * 3, 3),
+                       dtype=np.uint8)
+        big_img[:900, :, :] = np.concatenate(imgs[:3], axis=1)
+        img_back = np.concatenate(
+            [imgs[3][:, ::-1, :], imgs[4][:, ::-1, :], imgs[5][:, ::-1, :]],
+            axis=1)
+        big_img[900 + canva_size * scale_factor:, :, :] = img_back
+        big_img = cv2.resize(big_img, (int(1600 / scale_factor * 3),
+                                       int(900 / scale_factor * 2 + canva_size)))
+        w_begin = int((1600 * 3 / scale_factor - canva_size) // 2)
+        big_img[int(900 / scale_factor):int(900 / scale_factor) + canva_size,
+                w_begin:w_begin + canva_size, :] = occ_canvas_resize
+
+        if args.format == 'image':
+            out_dir = os.path.join(vis_dir, f'{scene_name}', f'{sample_token}')
+            mmcv.mkdir_or_exist(out_dir)
+            for i, img in enumerate(imgs):
+                cv2.imwrite(os.path.join(out_dir, f'img{i}.png'), img)
+            cv2.imwrite(os.path.join(out_dir, 'occ.png'), occ_canvas)
+            cv2.imwrite(os.path.join(out_dir, 'overall.png'), big_img)
+        elif args.format == 'video':
+            cv2.putText(big_img, f'{cnt:{cnt}}', (5, 15), fontFace=cv2.FONT_HERSHEY_COMPLEX, color=(0, 0, 0),
+                        fontScale=0.5)
+            cv2.putText(big_img, f'{scene_name}', (5, 35), fontFace=cv2.FONT_HERSHEY_COMPLEX, color=(0, 0, 0),
+                        fontScale=0.5)
+            cv2.putText(big_img, f'{sample_token[:5]}', (5, 55), fontFace=cv2.FONT_HERSHEY_COMPLEX, color=(0, 0, 0),
+                        fontScale=0.5)
+            vout.write(big_img)
+
+    if args.format == 'video':
+        vout.release()
+    vis.destroy_window()
+
+
+if __name__ == '__main__':
+    main()
\ No newline at end of file
--- a/tools/convert_bevdet_to_TRT.py
+++ b/tools/convert_bevdet_to_TRT.py
+import argparse
+import sys
+import os
+sys.path.insert(0, os.getcwd())
+
+import torch.onnx
+from mmcv import Config
+from mmdeploy.backend.tensorrt.utils import save, search_cuda_version
+
+try:
+    # If mmdet version > 2.23.0, compat_cfg would be imported and
+    # used from mmdet instead of mmdet3d.
+    from mmdet.utils import compat_cfg
+except ImportError:
+    from mmdet3d.utils import compat_cfg
+
+import os
+from typing import Dict, Optional, Sequence, Union
+
+import h5py
+import mmcv
+import numpy as np
+import onnx
+import pycuda.driver as cuda
+import tensorrt as trt
+import torch
+import tqdm
+from mmcv.runner import load_checkpoint
+from mmdeploy.apis.core import no_mp
+from mmdeploy.backend.tensorrt.calib_utils import HDF5Calibrator
+from mmdeploy.backend.tensorrt.init_plugins import load_tensorrt_plugin
+from mmdeploy.utils import load_config
+from packaging import version
+from torch.utils.data import DataLoader
+
+from mmdet3d.datasets import build_dataloader, build_dataset
+from mmdet3d.models import build_model
+from mmdet.datasets import replace_ImageToTensor
+from tools.misc.fuse_conv_bn import fuse_module
+
+
+class HDF5CalibratorBEVDet(HDF5Calibrator):
+
+    def get_batch(self, names: Sequence[str], **kwargs) -> list:
+        """Get batch data."""
+        if self.count < self.dataset_length:
+            if self.count % 100 == 0:
+                print('%d/%d' % (self.count, self.dataset_length))
+            ret = []
+            for name in names:
+                input_group = self.calib_data[name]
+                if name == 'img':
+                    data_np = input_group[str(self.count)][...].astype(
+                        np.float32)
+                else:
+                    data_np = input_group[str(self.count)][...].astype(
+                        np.int32)
+
+                # tile the tensor so we can keep the same distribute
+                opt_shape = self.input_shapes[name]['opt_shape']
+                data_shape = data_np.shape
+
+                reps = [
+                    int(np.ceil(opt_s / data_s))
+                    for opt_s, data_s in zip(opt_shape, data_shape)
+                ]
+
+                data_np = np.tile(data_np, reps)
+
+                slice_list = tuple(slice(0, end) for end in opt_shape)
+                data_np = data_np[slice_list]
+
+                data_np_cuda_ptr = cuda.mem_alloc(data_np.nbytes)
+                cuda.memcpy_htod(data_np_cuda_ptr,
+                                 np.ascontiguousarray(data_np))
+                self.buffers[name] = data_np_cuda_ptr
+
+                ret.append(self.buffers[name])
+            self.count += 1
+            return ret
+        else:
+            return None
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Deploy BEVDet with Tensorrt')
+    parser.add_argument('config', help='deploy config file path')
+    parser.add_argument('checkpoint', help='checkpoint file')
+    parser.add_argument('work_dir', help='work dir to save file')
+    parser.add_argument(
+        '--prefix', default='bevdet', help='prefix of the save file name')
+    parser.add_argument(
+        '--fp16', action='store_true', help='Whether to use tensorrt fp16')
+    parser.add_argument(
+        '--int8', action='store_true', help='Whether to use tensorrt int8')
+    parser.add_argument(
+        '--fuse-conv-bn',
+        action='store_true',
+        help='Whether to fuse conv and bn, this will slightly increase'
+        'the inference speed')
+    parser.add_argument('--calib_num', type=int, help='num to calib')
+    args = parser.parse_args()
+    return args
+
+
+def get_plugin_names():
+    return [pc.name for pc in trt.get_plugin_registry().plugin_creator_list]
+
+
+def create_calib_input_data_impl(calib_file: str,
+                                 dataloader: DataLoader,
+                                 model_partition: bool = False,
+                                 metas: list = [],
+                                 calib_num = None) -> None:
+    with h5py.File(calib_file, mode='w') as file:
+        calib_data_group = file.create_group('calib_data')
+        assert not model_partition
+        # create end2end group
+        input_data_group = calib_data_group.create_group('end2end')
+        input_group_img = input_data_group.create_group('img')
+        input_keys = [
+            'ranks_bev', 'ranks_depth', 'ranks_feat', 'interval_starts',
+            'interval_lengths'
+        ]
+        input_groups = []
+        for input_key in input_keys:
+            input_groups.append(input_data_group.create_group(input_key))
+        metas = [
+            metas[i].int().detach().cpu().numpy() for i in range(len(metas))
+        ]
+        for data_id, input_data in enumerate(tqdm.tqdm(dataloader)):
+            # save end2end data
+            if (calib_num is not None) and (data_id > calib_num):
+                break
+            input_tensor = input_data['img_inputs'][0][0]
+            input_ndarray = input_tensor.squeeze(0).detach().cpu().numpy()
+            # print(input_ndarray.shape, input_ndarray.dtype)
+            input_group_img.create_dataset(
+                str(data_id),
+                shape=input_ndarray.shape,
+                compression='gzip',
+                compression_opts=4,
+                data=input_ndarray)
+            for kid, input_key in enumerate(input_keys):
+                input_groups[kid].create_dataset(
+                    str(data_id),
+                    shape=metas[kid].shape,
+                    compression='gzip',
+                    compression_opts=4,
+                    data=metas[kid])
+            file.flush()
+
+
+def create_calib_input_data(calib_file: str,
+                            deploy_cfg: Union[str, mmcv.Config],
+                            model_cfg: Union[str, mmcv.Config],
+                            model_checkpoint: Optional[str] = None,
+                            dataset_cfg: Optional[Union[str,
+                                                        mmcv.Config]] = None,
+                            dataset_type: str = 'val',
+                            device: str = 'cpu',
+                            metas: list = [None],
+                            calib_num = None) -> None:
+    """Create dataset for post-training quantization.
+
+    Args:
+        calib_file (str): The output calibration data file.
+        deploy_cfg (str | mmcv.Config): Deployment config file or
+            Config object.
+        model_cfg (str | mmcv.Config): Model config file or Config object.
+        model_checkpoint (str): A checkpoint path of PyTorch model,
+            defaults to `None`.
+        dataset_cfg (Optional[Union[str, mmcv.Config]], optional): Model
+            config to provide calibration dataset. If none, use `model_cfg`
+            as the dataset config. Defaults to None.
+        dataset_type (str, optional): The dataset type. Defaults to 'val'.
+        device (str, optional): Device to create dataset. Defaults to 'cpu'.
+    """
+    with no_mp():
+        if dataset_cfg is None:
+            dataset_cfg = model_cfg
+
+        # load cfg if necessary
+        deploy_cfg, model_cfg = load_config(deploy_cfg, model_cfg)
+
+        if dataset_cfg is None:
+            dataset_cfg = model_cfg
+
+        # load dataset_cfg if necessary
+        dataset_cfg = load_config(dataset_cfg)[0]
+
+        from mmdeploy.apis.utils import build_task_processor
+        task_processor = build_task_processor(model_cfg, deploy_cfg, device)
+
+        dataset = task_processor.build_dataset(dataset_cfg, dataset_type)
+
+        dataloader = task_processor.build_dataloader(
+            dataset, 1, 1, dist=False, shuffle=False)
+
+        create_calib_input_data_impl(
+            calib_file, dataloader, model_partition=False, metas=metas, calib_num=calib_num)
+
+
+def from_onnx(onnx_model: Union[str, onnx.ModelProto],
+              output_file_prefix: str,
+              input_shapes: Dict[str, Sequence[int]],
+              max_workspace_size: int = 0,
+              fp16_mode: bool = False,
+              int8_mode: bool = False,
+              int8_param: Optional[dict] = None,
+              device_id: int = 0,
+              log_level: trt.Logger.Severity = trt.Logger.ERROR,
+              **kwargs) -> trt.ICudaEngine:
+    """Create a tensorrt engine from ONNX.
+
+    Modified from mmdeploy.backend.tensorrt.utils.from_onnx
+    """
+
+    import os
+    old_cuda_device = os.environ.get('CUDA_DEVICE', None)
+    os.environ['CUDA_DEVICE'] = str(device_id)
+    import pycuda.autoinit  # noqa:F401
+    if old_cuda_device is not None:
+        os.environ['CUDA_DEVICE'] = old_cuda_device
+    else:
+        os.environ.pop('CUDA_DEVICE')
+
+    load_tensorrt_plugin()
+    # create builder and network
+    logger = trt.Logger(log_level)
+    builder = trt.Builder(logger)
+    EXPLICIT_BATCH = 1 << (int)(
+        trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
+    network = builder.create_network(EXPLICIT_BATCH)
+
+    # parse onnx
+    parser = trt.OnnxParser(network, logger)
+
+    if isinstance(onnx_model, str):
+        onnx_model = onnx.load(onnx_model)
+
+    if not parser.parse(onnx_model.SerializeToString()):
+        error_msgs = ''
+        for error in range(parser.num_errors):
+            error_msgs += f'{parser.get_error(error)}\n'
+        raise RuntimeError(f'Failed to parse onnx, {error_msgs}')
+
+    # config builder
+    if version.parse(trt.__version__) < version.parse('8'):
+        builder.max_workspace_size = max_workspace_size
+
+    config = builder.create_builder_config()
+    config.max_workspace_size = max_workspace_size
+
+    cuda_version = search_cuda_version()
+    if cuda_version is not None:
+        version_major = int(cuda_version.split('.')[0])
+        if version_major < 11:
+            # cu11 support cublasLt, so cudnn heuristic tactic should disable CUBLAS_LT # noqa E501
+            tactic_source = config.get_tactic_sources() - (
+                1 << int(trt.TacticSource.CUBLAS_LT))
+            config.set_tactic_sources(tactic_source)
+
+    profile = builder.create_optimization_profile()
+
+    for input_name, param in input_shapes.items():
+        min_shape = param['min_shape']
+        opt_shape = param['opt_shape']
+        max_shape = param['max_shape']
+        profile.set_shape(input_name, min_shape, opt_shape, max_shape)
+    config.add_optimization_profile(profile)
+
+    if fp16_mode:
+        if version.parse(trt.__version__) < version.parse('8'):
+            builder.fp16_mode = fp16_mode
+        config.set_flag(trt.BuilderFlag.FP16)
+
+    if int8_mode:
+        config.set_flag(trt.BuilderFlag.INT8)
+        assert int8_param is not None
+        config.int8_calibrator = HDF5CalibratorBEVDet(
+            int8_param['calib_file'],
+            input_shapes,
+            model_type=int8_param['model_type'],
+            device_id=device_id,
+            algorithm=int8_param.get(
+                'algorithm', trt.CalibrationAlgoType.ENTROPY_CALIBRATION_2))
+        if version.parse(trt.__version__) < version.parse('8'):
+            builder.int8_mode = int8_mode
+            builder.int8_calibrator = config.int8_calibrator
+
+    # create engine
+    engine = builder.build_engine(network, config)
+
+    assert engine is not None, 'Failed to create TensorRT engine'
+
+    save(engine, output_file_prefix + '.engine')
+    print('Save engine at ', output_file_prefix + '.engine')
+    return engine
+
+
+def main():
+    args = parse_args()
+
+    max_workspace_size = 200*200*256*(2**8)
+    
+    if not os.path.exists(args.work_dir):
+        os.makedirs(args.work_dir)
+
+    load_tensorrt_plugin()
+    assert 'bev_pool_v2' in get_plugin_names(), \
+        'bev_pool_v2 is not in the plugin list of tensorrt, ' \
+        'please install mmdeploy from ' \
+        'https://github.com/HuangJunJie2017/mmdeploy.git'
+
+    # if args.int8:
+    #     assert args.fp16
+    model_prefix = args.prefix
+    if args.int8:
+        model_prefix = model_prefix + '_int8'
+    elif args.fp16:
+        model_prefix = model_prefix + '_fp16'
+    cfg = Config.fromfile(args.config)
+    cfg.model.pretrained = None
+    cfg.model.type = cfg.model.type + 'TRT'
+
+    cfg = compat_cfg(cfg)
+    cfg.gpu_ids = [0]
+
+    # import modules from plguin/xx, registry will be updated
+    if hasattr(cfg, 'plugin'):
+        if cfg.plugin:
+            import importlib
+            if hasattr(cfg, 'plugin_dir'):
+                plugin_dir = cfg.plugin_dir
+                _module_dir = os.path.dirname(plugin_dir)
+                _module_dir = _module_dir.split('/')
+                _module_path = _module_dir[0]
+
+                for m in _module_dir[1:]:
+                    _module_path = _module_path + '.' + m
+                print(_module_path)
+                plg_lib = importlib.import_module(_module_path)
+            else:
+                # import dir is the dirpath for the config file
+                _module_dir = os.path.dirname(args.config)
+                _module_dir = _module_dir.split('/')
+                _module_path = _module_dir[0]
+                for m in _module_dir[1:]:
+                    _module_path = _module_path + '.' + m
+                plg_lib = importlib.import_module(_module_path)
+
+    # build the dataloader
+    test_dataloader_default_args = dict(
+        samples_per_gpu=1, workers_per_gpu=2, dist=False, shuffle=False)
+
+    if isinstance(cfg.data.test, dict):
+        cfg.data.test.test_mode = True
+        if cfg.data.test_dataloader.get('samples_per_gpu', 1) > 1:
+            # Replace 'ImageToTensor' to 'DefaultFormatBundle'
+            cfg.data.test.pipeline = replace_ImageToTensor(
+                cfg.data.test.pipeline)
+    elif isinstance(cfg.data.test, list):
+        for ds_cfg in cfg.data.test:
+            ds_cfg.test_mode = True
+        if cfg.data.test_dataloader.get('samples_per_gpu', 1) > 1:
+            for ds_cfg in cfg.data.test:
+                ds_cfg.pipeline = replace_ImageToTensor(ds_cfg.pipeline)
+
+    test_loader_cfg = {
+        **test_dataloader_default_args,
+        **cfg.data.get('test_dataloader', {})
+    }
+    dataset = build_dataset(cfg.data.test)
+    data_loader = build_dataloader(dataset, **test_loader_cfg)
+
+    # build the model and load checkpoint
+    cfg.model.train_cfg = None
+    model = build_model(cfg.model, test_cfg=cfg.get('test_cfg'))
+    # assert model.img_view_transformer.grid_size[0] == 128
+    # assert model.img_view_transformer.grid_size[1] == 128
+    # assert model.img_view_transformer.grid_size[2] == 1
+    if os.path.exists(args.checkpoint):
+        load_checkpoint(model, args.checkpoint, map_location='cpu')
+    else:
+        print(args.checkpoint, " does not exists!")
+    if args.fuse_conv_bn:
+        model_prefix = model_prefix + '_fuse'
+        model = fuse_module(model)
+    model.cuda()
+    model.eval()
+
+    for i, data in enumerate(data_loader):
+        inputs = [t.cuda() for t in data['img_inputs'][0]]
+        img = inputs[0].squeeze(0)
+        if img.shape[0] > 6:
+            img = img[:6]
+        if model.__class__.__name__ in ['FBOCCTRT', 'FBOCC2DTRT']:
+            metas = model.get_bev_pool_input(inputs, img_metas=data['img_metas'])
+        else:
+            if model.__class__.__name__ in ['BEVDetOCCTRT']:
+                metas = model.get_bev_pool_input(inputs)
+            elif model.__class__.__name__ in ['BEVDepthOCCTRT']:
+                metas, mlp_input = model.get_bev_pool_input(inputs)
+
+        if model.__class__.__name__ in ['FBOCCTRT', 'FBOCC2DTRT', 'BEVDetOCCTRT']:
+            onnx_input = (img.float().contiguous(), metas[1].int().contiguous(),
+                metas[2].int().contiguous(), metas[0].int().contiguous(),
+                metas[3].int().contiguous(), metas[4].int().contiguous())
+            dynamic_axes={
+                    "ranks_depth" : {0: 'M'},
+                    "ranks_feat" : {0: 'M'},
+                    "ranks_bev" : {0: 'M'},
+                    "interval_starts" : {0: 'N'},
+                    "interval_lengths" : {0: 'N'},
+                }
+            input_names=[
+                    'img', 'ranks_depth', 'ranks_feat', 'ranks_bev',
+                    'interval_starts', 'interval_lengths'
+                ]
+        elif model.__class__.__name__ in ['BEVDepthOCCTRT']:
+            onnx_input = (img.float().contiguous(), metas[1].int().contiguous(),
+                metas[2].int().contiguous(), metas[0].int().contiguous(),
+                metas[3].int().contiguous(), metas[4].int().contiguous(), mlp_input)
+            dynamic_axes={
+                    "ranks_depth" : {0: 'M'},
+                    "ranks_feat" : {0: 'M'},
+                    "ranks_bev" : {0: 'M'},
+                    "interval_starts" : {0: 'N'},
+                    "interval_lengths" : {0: 'N'},
+                    # "mlp_input" : {0: 'K'},
+                }
+            input_names=[
+                    'img', 'ranks_depth', 'ranks_feat', 'ranks_bev',
+                    'interval_starts', 'interval_lengths', 'mlp_input',
+                ]
+
+        with torch.no_grad():
+            if (model.wdet3d == True) and (model.wocc == False) :
+                output_names=[f'output_{j}' for j in range(6 * len(model.pts_bbox_head.task_heads))]
+            elif (model.wdet3d == True) and (model.wocc == True) :
+                output_names=[f'output_{j}' for j in range(1 + 6 * len(model.pts_bbox_head.task_heads))]
+            elif (model.wdet3d == False) and (model.wocc == True) :
+                output_names=[f'output_{j}' for j in range(1)]
+            else:
+                raise(" At least one of wdet3d and wocc is set as True!! ")
+
+            model.forward = model.forward_ori
+            torch.onnx.export(
+                model,
+                onnx_input,
+                args.work_dir + model_prefix + '.onnx',
+                opset_version=11,
+                dynamic_axes=dynamic_axes,
+                input_names=input_names,
+                output_names=output_names)
+            print('output_names:', output_names)
+            print('====== onnx is saved at : ', args.work_dir + model_prefix + '.onnx')
+            # check onnx model
+            onnx_model = onnx.load(args.work_dir + model_prefix + '.onnx')
+            try:
+                onnx.checker.check_model(onnx_model)
+            except Exception:
+                print('ONNX Model Incorrect')
+            else:
+                print('ONNX Model Correct')
+
+            model.forward = model.forward_with_argmax
+            output_names = [f'cls_occ_label']
+            torch.onnx.export(
+                model,
+                onnx_input,
+                args.work_dir + model_prefix + '_with_argmax.onnx',
+                opset_version=11,
+                dynamic_axes=dynamic_axes,
+                input_names=input_names,
+                output_names=output_names)
+            print('output_names:', output_names)
+            print('====== onnx is saved at : ', args.work_dir + model_prefix + '_with_argmax.onnx')
+            # check onnx model
+            onnx_model = onnx.load(args.work_dir + model_prefix + '_with_argmax.onnx')
+            try:
+                onnx.checker.check_model(onnx_model)
+            except Exception:
+                print('ONNX Model Incorrect')
+            else:
+                print('ONNX Model Correct')
+
+        break
+
+    # convert to tensorrt
+    num_points = metas[0].shape[0]
+    num_intervals = metas[3].shape[0]
+    img_shape = img.shape
+    input_shapes = dict(
+        img=dict(
+            min_shape=img_shape, opt_shape=img_shape, max_shape=img_shape),
+        ranks_depth=dict(
+            min_shape=[num_points],
+            opt_shape=[num_points],
+            max_shape=[num_points]),
+        ranks_feat=dict(
+            min_shape=[num_points],
+            opt_shape=[num_points],
+            max_shape=[num_points]),
+        ranks_bev=dict(
+            min_shape=[num_points],
+            opt_shape=[num_points],
+            max_shape=[num_points]),
+        interval_starts=dict(
+            min_shape=[num_intervals],
+            opt_shape=[num_intervals],
+            max_shape=[num_intervals]),
+        interval_lengths=dict(
+            min_shape=[num_intervals],
+            opt_shape=[num_intervals],
+            max_shape=[num_intervals]))
+    deploy_cfg = dict(
+        backend_config=dict(
+            type='tensorrt',
+            common_config=dict(
+                fp16_mode=args.fp16,
+                max_workspace_size=max_workspace_size,
+                int8_mode=args.int8),
+            model_inputs=[dict(input_shapes=input_shapes)]),
+        codebase_config=dict(
+            type='mmdet3d', task='VoxelDetection', model_type='end2end'))
+
+    if args.int8:
+        calib_filename = 'calib_data.h5'
+        calib_path = os.path.join(args.work_dir, calib_filename)
+        create_calib_input_data(
+            calib_path,
+            deploy_cfg,
+            args.config,
+            args.checkpoint,
+            dataset_cfg=None,
+            dataset_type='val',
+            device='cuda:0',
+            metas=metas,
+            calib_num=args.calib_num)
+
+    from_onnx(
+        args.work_dir + model_prefix + '.onnx',
+        args.work_dir + model_prefix,
+        fp16_mode=args.fp16,
+        int8_mode=args.int8,
+        int8_param=dict(
+            calib_file=os.path.join(args.work_dir, 'calib_data.h5'),
+            model_type='end2end'),
+        max_workspace_size=max_workspace_size,
+        input_shapes=input_shapes)
+
+    # if args.int8:
+    #     os.remove(calib_path)
+
+
+if __name__ == '__main__':
+
+    main()