Commit 3b8d508a authored by lishj6's avatar lishj6 🏸
Browse files

init_0905

parent e968ab0f
Pipeline #2906 canceled with stages
// Copyright (c) Phigent Robotics. All rights reserved.
// Reference https://arxiv.org/abs/2211.17111
#include <stdio.h>
#include <stdlib.h>
__global__ void nearest_assign_kernel(
const int* l2s_key,
int l2s_size,
const int* occind2detind,
const int *__restrict__ occ_pred,
const int *__restrict__ inst_xyz,
const int *__restrict__ inst_cls,
const int *__restrict__ inst_id_list,
int inst_size,
int* __restrict__ inst_pred) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
// while (idx < 200*200*16)
if (true)
{
int occ_pred_label = occ_pred[idx];
int dist_min = 100000000;
for (int index = 0; index < l2s_size; index ++)
{
if (occ_pred_label == l2s_key[index])
{
int x = idx/(200*16);
int y = (idx - x*200*16)/16;
int z = idx - x*200*16 - y*16;
int inst_ind = 0;
for (inst_ind = 0; inst_ind < inst_size; inst_ind ++)
{
if (inst_cls[inst_ind] == occind2detind[occ_pred_label])
{
int dx = x - inst_xyz[inst_ind*3+0];
int dy = y - inst_xyz[inst_ind*3+1];
int dz = z - inst_xyz[inst_ind*3+2];
int dist = dx*dx + dy*dy + dz*dz;
if (dist < dist_min){
dist_min = dist;
inst_pred[idx] = inst_id_list[inst_ind];
}
}
}
return;
}
}
inst_pred[idx] = occ_pred[idx];
// idx += blockDim.x * gridDim.x;
}
}
void nearest_assign(
const int* l2s_key,
int l2s_size,
const int *__restrict__ occind2detind,
int inst_size,
const int *__restrict__ occ_pred,
const int *__restrict__ inst_xyz,
const int *__restrict__ inst_cls,
const int *__restrict__ inst_id_list,
int* __restrict__ inst_pred) {
// nearest_assign_kernel<<<128, 256>>>(
nearest_assign_kernel<<<(int)ceil(((double)200 * 200 * 16 / 256)), 256>>>(
l2s_key, l2s_size, occind2detind,
occ_pred, inst_xyz, inst_cls,
inst_id_list, inst_size, inst_pred
);
}
from setuptools import find_packages, setup
import os
import shutil
import sys
import torch
import warnings
from os import path as osp
from torch.utils.cpp_extension import (BuildExtension, CppExtension,
CUDAExtension)
def make_cuda_ext(name,
module,
sources,
sources_cuda=[],
extra_args=[],
extra_include_path=[]):
define_macros = []
extra_compile_args = {'cxx': [] + extra_args}
if torch.cuda.is_available() or os.getenv('FORCE_CUDA', '0') == '1':
define_macros += [('WITH_CUDA', None)]
extension = CUDAExtension
extra_compile_args['nvcc'] = extra_args + [
'-D__CUDA_NO_HALF_OPERATORS__',
'-D__CUDA_NO_HALF_CONVERSIONS__',
'-D__CUDA_NO_HALF2_OPERATORS__',
]
sources += sources_cuda
else:
print('Compiling {} without CUDA'.format(name))
extension = CppExtension
# raise EnvironmentError('CUDA is required to compile MMDetection!')
return extension(
name='{}.{}'.format(module, name),
sources=[os.path.join(*module.split('.'), p) for p in sources],
include_dirs=extra_include_path,
define_macros=define_macros,
extra_compile_args=extra_compile_args)
if __name__ == '__main__':
setup(
name='flashocc_plugin',
description=("OpenMMLab's next-generation platform"
'for general 3D object detection.'),
long_description_content_type='text/markdown',
author='MMDetection3D Contributors',
author_email='zwwdev@gmail.com',
keywords='computer vision, 3D object detection',
url='https://github.com/open-mmlab/mmdetection3d',
classifiers=[
'Development Status :: 4 - Beta',
'License :: OSI Approved :: Apache Software License',
'Operating System :: OS Independent',
'Programming Language :: Python :: 3',
'Programming Language :: Python :: 3.6',
'Programming Language :: Python :: 3.7',
],
license='Apache License 2.0',
ext_modules=[
make_cuda_ext(
name="bev_pool_ext",
module="mmdet3d_plugin.ops.bev_pool",
sources=[
"src/bev_pooling.cpp",
"src/bev_sum_pool.cpp",
"src/bev_sum_pool_cuda.cu",
"src/bev_max_pool.cpp",
"src/bev_max_pool_cuda.cu",
],
),
make_cuda_ext(
name="bev_pool_v2_ext",
module="mmdet3d_plugin.ops.bev_pool_v2",
sources=[
"src/bev_pool.cpp",
"src/bev_pool_cuda.cu"
],
),
make_cuda_ext(
name="nearest_assign_ext",
module="mmdet3d_plugin.ops.nearest_assign",
sources=[
"src/nearest_assign.cpp",
"src/nearest_assign_cuda.cu"
],
),
],
cmdclass={'build_ext': BuildExtension},
zip_safe=False)
import torch
from torch import tensor, device
import torch.fx as fx
from torch._dynamo.testing import rand_strided
from math import inf
import torch._inductor.inductor_prims
import torch._dynamo.config
import torch._inductor.config
import torch._functorch.config
import torch.fx.experimental._config
torch._dynamo.config.capture_scalar_outputs = True
isolate_fails_code_str = None
# torch version: 2.4.1
# torch cuda version: None
# torch git version: 45d303c9e4f41ec2f5450b6f60031246f67189d6
# CUDA Info:
# nvcc not found
# GPU Hardware Info:
# BW200 : 8
from torch.nn import *
class Repro(torch.nn.Module):
def __init__(self):
super().__init__()
def forward(self, primals_1, primals_2, primals_4, primals_5, primals_6, primals_7, primals_8, primals_10, convert_element_type_1, clamp_max, convert_element_type_3, clamp_max_1, clamp_max_2, clamp_max_3, cat, convolution, squeeze_1, relu, convolution_1, getitem_3, rsqrt_1, convert_element_type_5, clamp_max_4, convert_element_type_7, clamp_max_5, clamp_max_6, clamp_max_7, add_19, convolution_2, squeeze_7, relu_2, unsqueeze_14, unsqueeze_38, tangents_1):
sum_1 = torch.ops.aten.sum.dim_IntList(tangents_1, [0, 2, 3])
convolution_backward = torch.ops.aten.convolution_backward.default(tangents_1, relu_2, primals_10, [256], [1, 1], [0, 0], [1, 1], False, [0, 0], 1, [True, True, False]); tangents_1 = primals_10 = None
getitem_6 = convolution_backward[0]
getitem_7 = convolution_backward[1]; convolution_backward = None
le = torch.ops.aten.le.Scalar(relu_2, 0); relu_2 = None
full_default = torch.ops.aten.full.default([], 0.0, dtype = torch.float32, layout = torch.strided, device = device(type='cuda', index=2), pin_memory = False)
where = torch.ops.aten.where.self(le, full_default, getitem_6); le = getitem_6 = None
sum_2 = torch.ops.aten.sum.dim_IntList(where, [0, 2, 3])
sub_13 = torch.ops.aten.sub.Tensor(convolution_2, unsqueeze_14); convolution_2 = unsqueeze_14 = None
mul_31 = torch.ops.aten.mul.Tensor(where, sub_13)
sum_3 = torch.ops.aten.sum.dim_IntList(mul_31, [0, 2, 3]); mul_31 = None
mul_32 = torch.ops.aten.mul.Tensor(sum_2, 6.25e-06)
unsqueeze_15 = torch.ops.aten.unsqueeze.default(mul_32, 0); mul_32 = None
unsqueeze_16 = torch.ops.aten.unsqueeze.default(unsqueeze_15, 2); unsqueeze_15 = None
unsqueeze_17 = torch.ops.aten.unsqueeze.default(unsqueeze_16, 3); unsqueeze_16 = None
mul_33 = torch.ops.aten.mul.Tensor(sum_3, 6.25e-06)
mul_34 = torch.ops.aten.mul.Tensor(squeeze_7, squeeze_7)
mul_35 = torch.ops.aten.mul.Tensor(mul_33, mul_34); mul_33 = mul_34 = None
unsqueeze_18 = torch.ops.aten.unsqueeze.default(mul_35, 0); mul_35 = None
unsqueeze_19 = torch.ops.aten.unsqueeze.default(unsqueeze_18, 2); unsqueeze_18 = None
unsqueeze_20 = torch.ops.aten.unsqueeze.default(unsqueeze_19, 3); unsqueeze_19 = None
mul_36 = torch.ops.aten.mul.Tensor(squeeze_7, primals_8); primals_8 = None
unsqueeze_21 = torch.ops.aten.unsqueeze.default(mul_36, 0); mul_36 = None
unsqueeze_22 = torch.ops.aten.unsqueeze.default(unsqueeze_21, 2); unsqueeze_21 = None
unsqueeze_23 = torch.ops.aten.unsqueeze.default(unsqueeze_22, 3); unsqueeze_22 = None
mul_37 = torch.ops.aten.mul.Tensor(sub_13, unsqueeze_20); sub_13 = unsqueeze_20 = None
sub_15 = torch.ops.aten.sub.Tensor(where, mul_37); where = mul_37 = None
sub_16 = torch.ops.aten.sub.Tensor(sub_15, unsqueeze_17); sub_15 = unsqueeze_17 = None
mul_38 = torch.ops.aten.mul.Tensor(sub_16, unsqueeze_23); sub_16 = unsqueeze_23 = None
mul_39 = torch.ops.aten.mul.Tensor(sum_3, squeeze_7); sum_3 = squeeze_7 = None
convolution_backward_1 = torch.ops.aten.convolution_backward.default(mul_38, add_19, primals_7, [0], [1, 1], [1, 1], [1, 1], False, [0, 0], 1, [True, True, False]); mul_38 = add_19 = primals_7 = None
getitem_9 = convolution_backward_1[0]
getitem_10 = convolution_backward_1[1]; convolution_backward_1 = None
mul_40 = torch.ops.aten.mul.Tensor(getitem_9, clamp_max_7); clamp_max_7 = None
neg = torch.ops.aten.neg.default(mul_40)
add_25 = torch.ops.aten.add.Tensor(getitem_9, neg); getitem_9 = neg = None
mul_41 = torch.ops.aten.mul.Tensor(mul_40, clamp_max_6)
neg_1 = torch.ops.aten.neg.default(mul_41)
add_26 = torch.ops.aten.add.Tensor(mul_40, neg_1); mul_40 = neg_1 = None
mul_42 = torch.ops.aten.mul.Tensor(add_25, clamp_max_6); clamp_max_6 = None
neg_2 = torch.ops.aten.neg.default(mul_42)
add_27 = torch.ops.aten.add.Tensor(add_25, neg_2); add_25 = neg_2 = None
full_default_1 = torch.ops.aten.full.default([4, 512, 100, 100], 0, dtype = torch.float32, layout = torch.strided, device = device(type='cuda', index=2), pin_memory = False)
_unsafe_index_put = torch.ops.aten._unsafe_index_put.default(full_default_1, [None, None, clamp_max_4, clamp_max_5], mul_41, True); mul_41 = None
_unsafe_index_put_1 = torch.ops.aten._unsafe_index_put.default(full_default_1, [None, None, clamp_max_4, convert_element_type_7], add_26, True); clamp_max_4 = add_26 = None
add_28 = torch.ops.aten.add.Tensor(_unsafe_index_put, _unsafe_index_put_1); _unsafe_index_put = _unsafe_index_put_1 = None
_unsafe_index_put_2 = torch.ops.aten._unsafe_index_put.default(full_default_1, [None, None, convert_element_type_5, clamp_max_5], mul_42, True); clamp_max_5 = mul_42 = None
add_29 = torch.ops.aten.add.Tensor(add_28, _unsafe_index_put_2); add_28 = _unsafe_index_put_2 = None
_unsafe_index_put_3 = torch.ops.aten._unsafe_index_put.default(full_default_1, [None, None, convert_element_type_5, convert_element_type_7], add_27, True); full_default_1 = convert_element_type_5 = convert_element_type_7 = add_27 = None
add_30 = torch.ops.aten.add.Tensor(add_29, _unsafe_index_put_3); add_29 = _unsafe_index_put_3 = None
sub_6 = torch.ops.aten.sub.Tensor(convolution_1, getitem_3)
mul_12 = torch.ops.aten.mul.Tensor(sub_6, rsqrt_1); sub_6 = None
unsqueeze_4 = torch.ops.aten.unsqueeze.default(primals_5, -1)
unsqueeze_5 = torch.ops.aten.unsqueeze.default(unsqueeze_4, -1); unsqueeze_4 = None
mul_18 = torch.ops.aten.mul.Tensor(mul_12, unsqueeze_5); mul_12 = unsqueeze_5 = None
unsqueeze_6 = torch.ops.aten.unsqueeze.default(primals_6, -1); primals_6 = None
unsqueeze_7 = torch.ops.aten.unsqueeze.default(unsqueeze_6, -1); unsqueeze_6 = None
add_14 = torch.ops.aten.add.Tensor(mul_18, unsqueeze_7); mul_18 = unsqueeze_7 = None
relu_1 = torch.ops.aten.relu.default(add_14); add_14 = None
le_1 = torch.ops.aten.le.Scalar(relu_1, 0); relu_1 = None
where_1 = torch.ops.aten.where.self(le_1, full_default, add_30); le_1 = add_30 = None
squeeze_3 = torch.ops.aten.squeeze.dims(getitem_3, [0, 2, 3]); getitem_3 = None
unsqueeze_24 = torch.ops.aten.unsqueeze.default(squeeze_3, 0); squeeze_3 = None
unsqueeze_25 = torch.ops.aten.unsqueeze.default(unsqueeze_24, 2); unsqueeze_24 = None
unsqueeze_26 = torch.ops.aten.unsqueeze.default(unsqueeze_25, 3); unsqueeze_25 = None
sum_4 = torch.ops.aten.sum.dim_IntList(where_1, [0, 2, 3])
sub_17 = torch.ops.aten.sub.Tensor(convolution_1, unsqueeze_26); convolution_1 = unsqueeze_26 = None
mul_43 = torch.ops.aten.mul.Tensor(where_1, sub_17)
sum_5 = torch.ops.aten.sum.dim_IntList(mul_43, [0, 2, 3]); mul_43 = None
mul_44 = torch.ops.aten.mul.Tensor(sum_4, 2.5e-05)
unsqueeze_27 = torch.ops.aten.unsqueeze.default(mul_44, 0); mul_44 = None
unsqueeze_28 = torch.ops.aten.unsqueeze.default(unsqueeze_27, 2); unsqueeze_27 = None
unsqueeze_29 = torch.ops.aten.unsqueeze.default(unsqueeze_28, 3); unsqueeze_28 = None
mul_45 = torch.ops.aten.mul.Tensor(sum_5, 2.5e-05)
squeeze_4 = torch.ops.aten.squeeze.dims(rsqrt_1, [0, 2, 3]); rsqrt_1 = None
mul_46 = torch.ops.aten.mul.Tensor(squeeze_4, squeeze_4)
mul_47 = torch.ops.aten.mul.Tensor(mul_45, mul_46); mul_45 = mul_46 = None
unsqueeze_30 = torch.ops.aten.unsqueeze.default(mul_47, 0); mul_47 = None
unsqueeze_31 = torch.ops.aten.unsqueeze.default(unsqueeze_30, 2); unsqueeze_30 = None
unsqueeze_32 = torch.ops.aten.unsqueeze.default(unsqueeze_31, 3); unsqueeze_31 = None
mul_48 = torch.ops.aten.mul.Tensor(squeeze_4, primals_5); primals_5 = None
unsqueeze_33 = torch.ops.aten.unsqueeze.default(mul_48, 0); mul_48 = None
unsqueeze_34 = torch.ops.aten.unsqueeze.default(unsqueeze_33, 2); unsqueeze_33 = None
unsqueeze_35 = torch.ops.aten.unsqueeze.default(unsqueeze_34, 3); unsqueeze_34 = None
mul_49 = torch.ops.aten.mul.Tensor(sub_17, unsqueeze_32); sub_17 = unsqueeze_32 = None
sub_19 = torch.ops.aten.sub.Tensor(where_1, mul_49); where_1 = mul_49 = None
sub_20 = torch.ops.aten.sub.Tensor(sub_19, unsqueeze_29); sub_19 = unsqueeze_29 = None
mul_50 = torch.ops.aten.mul.Tensor(sub_20, unsqueeze_35); sub_20 = unsqueeze_35 = None
mul_51 = torch.ops.aten.mul.Tensor(sum_5, squeeze_4); sum_5 = squeeze_4 = None
convolution_backward_2 = torch.ops.aten.convolution_backward.default(mul_50, relu, primals_4, [0], [1, 1], [1, 1], [1, 1], False, [0, 0], 1, [True, True, False]); mul_50 = primals_4 = None
getitem_12 = convolution_backward_2[0]
getitem_13 = convolution_backward_2[1]; convolution_backward_2 = None
le_2 = torch.ops.aten.le.Scalar(relu, 0); relu = None
where_2 = torch.ops.aten.where.self(le_2, full_default, getitem_12); le_2 = full_default = getitem_12 = None
sum_6 = torch.ops.aten.sum.dim_IntList(where_2, [0, 2, 3])
sub_21 = torch.ops.aten.sub.Tensor(convolution, unsqueeze_38); convolution = unsqueeze_38 = None
mul_52 = torch.ops.aten.mul.Tensor(where_2, sub_21)
sum_7 = torch.ops.aten.sum.dim_IntList(mul_52, [0, 2, 3]); mul_52 = None
mul_53 = torch.ops.aten.mul.Tensor(sum_6, 2.5e-05)
unsqueeze_39 = torch.ops.aten.unsqueeze.default(mul_53, 0); mul_53 = None
unsqueeze_40 = torch.ops.aten.unsqueeze.default(unsqueeze_39, 2); unsqueeze_39 = None
unsqueeze_41 = torch.ops.aten.unsqueeze.default(unsqueeze_40, 3); unsqueeze_40 = None
mul_54 = torch.ops.aten.mul.Tensor(sum_7, 2.5e-05)
mul_55 = torch.ops.aten.mul.Tensor(squeeze_1, squeeze_1)
mul_56 = torch.ops.aten.mul.Tensor(mul_54, mul_55); mul_54 = mul_55 = None
unsqueeze_42 = torch.ops.aten.unsqueeze.default(mul_56, 0); mul_56 = None
unsqueeze_43 = torch.ops.aten.unsqueeze.default(unsqueeze_42, 2); unsqueeze_42 = None
unsqueeze_44 = torch.ops.aten.unsqueeze.default(unsqueeze_43, 3); unsqueeze_43 = None
mul_57 = torch.ops.aten.mul.Tensor(squeeze_1, primals_2); primals_2 = None
unsqueeze_45 = torch.ops.aten.unsqueeze.default(mul_57, 0); mul_57 = None
unsqueeze_46 = torch.ops.aten.unsqueeze.default(unsqueeze_45, 2); unsqueeze_45 = None
unsqueeze_47 = torch.ops.aten.unsqueeze.default(unsqueeze_46, 3); unsqueeze_46 = None
mul_58 = torch.ops.aten.mul.Tensor(sub_21, unsqueeze_44); sub_21 = unsqueeze_44 = None
sub_23 = torch.ops.aten.sub.Tensor(where_2, mul_58); where_2 = mul_58 = None
sub_24 = torch.ops.aten.sub.Tensor(sub_23, unsqueeze_41); sub_23 = unsqueeze_41 = None
mul_59 = torch.ops.aten.mul.Tensor(sub_24, unsqueeze_47); sub_24 = unsqueeze_47 = None
mul_60 = torch.ops.aten.mul.Tensor(sum_7, squeeze_1); sum_7 = squeeze_1 = None
convolution_backward_3 = torch.ops.aten.convolution_backward.default(mul_59, cat, primals_1, [0], [1, 1], [1, 1], [1, 1], False, [0, 0], 1, [True, True, False]); mul_59 = cat = primals_1 = None
getitem_15 = convolution_backward_3[0]
getitem_16 = convolution_backward_3[1]; convolution_backward_3 = None
slice_1 = torch.ops.aten.slice.Tensor(getitem_15, 1, 0, 128)
slice_2 = torch.ops.aten.slice.Tensor(getitem_15, 1, 128, 640); getitem_15 = None
mul_61 = torch.ops.aten.mul.Tensor(slice_2, clamp_max_3); clamp_max_3 = None
neg_3 = torch.ops.aten.neg.default(mul_61)
add_31 = torch.ops.aten.add.Tensor(slice_2, neg_3); slice_2 = neg_3 = None
mul_62 = torch.ops.aten.mul.Tensor(mul_61, clamp_max_2)
neg_4 = torch.ops.aten.neg.default(mul_62)
add_32 = torch.ops.aten.add.Tensor(mul_61, neg_4); mul_61 = neg_4 = None
mul_63 = torch.ops.aten.mul.Tensor(add_31, clamp_max_2); clamp_max_2 = None
neg_5 = torch.ops.aten.neg.default(mul_63)
add_33 = torch.ops.aten.add.Tensor(add_31, neg_5); add_31 = neg_5 = None
full_default_7 = torch.ops.aten.full.default([4, 512, 25, 25], 0, dtype = torch.float32, layout = torch.strided, device = device(type='cuda', index=2), pin_memory = False)
_unsafe_index_put_4 = torch.ops.aten._unsafe_index_put.default(full_default_7, [None, None, clamp_max, clamp_max_1], mul_62, True); mul_62 = None
_unsafe_index_put_5 = torch.ops.aten._unsafe_index_put.default(full_default_7, [None, None, clamp_max, convert_element_type_3], add_32, True); clamp_max = add_32 = None
add_34 = torch.ops.aten.add.Tensor(_unsafe_index_put_4, _unsafe_index_put_5); _unsafe_index_put_4 = _unsafe_index_put_5 = None
_unsafe_index_put_6 = torch.ops.aten._unsafe_index_put.default(full_default_7, [None, None, convert_element_type_1, clamp_max_1], mul_63, True); clamp_max_1 = mul_63 = None
add_35 = torch.ops.aten.add.Tensor(add_34, _unsafe_index_put_6); add_34 = _unsafe_index_put_6 = None
_unsafe_index_put_7 = torch.ops.aten._unsafe_index_put.default(full_default_7, [None, None, convert_element_type_1, convert_element_type_3], add_33, True); full_default_7 = convert_element_type_1 = convert_element_type_3 = add_33 = None
add_36 = torch.ops.aten.add.Tensor(add_35, _unsafe_index_put_7); add_35 = _unsafe_index_put_7 = None
return [getitem_16, mul_60, sum_6, getitem_13, mul_51, sum_4, getitem_10, mul_39, sum_2, getitem_7, sum_1, None, None, None, None, None, None, None, None, None, slice_1, add_36]
def load_args(reader):
buf0 = reader.storage('934c55e4a7a69a0a29a96cd8ef9f11c9859658e1', 11796480, device=device(type='cuda', index=2))
reader.tensor(buf0, (512, 640, 3, 3), requires_grad=True, is_leaf=True) # primals_1
buf1 = reader.storage('f12094f433480ec90280d223057708434df38941', 2048, device=device(type='cuda', index=2))
reader.tensor(buf1, (512,), requires_grad=True, is_leaf=True) # primals_2
buf2 = reader.storage('06c46ad2c91ec5c8eebc4fb0be80459bdfe007a8', 9437184, device=device(type='cuda', index=2))
reader.tensor(buf2, (512, 512, 3, 3), requires_grad=True, is_leaf=True) # primals_4
buf3 = reader.storage('aba0c4266c842d1845e720dc0c789942770a60b7', 2048, device=device(type='cuda', index=2))
reader.tensor(buf3, (512,), requires_grad=True, is_leaf=True) # primals_5
buf4 = reader.storage('bb8471d379e03c8ccb9897ce7d3a2dfbacb44e30', 2048, device=device(type='cuda', index=2))
reader.tensor(buf4, (512,), requires_grad=True, is_leaf=True) # primals_6
buf5 = reader.storage('b9484105fb5b2045fb6550a1edb77af72e639416', 4718592, device=device(type='cuda', index=2))
reader.tensor(buf5, (256, 512, 3, 3), requires_grad=True, is_leaf=True) # primals_7
buf6 = reader.storage('b778b8cab416c3fa6763b88e431266ae6ea28941', 1024, device=device(type='cuda', index=2))
reader.tensor(buf6, (256,), requires_grad=True, is_leaf=True) # primals_8
buf7 = reader.storage('c5f14ec72c73a593b47ef4aecf37f6bb25d2dec4', 262144, device=device(type='cuda', index=2))
reader.tensor(buf7, (256, 256, 1, 1), requires_grad=True, is_leaf=True) # primals_10
buf8 = reader.storage('99ef5c7086a924dfc5221c01ff1520de469849c8', 800, device=device(type='cuda', index=2), dtype_hint=torch.int64)
reader.tensor(buf8, (100, 1), dtype=torch.int64, is_leaf=True) # convert_element_type_1
buf9 = reader.storage('532b7b8fc19c48c7434e569ab96aa0670d5651ef', 800, device=device(type='cuda', index=2), dtype_hint=torch.int64)
reader.tensor(buf9, (100, 1), dtype=torch.int64, is_leaf=True) # clamp_max
buf10 = reader.storage('99ef5c7086a924dfc5221c01ff1520de469849c8', 800, device=device(type='cuda', index=2), dtype_hint=torch.int64)
reader.tensor(buf10, (100,), dtype=torch.int64, is_leaf=True) # convert_element_type_3
buf11 = reader.storage('532b7b8fc19c48c7434e569ab96aa0670d5651ef', 800, device=device(type='cuda', index=2), dtype_hint=torch.int64)
reader.tensor(buf11, (100,), dtype=torch.int64, is_leaf=True) # clamp_max_1
buf12 = reader.storage('0538ed039b8a4706a4f85bf431e12664d8940742', 400, device=device(type='cuda', index=2))
reader.tensor(buf12, (100,), is_leaf=True) # clamp_max_2
buf13 = reader.storage('0538ed039b8a4706a4f85bf431e12664d8940742', 400, device=device(type='cuda', index=2))
reader.tensor(buf13, (100, 1), is_leaf=True) # clamp_max_3
buf14 = reader.storage('5d41e66671a283b70001fd74345d8e7e3def00bd', 102400000, device=device(type='cuda', index=2))
reader.tensor(buf14, (4, 640, 100, 100), is_leaf=True) # cat
buf15 = reader.storage('a8fe0ed584571bb3218d663656459a36545be5e6', 81920000, device=device(type='cuda', index=2))
reader.tensor(buf15, (4, 512, 100, 100), is_leaf=True) # convolution
buf16 = reader.storage('0af13bcf109b8ca2df7f5ce3387d51e8576fb30a', 2048, device=device(type='cuda', index=2))
reader.tensor(buf16, (512,), is_leaf=True) # squeeze_1
buf17 = reader.storage('32f14d6fa07f654fbb09ef1563066303a3501eda', 81920000, device=device(type='cuda', index=2))
reader.tensor(buf17, (4, 512, 100, 100), is_leaf=True) # relu
buf18 = reader.storage('aca23d51e723ad9b4bec2e54d6f0af4b5b85cc7d', 81920000, device=device(type='cuda', index=2))
reader.tensor(buf18, (4, 512, 100, 100), is_leaf=True) # convolution_1
buf19 = reader.storage('4940c79e48676c2e1359870dc770e25cd780983d', 2048, device=device(type='cuda', index=2))
reader.tensor(buf19, (1, 512, 1, 1), is_leaf=True) # getitem_3
buf20 = reader.storage('d17407a9f45954a4d0d36e5b20a40ac554cc3aff', 2048, device=device(type='cuda', index=2))
reader.tensor(buf20, (1, 512, 1, 1), is_leaf=True) # rsqrt_1
buf21 = reader.storage('95fbd2b85e217ab78f8f9d7900b273a1362b3112', 1600, device=device(type='cuda', index=2), dtype_hint=torch.int64)
reader.tensor(buf21, (200, 1), dtype=torch.int64, is_leaf=True) # convert_element_type_5
buf22 = reader.storage('d9920b87a7261c94c907bc68889b005f277cd597', 1600, device=device(type='cuda', index=2), dtype_hint=torch.int64)
reader.tensor(buf22, (200, 1), dtype=torch.int64, is_leaf=True) # clamp_max_4
buf23 = reader.storage('95fbd2b85e217ab78f8f9d7900b273a1362b3112', 1600, device=device(type='cuda', index=2), dtype_hint=torch.int64)
reader.tensor(buf23, (200,), dtype=torch.int64, is_leaf=True) # convert_element_type_7
buf24 = reader.storage('d9920b87a7261c94c907bc68889b005f277cd597', 1600, device=device(type='cuda', index=2), dtype_hint=torch.int64)
reader.tensor(buf24, (200,), dtype=torch.int64, is_leaf=True) # clamp_max_5
buf25 = reader.storage('131d76cb798ee04745f0c7dcb67b63c74a6c00df', 800, device=device(type='cuda', index=2))
reader.tensor(buf25, (200,), is_leaf=True) # clamp_max_6
buf26 = reader.storage('131d76cb798ee04745f0c7dcb67b63c74a6c00df', 800, device=device(type='cuda', index=2))
reader.tensor(buf26, (200, 1), is_leaf=True) # clamp_max_7
buf27 = reader.storage('32194c54194bddd5f695a8d306828130629246fc', 327680000, device=device(type='cuda', index=2))
reader.tensor(buf27, (4, 512, 200, 200), is_leaf=True) # add_19
buf28 = reader.storage('e3a286ef8d6373c83ef30afe16eaae96ee52b965', 163840000, device=device(type='cuda', index=2))
reader.tensor(buf28, (4, 256, 200, 200), is_leaf=True) # convolution_2
buf29 = reader.storage('9572b289e6d5c9bdd20a79367d4005440da40795', 1024, device=device(type='cuda', index=2))
reader.tensor(buf29, (256,), is_leaf=True) # squeeze_7
buf30 = reader.storage('42f9ce794a05b12a40f15cbd4abb1201ccef0f72', 163840000, device=device(type='cuda', index=2))
reader.tensor(buf30, (4, 256, 200, 200), is_leaf=True) # relu_2
buf31 = reader.storage('61670207f087dc68f052bc03747d9ab365297b17', 1024, device=device(type='cuda', index=2))
reader.tensor(buf31, (1, 256, 1, 1), is_leaf=True) # unsqueeze_14
buf32 = reader.storage('ab77896e6dd76345e63586ecda30b1e4a63439cc', 2048, device=device(type='cuda', index=2))
reader.tensor(buf32, (1, 512, 1, 1), is_leaf=True) # unsqueeze_38
buf33 = reader.storage('f0ec623d2a44ff0f64fc264faf9128c2a6896e57', 163840000, device=device(type='cuda', index=2))
reader.tensor(buf33, (4, 256, 200, 200), is_leaf=True) # tangents_1
load_args._version = 0
mod = Repro()
if __name__ == '__main__':
from torch._dynamo.repro.after_aot import run_repro
with torch.no_grad():
run_repro(mod, load_args, accuracy=True, command='run', save_dir='/root/FlashOCC/torch_compile_debug/run_2025_08_24_19_42_28_279064-pid_182645/minifier/checkpoints', tracing_mode='real', check_str=None)
# To run it separately, do
# mod, args = run_repro(mod, load_args, accuracy=True, command='get_args', save_dir='/root/FlashOCC/torch_compile_debug/run_2025_08_24_19_42_28_279064-pid_182645/minifier/checkpoints', tracing_mode='real', check_str=None)
# mod(*args)
\ No newline at end of file
-r requirements/build.txt
-r requirements/optional.txt
-r requirements/runtime.txt
-r requirements/tests.txt
docutils==0.16.0
m2r
mistune==0.8.4
myst-parser
-e git+https://github.com/open-mmlab/pytorch_sphinx_theme.git#egg=pytorch_sphinx_theme
sphinx==4.0.2
sphinx-copybutton
sphinx_markdown_tables
mmcv-full>=1.4.8,<=1.6.0
mmdet>=2.24.0,<=3.0.0
mmsegmentation>=0.20.0,<=1.0.0
open3d
spconv
waymo-open-dataset-tf-2-1-0==1.2.0
mmcv>=1.4.8
mmdet>=2.24.0
mmsegmentation>=0.20.1
torch
torchvision
lyft_dataset_sdk
networkx>=2.2,<2.3
numba==0.53.0
numpy
nuscenes-devkit
plyfile
scikit-image
# by default we also use tensorboard to log results
tensorboard
trimesh>=2.35.39,<2.35.40
asynctest
codecov
flake8
interrogate
isort
# Note: used for kwarray.group_items, this may be ported to mmcv in the future.
kwarray
pytest
pytest-cov
pytest-runner
ubelt
xdoctest >= 0.10.0
yapf
# Copyright (c) OpenMMLab. All rights reserved.
import argparse
import json
from collections import defaultdict
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
def cal_train_time(log_dicts, args):
for i, log_dict in enumerate(log_dicts):
print(f'{"-" * 5}Analyze train time of {args.json_logs[i]}{"-" * 5}')
all_times = []
for epoch in log_dict.keys():
if args.include_outliers:
all_times.append(log_dict[epoch]['time'])
else:
all_times.append(log_dict[epoch]['time'][1:])
all_times = np.array(all_times)
epoch_ave_time = all_times.mean(-1)
slowest_epoch = epoch_ave_time.argmax()
fastest_epoch = epoch_ave_time.argmin()
std_over_epoch = epoch_ave_time.std()
print(f'slowest epoch {slowest_epoch + 1}, '
f'average time is {epoch_ave_time[slowest_epoch]:.4f}')
print(f'fastest epoch {fastest_epoch + 1}, '
f'average time is {epoch_ave_time[fastest_epoch]:.4f}')
print(f'time std over epochs is {std_over_epoch:.4f}')
print(f'average iter time: {np.mean(all_times):.4f} s/iter')
print()
def plot_curve(log_dicts, args):
if args.backend is not None:
plt.switch_backend(args.backend)
sns.set_style(args.style)
# if legend is None, use {filename}_{key} as legend
legend = args.legend
if legend is None:
legend = []
for json_log in args.json_logs:
for metric in args.keys:
legend.append(f'{json_log}_{metric}')
assert len(legend) == (len(args.json_logs) * len(args.keys))
metrics = args.keys
num_metrics = len(metrics)
for i, log_dict in enumerate(log_dicts):
epochs = list(log_dict.keys())
for j, metric in enumerate(metrics):
print(f'plot curve of {args.json_logs[i]}, metric is {metric}')
if metric not in log_dict[epochs[args.interval - 1]]:
raise KeyError(
f'{args.json_logs[i]} does not contain metric {metric}')
if args.mode == 'eval':
if min(epochs) == args.interval:
x0 = args.interval
else:
# if current training is resumed from previous checkpoint
# we lost information in early epochs
# `xs` should start according to `min(epochs)`
if min(epochs) % args.interval == 0:
x0 = min(epochs)
else:
# find the first epoch that do eval
x0 = min(epochs) + args.interval - \
min(epochs) % args.interval
xs = np.arange(x0, max(epochs) + 1, args.interval)
ys = []
for epoch in epochs[args.interval - 1::args.interval]:
ys += log_dict[epoch][metric]
# if training is aborted before eval of the last epoch
# `xs` and `ys` will have different length and cause an error
# check if `ys[-1]` is empty here
if not log_dict[epoch][metric]:
xs = xs[:-1]
ax = plt.gca()
ax.set_xticks(xs)
plt.xlabel('epoch')
plt.plot(xs, ys, label=legend[i * num_metrics + j], marker='o')
else:
xs = []
ys = []
num_iters_per_epoch = \
log_dict[epochs[args.interval-1]]['iter'][-1]
for epoch in epochs[args.interval - 1::args.interval]:
iters = log_dict[epoch]['iter']
if log_dict[epoch]['mode'][-1] == 'val':
iters = iters[:-1]
xs.append(
np.array(iters) + (epoch - 1) * num_iters_per_epoch)
ys.append(np.array(log_dict[epoch][metric][:len(iters)]))
xs = np.concatenate(xs)
ys = np.concatenate(ys)
plt.xlabel('iter')
plt.plot(
xs, ys, label=legend[i * num_metrics + j], linewidth=0.5)
plt.legend()
if args.title is not None:
plt.title(args.title)
if args.out is None:
plt.show()
else:
print(f'save curve to: {args.out}')
plt.savefig(args.out)
plt.cla()
def add_plot_parser(subparsers):
parser_plt = subparsers.add_parser(
'plot_curve', help='parser for plotting curves')
parser_plt.add_argument(
'json_logs',
type=str,
nargs='+',
help='path of train log in json format')
parser_plt.add_argument(
'--keys',
type=str,
nargs='+',
default=['mAP_0.25'],
help='the metric that you want to plot')
parser_plt.add_argument('--title', type=str, help='title of figure')
parser_plt.add_argument(
'--legend',
type=str,
nargs='+',
default=None,
help='legend of each plot')
parser_plt.add_argument(
'--backend', type=str, default=None, help='backend of plt')
parser_plt.add_argument(
'--style', type=str, default='dark', help='style of plt')
parser_plt.add_argument('--out', type=str, default=None)
parser_plt.add_argument('--mode', type=str, default='train')
parser_plt.add_argument('--interval', type=int, default=1)
def add_time_parser(subparsers):
parser_time = subparsers.add_parser(
'cal_train_time',
help='parser for computing the average time per training iteration')
parser_time.add_argument(
'json_logs',
type=str,
nargs='+',
help='path of train log in json format')
parser_time.add_argument(
'--include-outliers',
action='store_true',
help='include the first value of every epoch when computing '
'the average time')
def parse_args():
parser = argparse.ArgumentParser(description='Analyze Json Log')
# currently only support plot curve and calculate average train time
subparsers = parser.add_subparsers(dest='task', help='task parser')
add_plot_parser(subparsers)
add_time_parser(subparsers)
args = parser.parse_args()
return args
def load_json_logs(json_logs):
# load and convert json_logs to log_dict, key is epoch, value is a sub dict
# keys of sub dict is different metrics, e.g. memory, bbox_mAP
# value of sub dict is a list of corresponding values of all iterations
log_dicts = [dict() for _ in json_logs]
for json_log, log_dict in zip(json_logs, log_dicts):
with open(json_log, 'r') as log_file:
for line in log_file:
log = json.loads(line.strip())
# skip lines without `epoch` field
if 'epoch' not in log:
continue
epoch = log.pop('epoch')
if epoch not in log_dict:
log_dict[epoch] = defaultdict(list)
for k, v in log.items():
log_dict[epoch][k].append(v)
return log_dicts
def main():
args = parse_args()
json_logs = args.json_logs
for json_log in json_logs:
assert json_log.endswith('.json')
log_dicts = load_json_logs(json_logs)
eval(args.task)(log_dicts, args)
if __name__ == '__main__':
main()
# Copyright (c) OpenMMLab. All rights reserved.
import sys
import argparse
import time
import os
import torch
from mmcv import Config
from mmcv.parallel import MMDataParallel
from mmcv.runner import load_checkpoint, wrap_fp16_model
from mmdet3d.datasets import build_dataloader, build_dataset
from mmdet3d.models import build_detector
from tools.misc.fuse_conv_bn import fuse_module
sys.path.insert(0, os.getcwd())
print(sys.path)
def parse_args():
parser = argparse.ArgumentParser(description='MMDet benchmark a model')
parser.add_argument('config', help='test config file path')
parser.add_argument('checkpoint', help='checkpoint file')
parser.add_argument('--samples', default=500, help='samples to benchmark')
parser.add_argument(
'--log-interval', default=50, help='interval of logging')
parser.add_argument(
'--fuse-conv-bn',
action='store_true',
help='Whether to fuse conv and bn, this will slightly increase'
'the inference speed')
parser.add_argument(
'--w_pano',
action='store_true')
parser.add_argument(
'--w_panoproc',
action='store_true')
parser.add_argument(
'--no-acceleration',
action='store_true',
help='Omit the pre-computation acceleration')
args = parser.parse_args()
return args
def main():
args = parse_args()
cfg = Config.fromfile(args.config)
# set cudnn_benchmark
if cfg.get('cudnn_benchmark', False):
torch.backends.cudnn.benchmark = True
cfg.model.pretrained = None
cfg.data.test.test_mode = True
# import modules from plguin/xx, registry will be updated
if hasattr(cfg, 'plugin'):
if cfg.plugin:
import importlib
if hasattr(cfg, 'plugin_dir'):
plugin_dir = cfg.plugin_dir
_module_dir = os.path.dirname(plugin_dir)
_module_dir = _module_dir.split('/')
_module_path = _module_dir[0]
for m in _module_dir[1:]:
_module_path = _module_path + '.' + m
print(_module_path)
plg_lib = importlib.import_module(_module_path)
else:
# import dir is the dirpath for the config file
_module_dir = os.path.dirname(args.config)
_module_dir = _module_dir.split('/')
_module_path = _module_dir[0]
for m in _module_dir[1:]:
_module_path = _module_path + '.' + m
plg_lib = importlib.import_module(_module_path)
# build the dataloader
# TODO: support multiple images per gpu (only minor changes are needed)
dataset = build_dataset(cfg.data.test)
data_loader = build_dataloader(
dataset,
samples_per_gpu=1,
workers_per_gpu=0,
dist=False,
shuffle=False)
# build the model and load checkpoint
if not args.no_acceleration:
cfg.model.img_view_transformer.accelerate=True
cfg.model.train_cfg = None
model = build_detector(cfg.model, test_cfg=cfg.get('test_cfg'))
fp16_cfg = cfg.get('fp16', None)
if fp16_cfg is not None:
wrap_fp16_model(model)
load_checkpoint(model, args.checkpoint, map_location='cpu')
if args.fuse_conv_bn:
model = fuse_module(model)
model = MMDataParallel(model, device_ids=[0])
model.eval()
# the first several iterations may be very slow so skip them
num_warmup = 5
pure_inf_time = 0
# benchmark with several samples and take the average
# for i, data_ori in enumerate(data_loader):
# if i == 0:
# break
# import copy
# for i in range(500):
# data = copy.deepcopy(data_ori)
for i, data in enumerate(data_loader):
torch.cuda.synchronize()
start_time = time.perf_counter()
with torch.no_grad():
model(return_loss=False, rescale=True,
w_pano=args.w_pano,
w_panoproc=args.w_panoproc,
**data)
torch.cuda.synchronize()
elapsed = time.perf_counter() - start_time
if i >= num_warmup:
pure_inf_time += elapsed
if (i + 1) % args.log_interval == 0:
fps = (i + 1 - num_warmup) / pure_inf_time
print(f'Done image [{i + 1:<3}/ {args.samples}], '
f'fps: {fps:.1f} img / s')
if (i + 1) == args.samples:
pure_inf_time += elapsed
fps = (i + 1 - num_warmup) / pure_inf_time
print(f'Overall \nfps: {fps:.2f} img / s '
f'\ninference time: {1000 / fps:.2f} ms')
break
if __name__ == '__main__':
main()
# Copyright (c) OpenMMLab. All rights reserved.
import argparse
import time
import os
import sys
import torch
from mmcv import Config
from mmcv.parallel import MMDataParallel
from mmcv.runner import load_checkpoint, wrap_fp16_model
from mmdet3d.datasets import build_dataloader, build_dataset
from mmdet3d.models import build_detector
from tools.misc.fuse_conv_bn import fuse_module
sys.path.insert(0, os.getcwd())
print(sys.path)
def parse_args():
parser = argparse.ArgumentParser(description='MMDet benchmark a model')
parser.add_argument('config', help='test config file path')
parser.add_argument('checkpoint', help='checkpoint file')
parser.add_argument('--samples', default=400, help='samples to benchmark')
parser.add_argument(
'--log-interval', default=50, help='interval of logging')
parser.add_argument(
'--fuse-conv-bn',
action='store_true',
help='Whether to fuse conv and bn, this will slightly increase'
'the inference speed')
parser.add_argument(
'--w_pano',
action='store_true')
parser.add_argument(
'--w_panoproc',
action='store_true')
parser.add_argument(
'--no-acceleration',
action='store_true',
help='Omit the pre-computation acceleration')
args = parser.parse_args()
return args
def main():
args = parse_args()
cfg = Config.fromfile(args.config)
# set cudnn_benchmark
if cfg.get('cudnn_benchmark', False):
torch.backends.cudnn.benchmark = True
cfg.model.pretrained = None
cfg.data.test.test_mode = True
# import modules from plguin/xx, registry will be updated
if hasattr(cfg, 'plugin'):
if cfg.plugin:
import importlib
if hasattr(cfg, 'plugin_dir'):
plugin_dir = cfg.plugin_dir
_module_dir = os.path.dirname(plugin_dir)
_module_dir = _module_dir.split('/')
_module_path = _module_dir[0]
for m in _module_dir[1:]:
_module_path = _module_path + '.' + m
print(_module_path)
plg_lib = importlib.import_module(_module_path)
else:
# import dir is the dirpath for the config file
_module_dir = os.path.dirname(args.config)
_module_dir = _module_dir.split('/')
_module_path = _module_dir[0]
for m in _module_dir[1:]:
_module_path = _module_path + '.' + m
plg_lib = importlib.import_module(_module_path)
# build the dataloader
# TODO: support multiple images per gpu (only minor changes are needed)
dataset = build_dataset(cfg.data.test)
data_loader = build_dataloader(
dataset,
samples_per_gpu=1,
workers_per_gpu=0,
dist=False,
shuffle=False)
# build the model and load checkpoint
cfg.model.train_cfg = None
cfg.model.align_after_view_transfromation=True
if not args.no_acceleration:
cfg.model.img_view_transformer.accelerate=True
model = build_detector(cfg.model, test_cfg=cfg.get('test_cfg'))
fp16_cfg = cfg.get('fp16', None)
if fp16_cfg is not None:
wrap_fp16_model(model)
load_checkpoint(model, args.checkpoint, map_location='cpu')
if args.fuse_conv_bn:
model = fuse_module(model)
model = MMDataParallel(model, device_ids=[0])
model.eval()
# the first several iterations may be very slow so skip them
num_warmup = 5
pure_inf_time = 0
# benchmark with several samples and take the average
for i, data in enumerate(data_loader):
inputs = [d.cuda() for d in data['img_inputs'][0]]
with torch.no_grad():
feat_prev, inputs = model.module.extract_img_feat(
inputs, pred_prev=True, img_metas=None)
data['img_inputs'][0] = inputs
torch.cuda.synchronize()
start_time = time.perf_counter()
with torch.no_grad():
model(
return_loss=False,
rescale=True,
sequential=True,
feat_prev=feat_prev,
w_pano=args.w_pano,
w_panoproc=args.w_panoproc,
**data)
torch.cuda.synchronize()
elapsed = time.perf_counter() - start_time
if i >= num_warmup:
pure_inf_time += elapsed
if (i + 1) % args.log_interval == 0:
fps = (i + 1 - num_warmup) / pure_inf_time
print(f'Done image [{i + 1:<3}/ {args.samples}], '
f'fps: {fps:.1f} img / s')
if (i + 1) == args.samples:
pure_inf_time += elapsed
fps = (i + 1 - num_warmup) / pure_inf_time
print(f'Overall \nfps: {fps:.2f} img / s '
f'\ninference time: {1000 / fps:.2f} ms')
break
if __name__ == '__main__':
main()
\ No newline at end of file
import time
from typing import Dict, Optional, Sequence, Union
import os
from os import path as osp
import sys
sys.path.insert(0, os.getcwd())
import tensorrt as trt
import torch
import torch.onnx
from mmcv import Config
from mmdeploy.backend.tensorrt import load_tensorrt_plugin
try:
# If mmdet version > 2.23.0, compat_cfg would be imported and
# used from mmdet instead of mmdet3d.
from mmdet.utils import compat_cfg
except ImportError:
from mmdet3d.utils import compat_cfg
import argparse
from mmdet3d.core import bbox3d2result
from mmdet3d.core.bbox.structures.box_3d_mode import LiDARInstance3DBoxes
from mmdet3d.datasets import build_dataloader, build_dataset
from mmdet3d.models import build_model
def parse_args():
parser = argparse.ArgumentParser(description='Deploy BEVDet with Tensorrt')
parser.add_argument('config', help='deploy config file path')
parser.add_argument('engine', help='checkpoint file')
parser.add_argument('--samples', default=500, help='samples to benchmark')
parser.add_argument('--postprocessing', action='store_true')
parser.add_argument('--eval', action='store_true')
parser.add_argument('--prefetch', action='store_true',
help='use prefetch to accelerate the data loading, '
'the inference speed is sightly degenerated due '
'to the computational occupancy of prefetch')
args = parser.parse_args()
return args
def torch_dtype_from_trt(dtype: trt.DataType) -> torch.dtype:
"""Convert pytorch dtype to TensorRT dtype.
Args:
dtype (str.DataType): The data type in tensorrt.
Returns:
torch.dtype: The corresponding data type in torch.
"""
if dtype == trt.bool:
return torch.bool
elif dtype == trt.int8:
return torch.int8
elif dtype == trt.int32:
return torch.int32
elif dtype == trt.float16:
return torch.float16
elif dtype == trt.float32:
return torch.float32
else:
raise TypeError(f'{dtype} is not supported by torch')
class TRTWrapper(torch.nn.Module):
def __init__(self,
engine: Union[str, trt.ICudaEngine],
output_names: Optional[Sequence[str]] = None) -> None:
super().__init__()
self.engine = engine
if isinstance(self.engine, str):
with trt.Logger() as logger, trt.Runtime(logger) as runtime:
with open(self.engine, mode='rb') as f:
engine_bytes = f.read()
self.engine = runtime.deserialize_cuda_engine(engine_bytes)
self.context = self.engine.create_execution_context()
names = [_ for _ in self.engine]
input_names = list(filter(self.engine.binding_is_input, names))
self._input_names = input_names
self._output_names = output_names
if self._output_names is None:
output_names = list(set(names) - set(input_names))
self._output_names = output_names
def forward(self, inputs: Dict[str, torch.Tensor]):
bindings = [None] * (len(self._input_names) + len(self._output_names))
for input_name, input_tensor in inputs.items():
idx = self.engine.get_binding_index(input_name)
self.context.set_binding_shape(idx, tuple(input_tensor.shape))
bindings[idx] = input_tensor.contiguous().data_ptr()
# create output tensors
outputs = {}
for output_name in self._output_names:
idx = self.engine.get_binding_index(output_name)
dtype = torch_dtype_from_trt(self.engine.get_binding_dtype(idx))
shape = tuple(self.context.get_binding_shape(idx))
device = torch.device('cuda')
output = torch.zeros(size=shape, dtype=dtype, device=device)
outputs[output_name] = output
bindings[idx] = output.data_ptr()
self.context.execute_async_v2(bindings,
torch.cuda.current_stream().cuda_stream)
return outputs
def get_plugin_names():
return [pc.name for pc in trt.get_plugin_registry().plugin_creator_list]
def main():
load_tensorrt_plugin()
args = parse_args()
if args.eval:
args.postprocessing=True
print('Warnings: evaluation requirement detected, set '
'postprocessing=True for evaluation purpose')
cfg = Config.fromfile(args.config)
cfg.model.pretrained = None
cfg.model.type = cfg.model.type + 'TRT'
cfg = compat_cfg(cfg)
cfg.gpu_ids = [0]
if not args.prefetch:
cfg.data.test_dataloader.workers_per_gpu=0
# import modules from plguin/xx, registry will be updated
if hasattr(cfg, 'plugin'):
if cfg.plugin:
import importlib
if hasattr(cfg, 'plugin_dir'):
plugin_dir = cfg.plugin_dir
_module_dir = os.path.dirname(plugin_dir)
_module_dir = _module_dir.split('/')
_module_path = _module_dir[0]
for m in _module_dir[1:]:
_module_path = _module_path + '.' + m
print(_module_path)
plg_lib = importlib.import_module(_module_path)
else:
# import dir is the dirpath for the config file
_module_dir = os.path.dirname(args.config)
_module_dir = _module_dir.split('/')
_module_path = _module_dir[0]
for m in _module_dir[1:]:
_module_path = _module_path + '.' + m
plg_lib = importlib.import_module(_module_path)
# build dataloader
assert cfg.data.test.test_mode
test_dataloader_default_args = dict(
samples_per_gpu=1, workers_per_gpu=2, dist=False, shuffle=False)
test_loader_cfg = {
**test_dataloader_default_args,
**cfg.data.get('test_dataloader', {})
}
dataset = build_dataset(cfg.data.test)
data_loader = build_dataloader(dataset, **test_loader_cfg)
# build the model
cfg.model.train_cfg = None
model = build_model(cfg.model, test_cfg=cfg.get('test_cfg'))
# build tensorrt model
if (cfg.model.get('wdet3d', True) == True) and (cfg.model.get('wocc', True) == False):
trt_model = TRTWrapper(args.engine, [f'output_{i}' for i in range(6 * len(model.pts_bbox_head.task_heads))])
elif (cfg.model.get('wdet3d', True) == True) and (cfg.model.get('wocc', True) == True):
trt_model = TRTWrapper(args.engine, [f'output_{i}' for i in range(1 + 6 * len(model.pts_bbox_head.task_heads))])
elif (cfg.model.get('wdet3d', True) == False) and (cfg.model.get('wocc', True) == True):
trt_model = TRTWrapper(args.engine, [f'output_{i}' for i in range(1)])
else:
raise(" At least one of wdet3d and wocc is set as True!! ")
num_warmup = 50
pure_inf_time = 0
init_ = True
metas = dict()
# benchmark with several samples and take the average
results = list()
for i, data in enumerate(data_loader):
if init_:
inputs = [t.cuda() for t in data['img_inputs'][0]]
if model.__class__.__name__ in ['FBOCCTRT', 'FBOCC2DTRT']:
metas_ = model.get_bev_pool_input(inputs, img_metas=data['img_metas'])
else:
if model.__class__.__name__ in ['BEVDetOCCTRT']:
metas_ = model.get_bev_pool_input(inputs)
elif model.__class__.__name__ in ['BEVDepthOCCTRT']:
metas_, mlp_input = model.get_bev_pool_input(inputs)
if model.__class__.__name__ in ['FBOCCTRT', 'FBOCC2DTRT', 'BEVDetOCCTRT']:
metas = dict(
ranks_bev=metas_[0].int().contiguous(),
ranks_depth=metas_[1].int().contiguous(),
ranks_feat=metas_[2].int().contiguous(),
interval_starts=metas_[3].int().contiguous(),
interval_lengths=metas_[4].int().contiguous())
elif model.__class__.__name__ in ['BEVDepthOCCTRT']:
metas = dict(
ranks_bev=metas_[0].int().contiguous(),
ranks_depth=metas_[1].int().contiguous(),
ranks_feat=metas_[2].int().contiguous(),
interval_starts=metas_[3].int().contiguous(),
interval_lengths=metas_[4].int().contiguous(),
mlp_input=mlp_input)
init_ = False
img = data['img_inputs'][0][0].cuda().squeeze(0).contiguous()
if img.shape[0] > 6:
img = img[:6]
torch.cuda.synchronize()
start_time = time.perf_counter()
trt_output = trt_model.forward(dict(img=img, **metas))
# postprocessing
if args.postprocessing:
if cfg.model.get('wdet3d', True):
trt_output_det = [trt_output[f'output_{i}'] for i in
range(6 * len(model.pts_bbox_head.task_heads))]
pred = model.result_deserialize(trt_output_det)
img_metas = [dict(box_type_3d=LiDARInstance3DBoxes)]
bbox_list = model.pts_bbox_head.get_bboxes(
pred, img_metas, rescale=True)
bbox_results = [
bbox3d2result(bboxes, scores, labels)
for bboxes, scores, labels in bbox_list
]
if cfg.model.get('wocc', True):
# occupancy
if cfg.model.get('wdet3d', True):
occ_preds = model.occ_head.get_occ(trt_output['output_6']) # List[(Dx, Dy, Dz), (Dx, Dy, Dz), ...]
else:
occ_preds = model.occ_head.get_occ(trt_output['output_0']) # List[(Dx, Dy, Dz), (Dx, Dy, Dz), ...]
if args.eval:
if cfg.model.get('wdet3d', True) and (not cfg.model.get('wocc', True)):
results.append(bbox_results[0])
elif cfg.model.get('wdet3d', True) and cfg.model.get('wocc', True):
results.append({'pts_bbox': bbox_results[0], 'pred_occ': occ_preds[0]})
elif (not cfg.model.get('wdet3d', False)) and cfg.model.get('wocc', True):
results.append(occ_preds[0])
torch.cuda.synchronize()
elapsed = time.perf_counter() - start_time
if i >= num_warmup:
pure_inf_time += elapsed
if (i + 1) % 50 == 0:
fps = (i + 1 - num_warmup) / pure_inf_time
print(f'Done image [{i + 1:<3}/ {args.samples}], '
f'fps: {fps:.2f} img / s')
if (i + 1) == args.samples:
pure_inf_time += elapsed
fps = (i + 1 - num_warmup) / pure_inf_time
print(f'Overall \nfps: {fps:.2f} img / s '
f'\ninference time: {1000/fps:.2f} ms')
if not args.eval:
return
assert args.eval
eval_kwargs = cfg.get('evaluation', {}).copy()
# hard-code way to remove EvalHook args
for key in [
'interval', 'tmpdir', 'start', 'gpu_collect', 'save_best',
'rule'
]:
eval_kwargs.pop(key, None)
eval_kwargs.update(dict(metric=args.eval))
print(dataset.evaluate(results, **eval_kwargs))
if __name__ == '__main__':
fps = main()
# Copyright (c) OpenMMLab. All rights reserved.
import argparse
import time
import numpy as np
import torch
from mmcv import Config
from mmcv.parallel import MMDataParallel
from mmcv.runner import load_checkpoint
from mmdet3d.datasets import build_dataloader, build_dataset
from mmdet3d.models import build_detector
def parse_args():
parser = argparse.ArgumentParser(description='MMDet benchmark a model')
parser.add_argument('config', help='test config file path')
parser.add_argument('checkpoint', help='checkpoint file')
parser.add_argument('--samples', default=1000, help='samples to benchmark')
parser.add_argument(
'--log-interval', default=50, help='interval of logging')
parser.add_argument(
'--mem-only',
action='store_true',
help='Conduct the memory analysis only')
parser.add_argument(
'--no-acceleration',
action='store_true',
help='Omit the pre-computation acceleration')
args = parser.parse_args()
return args
def main():
args = parse_args()
cfg = Config.fromfile(args.config)
# set cudnn_benchmark
if cfg.get('cudnn_benchmark', False):
torch.backends.cudnn.benchmark = True
cfg.model.pretrained = None
cfg.data.test.test_mode = True
# build the dataloader
# TODO: support multiple images per gpu (only minor changes are needed)
dataset = build_dataset(cfg.data.test)
data_loader = build_dataloader(
dataset,
samples_per_gpu=1,
workers_per_gpu=cfg.data.workers_per_gpu,
dist=False,
shuffle=False)
# build the model and load checkpoint
if not args.no_acceleration:
cfg.model.img_view_transformer.accelerate=True
cfg.model.train_cfg = None
assert cfg.model.type == 'BEVDet', \
'Please use class BEVDet for ' \
'view transformation inference ' \
'speed estimation instead of %s'% cfg.model.type
model = build_detector(cfg.model, test_cfg=cfg.get('test_cfg'))
load_checkpoint(model, args.checkpoint, map_location='cpu')
model = MMDataParallel(model, device_ids=[0])
model.eval()
# the first several iterations may be very slow so skip them
num_warmup = 100
pure_inf_time = 0
D = model.module.img_view_transformer.D
out_channels = model.module.img_view_transformer.out_channels
depth_net = model.module.img_view_transformer.depth_net
view_transformer = model.module.img_view_transformer
# benchmark with several samples and take the average
for i, data in enumerate(data_loader):
with torch.no_grad():
img_feat, _ = \
model.module.image_encoder(data['img_inputs'][0][0].cuda())
B, N, C, H, W = img_feat.shape
x = depth_net(img_feat.reshape(B * N, C, H, W))
depth_digit = x[:, :D, ...]
tran_feat = x[:, D:D + out_channels, ...]
depth = depth_digit.softmax(dim=1)
input = [img_feat] + [d.cuda() for d in data['img_inputs'][0][1:]]
if i == 0:
precomputed_memory_allocated = 0.0
if view_transformer.accelerate:
start_mem_allocated = torch.cuda.memory_allocated()
view_transformer.pre_compute(input)
end_mem_allocated = torch.cuda.memory_allocated()
precomputed_memory_allocated = \
end_mem_allocated - start_mem_allocated
ref_max_mem_allocated = torch.cuda.max_memory_allocated()
# occupy the memory
size = (ref_max_mem_allocated - end_mem_allocated) // 4
occupy_tensor = torch.zeros(
size=(size, ), device='cuda', dtype=torch.float32)
print('Memory analysis: \n'
'precomputed_memory_allocated : %d B / %.01f MB \n' %
(precomputed_memory_allocated,
precomputed_memory_allocated / 1024 / 1024))
start_mem_allocated = torch.cuda.memory_allocated()
bev_feat = view_transformer.view_transform_core(
input, depth, tran_feat)[0]
end_max_mem_allocated = torch.cuda.max_memory_allocated()
peak_memory_allocated = \
end_max_mem_allocated - start_mem_allocated
total_memory_requirement = \
precomputed_memory_allocated + peak_memory_allocated
print('Memory analysis: \n'
'Memory requirement : %d B / %.01f MB \n' %
(total_memory_requirement,
total_memory_requirement / 1024 / 1024))
if args.mem_only:
return
torch.cuda.synchronize()
start_time = time.perf_counter()
with torch.no_grad():
view_transformer.view_transform(input, depth, tran_feat)[0]
torch.cuda.synchronize()
elapsed = time.perf_counter() - start_time
if i >= num_warmup:
pure_inf_time += elapsed
if (i + 1) % args.log_interval == 0:
fps = (i + 1 - num_warmup) / pure_inf_time
print(f'Done image [{i + 1:<3}/ {args.samples}], '
f'fps: {fps:.1f} img / s')
if (i + 1) == args.samples:
pure_inf_time += elapsed
fps = (i + 1 - num_warmup) / pure_inf_time
print(f'Overall fps: {fps:.1f} img / s')
return fps
if __name__ == '__main__':
repeat_times = 1
fps_list = []
for _ in range(repeat_times):
fps = main()
time.sleep(5)
fps_list.append(fps)
fps_list = np.array(fps_list, dtype=np.float32)
print(f'Mean Overall fps: {fps_list.mean():.4f} +'
f' {np.sqrt(fps_list.var()):.4f} img / s')
# Copyright (c) OpenMMLab. All rights reserved.
import argparse
import os
import sys
sys.path.insert(0, os.getcwd())
import torch
from mmcv import Config, DictAction
from mmdet3d.models import build_model
try:
from mmcv.cnn import get_model_complexity_info
except ImportError:
raise ImportError('Please upgrade mmcv to >0.6.2')
def parse_args():
parser = argparse.ArgumentParser(description='Train a detector')
parser.add_argument('config', help='train config file path')
parser.add_argument(
'--shape',
type=int,
nargs='+',
default=[40000, 4],
help='input point cloud size')
parser.add_argument(
'--modality',
type=str,
default='point',
choices=['point', 'image', 'multi'],
help='input data modality')
parser.add_argument(
'--cfg-options',
nargs='+',
action=DictAction,
help='override some settings in the used config, the key-value pair '
'in xxx=yyy format will be merged into config file. If the value to '
'be overwritten is a list, it should be like key="[a,b]" or key=a,b '
'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" '
'Note that the quotation marks are necessary and that no white space '
'is allowed.')
args = parser.parse_args()
return args
def construct_input(input_shape):
rot = torch.eye(4).float().cuda().view(1, 1, 4, 4).expand(1,6,4,4)
intrins = torch.eye(3).float().cuda().view(1,1, 3, 3).expand(1,6,3,3)
input = dict(img_inputs=[
torch.ones(()).new_empty((1, 6, *input_shape)).cuda(), rot,
rot, intrins, intrins,
torch.ones((1, 6, 3)).cuda(),
torch.eye(3).float().cuda().view(1, 3, 3)
])
return input
def main():
args = parse_args()
if args.modality == 'point':
assert len(args.shape) == 2, 'invalid input shape'
input_shape = tuple(args.shape)
elif args.modality == 'image':
if len(args.shape) == 1:
input_shape = (3, args.shape[0], args.shape[0])
elif len(args.shape) == 2:
input_shape = (3, ) + tuple(args.shape)
else:
raise ValueError('invalid input shape')
elif args.modality == 'multi':
raise NotImplementedError(
'FLOPs counter is currently not supported for models with '
'multi-modality input')
cfg = Config.fromfile(args.config)
# if 'stereo' in args.config or 'longterm' in args.config:
# assert False,'Config has not supported: %s ' % args.config
if args.cfg_options is not None:
cfg.merge_from_dict(args.cfg_options)
# import modules from plguin/xx, registry will be updated
if hasattr(cfg, 'plugin'):
if cfg.plugin:
import importlib
if hasattr(cfg, 'plugin_dir'):
plugin_dir = cfg.plugin_dir
_module_dir = os.path.dirname(plugin_dir)
_module_dir = _module_dir.split('/')
_module_path = _module_dir[0]
for m in _module_dir[1:]:
_module_path = _module_path + '.' + m
print(_module_path)
plg_lib = importlib.import_module(_module_path)
else:
# import dir is the dirpath for the config file
_module_dir = os.path.dirname(args.config)
_module_dir = _module_dir.split('/')
_module_path = _module_dir[0]
for m in _module_dir[1:]:
_module_path = _module_path + '.' + m
plg_lib = importlib.import_module(_module_path)
model = build_model(
cfg.model,
train_cfg=cfg.get('train_cfg'),
test_cfg=cfg.get('test_cfg'))
if torch.cuda.is_available():
model.cuda()
model.eval()
if hasattr(model, 'forward_dummy'):
model.forward = model.forward_dummy
else:
raise NotImplementedError(
'FLOPs counter is currently not supported for {}'.format(
model.__class__.__name__))
flops, params = get_model_complexity_info(
model, input_shape, input_constructor=construct_input)
split_line = '=' * 30
print(f'{split_line}\nInput shape: {input_shape}\n'
f'Flops: {flops}\nParams: {params}\n{split_line}')
print('!!!Please be cautious if you use the results in papers. '
'You may need to check if all ops are supported and verify that the '
'flops computation is correct.')
if __name__ == '__main__':
main()
# Copyright (c) Phigent Robotics. All rights reserved.
import argparse
import json
import os
import pickle
import cv2
import numpy as np
from pyquaternion.quaternion import Quaternion
from mmdet3d.core.bbox.structures.lidar_box3d import LiDARInstance3DBoxes as LB
def check_point_in_img(points, height, width):
valid = np.logical_and(points[:, 0] >= 0, points[:, 1] >= 0)
valid = np.logical_and(
valid, np.logical_and(points[:, 0] < width, points[:, 1] < height))
return valid
def depth2color(depth):
gray = max(0, min((depth + 2.5) / 3.0, 1.0))
max_lumi = 200
colors = np.array(
[[max_lumi, 0, max_lumi], [max_lumi, 0, 0], [max_lumi, max_lumi, 0],
[0, max_lumi, 0], [0, max_lumi, max_lumi], [0, 0, max_lumi]],
dtype=np.float32)
if gray == 1:
return tuple(colors[-1].tolist())
num_rank = len(colors) - 1
rank = np.floor(gray * num_rank).astype(np.int)
diff = (gray - rank / num_rank) * num_rank
return tuple(
(colors[rank] + (colors[rank + 1] - colors[rank]) * diff).tolist())
def lidar2img(points_lidar, camrera_info):
points_lidar_homogeneous = \
np.concatenate([points_lidar,
np.ones((points_lidar.shape[0], 1),
dtype=points_lidar.dtype)], axis=1)
camera2lidar = np.eye(4, dtype=np.float32)
camera2lidar[:3, :3] = camrera_info['sensor2lidar_rotation']
camera2lidar[:3, 3] = camrera_info['sensor2lidar_translation']
lidar2camera = np.linalg.inv(camera2lidar)
points_camera_homogeneous = points_lidar_homogeneous @ lidar2camera.T
points_camera = points_camera_homogeneous[:, :3]
valid = np.ones((points_camera.shape[0]), dtype=bool)
valid = np.logical_and(points_camera[:, -1] > 0.5, valid)
points_camera = points_camera / points_camera[:, 2:3]
camera2img = camrera_info['cam_intrinsic']
points_img = points_camera @ camera2img.T
points_img = points_img[:, :2]
return points_img, valid
def get_lidar2global(infos):
lidar2ego = np.eye(4, dtype=np.float32)
lidar2ego[:3, :3] = Quaternion(infos['lidar2ego_rotation']).rotation_matrix
lidar2ego[:3, 3] = infos['lidar2ego_translation']
ego2global = np.eye(4, dtype=np.float32)
ego2global[:3, :3] = Quaternion(
infos['ego2global_rotation']).rotation_matrix
ego2global[:3, 3] = infos['ego2global_translation']
return ego2global @ lidar2ego
def parse_args():
parser = argparse.ArgumentParser(description='Visualize the predicted '
'result of nuScenes')
parser.add_argument(
'res', help='Path to the predicted result in json format')
parser.add_argument(
'--show-range',
type=int,
default=50,
help='Range of visualization in BEV')
parser.add_argument(
'--canva-size', type=int, default=1000, help='Size of canva in pixel')
parser.add_argument(
'--vis-frames',
type=int,
default=500,
help='Number of frames for visualization')
parser.add_argument(
'--scale-factor',
type=int,
default=4,
help='Trade-off between image-view and bev in size of '
'the visualized canvas')
parser.add_argument(
'--vis-thred',
type=float,
default=0.3,
help='Threshold the predicted results')
parser.add_argument('--draw-gt', action='store_true')
parser.add_argument(
'--version',
type=str,
default='val',
help='Version of nuScenes dataset')
parser.add_argument(
'--root_path',
type=str,
default='./data/nuscenes',
help='Path to nuScenes dataset')
parser.add_argument(
'--save_path',
type=str,
default='./vis',
help='Path to save visualization results')
parser.add_argument(
'--format',
type=str,
default='video',
choices=['video', 'image'],
help='The desired format of the visualization result')
parser.add_argument(
'--fps', type=int, default=20, help='Frame rate of video')
parser.add_argument(
'--video-prefix', type=str, default='vis', help='name of video')
args = parser.parse_args()
return args
color_map = {0: (255, 255, 0), 1: (0, 255, 255)}
def main():
args = parse_args()
# load predicted results
res = json.load(open(args.res, 'r'))
# load dataset information
info_path = \
args.root_path + '/bevdetv2-nuscenes_infos_%s.pkl' % args.version
dataset = pickle.load(open(info_path, 'rb'))
# prepare save path and medium
vis_dir = args.save_path
if not os.path.exists(vis_dir):
os.makedirs(vis_dir)
print('saving visualized result to %s' % vis_dir)
scale_factor = args.scale_factor
canva_size = args.canva_size
show_range = args.show_range
if args.format == 'video':
fourcc = cv2.VideoWriter_fourcc(*'MP4V')
vout = cv2.VideoWriter(
os.path.join(vis_dir, '%s.mp4' % args.video_prefix), fourcc,
args.fps, (int(1600 / scale_factor * 3),
int(900 / scale_factor * 2 + canva_size)))
draw_boxes_indexes_bev = [(0, 1), (1, 2), (2, 3), (3, 0)]
draw_boxes_indexes_img_view = [(0, 1), (1, 2), (2, 3), (3, 0), (4, 5),
(5, 6), (6, 7), (7, 4), (0, 4), (1, 5),
(2, 6), (3, 7)]
views = [
'CAM_FRONT_LEFT', 'CAM_FRONT', 'CAM_FRONT_RIGHT', 'CAM_BACK_LEFT',
'CAM_BACK', 'CAM_BACK_RIGHT'
]
print('start visualizing results')
for cnt, infos in enumerate(
dataset['infos'][:min(args.vis_frames, len(dataset['infos']))]):
if cnt % 10 == 0:
print('%d/%d' % (cnt, min(args.vis_frames, len(dataset['infos']))))
# collect instances
pred_res = res['results'][infos['token']]
pred_boxes = [
pred_res[rid]['translation'] + pred_res[rid]['size'] + [
Quaternion(pred_res[rid]['rotation']).yaw_pitch_roll[0] +
np.pi / 2
] for rid in range(len(pred_res))
]
if len(pred_boxes) == 0:
corners_lidar = np.zeros((0, 3), dtype=np.float32)
else:
pred_boxes = np.array(pred_boxes, dtype=np.float32)
boxes = LB(pred_boxes, origin=(0.5, 0.5, 0.0))
corners_global = boxes.corners.numpy().reshape(-1, 3)
corners_global = np.concatenate(
[corners_global,
np.ones([corners_global.shape[0], 1])],
axis=1)
l2g = get_lidar2global(infos)
corners_lidar = corners_global @ np.linalg.inv(l2g).T
corners_lidar = corners_lidar[:, :3]
pred_flag = np.ones((corners_lidar.shape[0] // 8, ), dtype=np.bool)
scores = [
pred_res[rid]['detection_score'] for rid in range(len(pred_res))
]
if args.draw_gt:
gt_boxes = infos['gt_boxes']
gt_boxes[:, -1] = gt_boxes[:, -1] + np.pi / 2
width = gt_boxes[:, 4].copy()
gt_boxes[:, 4] = gt_boxes[:, 3]
gt_boxes[:, 3] = width
corners_lidar_gt = \
LB(infos['gt_boxes'],
origin=(0.5, 0.5, 0.5)).corners.numpy().reshape(-1, 3)
corners_lidar = np.concatenate([corners_lidar, corners_lidar_gt],
axis=0)
gt_flag = np.ones((corners_lidar_gt.shape[0] // 8), dtype=np.bool)
pred_flag = np.concatenate(
[pred_flag, np.logical_not(gt_flag)], axis=0)
scores = scores + [0 for _ in range(infos['gt_boxes'].shape[0])]
scores = np.array(scores, dtype=np.float32)
sort_ids = np.argsort(scores)
# image view
imgs = []
for view in views:
img = cv2.imread(infos['cams'][view]['data_path'])
# draw instances
corners_img, valid = lidar2img(corners_lidar, infos['cams'][view])
valid = np.logical_and(
valid,
check_point_in_img(corners_img, img.shape[0], img.shape[1]))
valid = valid.reshape(-1, 8)
corners_img = corners_img.reshape(-1, 8, 2).astype(np.int)
for aid in range(valid.shape[0]):
for index in draw_boxes_indexes_img_view:
if valid[aid, index[0]] and valid[aid, index[1]]:
cv2.line(
img,
tuple(corners_img[aid, index[0]]),
tuple(corners_img[aid, index[1]]),
color=color_map[int(pred_flag[aid])],
thickness=scale_factor)
imgs.append(img)
# bird-eye-view
canvas = np.zeros((int(canva_size), int(canva_size), 3),
dtype=np.uint8)
# draw lidar points
lidar_points = np.fromfile(infos['lidar_path'], dtype=np.float32)
lidar_points = lidar_points.reshape(-1, 5)[:, :3]
lidar_points[:, 1] = -lidar_points[:, 1]
lidar_points[:, :2] = \
(lidar_points[:, :2] + show_range) / show_range / 2.0 * canva_size
for p in lidar_points:
if check_point_in_img(
p.reshape(1, 3), canvas.shape[1], canvas.shape[0])[0]:
color = depth2color(p[2])
cv2.circle(
canvas, (int(p[0]), int(p[1])),
radius=0,
color=color,
thickness=1)
# draw instances
corners_lidar = corners_lidar.reshape(-1, 8, 3)
corners_lidar[:, :, 1] = -corners_lidar[:, :, 1]
bottom_corners_bev = corners_lidar[:, [0, 3, 7, 4], :2]
bottom_corners_bev = \
(bottom_corners_bev + show_range) / show_range / 2.0 * canva_size
bottom_corners_bev = np.round(bottom_corners_bev).astype(np.int32)
center_bev = corners_lidar[:, [0, 3, 7, 4], :2].mean(axis=1)
head_bev = corners_lidar[:, [0, 4], :2].mean(axis=1)
canter_canvas = \
(center_bev + show_range) / show_range / 2.0 * canva_size
center_canvas = canter_canvas.astype(np.int32)
head_canvas = (head_bev + show_range) / show_range / 2.0 * canva_size
head_canvas = head_canvas.astype(np.int32)
for rid in sort_ids:
score = scores[rid]
if score < args.vis_thred and pred_flag[rid]:
continue
score = min(score * 2.0, 1.0) if pred_flag[rid] else 1.0
color = color_map[int(pred_flag[rid])]
for index in draw_boxes_indexes_bev:
cv2.line(
canvas,
bottom_corners_bev[rid, index[0]],
bottom_corners_bev[rid, index[1]],
[color[0] * score, color[1] * score, color[2] * score],
thickness=1)
cv2.line(
canvas,
center_canvas[rid],
head_canvas[rid],
[color[0] * score, color[1] * score, color[2] * score],
1,
lineType=8)
# fuse image-view and bev
img = np.zeros((900 * 2 + canva_size * scale_factor, 1600 * 3, 3),
dtype=np.uint8)
img[:900, :, :] = np.concatenate(imgs[:3], axis=1)
img_back = np.concatenate(
[imgs[3][:, ::-1, :], imgs[4][:, ::-1, :], imgs[5][:, ::-1, :]],
axis=1)
img[900 + canva_size * scale_factor:, :, :] = img_back
img = cv2.resize(img, (int(1600 / scale_factor * 3),
int(900 / scale_factor * 2 + canva_size)))
w_begin = int((1600 * 3 / scale_factor - canva_size) // 2)
img[int(900 / scale_factor):int(900 / scale_factor) + canva_size,
w_begin:w_begin + canva_size, :] = canvas
if args.format == 'image':
cv2.imwrite(os.path.join(vis_dir, '%s.jpg' % infos['token']), img)
elif args.format == 'video':
vout.write(img)
if args.format == 'video':
vout.release()
if __name__ == '__main__':
main()
import os
import mmcv
import open3d as o3d
import numpy as np
import torch
import pickle
import math
from typing import Tuple, List, Dict, Iterable
import argparse
import cv2
NOT_OBSERVED = -1
FREE = 0
OCCUPIED = 1
FREE_LABEL = 17
BINARY_OBSERVED = 1
BINARY_NOT_OBSERVED = 0
VOXEL_SIZE = [0.4, 0.4, 0.4]
POINT_CLOUD_RANGE = [-40, -40, -1, 40, 40, 5.4]
SPTIAL_SHAPE = [200, 200, 16]
TGT_VOXEL_SIZE = [0.4, 0.4, 0.4]
TGT_POINT_CLOUD_RANGE = [-40, -40, -1, 40, 40, 5.4]
colormap_to_colors = np.array(
[
[0, 0, 0, 255], # 0 undefined
[112, 128, 144, 255], # 1 barrier orange
[220, 20, 60, 255], # 2 bicycle Blue
[255, 127, 80, 255], # 3 bus Darkslategrey
[255, 158, 0, 255], # 4 car Crimson
[233, 150, 70, 255], # 5 cons. Veh Orangered
[255, 61, 99, 255], # 6 motorcycle Darkorange
[0, 0, 230, 255], # 7 pedestrian Darksalmon
[47, 79, 79, 255], # 8 traffic cone Red
[255, 140, 0, 255],# 9 trailer Slategrey
[255, 99, 71, 255],# 10 truck Burlywood
[0, 207, 191, 255], # 11 drive sur Green
[175, 0, 75, 255], # 12 other lat nuTonomy green
[75, 0, 75, 255], # 13 sidewalk
[112, 180, 60, 255], # 14 terrain
[222, 184, 135, 255], # 15 manmade
[0, 175, 0, 255], # 16 vegeyation
], dtype=np.float32)
def voxel2points(voxel, occ_show, voxelSize):
"""
Args:
voxel: (Dx, Dy, Dz)
occ_show: (Dx, Dy, Dz)
voxelSize: (dx, dy, dz)
Returns:
points: (N, 3) 3: (x, y, z)
voxel: (N, ) cls_id
occIdx: (x_idx, y_idx, z_idx)
"""
occIdx = torch.where(occ_show)
points = torch.cat((occIdx[0][:, None] * voxelSize[0] + POINT_CLOUD_RANGE[0], \
occIdx[1][:, None] * voxelSize[1] + POINT_CLOUD_RANGE[1], \
occIdx[2][:, None] * voxelSize[2] + POINT_CLOUD_RANGE[2]),
dim=1) # (N, 3) 3: (x, y, z)
return points, voxel[occIdx], occIdx
def voxel_profile(voxel, voxel_size):
"""
Args:
voxel: (N, 3) 3:(x, y, z)
voxel_size: (vx, vy, vz)
Returns:
box: (N, 7) (x, y, z - dz/2, vx, vy, vz, 0)
"""
centers = torch.cat((voxel[:, :2], voxel[:, 2][:, None] - voxel_size[2] / 2), dim=1) # (x, y, z - dz/2)
# centers = voxel
wlh = torch.cat((torch.tensor(voxel_size[0]).repeat(centers.shape[0])[:, None],
torch.tensor(voxel_size[1]).repeat(centers.shape[0])[:, None],
torch.tensor(voxel_size[2]).repeat(centers.shape[0])[:, None]), dim=1)
yaw = torch.full_like(centers[:, 0:1], 0)
return torch.cat((centers, wlh, yaw), dim=1)
def rotz(t):
"""Rotation about the z-axis."""
c = torch.cos(t)
s = torch.sin(t)
return torch.tensor([[c, -s, 0],
[s, c, 0],
[0, 0, 1]])
def my_compute_box_3d(center, size, heading_angle):
"""
Args:
center: (N, 3) 3: (x, y, z - dz/2)
size: (N, 3) 3: (vx, vy, vz)
heading_angle: (N, 1)
Returns:
corners_3d: (N, 8, 3)
"""
h, w, l = size[:, 2], size[:, 0], size[:, 1]
center[:, 2] = center[:, 2] + h / 2
l, w, h = (l / 2).unsqueeze(1), (w / 2).unsqueeze(1), (h / 2).unsqueeze(1)
x_corners = torch.cat([-l, l, l, -l, -l, l, l, -l], dim=1)[..., None]
y_corners = torch.cat([w, w, -w, -w, w, w, -w, -w], dim=1)[..., None]
z_corners = torch.cat([h, h, h, h, -h, -h, -h, -h], dim=1)[..., None]
corners_3d = torch.cat([x_corners, y_corners, z_corners], dim=2)
corners_3d[..., 0] += center[:, 0:1]
corners_3d[..., 1] += center[:, 1:2]
corners_3d[..., 2] += center[:, 2:3]
return corners_3d
def show_point_cloud(points: np.ndarray, colors=True, points_colors=None, bbox3d=None, voxelize=False,
bbox_corners=None, linesets=None, vis=None, offset=[0,0,0], large_voxel=True, voxel_size=0.4):
"""
:param points: (N, 3) 3:(x, y, z)
:param colors: false 不显示点云颜色
:param points_colors: (N, 4)
:param bbox3d: voxel grid (N, 7) 7: (center, wlh, yaw=0)
:param voxelize: false 不显示voxel边界
:param bbox_corners: (N, 8, 3) voxel grid 角点坐标, 用于绘制voxel grid 边界.
:param linesets: 用于绘制voxel grid 边界.
:return:
"""
if vis is None:
vis = o3d.visualization.VisualizerWithKeyCallback()
vis.create_window()
if isinstance(offset, list) or isinstance(offset, tuple):
offset = np.array(offset)
pcd = o3d.geometry.PointCloud()
pcd.points = o3d.utility.Vector3dVector(points+offset)
if colors:
pcd.colors = o3d.utility.Vector3dVector(points_colors[:, :3])
mesh_frame = o3d.geometry.TriangleMesh.create_coordinate_frame(
size=1, origin=[0, 0, 0])
voxelGrid = o3d.geometry.VoxelGrid.create_from_point_cloud(pcd, voxel_size=voxel_size)
if large_voxel:
vis.add_geometry(voxelGrid)
else:
vis.add_geometry(pcd)
if voxelize:
line_sets = o3d.geometry.LineSet()
line_sets.points = o3d.open3d.utility.Vector3dVector(bbox_corners.reshape((-1, 3))+offset)
line_sets.lines = o3d.open3d.utility.Vector2iVector(linesets.reshape((-1, 2)))
line_sets.paint_uniform_color((0, 0, 0))
vis.add_geometry(line_sets)
vis.add_geometry(mesh_frame)
# ego_pcd = o3d.geometry.PointCloud()
# ego_points = generate_the_ego_car()
# ego_pcd.points = o3d.utility.Vector3dVector(ego_points)
# vis.add_geometry(ego_pcd)
return vis
def show_occ(occ_state, occ_show, voxel_size, vis=None, offset=[0, 0, 0]):
"""
Args:
occ_state: (Dx, Dy, Dz), cls_id
occ_show: (Dx, Dy, Dz), bool
voxel_size: [0.4, 0.4, 0.4]
vis: Visualizer
offset:
Returns:
"""
colors = colormap_to_colors / 255
pcd, labels, occIdx = voxel2points(occ_state, occ_show, voxel_size)
# pcd: (N, 3) 3: (x, y, z)
# labels: (N, ) cls_id
_labels = labels % len(colors)
pcds_colors = colors[_labels] # (N, 4)
bboxes = voxel_profile(pcd, voxel_size) # (N, 7) 7: (x, y, z - dz/2, dx, dy, dz, 0)
bboxes_corners = my_compute_box_3d(bboxes[:, 0:3], bboxes[:, 3:6], bboxes[:, 6:7]) # (N, 8, 3)
bases_ = torch.arange(0, bboxes_corners.shape[0] * 8, 8)
edges = torch.tensor([[0, 1], [1, 2], [2, 3], [3, 0], [4, 5], [5, 6], [6, 7], [7, 4], [0, 4], [1, 5], [2, 6], [3, 7]]) # lines along y-axis
edges = edges.reshape((1, 12, 2)).repeat(bboxes_corners.shape[0], 1, 1) # (N, 12, 2)
# (N, 12, 2) + (N, 1, 1) --> (N, 12, 2) 此时edges中记录的是bboxes_corners的整体id: (0, N*8).
edges = edges + bases_[:, None, None]
vis = show_point_cloud(
points=pcd.numpy(),
colors=True,
points_colors=pcds_colors,
voxelize=True,
bbox3d=bboxes.numpy(),
bbox_corners=bboxes_corners.numpy(),
linesets=edges.numpy(),
vis=vis,
offset=offset,
large_voxel=True,
voxel_size=0.4
)
return vis
def generate_the_ego_car():
ego_range = [-2, -1, 0, 2, 1, 1.5]
ego_voxel_size=[0.1, 0.1, 0.1]
ego_xdim = int((ego_range[3] - ego_range[0]) / ego_voxel_size[0])
ego_ydim = int((ego_range[4] - ego_range[1]) / ego_voxel_size[1])
ego_zdim = int((ego_range[5] - ego_range[2]) / ego_voxel_size[2])
temp_x = np.arange(ego_xdim)
temp_y = np.arange(ego_ydim)
temp_z = np.arange(ego_zdim)
ego_xyz = np.stack(np.meshgrid(temp_y, temp_x, temp_z), axis=-1).reshape(-1, 3)
ego_point_x = (ego_xyz[:, 0:1] + 0.5) / ego_xdim * (ego_range[3] - ego_range[0]) + ego_range[0]
ego_point_y = (ego_xyz[:, 1:2] + 0.5) / ego_ydim * (ego_range[4] - ego_range[1]) + ego_range[1]
ego_point_z = (ego_xyz[:, 2:3] + 0.5) / ego_zdim * (ego_range[5] - ego_range[2]) + ego_range[2]
ego_point_xyz = np.concatenate((ego_point_y, ego_point_x, ego_point_z), axis=-1)
ego_points_label = (np.ones((ego_point_xyz.shape[0]))*16).astype(np.uint8)
ego_dict = {}
ego_dict['point'] = ego_point_xyz
ego_dict['label'] = ego_points_label
return ego_point_xyz
def parse_args():
parser = argparse.ArgumentParser(description='Visualize the predicted '
'result of nuScenes')
parser.add_argument(
'res', help='Path to the predicted result')
parser.add_argument(
'--canva-size', type=int, default=1000, help='Size of canva in pixel')
parser.add_argument(
'--vis-frames',
type=int,
default=500,
help='Number of frames for visualization')
parser.add_argument(
'--scale-factor',
type=int,
default=4,
help='Trade-off between image-view and bev in size of '
'the visualized canvas')
parser.add_argument(
'--version',
type=str,
default='val',
help='Version of nuScenes dataset')
parser.add_argument('--draw-gt', action='store_true')
parser.add_argument(
'--root_path',
type=str,
default='./data/nuscenes',
help='Path to nuScenes dataset')
parser.add_argument(
'--save_path',
type=str,
default='./vis',
help='Path to save visualization results')
parser.add_argument(
'--format',
type=str,
default='image',
choices=['video', 'image'],
help='The desired format of the visualization result')
parser.add_argument(
'--fps', type=int, default=10, help='Frame rate of video')
parser.add_argument(
'--video-prefix', type=str, default='vis', help='name of video')
args = parser.parse_args()
return args
def main():
args = parse_args()
# load predicted results
results_dir = args.res
# load dataset information
info_path = \
args.root_path + '/bevdetv2-nuscenes_infos_%s.pkl' % args.version
dataset = pickle.load(open(info_path, 'rb'))
# prepare save path and medium
vis_dir = args.save_path
if not os.path.exists(vis_dir):
os.makedirs(vis_dir)
print('saving visualized result to %s' % vis_dir)
scale_factor = args.scale_factor
canva_size = args.canva_size
if args.format == 'video':
fourcc = cv2.VideoWriter_fourcc('m', 'p', '4', 'v')
vout = cv2.VideoWriter(
os.path.join(vis_dir, '%s.mp4' % args.video_prefix), fourcc,
args.fps, (int(1600 / scale_factor * 3),
int(900 / scale_factor * 2 + canva_size)))
views = [
'CAM_FRONT_LEFT', 'CAM_FRONT', 'CAM_FRONT_RIGHT', 'CAM_BACK_LEFT',
'CAM_BACK', 'CAM_BACK_RIGHT'
]
print('start visualizing results')
vis = o3d.visualization.VisualizerWithKeyCallback()
vis.create_window()
for cnt, info in enumerate(
dataset['infos'][:min(args.vis_frames, len(dataset['infos']))]):
if cnt % 10 == 0:
print('%d/%d' % (cnt, min(args.vis_frames, len(dataset['infos']))))
scene_name = info['scene_name']
sample_token = info['token']
pred_occ_path = os.path.join(results_dir, scene_name, sample_token, 'pred.npz')
gt_occ_path = info['occ_path']
pred_occ = np.load(pred_occ_path)['pred']
gt_data = np.load(os.path.join(args.root_path, gt_occ_path, 'labels.npz'))
voxel_label = gt_data['semantics']
lidar_mask = gt_data['mask_lidar']
camera_mask = gt_data['mask_camera']
# load imgs
imgs = []
for view in views:
img = cv2.imread(info['cams'][view]['data_path'])
imgs.append(img)
# occ_canvas
voxel_show = np.logical_and(pred_occ != FREE_LABEL, camera_mask)
# voxel_show = pred_occ != FREE_LABEL
voxel_size = VOXEL_SIZE
vis = show_occ(torch.from_numpy(pred_occ), torch.from_numpy(voxel_show), voxel_size=voxel_size, vis=vis,
offset=[0, pred_occ.shape[0] * voxel_size[0] * 1.2 * 0, 0])
if args.draw_gt:
voxel_show = np.logical_and(voxel_label != FREE_LABEL, camera_mask)
vis = show_occ(torch.from_numpy(voxel_label), torch.from_numpy(voxel_show), voxel_size=voxel_size, vis=vis,
offset=[0, voxel_label.shape[0] * voxel_size[0] * 1.2 * 1, 0])
view_control = vis.get_view_control()
look_at = np.array([-0.185, 0.513, 3.485])
front = np.array([-0.974, -0.055, 0.221])
up = np.array([0.221, 0.014, 0.975])
zoom = np.array([0.08])
view_control.set_lookat(look_at)
view_control.set_front(front)
view_control.set_up(up)
view_control.set_zoom(zoom)
opt = vis.get_render_option()
opt.background_color = np.asarray([1, 1, 1])
opt.line_width = 5
vis.poll_events()
vis.update_renderer()
vis.run()
# if args.format == 'image':
# out_dir = os.path.join(vis_dir, f'{scene_name}', f'{sample_token}')
# mmcv.mkdir_or_exist(out_dir)
# vis.capture_screen_image(os.path.join(out_dir, 'screen_occ.png'), do_render=True)
occ_canvas = vis.capture_screen_float_buffer(do_render=True)
occ_canvas = np.asarray(occ_canvas)
occ_canvas = (occ_canvas * 255).astype(np.uint8)
occ_canvas = occ_canvas[..., [2, 1, 0]]
occ_canvas_resize = cv2.resize(occ_canvas, (canva_size, canva_size), interpolation=cv2.INTER_CUBIC)
vis.clear_geometries()
big_img = np.zeros((900 * 2 + canva_size * scale_factor, 1600 * 3, 3),
dtype=np.uint8)
big_img[:900, :, :] = np.concatenate(imgs[:3], axis=1)
img_back = np.concatenate(
[imgs[3][:, ::-1, :], imgs[4][:, ::-1, :], imgs[5][:, ::-1, :]],
axis=1)
big_img[900 + canva_size * scale_factor:, :, :] = img_back
big_img = cv2.resize(big_img, (int(1600 / scale_factor * 3),
int(900 / scale_factor * 2 + canva_size)))
w_begin = int((1600 * 3 / scale_factor - canva_size) // 2)
big_img[int(900 / scale_factor):int(900 / scale_factor) + canva_size,
w_begin:w_begin + canva_size, :] = occ_canvas_resize
if args.format == 'image':
out_dir = os.path.join(vis_dir, f'{scene_name}', f'{sample_token}')
mmcv.mkdir_or_exist(out_dir)
for i, img in enumerate(imgs):
cv2.imwrite(os.path.join(out_dir, f'img{i}.png'), img)
cv2.imwrite(os.path.join(out_dir, 'occ.png'), occ_canvas)
cv2.imwrite(os.path.join(out_dir, 'overall.png'), big_img)
elif args.format == 'video':
cv2.putText(big_img, f'{cnt:{cnt}}', (5, 15), fontFace=cv2.FONT_HERSHEY_COMPLEX, color=(0, 0, 0),
fontScale=0.5)
cv2.putText(big_img, f'{scene_name}', (5, 35), fontFace=cv2.FONT_HERSHEY_COMPLEX, color=(0, 0, 0),
fontScale=0.5)
cv2.putText(big_img, f'{sample_token[:5]}', (5, 55), fontFace=cv2.FONT_HERSHEY_COMPLEX, color=(0, 0, 0),
fontScale=0.5)
vout.write(big_img)
if args.format == 'video':
vout.release()
vis.destroy_window()
if __name__ == '__main__':
main()
\ No newline at end of file
import argparse
import sys
import os
sys.path.insert(0, os.getcwd())
import torch.onnx
from mmcv import Config
from mmdeploy.backend.tensorrt.utils import save, search_cuda_version
try:
# If mmdet version > 2.23.0, compat_cfg would be imported and
# used from mmdet instead of mmdet3d.
from mmdet.utils import compat_cfg
except ImportError:
from mmdet3d.utils import compat_cfg
import os
from typing import Dict, Optional, Sequence, Union
import h5py
import mmcv
import numpy as np
import onnx
import pycuda.driver as cuda
import tensorrt as trt
import torch
import tqdm
from mmcv.runner import load_checkpoint
from mmdeploy.apis.core import no_mp
from mmdeploy.backend.tensorrt.calib_utils import HDF5Calibrator
from mmdeploy.backend.tensorrt.init_plugins import load_tensorrt_plugin
from mmdeploy.utils import load_config
from packaging import version
from torch.utils.data import DataLoader
from mmdet3d.datasets import build_dataloader, build_dataset
from mmdet3d.models import build_model
from mmdet.datasets import replace_ImageToTensor
from tools.misc.fuse_conv_bn import fuse_module
class HDF5CalibratorBEVDet(HDF5Calibrator):
def get_batch(self, names: Sequence[str], **kwargs) -> list:
"""Get batch data."""
if self.count < self.dataset_length:
if self.count % 100 == 0:
print('%d/%d' % (self.count, self.dataset_length))
ret = []
for name in names:
input_group = self.calib_data[name]
if name == 'img':
data_np = input_group[str(self.count)][...].astype(
np.float32)
else:
data_np = input_group[str(self.count)][...].astype(
np.int32)
# tile the tensor so we can keep the same distribute
opt_shape = self.input_shapes[name]['opt_shape']
data_shape = data_np.shape
reps = [
int(np.ceil(opt_s / data_s))
for opt_s, data_s in zip(opt_shape, data_shape)
]
data_np = np.tile(data_np, reps)
slice_list = tuple(slice(0, end) for end in opt_shape)
data_np = data_np[slice_list]
data_np_cuda_ptr = cuda.mem_alloc(data_np.nbytes)
cuda.memcpy_htod(data_np_cuda_ptr,
np.ascontiguousarray(data_np))
self.buffers[name] = data_np_cuda_ptr
ret.append(self.buffers[name])
self.count += 1
return ret
else:
return None
def parse_args():
parser = argparse.ArgumentParser(description='Deploy BEVDet with Tensorrt')
parser.add_argument('config', help='deploy config file path')
parser.add_argument('checkpoint', help='checkpoint file')
parser.add_argument('work_dir', help='work dir to save file')
parser.add_argument(
'--prefix', default='bevdet', help='prefix of the save file name')
parser.add_argument(
'--fp16', action='store_true', help='Whether to use tensorrt fp16')
parser.add_argument(
'--int8', action='store_true', help='Whether to use tensorrt int8')
parser.add_argument(
'--fuse-conv-bn',
action='store_true',
help='Whether to fuse conv and bn, this will slightly increase'
'the inference speed')
parser.add_argument('--calib_num', type=int, help='num to calib')
args = parser.parse_args()
return args
def get_plugin_names():
return [pc.name for pc in trt.get_plugin_registry().plugin_creator_list]
def create_calib_input_data_impl(calib_file: str,
dataloader: DataLoader,
model_partition: bool = False,
metas: list = [],
calib_num = None) -> None:
with h5py.File(calib_file, mode='w') as file:
calib_data_group = file.create_group('calib_data')
assert not model_partition
# create end2end group
input_data_group = calib_data_group.create_group('end2end')
input_group_img = input_data_group.create_group('img')
input_keys = [
'ranks_bev', 'ranks_depth', 'ranks_feat', 'interval_starts',
'interval_lengths'
]
input_groups = []
for input_key in input_keys:
input_groups.append(input_data_group.create_group(input_key))
metas = [
metas[i].int().detach().cpu().numpy() for i in range(len(metas))
]
for data_id, input_data in enumerate(tqdm.tqdm(dataloader)):
# save end2end data
if (calib_num is not None) and (data_id > calib_num):
break
input_tensor = input_data['img_inputs'][0][0]
input_ndarray = input_tensor.squeeze(0).detach().cpu().numpy()
# print(input_ndarray.shape, input_ndarray.dtype)
input_group_img.create_dataset(
str(data_id),
shape=input_ndarray.shape,
compression='gzip',
compression_opts=4,
data=input_ndarray)
for kid, input_key in enumerate(input_keys):
input_groups[kid].create_dataset(
str(data_id),
shape=metas[kid].shape,
compression='gzip',
compression_opts=4,
data=metas[kid])
file.flush()
def create_calib_input_data(calib_file: str,
deploy_cfg: Union[str, mmcv.Config],
model_cfg: Union[str, mmcv.Config],
model_checkpoint: Optional[str] = None,
dataset_cfg: Optional[Union[str,
mmcv.Config]] = None,
dataset_type: str = 'val',
device: str = 'cpu',
metas: list = [None],
calib_num = None) -> None:
"""Create dataset for post-training quantization.
Args:
calib_file (str): The output calibration data file.
deploy_cfg (str | mmcv.Config): Deployment config file or
Config object.
model_cfg (str | mmcv.Config): Model config file or Config object.
model_checkpoint (str): A checkpoint path of PyTorch model,
defaults to `None`.
dataset_cfg (Optional[Union[str, mmcv.Config]], optional): Model
config to provide calibration dataset. If none, use `model_cfg`
as the dataset config. Defaults to None.
dataset_type (str, optional): The dataset type. Defaults to 'val'.
device (str, optional): Device to create dataset. Defaults to 'cpu'.
"""
with no_mp():
if dataset_cfg is None:
dataset_cfg = model_cfg
# load cfg if necessary
deploy_cfg, model_cfg = load_config(deploy_cfg, model_cfg)
if dataset_cfg is None:
dataset_cfg = model_cfg
# load dataset_cfg if necessary
dataset_cfg = load_config(dataset_cfg)[0]
from mmdeploy.apis.utils import build_task_processor
task_processor = build_task_processor(model_cfg, deploy_cfg, device)
dataset = task_processor.build_dataset(dataset_cfg, dataset_type)
dataloader = task_processor.build_dataloader(
dataset, 1, 1, dist=False, shuffle=False)
create_calib_input_data_impl(
calib_file, dataloader, model_partition=False, metas=metas, calib_num=calib_num)
def from_onnx(onnx_model: Union[str, onnx.ModelProto],
output_file_prefix: str,
input_shapes: Dict[str, Sequence[int]],
max_workspace_size: int = 0,
fp16_mode: bool = False,
int8_mode: bool = False,
int8_param: Optional[dict] = None,
device_id: int = 0,
log_level: trt.Logger.Severity = trt.Logger.ERROR,
**kwargs) -> trt.ICudaEngine:
"""Create a tensorrt engine from ONNX.
Modified from mmdeploy.backend.tensorrt.utils.from_onnx
"""
import os
old_cuda_device = os.environ.get('CUDA_DEVICE', None)
os.environ['CUDA_DEVICE'] = str(device_id)
import pycuda.autoinit # noqa:F401
if old_cuda_device is not None:
os.environ['CUDA_DEVICE'] = old_cuda_device
else:
os.environ.pop('CUDA_DEVICE')
load_tensorrt_plugin()
# create builder and network
logger = trt.Logger(log_level)
builder = trt.Builder(logger)
EXPLICIT_BATCH = 1 << (int)(
trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
network = builder.create_network(EXPLICIT_BATCH)
# parse onnx
parser = trt.OnnxParser(network, logger)
if isinstance(onnx_model, str):
onnx_model = onnx.load(onnx_model)
if not parser.parse(onnx_model.SerializeToString()):
error_msgs = ''
for error in range(parser.num_errors):
error_msgs += f'{parser.get_error(error)}\n'
raise RuntimeError(f'Failed to parse onnx, {error_msgs}')
# config builder
if version.parse(trt.__version__) < version.parse('8'):
builder.max_workspace_size = max_workspace_size
config = builder.create_builder_config()
config.max_workspace_size = max_workspace_size
cuda_version = search_cuda_version()
if cuda_version is not None:
version_major = int(cuda_version.split('.')[0])
if version_major < 11:
# cu11 support cublasLt, so cudnn heuristic tactic should disable CUBLAS_LT # noqa E501
tactic_source = config.get_tactic_sources() - (
1 << int(trt.TacticSource.CUBLAS_LT))
config.set_tactic_sources(tactic_source)
profile = builder.create_optimization_profile()
for input_name, param in input_shapes.items():
min_shape = param['min_shape']
opt_shape = param['opt_shape']
max_shape = param['max_shape']
profile.set_shape(input_name, min_shape, opt_shape, max_shape)
config.add_optimization_profile(profile)
if fp16_mode:
if version.parse(trt.__version__) < version.parse('8'):
builder.fp16_mode = fp16_mode
config.set_flag(trt.BuilderFlag.FP16)
if int8_mode:
config.set_flag(trt.BuilderFlag.INT8)
assert int8_param is not None
config.int8_calibrator = HDF5CalibratorBEVDet(
int8_param['calib_file'],
input_shapes,
model_type=int8_param['model_type'],
device_id=device_id,
algorithm=int8_param.get(
'algorithm', trt.CalibrationAlgoType.ENTROPY_CALIBRATION_2))
if version.parse(trt.__version__) < version.parse('8'):
builder.int8_mode = int8_mode
builder.int8_calibrator = config.int8_calibrator
# create engine
engine = builder.build_engine(network, config)
assert engine is not None, 'Failed to create TensorRT engine'
save(engine, output_file_prefix + '.engine')
print('Save engine at ', output_file_prefix + '.engine')
return engine
def main():
args = parse_args()
max_workspace_size = 200*200*256*(2**8)
if not os.path.exists(args.work_dir):
os.makedirs(args.work_dir)
load_tensorrt_plugin()
assert 'bev_pool_v2' in get_plugin_names(), \
'bev_pool_v2 is not in the plugin list of tensorrt, ' \
'please install mmdeploy from ' \
'https://github.com/HuangJunJie2017/mmdeploy.git'
# if args.int8:
# assert args.fp16
model_prefix = args.prefix
if args.int8:
model_prefix = model_prefix + '_int8'
elif args.fp16:
model_prefix = model_prefix + '_fp16'
cfg = Config.fromfile(args.config)
cfg.model.pretrained = None
cfg.model.type = cfg.model.type + 'TRT'
cfg = compat_cfg(cfg)
cfg.gpu_ids = [0]
# import modules from plguin/xx, registry will be updated
if hasattr(cfg, 'plugin'):
if cfg.plugin:
import importlib
if hasattr(cfg, 'plugin_dir'):
plugin_dir = cfg.plugin_dir
_module_dir = os.path.dirname(plugin_dir)
_module_dir = _module_dir.split('/')
_module_path = _module_dir[0]
for m in _module_dir[1:]:
_module_path = _module_path + '.' + m
print(_module_path)
plg_lib = importlib.import_module(_module_path)
else:
# import dir is the dirpath for the config file
_module_dir = os.path.dirname(args.config)
_module_dir = _module_dir.split('/')
_module_path = _module_dir[0]
for m in _module_dir[1:]:
_module_path = _module_path + '.' + m
plg_lib = importlib.import_module(_module_path)
# build the dataloader
test_dataloader_default_args = dict(
samples_per_gpu=1, workers_per_gpu=2, dist=False, shuffle=False)
if isinstance(cfg.data.test, dict):
cfg.data.test.test_mode = True
if cfg.data.test_dataloader.get('samples_per_gpu', 1) > 1:
# Replace 'ImageToTensor' to 'DefaultFormatBundle'
cfg.data.test.pipeline = replace_ImageToTensor(
cfg.data.test.pipeline)
elif isinstance(cfg.data.test, list):
for ds_cfg in cfg.data.test:
ds_cfg.test_mode = True
if cfg.data.test_dataloader.get('samples_per_gpu', 1) > 1:
for ds_cfg in cfg.data.test:
ds_cfg.pipeline = replace_ImageToTensor(ds_cfg.pipeline)
test_loader_cfg = {
**test_dataloader_default_args,
**cfg.data.get('test_dataloader', {})
}
dataset = build_dataset(cfg.data.test)
data_loader = build_dataloader(dataset, **test_loader_cfg)
# build the model and load checkpoint
cfg.model.train_cfg = None
model = build_model(cfg.model, test_cfg=cfg.get('test_cfg'))
# assert model.img_view_transformer.grid_size[0] == 128
# assert model.img_view_transformer.grid_size[1] == 128
# assert model.img_view_transformer.grid_size[2] == 1
if os.path.exists(args.checkpoint):
load_checkpoint(model, args.checkpoint, map_location='cpu')
else:
print(args.checkpoint, " does not exists!")
if args.fuse_conv_bn:
model_prefix = model_prefix + '_fuse'
model = fuse_module(model)
model.cuda()
model.eval()
for i, data in enumerate(data_loader):
inputs = [t.cuda() for t in data['img_inputs'][0]]
img = inputs[0].squeeze(0)
if img.shape[0] > 6:
img = img[:6]
if model.__class__.__name__ in ['FBOCCTRT', 'FBOCC2DTRT']:
metas = model.get_bev_pool_input(inputs, img_metas=data['img_metas'])
else:
if model.__class__.__name__ in ['BEVDetOCCTRT']:
metas = model.get_bev_pool_input(inputs)
elif model.__class__.__name__ in ['BEVDepthOCCTRT']:
metas, mlp_input = model.get_bev_pool_input(inputs)
if model.__class__.__name__ in ['FBOCCTRT', 'FBOCC2DTRT', 'BEVDetOCCTRT']:
onnx_input = (img.float().contiguous(), metas[1].int().contiguous(),
metas[2].int().contiguous(), metas[0].int().contiguous(),
metas[3].int().contiguous(), metas[4].int().contiguous())
dynamic_axes={
"ranks_depth" : {0: 'M'},
"ranks_feat" : {0: 'M'},
"ranks_bev" : {0: 'M'},
"interval_starts" : {0: 'N'},
"interval_lengths" : {0: 'N'},
}
input_names=[
'img', 'ranks_depth', 'ranks_feat', 'ranks_bev',
'interval_starts', 'interval_lengths'
]
elif model.__class__.__name__ in ['BEVDepthOCCTRT']:
onnx_input = (img.float().contiguous(), metas[1].int().contiguous(),
metas[2].int().contiguous(), metas[0].int().contiguous(),
metas[3].int().contiguous(), metas[4].int().contiguous(), mlp_input)
dynamic_axes={
"ranks_depth" : {0: 'M'},
"ranks_feat" : {0: 'M'},
"ranks_bev" : {0: 'M'},
"interval_starts" : {0: 'N'},
"interval_lengths" : {0: 'N'},
# "mlp_input" : {0: 'K'},
}
input_names=[
'img', 'ranks_depth', 'ranks_feat', 'ranks_bev',
'interval_starts', 'interval_lengths', 'mlp_input',
]
with torch.no_grad():
if (model.wdet3d == True) and (model.wocc == False) :
output_names=[f'output_{j}' for j in range(6 * len(model.pts_bbox_head.task_heads))]
elif (model.wdet3d == True) and (model.wocc == True) :
output_names=[f'output_{j}' for j in range(1 + 6 * len(model.pts_bbox_head.task_heads))]
elif (model.wdet3d == False) and (model.wocc == True) :
output_names=[f'output_{j}' for j in range(1)]
else:
raise(" At least one of wdet3d and wocc is set as True!! ")
model.forward = model.forward_ori
torch.onnx.export(
model,
onnx_input,
args.work_dir + model_prefix + '.onnx',
opset_version=11,
dynamic_axes=dynamic_axes,
input_names=input_names,
output_names=output_names)
print('output_names:', output_names)
print('====== onnx is saved at : ', args.work_dir + model_prefix + '.onnx')
# check onnx model
onnx_model = onnx.load(args.work_dir + model_prefix + '.onnx')
try:
onnx.checker.check_model(onnx_model)
except Exception:
print('ONNX Model Incorrect')
else:
print('ONNX Model Correct')
model.forward = model.forward_with_argmax
output_names = [f'cls_occ_label']
torch.onnx.export(
model,
onnx_input,
args.work_dir + model_prefix + '_with_argmax.onnx',
opset_version=11,
dynamic_axes=dynamic_axes,
input_names=input_names,
output_names=output_names)
print('output_names:', output_names)
print('====== onnx is saved at : ', args.work_dir + model_prefix + '_with_argmax.onnx')
# check onnx model
onnx_model = onnx.load(args.work_dir + model_prefix + '_with_argmax.onnx')
try:
onnx.checker.check_model(onnx_model)
except Exception:
print('ONNX Model Incorrect')
else:
print('ONNX Model Correct')
break
# convert to tensorrt
num_points = metas[0].shape[0]
num_intervals = metas[3].shape[0]
img_shape = img.shape
input_shapes = dict(
img=dict(
min_shape=img_shape, opt_shape=img_shape, max_shape=img_shape),
ranks_depth=dict(
min_shape=[num_points],
opt_shape=[num_points],
max_shape=[num_points]),
ranks_feat=dict(
min_shape=[num_points],
opt_shape=[num_points],
max_shape=[num_points]),
ranks_bev=dict(
min_shape=[num_points],
opt_shape=[num_points],
max_shape=[num_points]),
interval_starts=dict(
min_shape=[num_intervals],
opt_shape=[num_intervals],
max_shape=[num_intervals]),
interval_lengths=dict(
min_shape=[num_intervals],
opt_shape=[num_intervals],
max_shape=[num_intervals]))
deploy_cfg = dict(
backend_config=dict(
type='tensorrt',
common_config=dict(
fp16_mode=args.fp16,
max_workspace_size=max_workspace_size,
int8_mode=args.int8),
model_inputs=[dict(input_shapes=input_shapes)]),
codebase_config=dict(
type='mmdet3d', task='VoxelDetection', model_type='end2end'))
if args.int8:
calib_filename = 'calib_data.h5'
calib_path = os.path.join(args.work_dir, calib_filename)
create_calib_input_data(
calib_path,
deploy_cfg,
args.config,
args.checkpoint,
dataset_cfg=None,
dataset_type='val',
device='cuda:0',
metas=metas,
calib_num=args.calib_num)
from_onnx(
args.work_dir + model_prefix + '.onnx',
args.work_dir + model_prefix,
fp16_mode=args.fp16,
int8_mode=args.int8,
int8_param=dict(
calib_file=os.path.join(args.work_dir, 'calib_data.h5'),
model_type='end2end'),
max_workspace_size=max_workspace_size,
input_shapes=input_shapes)
# if args.int8:
# os.remove(calib_path)
if __name__ == '__main__':
main()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment