"tools/imglab/vscode:/vscode.git/clone" did not exist on "13269d7f97cd6bdf78b67805ce92f07da99a331d"
Unverified Commit 4d77b4c8 authored by Jingwei Zhang's avatar Jingwei Zhang Committed by GitHub
Browse files

[Feature] Support BEVFusion in `projects/` (#2236)

* add bevfusion models

* refactor

* build successfully

* update ImageAug3D

* support inference

* update the format of final bboxes

* add new loading func

* align test precision

* polish docstring

* refactor transformer decoder

* polish code

* fix table in readme

* fix table in readme

* fix table in readme

* update pre-commit-config

* minor changes

* revert the changes of file_client_args in LoadAnnotation3D

* remove unnucessary functions in BEVFusion

* fix loading bug

* fix docstring
parent c6a8eb1f
#include <ATen/TensorUtils.h>
#include <torch/extension.h>
// #include "voxelization.h"
namespace {
template <typename T, typename T_int>
void dynamic_voxelize_kernel(const torch::TensorAccessor<T, 2> points,
torch::TensorAccessor<T_int, 2> coors,
const std::vector<float> voxel_size,
const std::vector<float> coors_range,
const std::vector<int> grid_size,
const int num_points, const int num_features,
const int NDim) {
const int ndim_minus_1 = NDim - 1;
bool failed = false;
// int coor[NDim];
int* coor = new int[NDim]();
int c;
for (int i = 0; i < num_points; ++i) {
failed = false;
for (int j = 0; j < NDim; ++j) {
c = floor((points[i][j] - coors_range[j]) / voxel_size[j]);
// necessary to rm points out of range
if ((c < 0 || c >= grid_size[j])) {
failed = true;
break;
}
coor[j] = c;
}
for (int k = 0; k < NDim; ++k) {
if (failed)
coors[i][k] = -1;
else
coors[i][k] = coor[k];
}
}
delete[] coor;
return;
}
template <typename T, typename T_int>
void hard_voxelize_kernel(const torch::TensorAccessor<T, 2> points,
torch::TensorAccessor<T, 3> voxels,
torch::TensorAccessor<T_int, 2> coors,
torch::TensorAccessor<T_int, 1> num_points_per_voxel,
torch::TensorAccessor<T_int, 3> coor_to_voxelidx,
int& voxel_num, const std::vector<float> voxel_size,
const std::vector<float> coors_range,
const std::vector<int> grid_size,
const int max_points, const int max_voxels,
const int num_points, const int num_features,
const int NDim) {
// declare a temp coors
at::Tensor temp_coors = at::zeros(
{num_points, NDim}, at::TensorOptions().dtype(at::kInt).device(at::kCPU));
// First use dynamic voxelization to get coors,
// then check max points/voxels constraints
dynamic_voxelize_kernel<T, int>(points, temp_coors.accessor<int, 2>(),
voxel_size, coors_range, grid_size,
num_points, num_features, NDim);
int voxelidx, num;
auto coor = temp_coors.accessor<int, 2>();
for (int i = 0; i < num_points; ++i) {
// T_int* coor = temp_coors.data_ptr<int>() + i * NDim;
if (coor[i][0] == -1) continue;
voxelidx = coor_to_voxelidx[coor[i][0]][coor[i][1]][coor[i][2]];
// record voxel
if (voxelidx == -1) {
voxelidx = voxel_num;
if (max_voxels != -1 && voxel_num >= max_voxels) continue;
voxel_num += 1;
coor_to_voxelidx[coor[i][0]][coor[i][1]][coor[i][2]] = voxelidx;
for (int k = 0; k < NDim; ++k) {
coors[voxelidx][k] = coor[i][k];
}
}
// put points into voxel
num = num_points_per_voxel[voxelidx];
if (max_points == -1 || num < max_points) {
for (int k = 0; k < num_features; ++k) {
voxels[voxelidx][num][k] = points[i][k];
}
num_points_per_voxel[voxelidx] += 1;
}
}
return;
}
} // namespace
namespace voxelization {
int hard_voxelize_cpu(const at::Tensor& points, at::Tensor& voxels,
at::Tensor& coors, at::Tensor& num_points_per_voxel,
const std::vector<float> voxel_size,
const std::vector<float> coors_range,
const int max_points, const int max_voxels,
const int NDim = 3) {
// current version tooks about 0.02s_0.03s for one frame on cpu
// check device
AT_ASSERTM(points.device().is_cpu(), "points must be a CPU tensor");
std::vector<int> grid_size(NDim);
const int num_points = points.size(0);
const int num_features = points.size(1);
for (int i = 0; i < NDim; ++i) {
grid_size[i] =
round((coors_range[NDim + i] - coors_range[i]) / voxel_size[i]);
}
// coors, num_points_per_voxel, coor_to_voxelidx are int Tensor
// printf("cpu coor_to_voxelidx size: [%d, %d, %d]\n", grid_size[2],
// grid_size[1], grid_size[0]);
at::Tensor coor_to_voxelidx =
-at::ones({grid_size[2], grid_size[1], grid_size[0]}, coors.options());
int voxel_num = 0;
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
points.scalar_type(), "hard_voxelize_forward", [&] {
hard_voxelize_kernel<scalar_t, int>(
points.accessor<scalar_t, 2>(), voxels.accessor<scalar_t, 3>(),
coors.accessor<int, 2>(), num_points_per_voxel.accessor<int, 1>(),
coor_to_voxelidx.accessor<int, 3>(), voxel_num, voxel_size,
coors_range, grid_size, max_points, max_voxels, num_points,
num_features, NDim);
});
return voxel_num;
}
void dynamic_voxelize_cpu(const at::Tensor& points, at::Tensor& coors,
const std::vector<float> voxel_size,
const std::vector<float> coors_range,
const int NDim = 3) {
// check device
AT_ASSERTM(points.device().is_cpu(), "points must be a CPU tensor");
std::vector<int> grid_size(NDim);
const int num_points = points.size(0);
const int num_features = points.size(1);
for (int i = 0; i < NDim; ++i) {
grid_size[i] =
round((coors_range[NDim + i] - coors_range[i]) / voxel_size[i]);
}
// coors, num_points_per_voxel, coor_to_voxelidx are int Tensor
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
points.scalar_type(), "hard_voxelize_forward", [&] {
dynamic_voxelize_kernel<scalar_t, int>(
points.accessor<scalar_t, 2>(), coors.accessor<int, 2>(),
voxel_size, coors_range, grid_size, num_points, num_features, NDim);
});
return;
}
} // namespace voxelization
#include <ATen/ATen.h>
#include <ATen/cuda/CUDAContext.h>
#include <c10/cuda/CUDAGuard.h>
#include <torch/types.h>
#include <ATen/cuda/CUDAApplyUtils.cuh>
#define CHECK_CUDA(x) \
TORCH_CHECK(x.device().is_cuda(), #x " must be a CUDA tensor")
#define CHECK_CONTIGUOUS(x) \
TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
#define CHECK_INPUT(x) \
CHECK_CUDA(x); \
CHECK_CONTIGUOUS(x)
namespace {
int const threadsPerBlock = sizeof(unsigned long long) * 8;
}
#define CUDA_1D_KERNEL_LOOP(i, n) \
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; \
i += blockDim.x * gridDim.x)
template <typename T, typename T_int>
__global__ void dynamic_voxelize_kernel(
const T* points, T_int* coors, const float voxel_x, const float voxel_y,
const float voxel_z, const float coors_x_min, const float coors_y_min,
const float coors_z_min, const float coors_x_max, const float coors_y_max,
const float coors_z_max, const int grid_x, const int grid_y,
const int grid_z, const int num_points, const int num_features,
const int NDim) {
// const int index = blockIdx.x * threadsPerBlock + threadIdx.x;
CUDA_1D_KERNEL_LOOP(index, num_points) {
// To save some computation
auto points_offset = points + index * num_features;
auto coors_offset = coors + index * NDim;
int c_x = floor((points_offset[0] - coors_x_min) / voxel_x);
if (c_x < 0 || c_x >= grid_x) {
coors_offset[0] = -1;
return;
}
int c_y = floor((points_offset[1] - coors_y_min) / voxel_y);
if (c_y < 0 || c_y >= grid_y) {
coors_offset[0] = -1;
coors_offset[1] = -1;
return;
}
int c_z = floor((points_offset[2] - coors_z_min) / voxel_z);
if (c_z < 0 || c_z >= grid_z) {
coors_offset[0] = -1;
coors_offset[1] = -1;
coors_offset[2] = -1;
} else {
coors_offset[0] = c_x;
coors_offset[1] = c_y;
coors_offset[2] = c_z;
}
}
}
template <typename T, typename T_int>
__global__ void assign_point_to_voxel(const int nthreads, const T* points,
T_int* point_to_voxelidx,
T_int* coor_to_voxelidx, T* voxels,
const int max_points,
const int num_features,
const int num_points, const int NDim) {
CUDA_1D_KERNEL_LOOP(thread_idx, nthreads) {
// const int index = blockIdx.x * threadsPerBlock + threadIdx.x;
int index = thread_idx / num_features;
int num = point_to_voxelidx[index];
int voxelidx = coor_to_voxelidx[index];
if (num > -1 && voxelidx > -1) {
auto voxels_offset =
voxels + voxelidx * max_points * num_features + num * num_features;
int k = thread_idx % num_features;
voxels_offset[k] = points[thread_idx];
}
}
}
template <typename T, typename T_int>
__global__ void assign_voxel_coors(const int nthreads, T_int* coor,
T_int* point_to_voxelidx,
T_int* coor_to_voxelidx, T_int* voxel_coors,
const int num_points, const int NDim) {
CUDA_1D_KERNEL_LOOP(thread_idx, nthreads) {
// const int index = blockIdx.x * threadsPerBlock + threadIdx.x;
// if (index >= num_points) return;
int index = thread_idx / NDim;
int num = point_to_voxelidx[index];
int voxelidx = coor_to_voxelidx[index];
if (num == 0 && voxelidx > -1) {
auto coors_offset = voxel_coors + voxelidx * NDim;
int k = thread_idx % NDim;
coors_offset[k] = coor[thread_idx];
}
}
}
template <typename T_int>
__global__ void point_to_voxelidx_kernel(const T_int* coor,
T_int* point_to_voxelidx,
T_int* point_to_pointidx,
const int max_points,
const int max_voxels,
const int num_points, const int NDim) {
CUDA_1D_KERNEL_LOOP(index, num_points) {
auto coor_offset = coor + index * NDim;
// skip invalid points
if ((index >= num_points) || (coor_offset[0] == -1)) return;
int num = 0;
int coor_x = coor_offset[0];
int coor_y = coor_offset[1];
int coor_z = coor_offset[2];
// only calculate the coors before this coor[index]
for (int i = 0; i < index; ++i) {
auto prev_coor = coor + i * NDim;
if (prev_coor[0] == -1) continue;
// Find all previous points that have the same coors
// if find the same coor, record it
if ((prev_coor[0] == coor_x) && (prev_coor[1] == coor_y) &&
(prev_coor[2] == coor_z)) {
num++;
if (num == 1) {
// point to the same coor that first show up
point_to_pointidx[index] = i;
} else if (num >= max_points) {
// out of boundary
return;
}
}
}
if (num == 0) {
point_to_pointidx[index] = index;
}
if (num < max_points) {
point_to_voxelidx[index] = num;
}
}
}
template <typename T_int>
__global__ void determin_voxel_num(
// const T_int* coor,
T_int* num_points_per_voxel, T_int* point_to_voxelidx,
T_int* point_to_pointidx, T_int* coor_to_voxelidx, T_int* voxel_num,
const int max_points, const int max_voxels, const int num_points) {
// only calculate the coors before this coor[index]
for (int i = 0; i < num_points; ++i) {
// if (coor[i][0] == -1)
// continue;
int point_pos_in_voxel = point_to_voxelidx[i];
// record voxel
if (point_pos_in_voxel == -1) {
// out of max_points or invalid point
continue;
} else if (point_pos_in_voxel == 0) {
// record new voxel
int voxelidx = voxel_num[0];
if (voxel_num[0] >= max_voxels) continue;
voxel_num[0] += 1;
coor_to_voxelidx[i] = voxelidx;
num_points_per_voxel[voxelidx] = 1;
} else {
int point_idx = point_to_pointidx[i];
int voxelidx = coor_to_voxelidx[point_idx];
if (voxelidx != -1) {
coor_to_voxelidx[i] = voxelidx;
num_points_per_voxel[voxelidx] += 1;
}
}
}
}
__global__ void nondisterministic_get_assign_pos(
const int nthreads, const int32_t *coors_map, int32_t *pts_id,
int32_t *coors_count, int32_t *reduce_count, int32_t *coors_order) {
CUDA_1D_KERNEL_LOOP(thread_idx, nthreads) {
int coors_idx = coors_map[thread_idx];
if (coors_idx > -1) {
int32_t coors_pts_pos = atomicAdd(&reduce_count[coors_idx], 1);
pts_id[thread_idx] = coors_pts_pos;
if (coors_pts_pos == 0) {
coors_order[coors_idx] = atomicAdd(coors_count, 1);
}
}
}
}
template<typename T>
__global__ void nondisterministic_assign_point_voxel(
const int nthreads, const T *points, const int32_t *coors_map,
const int32_t *pts_id, const int32_t *coors_in,
const int32_t *reduce_count, const int32_t *coors_order,
T *voxels, int32_t *coors, int32_t *pts_count, const int max_voxels,
const int max_points, const int num_features, const int NDim) {
CUDA_1D_KERNEL_LOOP(thread_idx, nthreads) {
int coors_idx = coors_map[thread_idx];
int coors_pts_pos = pts_id[thread_idx];
if (coors_idx > -1) {
int coors_pos = coors_order[coors_idx];
if (coors_pos < max_voxels && coors_pts_pos < max_points) {
auto voxels_offset =
voxels + (coors_pos * max_points + coors_pts_pos) * num_features;
auto points_offset = points + thread_idx * num_features;
for (int k = 0; k < num_features; k++) {
voxels_offset[k] = points_offset[k];
}
if (coors_pts_pos == 0) {
pts_count[coors_pos] = min(reduce_count[coors_idx], max_points);
auto coors_offset = coors + coors_pos * NDim;
auto coors_in_offset = coors_in + coors_idx * NDim;
for (int k = 0; k < NDim; k++) {
coors_offset[k] = coors_in_offset[k];
}
}
}
}
}
}
namespace voxelization {
int hard_voxelize_gpu(const at::Tensor& points, at::Tensor& voxels,
at::Tensor& coors, at::Tensor& num_points_per_voxel,
const std::vector<float> voxel_size,
const std::vector<float> coors_range,
const int max_points, const int max_voxels,
const int NDim = 3) {
// current version tooks about 0.04s for one frame on cpu
// check device
CHECK_INPUT(points);
at::cuda::CUDAGuard device_guard(points.device());
const int num_points = points.size(0);
const int num_features = points.size(1);
const float voxel_x = voxel_size[0];
const float voxel_y = voxel_size[1];
const float voxel_z = voxel_size[2];
const float coors_x_min = coors_range[0];
const float coors_y_min = coors_range[1];
const float coors_z_min = coors_range[2];
const float coors_x_max = coors_range[3];
const float coors_y_max = coors_range[4];
const float coors_z_max = coors_range[5];
const int grid_x = round((coors_x_max - coors_x_min) / voxel_x);
const int grid_y = round((coors_y_max - coors_y_min) / voxel_y);
const int grid_z = round((coors_z_max - coors_z_min) / voxel_z);
// map points to voxel coors
at::Tensor temp_coors =
at::zeros({num_points, NDim}, points.options().dtype(at::kInt));
dim3 grid(std::min(at::cuda::ATenCeilDiv(num_points, 512), 4096));
dim3 block(512);
// 1. link point to corresponding voxel coors
AT_DISPATCH_ALL_TYPES(
points.scalar_type(), "hard_voxelize_kernel", ([&] {
dynamic_voxelize_kernel<scalar_t, int>
<<<grid, block, 0, at::cuda::getCurrentCUDAStream()>>>(
points.contiguous().data_ptr<scalar_t>(),
temp_coors.contiguous().data_ptr<int>(), voxel_x, voxel_y,
voxel_z, coors_x_min, coors_y_min, coors_z_min, coors_x_max,
coors_y_max, coors_z_max, grid_x, grid_y, grid_z, num_points,
num_features, NDim);
}));
cudaDeviceSynchronize();
AT_CUDA_CHECK(cudaGetLastError());
// 2. map point to the idx of the corresponding voxel, find duplicate coor
// create some temporary variables
auto point_to_pointidx = -at::ones(
{
num_points,
},
points.options().dtype(at::kInt));
auto point_to_voxelidx = -at::ones(
{
num_points,
},
points.options().dtype(at::kInt));
dim3 map_grid(std::min(at::cuda::ATenCeilDiv(num_points, 512), 4096));
dim3 map_block(512);
AT_DISPATCH_ALL_TYPES(
temp_coors.scalar_type(), "determin_duplicate", ([&] {
point_to_voxelidx_kernel<int>
<<<map_grid, map_block, 0, at::cuda::getCurrentCUDAStream()>>>(
temp_coors.contiguous().data_ptr<int>(),
point_to_voxelidx.contiguous().data_ptr<int>(),
point_to_pointidx.contiguous().data_ptr<int>(), max_points,
max_voxels, num_points, NDim);
}));
cudaDeviceSynchronize();
AT_CUDA_CHECK(cudaGetLastError());
// 3. determined voxel num and voxel's coor index
// make the logic in the CUDA device could accelerate about 10 times
auto coor_to_voxelidx = -at::ones(
{
num_points,
},
points.options().dtype(at::kInt));
auto voxel_num = at::zeros(
{
1,
},
points.options().dtype(at::kInt)); // must be zero from the beginning
AT_DISPATCH_ALL_TYPES(
temp_coors.scalar_type(), "determin_duplicate", ([&] {
determin_voxel_num<int><<<1, 1, 0, at::cuda::getCurrentCUDAStream()>>>(
num_points_per_voxel.contiguous().data_ptr<int>(),
point_to_voxelidx.contiguous().data_ptr<int>(),
point_to_pointidx.contiguous().data_ptr<int>(),
coor_to_voxelidx.contiguous().data_ptr<int>(),
voxel_num.contiguous().data_ptr<int>(), max_points, max_voxels,
num_points);
}));
cudaDeviceSynchronize();
AT_CUDA_CHECK(cudaGetLastError());
// 4. copy point features to voxels
// Step 4 & 5 could be parallel
auto pts_output_size = num_points * num_features;
dim3 cp_grid(std::min(at::cuda::ATenCeilDiv(pts_output_size, 512), 4096));
dim3 cp_block(512);
AT_DISPATCH_ALL_TYPES(
points.scalar_type(), "assign_point_to_voxel", ([&] {
assign_point_to_voxel<float, int>
<<<cp_grid, cp_block, 0, at::cuda::getCurrentCUDAStream()>>>(
pts_output_size, points.contiguous().data_ptr<float>(),
point_to_voxelidx.contiguous().data_ptr<int>(),
coor_to_voxelidx.contiguous().data_ptr<int>(),
voxels.contiguous().data_ptr<float>(), max_points, num_features,
num_points, NDim);
}));
// cudaDeviceSynchronize();
// AT_CUDA_CHECK(cudaGetLastError());
// 5. copy coors of each voxels
auto coors_output_size = num_points * NDim;
dim3 coors_cp_grid(
std::min(at::cuda::ATenCeilDiv(coors_output_size, 512), 4096));
dim3 coors_cp_block(512);
AT_DISPATCH_ALL_TYPES(
points.scalar_type(), "assign_point_to_voxel", ([&] {
assign_voxel_coors<float, int><<<coors_cp_grid, coors_cp_block, 0,
at::cuda::getCurrentCUDAStream()>>>(
coors_output_size, temp_coors.contiguous().data_ptr<int>(),
point_to_voxelidx.contiguous().data_ptr<int>(),
coor_to_voxelidx.contiguous().data_ptr<int>(),
coors.contiguous().data_ptr<int>(), num_points, NDim);
}));
cudaDeviceSynchronize();
AT_CUDA_CHECK(cudaGetLastError());
auto voxel_num_cpu = voxel_num.to(at::kCPU);
int voxel_num_int = voxel_num_cpu.data_ptr<int>()[0];
return voxel_num_int;
}
int nondisterministic_hard_voxelize_gpu(
const at::Tensor &points, at::Tensor &voxels,
at::Tensor &coors, at::Tensor &num_points_per_voxel,
const std::vector<float> voxel_size,
const std::vector<float> coors_range,
const int max_points, const int max_voxels,
const int NDim = 3) {
CHECK_INPUT(points);
at::cuda::CUDAGuard device_guard(points.device());
const int num_points = points.size(0);
const int num_features = points.size(1);
if (num_points == 0)
return 0;
const float voxel_x = voxel_size[0];
const float voxel_y = voxel_size[1];
const float voxel_z = voxel_size[2];
const float coors_x_min = coors_range[0];
const float coors_y_min = coors_range[1];
const float coors_z_min = coors_range[2];
const float coors_x_max = coors_range[3];
const float coors_y_max = coors_range[4];
const float coors_z_max = coors_range[5];
const int grid_x = round((coors_x_max - coors_x_min) / voxel_x);
const int grid_y = round((coors_y_max - coors_y_min) / voxel_y);
const int grid_z = round((coors_z_max - coors_z_min) / voxel_z);
// map points to voxel coors
at::Tensor temp_coors =
at::zeros({num_points, NDim}, points.options().dtype(torch::kInt32));
dim3 grid(std::min(at::cuda::ATenCeilDiv(num_points, 512), 4096));
dim3 block(512);
// 1. link point to corresponding voxel coors
AT_DISPATCH_ALL_TYPES(
points.scalar_type(), "hard_voxelize_kernel", ([&] {
dynamic_voxelize_kernel<scalar_t, int>
<<<grid, block, 0, at::cuda::getCurrentCUDAStream()>>>(
points.contiguous().data_ptr<scalar_t>(),
temp_coors.contiguous().data_ptr<int>(), voxel_x, voxel_y,
voxel_z, coors_x_min, coors_y_min, coors_z_min, coors_x_max,
coors_y_max, coors_z_max, grid_x, grid_y, grid_z, num_points,
num_features, NDim);
}));
at::Tensor coors_map;
at::Tensor coors_count;
at::Tensor coors_order;
at::Tensor reduce_count;
at::Tensor pts_id;
auto coors_clean = temp_coors.masked_fill(temp_coors.lt(0).any(-1, true), -1);
std::tie(temp_coors, coors_map, reduce_count) =
at::unique_dim(coors_clean, 0, true, true, false);
if (temp_coors.index({0, 0}).lt(0).item<bool>()) {
// the first element of temp_coors is (-1,-1,-1) and should be removed
temp_coors = temp_coors.slice(0, 1);
coors_map = coors_map - 1;
}
int num_coors = temp_coors.size(0);
temp_coors = temp_coors.to(torch::kInt32);
coors_map = coors_map.to(torch::kInt32);
coors_count = coors_map.new_zeros(1);
coors_order = coors_map.new_empty(num_coors);
reduce_count = coors_map.new_zeros(num_coors);
pts_id = coors_map.new_zeros(num_points);
dim3 cp_grid(std::min(at::cuda::ATenCeilDiv(num_points, 512), 4096));
dim3 cp_block(512);
AT_DISPATCH_ALL_TYPES(points.scalar_type(), "get_assign_pos", ([&] {
nondisterministic_get_assign_pos<<<cp_grid, cp_block, 0,
at::cuda::getCurrentCUDAStream()>>>(
num_points,
coors_map.contiguous().data_ptr<int32_t>(),
pts_id.contiguous().data_ptr<int32_t>(),
coors_count.contiguous().data_ptr<int32_t>(),
reduce_count.contiguous().data_ptr<int32_t>(),
coors_order.contiguous().data_ptr<int32_t>());
}));
AT_DISPATCH_ALL_TYPES(
points.scalar_type(), "assign_point_to_voxel", ([&] {
nondisterministic_assign_point_voxel<scalar_t>
<<<cp_grid, cp_block, 0, at::cuda::getCurrentCUDAStream()>>>(
num_points, points.contiguous().data_ptr<scalar_t>(),
coors_map.contiguous().data_ptr<int32_t>(),
pts_id.contiguous().data_ptr<int32_t>(),
temp_coors.contiguous().data_ptr<int32_t>(),
reduce_count.contiguous().data_ptr<int32_t>(),
coors_order.contiguous().data_ptr<int32_t>(),
voxels.contiguous().data_ptr<scalar_t>(),
coors.contiguous().data_ptr<int32_t>(),
num_points_per_voxel.contiguous().data_ptr<int32_t>(),
max_voxels, max_points,
num_features, NDim);
}));
AT_CUDA_CHECK(cudaGetLastError());
return max_voxels < num_coors ? max_voxels : num_coors;
}
void dynamic_voxelize_gpu(const at::Tensor& points, at::Tensor& coors,
const std::vector<float> voxel_size,
const std::vector<float> coors_range,
const int NDim = 3) {
// current version tooks about 0.04s for one frame on cpu
// check device
CHECK_INPUT(points);
at::cuda::CUDAGuard device_guard(points.device());
const int num_points = points.size(0);
const int num_features = points.size(1);
const float voxel_x = voxel_size[0];
const float voxel_y = voxel_size[1];
const float voxel_z = voxel_size[2];
const float coors_x_min = coors_range[0];
const float coors_y_min = coors_range[1];
const float coors_z_min = coors_range[2];
const float coors_x_max = coors_range[3];
const float coors_y_max = coors_range[4];
const float coors_z_max = coors_range[5];
const int grid_x = round((coors_x_max - coors_x_min) / voxel_x);
const int grid_y = round((coors_y_max - coors_y_min) / voxel_y);
const int grid_z = round((coors_z_max - coors_z_min) / voxel_z);
const int col_blocks = at::cuda::ATenCeilDiv(num_points, threadsPerBlock);
dim3 blocks(col_blocks);
dim3 threads(threadsPerBlock);
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
AT_DISPATCH_ALL_TYPES(points.scalar_type(), "dynamic_voxelize_kernel", [&] {
dynamic_voxelize_kernel<scalar_t, int><<<blocks, threads, 0, stream>>>(
points.contiguous().data_ptr<scalar_t>(),
coors.contiguous().data_ptr<int>(), voxel_x, voxel_y, voxel_z,
coors_x_min, coors_y_min, coors_z_min, coors_x_max, coors_y_max,
coors_z_max, grid_x, grid_y, grid_z, num_points, num_features, NDim);
});
cudaDeviceSynchronize();
AT_CUDA_CHECK(cudaGetLastError());
return;
}
} // namespace voxelization
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
import torch
from torch import nn
from torch.autograd import Function
from torch.nn.modules.utils import _pair
from .voxel_layer import dynamic_voxelize, hard_voxelize
class _Voxelization(Function):
@staticmethod
def forward(ctx,
points,
voxel_size,
coors_range,
max_points=35,
max_voxels=20000,
deterministic=True):
"""convert kitti points(N, >=3) to voxels.
Args:
points: [N, ndim] float tensor. points[:, :3] contain xyz points
and points[:, 3:] contain other information like reflectivity
voxel_size: [3] list/tuple or array, float. xyz, indicate voxel
size
coors_range: [6] list/tuple or array, float. indicate voxel
range. format: xyzxyz, minmax
max_points: int. indicate maximum points contained in a voxel. if
max_points=-1, it means using dynamic_voxelize
max_voxels: int. indicate maximum voxels this function create.
for second, 20000 is a good choice. Users should shuffle points
before call this function because max_voxels may drop points.
deterministic: bool. whether to invoke the non-deterministic
version of hard-voxelization implementations. non-deterministic
version is considerablly fast but is not deterministic. only
affects hard voxelization. default True. for more information
of this argument and the implementation insights, please refer
to the following links:
https://github.com/open-mmlab/mmdetection3d/issues/894
https://github.com/open-mmlab/mmdetection3d/pull/904
it is an experimental feature and we will appreciate it if
you could share with us the failing cases.
Returns:
voxels: [M, max_points, ndim] float tensor. only contain points
and returned when max_points != -1.
coordinates: [M, 3] int32 tensor, always returned.
num_points_per_voxel: [M] int32 tensor. Only returned when
max_points != -1.
"""
if max_points == -1 or max_voxels == -1:
coors = points.new_zeros(size=(points.size(0), 3), dtype=torch.int)
dynamic_voxelize(points, coors, voxel_size, coors_range, 3)
return coors
else:
voxels = points.new_zeros(
size=(max_voxels, max_points, points.size(1)))
coors = points.new_zeros(size=(max_voxels, 3), dtype=torch.int)
num_points_per_voxel = points.new_zeros(
size=(max_voxels, ), dtype=torch.int)
voxel_num = hard_voxelize(
points,
voxels,
coors,
num_points_per_voxel,
voxel_size,
coors_range,
max_points,
max_voxels,
3,
deterministic,
)
# select the valid voxels
voxels_out = voxels[:voxel_num]
coors_out = coors[:voxel_num]
num_points_per_voxel_out = num_points_per_voxel[:voxel_num]
return voxels_out, coors_out, num_points_per_voxel_out
voxelization = _Voxelization.apply
class Voxelization(nn.Module):
def __init__(self,
voxel_size,
point_cloud_range,
max_num_points,
max_voxels=20000,
deterministic=True):
super(Voxelization, self).__init__()
"""
Args:
voxel_size (list): list [x, y, z] size of three dimension
point_cloud_range (list):
[x_min, y_min, z_min, x_max, y_max, z_max]
max_num_points (int): max number of points per voxel
max_voxels (tuple or int): max number of voxels in
(training, testing) time
deterministic: bool. whether to invoke the non-deterministic
version of hard-voxelization implementations. non-deterministic
version is considerablly fast but is not deterministic. only
affects hard voxelization. default True. for more information
of this argument and the implementation insights, please refer
to the following links:
https://github.com/open-mmlab/mmdetection3d/issues/894
https://github.com/open-mmlab/mmdetection3d/pull/904
it is an experimental feature and we will appreciate it if
you could share with us the failing cases.
"""
self.voxel_size = voxel_size
self.point_cloud_range = point_cloud_range
self.max_num_points = max_num_points
if isinstance(max_voxels, tuple):
self.max_voxels = max_voxels
else:
self.max_voxels = _pair(max_voxels)
self.deterministic = deterministic
point_cloud_range = torch.tensor(
point_cloud_range, dtype=torch.float32)
# [0, -40, -3, 70.4, 40, 1]
voxel_size = torch.tensor(voxel_size, dtype=torch.float32)
grid_size = (point_cloud_range[3:] -
point_cloud_range[:3]) / voxel_size
grid_size = torch.round(grid_size).long()
input_feat_shape = grid_size[:2]
self.grid_size = grid_size
# the origin shape is as [x-len, y-len, z-len]
# [w, h, d] -> [d, h, w] removed
self.pcd_shape = [*input_feat_shape, 1] # [::-1]
def forward(self, input):
"""
Args:
input: NC points
"""
if self.training:
max_voxels = self.max_voxels[0]
else:
max_voxels = self.max_voxels[1]
return voxelization(
input,
self.voxel_size,
self.point_cloud_range,
self.max_num_points,
max_voxels,
self.deterministic,
)
def __repr__(self):
tmpstr = self.__class__.__name__ + '('
tmpstr += 'voxel_size=' + str(self.voxel_size)
tmpstr += ', point_cloud_range=' + str(self.point_cloud_range)
tmpstr += ', max_num_points=' + str(self.max_num_points)
tmpstr += ', max_voxels=' + str(self.max_voxels)
tmpstr += ', deterministic=' + str(self.deterministic)
tmpstr += ')'
return tmpstr
# Copyright (c) OpenMMLab. All rights reserved.
from mmdet3d.models.layers import make_sparse_convmodule
from mmdet3d.models.layers.spconv import IS_SPCONV2_AVAILABLE
from mmdet3d.models.middle_encoders import SparseEncoder
from mmdet3d.registry import MODELS
if IS_SPCONV2_AVAILABLE:
from spconv.pytorch import SparseConvTensor
else:
from mmcv.ops import SparseConvTensor
@MODELS.register_module()
class BEVFusionSparseEncoder(SparseEncoder):
r"""Sparse encoder for BEVFusion. The difference between this
implementation and that of ``SparseEncoder`` is that the shape order of 3D
conv is (H, W, D) in ``BEVFusionSparseEncoder`` rather than (D, H, W) in
``SparseEncoder``. This difference comes from the implementation of
``voxelization``.
Args:
in_channels (int): The number of input channels.
sparse_shape (list[int]): The sparse shape of input tensor.
order (list[str], optional): Order of conv module.
Defaults to ('conv', 'norm', 'act').
norm_cfg (dict, optional): Config of normalization layer. Defaults to
dict(type='BN1d', eps=1e-3, momentum=0.01).
base_channels (int, optional): Out channels for conv_input layer.
Defaults to 16.
output_channels (int, optional): Out channels for conv_out layer.
Defaults to 128.
encoder_channels (tuple[tuple[int]], optional):
Convolutional channels of each encode block.
Defaults to ((16, ), (32, 32, 32), (64, 64, 64), (64, 64, 64)).
encoder_paddings (tuple[tuple[int]], optional):
Paddings of each encode block.
Defaults to ((1, ), (1, 1, 1), (1, 1, 1), ((0, 1, 1), 1, 1)).
block_type (str, optional): Type of the block to use.
Defaults to 'conv_module'.
return_middle_feats (bool): Whether output middle features.
Default to False.
"""
def __init__(self,
in_channels,
sparse_shape,
order=('conv', 'norm', 'act'),
norm_cfg=dict(type='BN1d', eps=1e-3, momentum=0.01),
base_channels=16,
output_channels=128,
encoder_channels=((16, ), (32, 32, 32), (64, 64, 64), (64, 64,
64)),
encoder_paddings=((1, ), (1, 1, 1), (1, 1, 1), ((0, 1, 1), 1,
1)),
block_type='conv_module',
return_middle_feats=False):
super(SparseEncoder, self).__init__()
assert block_type in ['conv_module', 'basicblock']
self.sparse_shape = sparse_shape
self.in_channels = in_channels
self.order = order
self.base_channels = base_channels
self.output_channels = output_channels
self.encoder_channels = encoder_channels
self.encoder_paddings = encoder_paddings
self.stage_num = len(self.encoder_channels)
self.fp16_enabled = False
self.return_middle_feats = return_middle_feats
# Spconv init all weight on its own
assert isinstance(order, tuple) and len(order) == 3
assert set(order) == {'conv', 'norm', 'act'}
if self.order[0] != 'conv': # pre activate
self.conv_input = make_sparse_convmodule(
in_channels,
self.base_channels,
3,
norm_cfg=norm_cfg,
padding=1,
indice_key='subm1',
conv_type='SubMConv3d',
order=('conv', ))
else: # post activate
self.conv_input = make_sparse_convmodule(
in_channels,
self.base_channels,
3,
norm_cfg=norm_cfg,
padding=1,
indice_key='subm1',
conv_type='SubMConv3d')
encoder_out_channels = self.make_encoder_layers(
make_sparse_convmodule,
norm_cfg,
self.base_channels,
block_type=block_type)
self.conv_out = make_sparse_convmodule(
encoder_out_channels,
self.output_channels,
kernel_size=(1, 1, 3),
stride=(1, 1, 2),
norm_cfg=norm_cfg,
padding=0,
indice_key='spconv_down2',
conv_type='SparseConv3d')
def forward(self, voxel_features, coors, batch_size):
"""Forward of SparseEncoder.
Args:
voxel_features (torch.Tensor): Voxel features in shape (N, C).
coors (torch.Tensor): Coordinates in shape (N, 4),
the columns in the order of (batch_idx, z_idx, y_idx, x_idx).
batch_size (int): Batch size.
Returns:
torch.Tensor | tuple[torch.Tensor, list]: Return spatial features
include:
- spatial_features (torch.Tensor): Spatial features are out from
the last layer.
- encode_features (List[SparseConvTensor], optional): Middle layer
output features. When self.return_middle_feats is True, the
module returns middle features.
"""
coors = coors.int()
input_sp_tensor = SparseConvTensor(voxel_features, coors,
self.sparse_shape, batch_size)
x = self.conv_input(input_sp_tensor)
encode_features = []
for encoder_layer in self.encoder_layers:
x = encoder_layer(x)
encode_features.append(x)
# for detection head
# [200, 176, 5] -> [200, 176, 2]
out = self.conv_out(encode_features[-1])
spatial_features = out.dense()
N, C, H, W, D = spatial_features.shape
spatial_features = spatial_features.permute(0, 1, 4, 2, 3).contiguous()
spatial_features = spatial_features.view(N, C * D, H, W)
if self.return_middle_feats:
return spatial_features, encode_features
else:
return spatial_features
# Copyright (c) OpenMMLab. All rights reserved.
from mmdet.models import DetrTransformerDecoderLayer
from torch import Tensor, nn
from mmdet3d.registry import MODELS
class PositionEncodingLearned(nn.Module):
"""Absolute pos embedding, learned."""
def __init__(self, input_channel, num_pos_feats=288):
super().__init__()
self.position_embedding_head = nn.Sequential(
nn.Conv1d(input_channel, num_pos_feats, kernel_size=1),
nn.BatchNorm1d(num_pos_feats), nn.ReLU(inplace=True),
nn.Conv1d(num_pos_feats, num_pos_feats, kernel_size=1))
def forward(self, xyz):
xyz = xyz.transpose(1, 2).contiguous()
position_embedding = self.position_embedding_head(xyz)
return position_embedding
@MODELS.register_module()
class TransformerDecoderLayer(DetrTransformerDecoderLayer):
def __init__(self,
pos_encoding_cfg=dict(input_channel=2, num_pos_feats=128),
**kwargs):
super().__init__(**kwargs)
self.self_posembed = PositionEncodingLearned(**pos_encoding_cfg)
self.cross_posembed = PositionEncodingLearned(**pos_encoding_cfg)
def forward(self,
query: Tensor,
key: Tensor = None,
value: Tensor = None,
query_pos: Tensor = None,
key_pos: Tensor = None,
self_attn_mask: Tensor = None,
cross_attn_mask: Tensor = None,
key_padding_mask: Tensor = None,
**kwargs) -> Tensor:
"""
Args:
query (Tensor): The input query, has shape (bs, num_queries, dim).
key (Tensor, optional): The input key, has shape (bs, num_keys,
dim). If `None`, the `query` will be used. Defaults to `None`.
value (Tensor, optional): The input value, has the same shape as
`key`, as in `nn.MultiheadAttention.forward`. If `None`, the
`key` will be used. Defaults to `None`.
query_pos (Tensor, optional): The positional encoding for `query`,
has the same shape as `query`. If not `None`, it will be added
to `query` before forward function. Defaults to `None`.
key_pos (Tensor, optional): The positional encoding for `key`, has
the same shape as `key`. If not `None`, it will be added to
`key` before forward function. If None, and `query_pos` has the
same shape as `key`, then `query_pos` will be used for
`key_pos`. Defaults to None.
self_attn_mask (Tensor, optional): ByteTensor mask, has shape
(num_queries, num_keys), as in `nn.MultiheadAttention.forward`.
Defaults to None.
cross_attn_mask (Tensor, optional): ByteTensor mask, has shape
(num_queries, num_keys), as in `nn.MultiheadAttention.forward`.
Defaults to None.
key_padding_mask (Tensor, optional): The `key_padding_mask` of
`self_attn` input. ByteTensor, has shape (bs, num_value).
Defaults to None.
Returns:
Tensor: forwarded results, has shape (bs, num_queries, dim).
"""
if self.self_posembed is not None and query_pos is not None:
query_pos = self.self_posembed(query_pos).transpose(1, 2)
else:
query_pos = None
if self.cross_posembed is not None and key_pos is not None:
key_pos = self.cross_posembed(key_pos).transpose(1, 2)
else:
key_pos = None
query = query.transpose(1, 2)
key = key.transpose(1, 2)
# Note that the `value` (equal to `query`) is encoded with `query_pos`.
# This is different from the standard DETR Decoder Layer.
query = self.self_attn(
query=query,
key=query,
value=query + query_pos,
query_pos=query_pos,
key_pos=query_pos,
attn_mask=self_attn_mask,
**kwargs)
query = self.norms[0](query)
# Note that the `value` (equal to `key`) is encoded with `key_pos`.
# This is different from the standard DETR Decoder Layer.
query = self.cross_attn(
query=query,
key=key,
value=key + key_pos,
query_pos=query_pos,
key_pos=key_pos,
attn_mask=cross_attn_mask,
key_padding_mask=key_padding_mask,
**kwargs)
query = self.norms[1](query)
query = self.ffn(query)
query = self.norms[2](query)
query = query.transpose(1, 2)
return query
# modify from https://github.com/mit-han-lab/bevfusion
from typing import Any, Dict
import numpy as np
import torch
from mmcv.transforms import BaseTransform
from PIL import Image
from mmdet3d.registry import TRANSFORMS
@TRANSFORMS.register_module()
class ImageAug3D(BaseTransform):
def __init__(self, final_dim, resize_lim, bot_pct_lim, rot_lim, rand_flip,
is_train):
self.final_dim = final_dim
self.resize_lim = resize_lim
self.bot_pct_lim = bot_pct_lim
self.rand_flip = rand_flip
self.rot_lim = rot_lim
self.is_train = is_train
def sample_augmentation(self, results):
H, W = results['ori_shape']
fH, fW = self.final_dim
if self.is_train:
resize = np.random.uniform(*self.resize_lim)
resize_dims = (int(W * resize), int(H * resize))
newW, newH = resize_dims
crop_h = int(
(1 - np.random.uniform(*self.bot_pct_lim)) * newH) - fH
crop_w = int(np.random.uniform(0, max(0, newW - fW)))
crop = (crop_w, crop_h, crop_w + fW, crop_h + fH)
flip = False
if self.rand_flip and np.random.choice([0, 1]):
flip = True
rotate = np.random.uniform(*self.rot_lim)
else:
resize = np.mean(self.resize_lim)
resize_dims = (int(W * resize), int(H * resize))
newW, newH = resize_dims
crop_h = int((1 - np.mean(self.bot_pct_lim)) * newH) - fH
crop_w = int(max(0, newW - fW) / 2)
crop = (crop_w, crop_h, crop_w + fW, crop_h + fH)
flip = False
rotate = 0
return resize, resize_dims, crop, flip, rotate
def img_transform(self, img, rotation, translation, resize, resize_dims,
crop, flip, rotate):
# adjust image
img = Image.fromarray(img.astype('uint8'), mode='RGB')
img = img.resize(resize_dims)
img = img.crop(crop)
if flip:
img = img.transpose(method=Image.FLIP_LEFT_RIGHT)
img = img.rotate(rotate)
# post-homography transformation
rotation *= resize
translation -= torch.Tensor(crop[:2])
if flip:
A = torch.Tensor([[-1, 0], [0, 1]])
b = torch.Tensor([crop[2] - crop[0], 0])
rotation = A.matmul(rotation)
translation = A.matmul(translation) + b
theta = rotate / 180 * np.pi
A = torch.Tensor([
[np.cos(theta), np.sin(theta)],
[-np.sin(theta), np.cos(theta)],
])
b = torch.Tensor([crop[2] - crop[0], crop[3] - crop[1]]) / 2
b = A.matmul(-b) + b
rotation = A.matmul(rotation)
translation = A.matmul(translation) + b
return img, rotation, translation
def transform(self, data: Dict[str, Any]) -> Dict[str, Any]:
imgs = data['img']
new_imgs = []
transforms = []
for img in imgs:
resize, resize_dims, crop, flip, rotate = self.sample_augmentation(
data)
post_rot = torch.eye(2)
post_tran = torch.zeros(2)
new_img, rotation, translation = self.img_transform(
img,
post_rot,
post_tran,
resize=resize,
resize_dims=resize_dims,
crop=crop,
flip=flip,
rotate=rotate,
)
transform = torch.eye(4)
transform[:2, :2] = rotation
transform[:2, 3] = translation
new_imgs.append(np.array(new_img).astype(np.float32))
transforms.append(transform.numpy())
data['img'] = new_imgs
# update the calibration matrices
data['img_aug_matrix'] = transforms
return data
@TRANSFORMS.register_module()
class GridMask(BaseTransform):
def __init__(
self,
use_h,
use_w,
max_epoch,
rotate=1,
offset=False,
ratio=0.5,
mode=0,
prob=1.0,
fixed_prob=False,
):
self.use_h = use_h
self.use_w = use_w
self.rotate = rotate
self.offset = offset
self.ratio = ratio
self.mode = mode
self.st_prob = prob
self.prob = prob
self.epoch = None
self.max_epoch = max_epoch
self.fixed_prob = fixed_prob
def set_epoch(self, epoch):
self.epoch = epoch
if not self.fixed_prob:
self.set_prob(self.epoch, self.max_epoch)
def set_prob(self, epoch, max_epoch):
self.prob = self.st_prob * self.epoch / self.max_epoch
def transform(self, results):
if np.random.rand() > self.prob:
return results
imgs = results['img']
h = imgs[0].shape[0]
w = imgs[0].shape[1]
self.d1 = 2
self.d2 = min(h, w)
hh = int(1.5 * h)
ww = int(1.5 * w)
d = np.random.randint(self.d1, self.d2)
if self.ratio == 1:
self.length = np.random.randint(1, d)
else:
self.length = min(max(int(d * self.ratio + 0.5), 1), d - 1)
mask = np.ones((hh, ww), np.float32)
st_h = np.random.randint(d)
st_w = np.random.randint(d)
if self.use_h:
for i in range(hh // d):
s = d * i + st_h
t = min(s + self.length, hh)
mask[s:t, :] *= 0
if self.use_w:
for i in range(ww // d):
s = d * i + st_w
t = min(s + self.length, ww)
mask[:, s:t] *= 0
r = np.random.randint(self.rotate)
mask = Image.fromarray(np.uint8(mask))
mask = mask.rotate(r)
mask = np.asarray(mask)
mask = mask[(hh - h) // 2:(hh - h) // 2 + h,
(ww - w) // 2:(ww - w) // 2 + w]
mask = mask.astype(np.float32)
mask = mask[:, :, None]
if self.mode == 1:
mask = 1 - mask
# mask = mask.expand_as(imgs[0])
if self.offset:
offset = torch.from_numpy(2 * (np.random.rand(h, w) - 0.5)).float()
offset = (1 - mask) * offset
imgs = [x * mask + offset for x in imgs]
else:
imgs = [x * mask for x in imgs]
results.update(img=imgs)
return results
This diff is collapsed.
# modify from https://github.com/mit-han-lab/bevfusion
import torch
from mmdet.models.task_modules import AssignResult, BaseAssigner, BaseBBoxCoder
try:
from scipy.optimize import linear_sum_assignment
except ImportError:
linear_sum_assignment = None
from mmdet3d.registry import TASK_UTILS
@TASK_UTILS.register_module()
class TransFusionBBoxCoder(BaseBBoxCoder):
def __init__(
self,
pc_range,
out_size_factor,
voxel_size,
post_center_range=None,
score_threshold=None,
code_size=8,
):
self.pc_range = pc_range
self.out_size_factor = out_size_factor
self.voxel_size = voxel_size
self.post_center_range = post_center_range
self.score_threshold = score_threshold
self.code_size = code_size
def encode(self, dst_boxes):
targets = torch.zeros([dst_boxes.shape[0],
self.code_size]).to(dst_boxes.device)
targets[:, 0] = (dst_boxes[:, 0] - self.pc_range[0]) / (
self.out_size_factor * self.voxel_size[0])
targets[:, 1] = (dst_boxes[:, 1] - self.pc_range[1]) / (
self.out_size_factor * self.voxel_size[1])
targets[:, 3] = dst_boxes[:, 3].log()
targets[:, 4] = dst_boxes[:, 4].log()
targets[:, 5] = dst_boxes[:, 5].log()
# bottom center to gravity center
targets[:, 2] = dst_boxes[:, 2] + dst_boxes[:, 5] * 0.5
targets[:, 6] = torch.sin(dst_boxes[:, 6])
targets[:, 7] = torch.cos(dst_boxes[:, 6])
if self.code_size == 10:
targets[:, 8:10] = dst_boxes[:, 7:]
return targets
def decode(self, heatmap, rot, dim, center, height, vel, filter=False):
"""Decode bboxes.
Args:
heat (torch.Tensor): Heatmap with the shape of
[B, num_cls, num_proposals].
rot (torch.Tensor): Rotation with the shape of
[B, 1, num_proposals].
dim (torch.Tensor): Dim of the boxes with the shape of
[B, 3, num_proposals].
center (torch.Tensor): bev center of the boxes with the shape of
[B, 2, num_proposals]. (in feature map metric)
height (torch.Tensor): height of the boxes with the shape of
[B, 2, num_proposals]. (in real world metric)
vel (torch.Tensor): Velocity with the shape of
[B, 2, num_proposals].
filter: if False, return all box without checking score and
center_range
Returns:
list[dict]: Decoded boxes.
"""
# class label
final_preds = heatmap.max(1, keepdims=False).indices
final_scores = heatmap.max(1, keepdims=False).values
# change size to real world metric
center[:,
0, :] = center[:,
0, :] * self.out_size_factor * self.voxel_size[
0] + self.pc_range[0]
center[:,
1, :] = center[:,
1, :] * self.out_size_factor * self.voxel_size[
1] + self.pc_range[1]
dim[:, 0, :] = dim[:, 0, :].exp()
dim[:, 1, :] = dim[:, 1, :].exp()
dim[:, 2, :] = dim[:, 2, :].exp()
height = height - dim[:,
2:3, :] * 0.5 # gravity center to bottom center
rots, rotc = rot[:, 0:1, :], rot[:, 1:2, :]
rot = torch.atan2(rots, rotc)
if vel is None:
final_box_preds = torch.cat([center, height, dim, rot],
dim=1).permute(0, 2, 1)
else:
final_box_preds = torch.cat([center, height, dim, rot, vel],
dim=1).permute(0, 2, 1)
predictions_dicts = []
for i in range(heatmap.shape[0]):
boxes3d = final_box_preds[i]
scores = final_scores[i]
labels = final_preds[i]
predictions_dict = {
'bboxes': boxes3d,
'scores': scores,
'labels': labels
}
predictions_dicts.append(predictions_dict)
if filter is False:
return predictions_dicts
# use score threshold
if self.score_threshold is not None:
thresh_mask = final_scores > self.score_threshold
if self.post_center_range is not None:
self.post_center_range = torch.tensor(
self.post_center_range, device=heatmap.device)
mask = (final_box_preds[..., :3] >=
self.post_center_range[:3]).all(2)
mask &= (final_box_preds[..., :3] <=
self.post_center_range[3:]).all(2)
predictions_dicts = []
for i in range(heatmap.shape[0]):
cmask = mask[i, :]
if self.score_threshold:
cmask &= thresh_mask[i]
boxes3d = final_box_preds[i, cmask]
scores = final_scores[i, cmask]
labels = final_preds[i, cmask]
predictions_dict = {
'bboxes': boxes3d,
'scores': scores,
'labels': labels
}
predictions_dicts.append(predictions_dict)
else:
raise NotImplementedError(
'Need to reorganize output as a batch, only '
'support post_center_range is not None for now!')
return predictions_dicts
@TASK_UTILS.register_module()
class BBoxBEVL1Cost(object):
def __init__(self, weight):
self.weight = weight
def __call__(self, bboxes, gt_bboxes, train_cfg):
pc_start = bboxes.new(train_cfg['point_cloud_range'][0:2])
pc_range = bboxes.new(
train_cfg['point_cloud_range'][3:5]) - bboxes.new(
train_cfg['point_cloud_range'][0:2])
# normalize the box center to [0, 1]
normalized_bboxes_xy = (bboxes[:, :2] - pc_start) / pc_range
normalized_gt_bboxes_xy = (gt_bboxes[:, :2] - pc_start) / pc_range
reg_cost = torch.cdist(
normalized_bboxes_xy, normalized_gt_bboxes_xy, p=1)
return reg_cost * self.weight
@TASK_UTILS.register_module()
class IoU3DCost(object):
def __init__(self, weight):
self.weight = weight
def __call__(self, iou):
iou_cost = -iou
return iou_cost * self.weight
@TASK_UTILS.register_module()
class HeuristicAssigner3D(BaseAssigner):
def __init__(self,
dist_thre=100,
iou_calculator=dict(type='BboxOverlaps3D')):
self.dist_thre = dist_thre # distance in meter
self.iou_calculator = TASK_UTILS.build(iou_calculator)
def assign(self,
bboxes,
gt_bboxes,
gt_bboxes_ignore=None,
gt_labels=None,
query_labels=None):
dist_thre = self.dist_thre
num_gts, num_bboxes = len(gt_bboxes), len(bboxes)
bev_dist = torch.norm(
bboxes[:, 0:2][None, :, :] - gt_bboxes[:, 0:2][:, None, :],
dim=-1) # [num_gts, num_bboxes]
if query_labels is not None:
# only match the gt box and query with same category
not_same_class = (query_labels[None] != gt_labels[:, None])
bev_dist += not_same_class * dist_thre
# for each gt box, assign it to the nearest pred box
nearest_values, nearest_indices = bev_dist.min(1) # [num_gts]
assigned_gt_inds = torch.ones([
num_bboxes,
]).to(bboxes) * 0
assigned_gt_vals = torch.ones([
num_bboxes,
]).to(bboxes) * 10000
assigned_gt_labels = torch.ones([
num_bboxes,
]).to(bboxes) * -1
for idx_gts in range(num_gts):
# for idx_pred in torch.where(bev_dist[idx_gts] < dist_thre)[0]:
# # each gt match to all the pred box within some radius
idx_pred = nearest_indices[
idx_gts] # each gt only match to the nearest pred box
if bev_dist[idx_gts, idx_pred] <= dist_thre:
# if this pred box is assigned, then compare
if bev_dist[idx_gts, idx_pred] < assigned_gt_vals[idx_pred]:
assigned_gt_vals[idx_pred] = bev_dist[idx_gts, idx_pred]
# for AssignResult, 0 is negative, -1 is ignore, 1-based
# indices are positive
assigned_gt_inds[idx_pred] = idx_gts + 1
assigned_gt_labels[idx_pred] = gt_labels[idx_gts]
max_overlaps = torch.zeros([
num_bboxes,
]).to(bboxes)
matched_indices = torch.where(assigned_gt_inds > 0)
matched_iou = self.iou_calculator(
gt_bboxes[assigned_gt_inds[matched_indices].long() - 1],
bboxes[matched_indices]).diag()
max_overlaps[matched_indices] = matched_iou
return AssignResult(
num_gts,
assigned_gt_inds.long(),
max_overlaps,
labels=assigned_gt_labels)
@TASK_UTILS.register_module()
class HungarianAssigner3D(BaseAssigner):
def __init__(self,
cls_cost=dict(type='ClassificationCost', weight=1.),
reg_cost=dict(type='BBoxBEVL1Cost', weight=1.0),
iou_cost=dict(type='IoU3DCost', weight=1.0),
iou_calculator=dict(type='BboxOverlaps3D')):
self.cls_cost = TASK_UTILS.build(cls_cost)
self.reg_cost = TASK_UTILS.build(reg_cost)
self.iou_cost = TASK_UTILS.build(iou_cost)
self.iou_calculator = TASK_UTILS.build(iou_calculator)
def assign(self, bboxes, gt_bboxes, gt_labels, cls_pred, train_cfg):
num_gts, num_bboxes = gt_bboxes.size(0), bboxes.size(0)
# 1. assign -1 by default
assigned_gt_inds = bboxes.new_full((num_bboxes, ),
-1,
dtype=torch.long)
assigned_labels = bboxes.new_full((num_bboxes, ), -1, dtype=torch.long)
if num_gts == 0 or num_bboxes == 0:
# No ground truth or boxes, return empty assignment
if num_gts == 0:
# No ground truth, assign all to background
assigned_gt_inds[:] = 0
return AssignResult(
num_gts, assigned_gt_inds, None, labels=assigned_labels)
# 2. compute the weighted costs
# see mmdetection/mmdet/core/bbox/match_costs/match_cost.py
cls_cost = self.cls_cost(cls_pred[0].T, gt_labels)
reg_cost = self.reg_cost(bboxes, gt_bboxes, train_cfg)
iou = self.iou_calculator(bboxes, gt_bboxes)
iou_cost = self.iou_cost(iou)
# weighted sum of above three costs
cost = cls_cost + reg_cost + iou_cost
# 3. do Hungarian matching on CPU using linear_sum_assignment
cost = cost.detach().cpu()
if linear_sum_assignment is None:
raise ImportError('Please run "pip install scipy" '
'to install scipy first.')
matched_row_inds, matched_col_inds = linear_sum_assignment(cost)
matched_row_inds = torch.from_numpy(matched_row_inds).to(bboxes.device)
matched_col_inds = torch.from_numpy(matched_col_inds).to(bboxes.device)
# 4. assign backgrounds and foregrounds
# assign all indices to backgrounds first
assigned_gt_inds[:] = 0
# assign foregrounds based on matching results
assigned_gt_inds[matched_row_inds] = matched_col_inds + 1
assigned_labels[matched_row_inds] = gt_labels[matched_col_inds]
max_overlaps = torch.zeros_like(iou.max(1).values)
max_overlaps[matched_row_inds] = iou[matched_row_inds,
matched_col_inds]
# max_overlaps = iou.max(1).values
return AssignResult(
num_gts, assigned_gt_inds, max_overlaps, labels=assigned_labels)
_base_ = ['mmdet3d::_base_/default_runtime.py']
custom_imports = dict(
imports=['projects.BEVFusion.bevfusion'], allow_failed_imports=False)
# model settings
# Voxel size for voxel encoder
# Usually voxel size is changed consistently with the point cloud range
# If point cloud range is modified, do remember to change all related
# keys in the config.
voxel_size = [0.075, 0.075, 0.2]
point_cloud_range = [-54.0, -54.0, -5.0, 54.0, 54.0, 3.0]
class_names = [
'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier',
'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone'
]
metainfo = dict(classes=class_names)
dataset_type = 'NuScenesDataset'
data_root = 'data/nuscenes/'
data_prefix = dict(
pts='samples/LIDAR_TOP',
CAM_FRONT='samples/CAM_FRONT',
CAM_FRONT_LEFT='samples/CAM_FRONT_LEFT',
CAM_FRONT_RIGHT='samples/CAM_FRONT_RIGHT',
CAM_BACK='samples/CAM_BACK',
CAM_BACK_RIGHT='samples/CAM_BACK_RIGHT',
CAM_BACK_LEFT='samples/CAM_BACK_LEFT',
sweeps='sweeps/LIDAR_TOP')
input_modality = dict(use_lidar=True, use_camera=True)
file_client_args = dict(backend='disk')
model = dict(
type='BEVFusion',
data_preprocessor=dict(
type='Det3DDataPreprocessor',
mean=[123.675, 116.28, 103.53],
std=[58.395, 57.12, 57.375],
bgr_to_rgb=False,
pad_size_divisor=32,
voxelize_cfg=dict(
max_num_points=10,
point_cloud_range=[-54.0, -54.0, -5.0, 54.0, 54.0, 3.0],
voxel_size=[0.075, 0.075, 0.2],
max_voxels=[120000, 160000],
voxelize_reduce=True)),
img_backbone=dict(
type='mmdet.SwinTransformer',
embed_dims=96,
depths=[2, 2, 6, 2],
num_heads=[3, 6, 12, 24],
window_size=7,
mlp_ratio=4,
qkv_bias=True,
qk_scale=None,
drop_rate=0.0,
attn_drop_rate=0.0,
drop_path_rate=0.2,
patch_norm=True,
out_indices=[1, 2, 3],
with_cp=False,
convert_weights=True,
init_cfg=dict(
type='Pretrained',
checkpoint= # noqa: E251
'https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_tiny_patch4_window7_224.pth' # noqa: E501
)),
img_neck=dict(
type='GeneralizedLSSFPN',
in_channels=[192, 384, 768],
out_channels=256,
start_level=0,
num_outs=3,
norm_cfg=dict(type='BN2d', requires_grad=True),
act_cfg=dict(type='ReLU', inplace=True),
upsample_cfg=dict(mode='bilinear', align_corners=False)),
vtransform=dict(
type='DepthLSSTransform',
in_channels=256,
out_channels=80,
image_size=[256, 704],
feature_size=[32, 88],
xbound=[-54.0, 54.0, 0.3],
ybound=[-54.0, 54.0, 0.3],
zbound=[-10.0, 10.0, 20.0],
dbound=[1.0, 60.0, 0.5],
downsample=2),
pts_voxel_encoder=dict(type='HardSimpleVFE', num_features=5),
pts_middle_encoder=dict(
type='BEVFusionSparseEncoder',
in_channels=5,
sparse_shape=[1440, 1440, 41],
order=('conv', 'norm', 'act'),
norm_cfg=dict(type='SyncBN', eps=0.001, momentum=0.01),
encoder_channels=((16, 16, 32), (32, 32, 64), (64, 64, 128), (128,
128)),
encoder_paddings=((0, 0, 1), (0, 0, 1), (0, 0, (1, 1, 0)), (0, 0)),
block_type='basicblock'),
fusion_layer=dict(
type='ConvFuser', in_channels=[80, 256], out_channels=256),
pts_backbone=dict(
type='SECOND',
in_channels=256,
out_channels=[128, 256],
layer_nums=[5, 5],
layer_strides=[1, 2],
norm_cfg=dict(type='SyncBN', eps=0.001, momentum=0.01),
conv_cfg=dict(type='Conv2d', bias=False)),
pts_neck=dict(
type='SECONDFPN',
in_channels=[128, 256],
out_channels=[256, 256],
upsample_strides=[1, 2],
norm_cfg=dict(type='SyncBN', eps=0.001, momentum=0.01),
upsample_cfg=dict(type='deconv', bias=False),
use_conv_for_no_stride=True),
bbox_head=dict(
type='TransFusionHead',
num_proposals=200,
auxiliary=True,
in_channels=512,
hidden_channel=128,
num_classes=10,
nms_kernel_size=3,
bn_momentum=0.1,
num_decoder_layers=1,
decoder_layer=dict(
type='TransformerDecoderLayer',
self_attn_cfg=dict(embed_dims=128, num_heads=8, dropout=0.1),
cross_attn_cfg=dict(embed_dims=128, num_heads=8, dropout=0.1),
ffn_cfg=dict(
embed_dims=128,
feedforward_channels=256,
num_fcs=2,
ffn_drop=0.1,
act_cfg=dict(type='ReLU', inplace=True),
),
norm_cfg=dict(type='LN'),
pos_encoding_cfg=dict(input_channel=2, num_pos_feats=128)),
train_cfg=dict(
dataset='nuScenes',
point_cloud_range=[-54.0, -54.0, -5.0, 54.0, 54.0, 3.0],
grid_size=[1440, 1440, 41],
voxel_size=[0.075, 0.075, 0.2],
out_size_factor=8,
gaussian_overlap=0.1,
min_radius=2,
pos_weight=-1,
code_weights=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2],
assigner=dict(
type='HungarianAssigner3D',
iou_calculator=dict(type='BboxOverlaps3D', coordinate='lidar'),
cls_cost=dict(
type='mmdet.FocalLossCost',
gamma=2.0,
alpha=0.25,
weight=0.15),
reg_cost=dict(type='BBoxBEVL1Cost', weight=0.25),
iou_cost=dict(type='IoU3DCost', weight=0.25))),
test_cfg=dict(
dataset='nuScenes',
grid_size=[1440, 1440, 41],
out_size_factor=8,
voxel_size=[0.075, 0.075],
pc_range=[-54.0, -54.0],
nms_type=None),
common_heads=dict(
center=[2, 2], height=[1, 2], dim=[3, 2], rot=[2, 2], vel=[2, 2]),
bbox_coder=dict(
type='TransFusionBBoxCoder',
pc_range=[-54.0, -54.0],
post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0],
score_threshold=0.0,
out_size_factor=8,
voxel_size=[0.075, 0.075],
code_size=10),
loss_cls=dict(
type='mmdet.FocalLoss',
use_sigmoid=True,
gamma=2.0,
alpha=0.25,
reduction='mean',
loss_weight=1.0),
loss_heatmap=dict(
type='mmdet.GaussianFocalLoss', reduction='mean', loss_weight=1.0),
loss_bbox=dict(
type='mmdet.L1Loss', reduction='mean', loss_weight=0.25)))
db_sampler = dict(
data_root=data_root,
info_path=data_root + 'nuscenes_dbinfos_train.pkl',
rate=1.0,
prepare=dict(
filter_by_difficulty=[-1],
filter_by_min_points=dict(Car=5, Pedestrian=5, Cyclist=5)),
classes=class_names,
sample_groups=dict(
car=5,
truck=5,
bus=5,
trailer=5,
construction_vehicle=5,
traffic_cone=5,
barrier=5,
motorcycle=5,
bicycle=5,
pedestrian=5),
points_loader=dict(
type='LoadPointsFromFile',
coord_type='LIDAR',
load_dim=5,
use_dim=[0, 1, 2, 3, 4],
reduce_beams=32))
train_pipeline = [
dict(
type='BEVLoadMultiViewImageFromFiles',
to_float32=True,
color_type='color'),
dict(
type='LoadPointsFromFile',
coord_type='LIDAR',
load_dim=5,
use_dim=5,
reduce_beams=32,
load_augmented=None),
dict(
type='LoadPointsFromMultiSweeps',
sweeps_num=9,
load_dim=5,
use_dim=5,
reduce_beams=32,
pad_empty_sweeps=True,
remove_close=True,
load_augmented=None),
dict(
type='LoadAnnotations3D',
with_bbox_3d=True,
with_label_3d=True,
with_attr_label=False),
# dict(type='ObjectSampling', db_sampler=db_sampler),
dict(
type='ImageAug3D',
final_dim=[256, 704],
resize_lim=[0.38, 0.55],
bot_pct_lim=[0.0, 0.0],
rot_lim=[-5.4, 5.4],
rand_flip=True,
is_train=True),
dict(
type='GlobalRotScaleTrans',
resize_lim=[0.9, 1.1],
rot_lim=[-0.78539816, 0.78539816],
trans_lim=0.5,
is_train=True),
dict(type='RandomFlip3D'),
dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
dict(
type='ObjectNameFilter',
classes=[
'car', 'truck', 'construction_vehicle', 'bus', 'trailer',
'barrier', 'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone'
]),
dict(
type='GridMask',
use_h=True,
use_w=True,
max_epoch=6,
rotate=1,
offset=False,
ratio=0.5,
mode=1,
prob=0.0,
fixed_prob=True),
dict(type='PointShuffle'),
dict(
type='Pack3DDetInputs',
keys=[
'points', 'img', 'gt_bboxes_3d', 'gt_labels_3d', 'gt_bboxes',
'gt_labels'
])
]
test_pipeline = [
dict(
type='BEVLoadMultiViewImageFromFiles',
to_float32=True,
color_type='color'),
dict(type='LoadPointsFromFile', coord_type='LIDAR', load_dim=5, use_dim=5),
dict(
type='LoadPointsFromMultiSweeps',
sweeps_num=9,
load_dim=5,
use_dim=5,
pad_empty_sweeps=True,
remove_close=True),
dict(
type='ImageAug3D',
final_dim=[256, 704],
resize_lim=[0.48, 0.48],
bot_pct_lim=[0.0, 0.0],
rot_lim=[0.0, 0.0],
rand_flip=False,
is_train=False),
dict(
type='PointsRangeFilter',
point_cloud_range=[-54.0, -54.0, -5.0, 54.0, 54.0, 3.0]),
dict(
type='Pack3DDetInputs',
keys=['img', 'points', 'gt_bboxes_3d', 'gt_labels_3d'],
meta_keys=[
'cam2img', 'ori_cam2img', 'lidar2cam', 'lidar2img', 'cam2lidar',
'ori_lidar2img', 'img_aug_matrix', 'box_type_3d', 'sample_idx',
'lidar_path', 'img_path'
])
]
train_dataloader = dict(
batch_size=4,
num_workers=4,
persistent_workers=True,
sampler=dict(type='DefaultSampler', shuffle=True),
dataset=dict(
type=dataset_type,
data_root=data_root,
ann_file='nuscenes_infos_train.pkl',
pipeline=train_pipeline,
metainfo=metainfo,
modality=input_modality,
test_mode=False,
data_prefix=data_prefix,
# we use box_type_3d='LiDAR' in kitti and nuscenes dataset
# and box_type_3d='Depth' in sunrgbd and scannet dataset.
box_type_3d='LiDAR'))
val_dataloader = dict(
batch_size=1,
num_workers=0,
# persistent_workers=True,
drop_last=False,
sampler=dict(type='DefaultSampler', shuffle=False),
dataset=dict(
type=dataset_type,
data_root=data_root,
ann_file='nuscenes_infos_val.pkl',
pipeline=test_pipeline,
metainfo=metainfo,
modality=input_modality,
data_prefix=data_prefix,
test_mode=True,
box_type_3d='LiDAR'))
test_dataloader = val_dataloader
val_evaluator = dict(
type='NuScenesMetric',
data_root=data_root,
ann_file=data_root + 'nuscenes_infos_val.pkl',
metric='bbox')
test_evaluator = val_evaluator
vis_backends = [dict(type='LocalVisBackend')]
visualizer = dict(
type='Det3DLocalVisualizer', vis_backends=vis_backends, name='visualizer')
param_scheduler = [
dict(
type='LinearLR',
start_factor=0.33333333,
by_epoch=False,
begin=0,
end=500),
dict(
type='CosineAnnealingLR',
begin=0,
T_max=6,
end=6,
by_epoch=True,
eta_min_ratio=1e-3),
# momentum scheduler
# During the first 8 epochs, momentum increases from 1 to 0.85 / 0.95
# during the next 12 epochs, momentum increases from 0.85 / 0.95 to 1
dict(
type='CosineAnnealingMomentum',
eta_min=0.85 / 0.95,
begin=0,
end=2.4,
by_epoch=True,
convert_to_iter_based=True),
dict(
type='CosineAnnealingMomentum',
eta_min=1,
begin=2.4,
end=6,
by_epoch=True,
convert_to_iter_based=True)
]
# runtime settings
train_cfg = dict(by_epoch=True, max_epochs=6, val_interval=6)
val_cfg = dict()
test_cfg = dict()
optim_wrapper = dict(
type='OptimWrapper',
optimizer=dict(type='AdamW', lr=0.0002, weight_decay=0.01),
clip_grad=dict(max_norm=35, norm_type=2))
# Default setting for scaling LR automatically
# - `enable` means enable scaling LR automatically
# or not by default.
# - `base_batch_size` = (4 GPUs) x (4 samples per GPU).
auto_scale_lr = dict(enable=False, base_batch_size=16)
default_hooks = dict(
logger=dict(type='LoggerHook', interval=50),
checkpoint=dict(type='CheckpointHook', interval=5))
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment