Unverified Commit 4c5c4299 authored by Tai-Wang's avatar Tai-Wang Committed by GitHub
Browse files

Merge branch 'master' into v1.0.0.dev0

parents b420cd1d 86cc487c
......@@ -3,74 +3,74 @@
{
"cell_type": "code",
"execution_count": 7,
"source": [
"from mmdet3d.apis import init_model, inference_detector, show_result_meshlab"
],
"outputs": [],
"metadata": {
"pycharm": {
"is_executing": false
}
},
"outputs": [],
"source": [
"from mmdet3d.apis import init_detector, inference_detector, show_result_meshlab"
]
}
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"pycharm": {
"is_executing": false
}
},
"outputs": [],
"source": [
"config_file = '../configs/second/hv_second_secfpn_6x8_80e_kitti-3d-car.py'\n",
"# download the checkpoint from model zoo and put it in `checkpoints/`\n",
"checkpoint_file = '../work_dirs/second/epoch_40.pth'"
]
},
{
"cell_type": "code",
"execution_count": 9,
],
"outputs": [],
"metadata": {
"pycharm": {
"is_executing": false
}
},
"outputs": [],
"source": [
"# build the model from a config file and a checkpoint file\n",
"model = init_detector(config_file, checkpoint_file, device='cuda:0')"
]
}
},
{
"cell_type": "code",
"execution_count": 10,
"execution_count": 9,
"source": [
"# build the model from a config file and a checkpoint file\n",
"model = init_model(config_file, checkpoint_file, device='cuda:0')"
],
"outputs": [],
"metadata": {
"pycharm": {
"is_executing": false
}
}
},
"outputs": [],
{
"cell_type": "code",
"execution_count": 10,
"source": [
"# test a single sample\n",
"pcd = 'kitti_000008.bin'\n",
"result, data = inference_detector(model, pcd)"
]
},
{
"cell_type": "code",
"execution_count": 11,
],
"outputs": [],
"metadata": {
"pycharm": {
"is_executing": false
}
}
},
"outputs": [],
{
"cell_type": "code",
"execution_count": 11,
"source": [
"# show the results\n",
"out_dir = './'\n",
"show_result_meshlab(data, result, out_dir)"
]
],
"outputs": [],
"metadata": {
"pycharm": {
"is_executing": false
}
}
}
],
"metadata": {
......
......@@ -28,7 +28,7 @@ class GatherPoints(Function):
B, npoint = indices.size()
_, C, N = features.size()
output = torch.cuda.FloatTensor(B, C, npoint)
output = features.new_zeros((B, C, npoint))
gather_points_ext.gather_points_wrapper(B, C, N, npoint, features,
indices, output)
......@@ -42,7 +42,7 @@ class GatherPoints(Function):
idx, C, N = ctx.for_backwards
B, npoint = idx.size()
grad_features = torch.cuda.FloatTensor(B, C, N).zero_()
grad_features = grad_out.new_zeros((B, C, N))
grad_out_data = grad_out.data.contiguous()
gather_points_ext.gather_points_grad_wrapper(B, C, N, npoint,
grad_out_data, idx,
......
#include <ATen/cuda/CUDAContext.h>
#include <ATen/TensorUtils.h>
#include <THC/THC.h>
#include <torch/extension.h>
#include <torch/serialize/tensor.h>
#include <vector>
extern THCState *state;
int gather_points_wrapper(int b, int c, int n, int npoints,
at::Tensor points_tensor, at::Tensor idx_tensor,
at::Tensor out_tensor);
at::Tensor& points_tensor, at::Tensor& idx_tensor,
at::Tensor& out_tensor);
void gather_points_kernel_launcher(int b, int c, int n, int npoints,
const float *points, const int *idx,
float *out, cudaStream_t stream);
const at::Tensor& points_tensor,
const at::Tensor& idx_tensor,
at::Tensor& out_tensor);
int gather_points_grad_wrapper(int b, int c, int n, int npoints,
at::Tensor grad_out_tensor,
at::Tensor idx_tensor,
at::Tensor grad_points_tensor);
at::Tensor& grad_out_tensor,
at::Tensor& idx_tensor,
at::Tensor& grad_points_tensor);
void gather_points_grad_kernel_launcher(int b, int c, int n, int npoints,
const float *grad_out, const int *idx,
float *grad_points,
cudaStream_t stream);
const at::Tensor& grad_out_tensor,
const at::Tensor& idx_tensor,
at::Tensor& grad_points_tensor);
int gather_points_wrapper(int b, int c, int n, int npoints,
at::Tensor points_tensor, at::Tensor idx_tensor,
at::Tensor out_tensor) {
const float *points = points_tensor.data_ptr<float>();
const int *idx = idx_tensor.data_ptr<int>();
float *out = out_tensor.data_ptr<float>();
cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream();
gather_points_kernel_launcher(b, c, n, npoints, points, idx, out, stream);
at::Tensor& points_tensor, at::Tensor& idx_tensor,
at::Tensor& out_tensor)
{
gather_points_kernel_launcher(b, c, n, npoints, points_tensor, idx_tensor, out_tensor);
return 1;
}
int gather_points_grad_wrapper(int b, int c, int n, int npoints,
at::Tensor grad_out_tensor,
at::Tensor idx_tensor,
at::Tensor grad_points_tensor) {
const float *grad_out = grad_out_tensor.data_ptr<float>();
const int *idx = idx_tensor.data_ptr<int>();
float *grad_points = grad_points_tensor.data_ptr<float>();
cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream();
gather_points_grad_kernel_launcher(b, c, n, npoints, grad_out, idx,
grad_points, stream);
at::Tensor& grad_out_tensor,
at::Tensor& idx_tensor,
at::Tensor& grad_points_tensor)
{
gather_points_grad_kernel_launcher(b, c, n, npoints, grad_out_tensor, idx_tensor,
grad_points_tensor);
return 1;
}
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
{
m.def("gather_points_wrapper", &gather_points_wrapper,
"gather_points_wrapper");
m.def("gather_points_grad_wrapper", &gather_points_grad_wrapper,
......
#include <stdio.h>
#include <stdlib.h>
#include <ATen/ATen.h>
#include <ATen/cuda/CUDAContext.h>
#include <c10/cuda/CUDAGuard.h>
#include <torch/types.h>
#include <ATen/cuda/CUDAApplyUtils.cuh>
#define TOTAL_THREADS 1024
#define THREADS_PER_BLOCK 256
#define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0))
template <typename scalar_t>
__global__ void gather_points_kernel(int b, int c, int n, int m,
const float *__restrict__ points,
const scalar_t *__restrict__ points,
const int *__restrict__ idx,
float *__restrict__ out) {
scalar_t *__restrict__ out) {
// points: (B, C, N)
// idx: (B, M)
// output:
......@@ -26,8 +33,10 @@ __global__ void gather_points_kernel(int b, int c, int n, int m,
}
void gather_points_kernel_launcher(int b, int c, int n, int npoints,
const float *points, const int *idx,
float *out, cudaStream_t stream) {
const at::Tensor& points_tensor,
const at::Tensor& idx_tensor,
at::Tensor& out_tensor)
{
// points: (B, C, N)
// idx: (B, npoints)
// output:
......@@ -37,21 +46,31 @@ void gather_points_kernel_launcher(int b, int c, int n, int npoints,
dim3 blocks(DIVUP(npoints, THREADS_PER_BLOCK), c,
b); // blockIdx.x(col), blockIdx.y(row)
dim3 threads(THREADS_PER_BLOCK);
cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream();
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
out_tensor.scalar_type(), "gather_points_kernel",
[&]
{
const scalar_t *points = points_tensor.data_ptr<scalar_t>();
const int *idx = idx_tensor.data_ptr<int>();
scalar_t *out = out_tensor.data_ptr<scalar_t>();
gather_points_kernel<<<blocks, threads, 0, stream>>>(b, c, n, npoints, points,
idx, out);
});
err = cudaGetLastError();
if (cudaSuccess != err) {
if (cudaSuccess != err)
{
fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
exit(-1);
}
}
template <typename scalar_t>
__global__ void gather_points_grad_kernel(int b, int c, int n, int m,
const float *__restrict__ grad_out,
const scalar_t *__restrict__ grad_out,
const int *__restrict__ idx,
float *__restrict__ grad_points) {
scalar_t *__restrict__ grad_points) {
// grad_out: (B, C, M)
// idx: (B, M)
// output:
......@@ -70,9 +89,10 @@ __global__ void gather_points_grad_kernel(int b, int c, int n, int m,
}
void gather_points_grad_kernel_launcher(int b, int c, int n, int npoints,
const float *grad_out, const int *idx,
float *grad_points,
cudaStream_t stream) {
const at::Tensor& grad_out_tensor,
const at::Tensor& idx_tensor,
at::Tensor& grad_points_tensor)
{
// grad_out: (B, C, npoints)
// idx: (B, npoints)
// output:
......@@ -83,11 +103,21 @@ void gather_points_grad_kernel_launcher(int b, int c, int n, int npoints,
b); // blockIdx.x(col), blockIdx.y(row)
dim3 threads(THREADS_PER_BLOCK);
gather_points_grad_kernel<<<blocks, threads, 0, stream>>>(
cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream();
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
grad_points_tensor.scalar_type(), "gather_points_grad_kernel",
[&]
{
const scalar_t *grad_out = grad_out_tensor.data_ptr<scalar_t>();
const int *idx = idx_tensor.data_ptr<int>();
scalar_t *grad_points = grad_points_tensor.data_ptr<scalar_t>();
gather_points_grad_kernel<scalar_t><<<blocks, threads, 0, stream>>>(
b, c, n, npoints, grad_out, idx, grad_points);
});
err = cudaGetLastError();
if (cudaSuccess != err) {
if (cudaSuccess != err)
{
fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
exit(-1);
}
......
......@@ -2,6 +2,7 @@
from typing import Tuple
import torch
from mmcv.runner import force_fp32
from torch import nn as nn
from torch.autograd import Function
......@@ -62,7 +63,9 @@ class QueryAndGroup(nn.Module):
if self.max_radius is None:
assert not self.normalize_xyz, \
'can not normalize grouped xyz when max_radius is None'
self.fp16_enabled = False
@force_fp32()
def forward(self, points_xyz, center_xyz, features=None):
"""forward.
......@@ -143,7 +146,9 @@ class GroupAll(nn.Module):
def __init__(self, use_xyz: bool = True):
super().__init__()
self.use_xyz = use_xyz
self.fp16_enabled = False
@force_fp32()
def forward(self,
xyz: torch.Tensor,
new_xyz: torch.Tensor,
......
......@@ -2,9 +2,16 @@
import pytest
import torch
from mmdet3d.ops import (ball_query, furthest_point_sample,
furthest_point_sample_with_dist, gather_points,
grouping_operation, knn, three_interpolate, three_nn)
from mmdet3d.ops import (
ball_query,
furthest_point_sample,
furthest_point_sample_with_dist,
gather_points,
grouping_operation,
knn,
three_interpolate,
three_nn,
)
def test_fps():
......@@ -236,6 +243,8 @@ def test_gather_points():
[-0.7172, 0.0462, -0.6227, -0.7172, -0.7172, -0.7172]]]).cuda()
assert torch.allclose(output, expected_output)
output_half = gather_points(features.half(), idx)
assert torch.allclose(output_half, expected_output.half())
def test_three_interpolate():
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment