Commit 4db49780 authored by Shaoshuai Shi's avatar Shaoshuai Shi
Browse files

continune to update cuda codes to support PyTorch 1.1/1.5

parent 11a7e434
...@@ -85,7 +85,7 @@ All models are trained with 8 GTX 1080Ti GPUs and are available for download. ...@@ -85,7 +85,7 @@ All models are trained with 8 GTX 1080Ti GPUs and are available for download.
| | training time | Car | Pedestrian | Cyclist | download | | | training time | Car | Pedestrian | Cyclist | download |
|---------------------------------------------|----------:|:-------:|:-------:|:-------:|:---------:| |---------------------------------------------|----------:|:-------:|:-------:|:-------:|:---------:|
| [PointPillar](tools/cfgs/kitti_models/pointpillar.yaml) |~1.5 hours| 77.28 | 52.29 | 62.68 | [model-18M](https://drive.google.com/file/d/1wMxWTpU1qUoY3DsCH31WJmvJxcjFXKlm/view?usp=sharing) | | [PointPillar](tools/cfgs/kitti_models/pointpillar.yaml) |~1 hours| 77.28 | 52.29 | 62.68 | [model-18M](https://drive.google.com/file/d/1wMxWTpU1qUoY3DsCH31WJmvJxcjFXKlm/view?usp=sharing) |
| [SECOND](tools/cfgs/kitti_models/second.yaml) | ~2 hours | 78.62 | 52.98 | 67.15 | [model-20M](https://drive.google.com/file/d/1-01zsPOsqanZQqIIyy7FpNXStL3y4jdR/view?usp=sharing) | | [SECOND](tools/cfgs/kitti_models/second.yaml) | ~2 hours | 78.62 | 52.98 | 67.15 | [model-20M](https://drive.google.com/file/d/1-01zsPOsqanZQqIIyy7FpNXStL3y4jdR/view?usp=sharing) |
| [PointRCNN](tools/cfgs/kitti_models/pointrcnn.yaml) | ~3 hours | 78.70 | 54.41 | 72.11 | [model-16M](https://drive.google.com/file/d/1BCX9wMn-GYAfSOPpyxf6Iv6fc0qKLSiU/view?usp=sharing)| | [PointRCNN](tools/cfgs/kitti_models/pointrcnn.yaml) | ~3 hours | 78.70 | 54.41 | 72.11 | [model-16M](https://drive.google.com/file/d/1BCX9wMn-GYAfSOPpyxf6Iv6fc0qKLSiU/view?usp=sharing)|
| [PointRCNN-IoU](tools/cfgs/kitti_models/pointrcnn_iou.yaml) | ~3 hours | 78.75 | 58.32 | 71.34 | [model-16M](https://drive.google.com/file/d/1V0vNZ3lAHpEEt0MlT80eL2f41K2tHm_D/view?usp=sharing)| | [PointRCNN-IoU](tools/cfgs/kitti_models/pointrcnn_iou.yaml) | ~3 hours | 78.75 | 58.32 | 71.34 | [model-16M](https://drive.google.com/file/d/1V0vNZ3lAHpEEt0MlT80eL2f41K2tHm_D/view?usp=sharing)|
......
...@@ -154,7 +154,7 @@ class AxisAlignedTargetAssigner(object): ...@@ -154,7 +154,7 @@ class AxisAlignedTargetAssigner(object):
empty_gt_mask = gt_to_anchor_max == 0 empty_gt_mask = gt_to_anchor_max == 0
gt_to_anchor_max[empty_gt_mask] = -1 gt_to_anchor_max[empty_gt_mask] = -1
anchors_with_max_overlap = torch.nonzero(anchor_by_gt_overlap == gt_to_anchor_max)[:, 0] anchors_with_max_overlap = (anchor_by_gt_overlap == gt_to_anchor_max).nonzero()[:, 0]
gt_inds_force = anchor_to_gt_argmax[anchors_with_max_overlap] gt_inds_force = anchor_to_gt_argmax[anchors_with_max_overlap]
labels[anchors_with_max_overlap] = gt_classes[gt_inds_force] labels[anchors_with_max_overlap] = gt_classes[gt_inds_force]
gt_ids[anchors_with_max_overlap] = gt_inds_force.int() gt_ids[anchors_with_max_overlap] = gt_inds_force.int()
...@@ -163,11 +163,11 @@ class AxisAlignedTargetAssigner(object): ...@@ -163,11 +163,11 @@ class AxisAlignedTargetAssigner(object):
gt_inds_over_thresh = anchor_to_gt_argmax[pos_inds] gt_inds_over_thresh = anchor_to_gt_argmax[pos_inds]
labels[pos_inds] = gt_classes[gt_inds_over_thresh] labels[pos_inds] = gt_classes[gt_inds_over_thresh]
gt_ids[pos_inds] = gt_inds_over_thresh.int() gt_ids[pos_inds] = gt_inds_over_thresh.int()
bg_inds = torch.nonzero(anchor_to_gt_max < unmatched_threshold)[:, 0] bg_inds = (anchor_to_gt_max < unmatched_threshold).nonzero()[:, 0]
else: else:
bg_inds = torch.arange(num_anchors, device=anchors.device) bg_inds = torch.arange(num_anchors, device=anchors.device)
fg_inds = torch.nonzero(labels > 0)[:, 0] fg_inds = (labels > 0).nonzero()[:, 0]
if self.pos_fraction is not None: if self.pos_fraction is not None:
num_fg = int(self.pos_fraction * self.sample_size) num_fg = int(self.pos_fraction * self.sample_size)
...@@ -175,7 +175,7 @@ class AxisAlignedTargetAssigner(object): ...@@ -175,7 +175,7 @@ class AxisAlignedTargetAssigner(object):
num_disabled = len(fg_inds) - num_fg num_disabled = len(fg_inds) - num_fg
disable_inds = torch.randperm(len(fg_inds))[:num_disabled] disable_inds = torch.randperm(len(fg_inds))[:num_disabled]
labels[disable_inds] = -1 labels[disable_inds] = -1
fg_inds = torch.nonzero(labels > 0)[:, 0] fg_inds = (labels > 0).nonzero()[:, 0]
num_bg = self.sample_size - (labels > 0).sum() num_bg = self.sample_size - (labels > 0).sum()
if len(bg_inds) > num_bg: if len(bg_inds) > num_bg:
......
...@@ -118,10 +118,10 @@ class ProposalTargetLayer(nn.Module): ...@@ -118,10 +118,10 @@ class ProposalTargetLayer(nn.Module):
fg_rois_per_image = int(np.round(self.roi_sampler_cfg.FG_RATIO * self.roi_sampler_cfg.ROI_PER_IMAGE)) fg_rois_per_image = int(np.round(self.roi_sampler_cfg.FG_RATIO * self.roi_sampler_cfg.ROI_PER_IMAGE))
fg_thresh = min(self.roi_sampler_cfg.REG_FG_THRESH, self.roi_sampler_cfg.CLS_FG_THRESH) fg_thresh = min(self.roi_sampler_cfg.REG_FG_THRESH, self.roi_sampler_cfg.CLS_FG_THRESH)
fg_inds = torch.nonzero((max_overlaps >= fg_thresh)).view(-1) fg_inds = ((max_overlaps >= fg_thresh)).nonzero().view(-1)
easy_bg_inds = torch.nonzero((max_overlaps < self.roi_sampler_cfg.CLS_BG_THRESH_LO)).view(-1) easy_bg_inds = ((max_overlaps < self.roi_sampler_cfg.CLS_BG_THRESH_LO)).nonzero().view(-1)
hard_bg_inds = torch.nonzero((max_overlaps < self.roi_sampler_cfg.REG_FG_THRESH) & hard_bg_inds = ((max_overlaps < self.roi_sampler_cfg.REG_FG_THRESH) &
(max_overlaps >= self.roi_sampler_cfg.CLS_BG_THRESH_LO)).view(-1) (max_overlaps >= self.roi_sampler_cfg.CLS_BG_THRESH_LO)).nonzero().view(-1)
fg_num_rois = fg_inds.numel() fg_num_rois = fg_inds.numel()
bg_num_rois = hard_bg_inds.numel() + easy_bg_inds.numel() bg_num_rois = hard_bg_inds.numel() + easy_bg_inds.numel()
......
...@@ -14,10 +14,21 @@ All Rights Reserved 2018. ...@@ -14,10 +14,21 @@ All Rights Reserved 2018.
extern THCState *state; extern THCState *state;
#define CHECK_CUDA(x) AT_CHECK(x.type().is_cuda(), #x, " must be a CUDAtensor ") #define CHECK_CUDA(x) do { \
#define CHECK_CONTIGUOUS(x) AT_CHECK(x.is_contiguous(), #x, " must be contiguous ") if (!x.type().is_cuda()) { \
fprintf(stderr, "%s must be CUDA tensor at %s:%d\n", #x, __FILE__, __LINE__); \
exit(-1); \
} \
} while (0)
#define CHECK_CONTIGUOUS(x) do { \
if (!x.is_contiguous()) { \
fprintf(stderr, "%s must be contiguous tensor at %s:%d\n", #x, __FILE__, __LINE__); \
exit(-1); \
} \
} while (0)
#define CHECK_INPUT(x) CHECK_CUDA(x);CHECK_CONTIGUOUS(x) #define CHECK_INPUT(x) CHECK_CUDA(x);CHECK_CONTIGUOUS(x)
int ball_query_wrapper_fast(int b, int n, int m, float radius, int nsample, int ball_query_wrapper_fast(int b, int n, int m, float radius, int nsample,
at::Tensor new_xyz_tensor, at::Tensor xyz_tensor, at::Tensor idx_tensor) { at::Tensor new_xyz_tensor, at::Tensor xyz_tensor, at::Tensor idx_tensor) {
CHECK_INPUT(new_xyz_tensor); CHECK_INPUT(new_xyz_tensor);
...@@ -26,7 +37,6 @@ int ball_query_wrapper_fast(int b, int n, int m, float radius, int nsample, ...@@ -26,7 +37,6 @@ int ball_query_wrapper_fast(int b, int n, int m, float radius, int nsample,
const float *xyz = xyz_tensor.data<float>(); const float *xyz = xyz_tensor.data<float>();
int *idx = idx_tensor.data<int>(); int *idx = idx_tensor.data<int>();
cudaStream_t stream = THCState_getCurrentStream(state); ball_query_kernel_launcher_fast(b, n, m, radius, nsample, new_xyz, xyz, idx);
ball_query_kernel_launcher_fast(b, n, m, radius, nsample, new_xyz, xyz, idx, stream);
return 1; return 1;
} }
\ No newline at end of file
...@@ -52,7 +52,7 @@ __global__ void ball_query_kernel_fast(int b, int n, int m, float radius, int ns ...@@ -52,7 +52,7 @@ __global__ void ball_query_kernel_fast(int b, int n, int m, float radius, int ns
void ball_query_kernel_launcher_fast(int b, int n, int m, float radius, int nsample, \ void ball_query_kernel_launcher_fast(int b, int n, int m, float radius, int nsample, \
const float *new_xyz, const float *xyz, int *idx, cudaStream_t stream) { const float *new_xyz, const float *xyz, int *idx) {
// new_xyz: (B, M, 3) // new_xyz: (B, M, 3)
// xyz: (B, N, 3) // xyz: (B, N, 3)
// output: // output:
...@@ -63,11 +63,11 @@ void ball_query_kernel_launcher_fast(int b, int n, int m, float radius, int nsam ...@@ -63,11 +63,11 @@ void ball_query_kernel_launcher_fast(int b, int n, int m, float radius, int nsam
dim3 blocks(DIVUP(m, THREADS_PER_BLOCK), b); // blockIdx.x(col), blockIdx.y(row) dim3 blocks(DIVUP(m, THREADS_PER_BLOCK), b); // blockIdx.x(col), blockIdx.y(row)
dim3 threads(THREADS_PER_BLOCK); dim3 threads(THREADS_PER_BLOCK);
ball_query_kernel_fast<<<blocks, threads, 0, stream>>>(b, n, m, radius, nsample, new_xyz, xyz, idx); ball_query_kernel_fast<<<blocks, threads>>>(b, n, m, radius, nsample, new_xyz, xyz, idx);
// cudaDeviceSynchronize(); // for using printf in kernel function // cudaDeviceSynchronize(); // for using printf in kernel function
err = cudaGetLastError(); err = cudaGetLastError();
if (cudaSuccess != err) { if (cudaSuccess != err) {
fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err)); fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
exit(-1); exit(-1);
} }
} }
\ No newline at end of file
...@@ -10,6 +10,6 @@ int ball_query_wrapper_fast(int b, int n, int m, float radius, int nsample, ...@@ -10,6 +10,6 @@ int ball_query_wrapper_fast(int b, int n, int m, float radius, int nsample,
at::Tensor new_xyz_tensor, at::Tensor xyz_tensor, at::Tensor idx_tensor); at::Tensor new_xyz_tensor, at::Tensor xyz_tensor, at::Tensor idx_tensor);
void ball_query_kernel_launcher_fast(int b, int n, int m, float radius, int nsample, void ball_query_kernel_launcher_fast(int b, int n, int m, float radius, int nsample,
const float *xyz, const float *new_xyz, int *idx, cudaStream_t stream); const float *xyz, const float *new_xyz, int *idx);
#endif #endif
...@@ -22,9 +22,7 @@ int group_points_grad_wrapper_fast(int b, int c, int n, int npoints, int nsample ...@@ -22,9 +22,7 @@ int group_points_grad_wrapper_fast(int b, int c, int n, int npoints, int nsample
const int *idx = idx_tensor.data<int>(); const int *idx = idx_tensor.data<int>();
const float *grad_out = grad_out_tensor.data<float>(); const float *grad_out = grad_out_tensor.data<float>();
cudaStream_t stream = THCState_getCurrentStream(state); group_points_grad_kernel_launcher_fast(b, c, n, npoints, nsample, grad_out, idx, grad_points);
group_points_grad_kernel_launcher_fast(b, c, n, npoints, nsample, grad_out, idx, grad_points, stream);
return 1; return 1;
} }
...@@ -36,8 +34,6 @@ int group_points_wrapper_fast(int b, int c, int n, int npoints, int nsample, ...@@ -36,8 +34,6 @@ int group_points_wrapper_fast(int b, int c, int n, int npoints, int nsample,
const int *idx = idx_tensor.data<int>(); const int *idx = idx_tensor.data<int>();
float *out = out_tensor.data<float>(); float *out = out_tensor.data<float>();
cudaStream_t stream = THCState_getCurrentStream(state); group_points_kernel_launcher_fast(b, c, n, npoints, nsample, points, idx, out);
group_points_kernel_launcher_fast(b, c, n, npoints, nsample, points, idx, out, stream);
return 1; return 1;
} }
\ No newline at end of file
...@@ -31,7 +31,7 @@ __global__ void group_points_grad_kernel_fast(int b, int c, int n, int npoints, ...@@ -31,7 +31,7 @@ __global__ void group_points_grad_kernel_fast(int b, int c, int n, int npoints,
} }
void group_points_grad_kernel_launcher_fast(int b, int c, int n, int npoints, int nsample, void group_points_grad_kernel_launcher_fast(int b, int c, int n, int npoints, int nsample,
const float *grad_out, const int *idx, float *grad_points, cudaStream_t stream) { const float *grad_out, const int *idx, float *grad_points) {
// grad_out: (B, C, npoints, nsample) // grad_out: (B, C, npoints, nsample)
// idx: (B, npoints, nsample) // idx: (B, npoints, nsample)
// output: // output:
...@@ -40,7 +40,7 @@ void group_points_grad_kernel_launcher_fast(int b, int c, int n, int npoints, in ...@@ -40,7 +40,7 @@ void group_points_grad_kernel_launcher_fast(int b, int c, int n, int npoints, in
dim3 blocks(DIVUP(npoints * nsample, THREADS_PER_BLOCK), c, b); // blockIdx.x(col), blockIdx.y(row) dim3 blocks(DIVUP(npoints * nsample, THREADS_PER_BLOCK), c, b); // blockIdx.x(col), blockIdx.y(row)
dim3 threads(THREADS_PER_BLOCK); dim3 threads(THREADS_PER_BLOCK);
group_points_grad_kernel_fast<<<blocks, threads, 0, stream>>>(b, c, n, npoints, nsample, grad_out, idx, grad_points); group_points_grad_kernel_fast<<<blocks, threads>>>(b, c, n, npoints, nsample, grad_out, idx, grad_points);
err = cudaGetLastError(); err = cudaGetLastError();
if (cudaSuccess != err) { if (cudaSuccess != err) {
...@@ -73,7 +73,7 @@ __global__ void group_points_kernel_fast(int b, int c, int n, int npoints, int n ...@@ -73,7 +73,7 @@ __global__ void group_points_kernel_fast(int b, int c, int n, int npoints, int n
void group_points_kernel_launcher_fast(int b, int c, int n, int npoints, int nsample, void group_points_kernel_launcher_fast(int b, int c, int n, int npoints, int nsample,
const float *points, const int *idx, float *out, cudaStream_t stream) { const float *points, const int *idx, float *out) {
// points: (B, C, N) // points: (B, C, N)
// idx: (B, npoints, nsample) // idx: (B, npoints, nsample)
// output: // output:
...@@ -82,7 +82,7 @@ void group_points_kernel_launcher_fast(int b, int c, int n, int npoints, int nsa ...@@ -82,7 +82,7 @@ void group_points_kernel_launcher_fast(int b, int c, int n, int npoints, int nsa
dim3 blocks(DIVUP(npoints * nsample, THREADS_PER_BLOCK), c, b); // blockIdx.x(col), blockIdx.y(row) dim3 blocks(DIVUP(npoints * nsample, THREADS_PER_BLOCK), c, b); // blockIdx.x(col), blockIdx.y(row)
dim3 threads(THREADS_PER_BLOCK); dim3 threads(THREADS_PER_BLOCK);
group_points_kernel_fast<<<blocks, threads, 0, stream>>>(b, c, n, npoints, nsample, points, idx, out); group_points_kernel_fast<<<blocks, threads>>>(b, c, n, npoints, nsample, points, idx, out);
// cudaDeviceSynchronize(); // for using printf in kernel function // cudaDeviceSynchronize(); // for using printf in kernel function
err = cudaGetLastError(); err = cudaGetLastError();
if (cudaSuccess != err) { if (cudaSuccess != err) {
......
...@@ -11,12 +11,12 @@ int group_points_wrapper_fast(int b, int c, int n, int npoints, int nsample, ...@@ -11,12 +11,12 @@ int group_points_wrapper_fast(int b, int c, int n, int npoints, int nsample,
at::Tensor points_tensor, at::Tensor idx_tensor, at::Tensor out_tensor); at::Tensor points_tensor, at::Tensor idx_tensor, at::Tensor out_tensor);
void group_points_kernel_launcher_fast(int b, int c, int n, int npoints, int nsample, void group_points_kernel_launcher_fast(int b, int c, int n, int npoints, int nsample,
const float *points, const int *idx, float *out, cudaStream_t stream); const float *points, const int *idx, float *out);
int group_points_grad_wrapper_fast(int b, int c, int n, int npoints, int nsample, int group_points_grad_wrapper_fast(int b, int c, int n, int npoints, int nsample,
at::Tensor grad_out_tensor, at::Tensor idx_tensor, at::Tensor grad_points_tensor); at::Tensor grad_out_tensor, at::Tensor idx_tensor, at::Tensor grad_points_tensor);
void group_points_grad_kernel_launcher_fast(int b, int c, int n, int npoints, int nsample, void group_points_grad_kernel_launcher_fast(int b, int c, int n, int npoints, int nsample,
const float *grad_out, const int *idx, float *grad_points, cudaStream_t stream); const float *grad_out, const int *idx, float *grad_points);
#endif #endif
...@@ -25,8 +25,7 @@ void three_nn_wrapper_fast(int b, int n, int m, at::Tensor unknown_tensor, ...@@ -25,8 +25,7 @@ void three_nn_wrapper_fast(int b, int n, int m, at::Tensor unknown_tensor,
float *dist2 = dist2_tensor.data<float>(); float *dist2 = dist2_tensor.data<float>();
int *idx = idx_tensor.data<int>(); int *idx = idx_tensor.data<int>();
cudaStream_t stream = THCState_getCurrentStream(state); three_nn_kernel_launcher_fast(b, n, m, unknown, known, dist2, idx);
three_nn_kernel_launcher_fast(b, n, m, unknown, known, dist2, idx, stream);
} }
...@@ -41,8 +40,7 @@ void three_interpolate_wrapper_fast(int b, int c, int m, int n, ...@@ -41,8 +40,7 @@ void three_interpolate_wrapper_fast(int b, int c, int m, int n,
float *out = out_tensor.data<float>(); float *out = out_tensor.data<float>();
const int *idx = idx_tensor.data<int>(); const int *idx = idx_tensor.data<int>();
cudaStream_t stream = THCState_getCurrentStream(state); three_interpolate_kernel_launcher_fast(b, c, m, n, points, idx, weight, out);
three_interpolate_kernel_launcher_fast(b, c, m, n, points, idx, weight, out, stream);
} }
void three_interpolate_grad_wrapper_fast(int b, int c, int n, int m, void three_interpolate_grad_wrapper_fast(int b, int c, int n, int m,
...@@ -56,6 +54,5 @@ void three_interpolate_grad_wrapper_fast(int b, int c, int n, int m, ...@@ -56,6 +54,5 @@ void three_interpolate_grad_wrapper_fast(int b, int c, int n, int m,
float *grad_points = grad_points_tensor.data<float>(); float *grad_points = grad_points_tensor.data<float>();
const int *idx = idx_tensor.data<int>(); const int *idx = idx_tensor.data<int>();
cudaStream_t stream = THCState_getCurrentStream(state); three_interpolate_grad_kernel_launcher_fast(b, c, n, m, grad_out, idx, weight, grad_points);
three_interpolate_grad_kernel_launcher_fast(b, c, n, m, grad_out, idx, weight, grad_points, stream); }
}
\ No newline at end of file
...@@ -60,7 +60,7 @@ __global__ void three_nn_kernel_fast(int b, int n, int m, const float *__restric ...@@ -60,7 +60,7 @@ __global__ void three_nn_kernel_fast(int b, int n, int m, const float *__restric
void three_nn_kernel_launcher_fast(int b, int n, int m, const float *unknown, void three_nn_kernel_launcher_fast(int b, int n, int m, const float *unknown,
const float *known, float *dist2, int *idx, cudaStream_t stream) { const float *known, float *dist2, int *idx) {
// unknown: (B, N, 3) // unknown: (B, N, 3)
// known: (B, M, 3) // known: (B, M, 3)
// output: // output:
...@@ -71,7 +71,7 @@ void three_nn_kernel_launcher_fast(int b, int n, int m, const float *unknown, ...@@ -71,7 +71,7 @@ void three_nn_kernel_launcher_fast(int b, int n, int m, const float *unknown,
dim3 blocks(DIVUP(n, THREADS_PER_BLOCK), b); // blockIdx.x(col), blockIdx.y(row) dim3 blocks(DIVUP(n, THREADS_PER_BLOCK), b); // blockIdx.x(col), blockIdx.y(row)
dim3 threads(THREADS_PER_BLOCK); dim3 threads(THREADS_PER_BLOCK);
three_nn_kernel_fast<<<blocks, threads, 0, stream>>>(b, n, m, unknown, known, dist2, idx); three_nn_kernel_fast<<<blocks, threads>>>(b, n, m, unknown, known, dist2, idx);
err = cudaGetLastError(); err = cudaGetLastError();
if (cudaSuccess != err) { if (cudaSuccess != err) {
...@@ -104,7 +104,7 @@ __global__ void three_interpolate_kernel_fast(int b, int c, int m, int n, const ...@@ -104,7 +104,7 @@ __global__ void three_interpolate_kernel_fast(int b, int c, int m, int n, const
} }
void three_interpolate_kernel_launcher_fast(int b, int c, int m, int n, void three_interpolate_kernel_launcher_fast(int b, int c, int m, int n,
const float *points, const int *idx, const float *weight, float *out, cudaStream_t stream) { const float *points, const int *idx, const float *weight, float *out) {
// points: (B, C, M) // points: (B, C, M)
// idx: (B, N, 3) // idx: (B, N, 3)
// weight: (B, N, 3) // weight: (B, N, 3)
...@@ -114,7 +114,7 @@ void three_interpolate_kernel_launcher_fast(int b, int c, int m, int n, ...@@ -114,7 +114,7 @@ void three_interpolate_kernel_launcher_fast(int b, int c, int m, int n,
cudaError_t err; cudaError_t err;
dim3 blocks(DIVUP(n, THREADS_PER_BLOCK), c, b); // blockIdx.x(col), blockIdx.y(row) dim3 blocks(DIVUP(n, THREADS_PER_BLOCK), c, b); // blockIdx.x(col), blockIdx.y(row)
dim3 threads(THREADS_PER_BLOCK); dim3 threads(THREADS_PER_BLOCK);
three_interpolate_kernel_fast<<<blocks, threads, 0, stream>>>(b, c, m, n, points, idx, weight, out); three_interpolate_kernel_fast<<<blocks, threads>>>(b, c, m, n, points, idx, weight, out);
err = cudaGetLastError(); err = cudaGetLastError();
if (cudaSuccess != err) { if (cudaSuccess != err) {
...@@ -149,7 +149,7 @@ __global__ void three_interpolate_grad_kernel_fast(int b, int c, int n, int m, c ...@@ -149,7 +149,7 @@ __global__ void three_interpolate_grad_kernel_fast(int b, int c, int n, int m, c
} }
void three_interpolate_grad_kernel_launcher_fast(int b, int c, int n, int m, const float *grad_out, void three_interpolate_grad_kernel_launcher_fast(int b, int c, int n, int m, const float *grad_out,
const int *idx, const float *weight, float *grad_points, cudaStream_t stream) { const int *idx, const float *weight, float *grad_points) {
// grad_out: (B, C, N) // grad_out: (B, C, N)
// weight: (B, N, 3) // weight: (B, N, 3)
// output: // output:
...@@ -158,11 +158,11 @@ void three_interpolate_grad_kernel_launcher_fast(int b, int c, int n, int m, con ...@@ -158,11 +158,11 @@ void three_interpolate_grad_kernel_launcher_fast(int b, int c, int n, int m, con
cudaError_t err; cudaError_t err;
dim3 blocks(DIVUP(n, THREADS_PER_BLOCK), c, b); // blockIdx.x(col), blockIdx.y(row) dim3 blocks(DIVUP(n, THREADS_PER_BLOCK), c, b); // blockIdx.x(col), blockIdx.y(row)
dim3 threads(THREADS_PER_BLOCK); dim3 threads(THREADS_PER_BLOCK);
three_interpolate_grad_kernel_fast<<<blocks, threads, 0, stream>>>(b, c, n, m, grad_out, idx, weight, grad_points); three_interpolate_grad_kernel_fast<<<blocks, threads>>>(b, c, n, m, grad_out, idx, weight, grad_points);
err = cudaGetLastError(); err = cudaGetLastError();
if (cudaSuccess != err) { if (cudaSuccess != err) {
fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err)); fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
exit(-1); exit(-1);
} }
} }
\ No newline at end of file
...@@ -11,20 +11,20 @@ void three_nn_wrapper_fast(int b, int n, int m, at::Tensor unknown_tensor, ...@@ -11,20 +11,20 @@ void three_nn_wrapper_fast(int b, int n, int m, at::Tensor unknown_tensor,
at::Tensor known_tensor, at::Tensor dist2_tensor, at::Tensor idx_tensor); at::Tensor known_tensor, at::Tensor dist2_tensor, at::Tensor idx_tensor);
void three_nn_kernel_launcher_fast(int b, int n, int m, const float *unknown, void three_nn_kernel_launcher_fast(int b, int n, int m, const float *unknown,
const float *known, float *dist2, int *idx, cudaStream_t stream); const float *known, float *dist2, int *idx);
void three_interpolate_wrapper_fast(int b, int c, int m, int n, at::Tensor points_tensor, void three_interpolate_wrapper_fast(int b, int c, int m, int n, at::Tensor points_tensor,
at::Tensor idx_tensor, at::Tensor weight_tensor, at::Tensor out_tensor); at::Tensor idx_tensor, at::Tensor weight_tensor, at::Tensor out_tensor);
void three_interpolate_kernel_launcher_fast(int b, int c, int m, int n, void three_interpolate_kernel_launcher_fast(int b, int c, int m, int n,
const float *points, const int *idx, const float *weight, float *out, cudaStream_t stream); const float *points, const int *idx, const float *weight, float *out);
void three_interpolate_grad_wrapper_fast(int b, int c, int n, int m, at::Tensor grad_out_tensor, void three_interpolate_grad_wrapper_fast(int b, int c, int n, int m, at::Tensor grad_out_tensor,
at::Tensor idx_tensor, at::Tensor weight_tensor, at::Tensor grad_points_tensor); at::Tensor idx_tensor, at::Tensor weight_tensor, at::Tensor grad_points_tensor);
void three_interpolate_grad_kernel_launcher_fast(int b, int c, int n, int m, const float *grad_out, void three_interpolate_grad_kernel_launcher_fast(int b, int c, int n, int m, const float *grad_out,
const int *idx, const float *weight, float *grad_points, cudaStream_t stream); const int *idx, const float *weight, float *grad_points);
#endif #endif
...@@ -21,8 +21,7 @@ int gather_points_wrapper_fast(int b, int c, int n, int npoints, ...@@ -21,8 +21,7 @@ int gather_points_wrapper_fast(int b, int c, int n, int npoints,
const int *idx = idx_tensor.data<int>(); const int *idx = idx_tensor.data<int>();
float *out = out_tensor.data<float>(); float *out = out_tensor.data<float>();
cudaStream_t stream = THCState_getCurrentStream(state); gather_points_kernel_launcher_fast(b, c, n, npoints, points, idx, out);
gather_points_kernel_launcher_fast(b, c, n, npoints, points, idx, out, stream);
return 1; return 1;
} }
...@@ -34,8 +33,7 @@ int gather_points_grad_wrapper_fast(int b, int c, int n, int npoints, ...@@ -34,8 +33,7 @@ int gather_points_grad_wrapper_fast(int b, int c, int n, int npoints,
const int *idx = idx_tensor.data<int>(); const int *idx = idx_tensor.data<int>();
float *grad_points = grad_points_tensor.data<float>(); float *grad_points = grad_points_tensor.data<float>();
cudaStream_t stream = THCState_getCurrentStream(state); gather_points_grad_kernel_launcher_fast(b, c, n, npoints, grad_out, idx, grad_points);
gather_points_grad_kernel_launcher_fast(b, c, n, npoints, grad_out, idx, grad_points, stream);
return 1; return 1;
} }
...@@ -47,7 +45,6 @@ int furthest_point_sampling_wrapper(int b, int n, int m, ...@@ -47,7 +45,6 @@ int furthest_point_sampling_wrapper(int b, int n, int m,
float *temp = temp_tensor.data<float>(); float *temp = temp_tensor.data<float>();
int *idx = idx_tensor.data<int>(); int *idx = idx_tensor.data<int>();
cudaStream_t stream = THCState_getCurrentStream(state); furthest_point_sampling_kernel_launcher(b, n, m, points, temp, idx);
furthest_point_sampling_kernel_launcher(b, n, m, points, temp, idx, stream);
return 1; return 1;
} }
...@@ -31,7 +31,7 @@ __global__ void gather_points_kernel_fast(int b, int c, int n, int m, ...@@ -31,7 +31,7 @@ __global__ void gather_points_kernel_fast(int b, int c, int n, int m,
} }
void gather_points_kernel_launcher_fast(int b, int c, int n, int npoints, void gather_points_kernel_launcher_fast(int b, int c, int n, int npoints,
const float *points, const int *idx, float *out, cudaStream_t stream) { const float *points, const int *idx, float *out) {
// points: (B, C, N) // points: (B, C, N)
// idx: (B, npoints) // idx: (B, npoints)
// output: // output:
...@@ -41,7 +41,7 @@ void gather_points_kernel_launcher_fast(int b, int c, int n, int npoints, ...@@ -41,7 +41,7 @@ void gather_points_kernel_launcher_fast(int b, int c, int n, int npoints,
dim3 blocks(DIVUP(npoints, THREADS_PER_BLOCK), c, b); // blockIdx.x(col), blockIdx.y(row) dim3 blocks(DIVUP(npoints, THREADS_PER_BLOCK), c, b); // blockIdx.x(col), blockIdx.y(row)
dim3 threads(THREADS_PER_BLOCK); dim3 threads(THREADS_PER_BLOCK);
gather_points_kernel_fast<<<blocks, threads, 0, stream>>>(b, c, n, npoints, points, idx, out); gather_points_kernel_fast<<<blocks, threads>>>(b, c, n, npoints, points, idx, out);
err = cudaGetLastError(); err = cudaGetLastError();
if (cudaSuccess != err) { if (cudaSuccess != err) {
...@@ -70,7 +70,7 @@ __global__ void gather_points_grad_kernel_fast(int b, int c, int n, int m, const ...@@ -70,7 +70,7 @@ __global__ void gather_points_grad_kernel_fast(int b, int c, int n, int m, const
} }
void gather_points_grad_kernel_launcher_fast(int b, int c, int n, int npoints, void gather_points_grad_kernel_launcher_fast(int b, int c, int n, int npoints,
const float *grad_out, const int *idx, float *grad_points, cudaStream_t stream) { const float *grad_out, const int *idx, float *grad_points) {
// grad_out: (B, C, npoints) // grad_out: (B, C, npoints)
// idx: (B, npoints) // idx: (B, npoints)
// output: // output:
...@@ -80,7 +80,7 @@ void gather_points_grad_kernel_launcher_fast(int b, int c, int n, int npoints, ...@@ -80,7 +80,7 @@ void gather_points_grad_kernel_launcher_fast(int b, int c, int n, int npoints,
dim3 blocks(DIVUP(npoints, THREADS_PER_BLOCK), c, b); // blockIdx.x(col), blockIdx.y(row) dim3 blocks(DIVUP(npoints, THREADS_PER_BLOCK), c, b); // blockIdx.x(col), blockIdx.y(row)
dim3 threads(THREADS_PER_BLOCK); dim3 threads(THREADS_PER_BLOCK);
gather_points_grad_kernel_fast<<<blocks, threads, 0, stream>>>(b, c, n, npoints, grad_out, idx, grad_points); gather_points_grad_kernel_fast<<<blocks, threads>>>(b, c, n, npoints, grad_out, idx, grad_points);
err = cudaGetLastError(); err = cudaGetLastError();
if (cudaSuccess != err) { if (cudaSuccess != err) {
...@@ -216,7 +216,7 @@ __global__ void furthest_point_sampling_kernel(int b, int n, int m, ...@@ -216,7 +216,7 @@ __global__ void furthest_point_sampling_kernel(int b, int n, int m,
} }
void furthest_point_sampling_kernel_launcher(int b, int n, int m, void furthest_point_sampling_kernel_launcher(int b, int n, int m,
const float *dataset, float *temp, int *idxs, cudaStream_t stream) { const float *dataset, float *temp, int *idxs) {
// dataset: (B, N, 3) // dataset: (B, N, 3)
// tmp: (B, N) // tmp: (B, N)
// output: // output:
...@@ -227,29 +227,29 @@ void furthest_point_sampling_kernel_launcher(int b, int n, int m, ...@@ -227,29 +227,29 @@ void furthest_point_sampling_kernel_launcher(int b, int n, int m,
switch (n_threads) { switch (n_threads) {
case 1024: case 1024:
furthest_point_sampling_kernel<1024><<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs); break; furthest_point_sampling_kernel<1024><<<b, n_threads>>>(b, n, m, dataset, temp, idxs); break;
case 512: case 512:
furthest_point_sampling_kernel<512><<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs); break; furthest_point_sampling_kernel<512><<<b, n_threads>>>(b, n, m, dataset, temp, idxs); break;
case 256: case 256:
furthest_point_sampling_kernel<256><<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs); break; furthest_point_sampling_kernel<256><<<b, n_threads>>>(b, n, m, dataset, temp, idxs); break;
case 128: case 128:
furthest_point_sampling_kernel<128><<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs); break; furthest_point_sampling_kernel<128><<<b, n_threads>>>(b, n, m, dataset, temp, idxs); break;
case 64: case 64:
furthest_point_sampling_kernel<64><<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs); break; furthest_point_sampling_kernel<64><<<b, n_threads>>>(b, n, m, dataset, temp, idxs); break;
case 32: case 32:
furthest_point_sampling_kernel<32><<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs); break; furthest_point_sampling_kernel<32><<<b, n_threads>>>(b, n, m, dataset, temp, idxs); break;
case 16: case 16:
furthest_point_sampling_kernel<16><<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs); break; furthest_point_sampling_kernel<16><<<b, n_threads>>>(b, n, m, dataset, temp, idxs); break;
case 8: case 8:
furthest_point_sampling_kernel<8><<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs); break; furthest_point_sampling_kernel<8><<<b, n_threads>>>(b, n, m, dataset, temp, idxs); break;
case 4: case 4:
furthest_point_sampling_kernel<4><<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs); break; furthest_point_sampling_kernel<4><<<b, n_threads>>>(b, n, m, dataset, temp, idxs); break;
case 2: case 2:
furthest_point_sampling_kernel<2><<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs); break; furthest_point_sampling_kernel<2><<<b, n_threads>>>(b, n, m, dataset, temp, idxs); break;
case 1: case 1:
furthest_point_sampling_kernel<1><<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs); break; furthest_point_sampling_kernel<1><<<b, n_threads>>>(b, n, m, dataset, temp, idxs); break;
default: default:
furthest_point_sampling_kernel<512><<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs); furthest_point_sampling_kernel<512><<<b, n_threads>>>(b, n, m, dataset, temp, idxs);
} }
err = cudaGetLastError(); err = cudaGetLastError();
......
...@@ -10,20 +10,20 @@ int gather_points_wrapper_fast(int b, int c, int n, int npoints, ...@@ -10,20 +10,20 @@ int gather_points_wrapper_fast(int b, int c, int n, int npoints,
at::Tensor points_tensor, at::Tensor idx_tensor, at::Tensor out_tensor); at::Tensor points_tensor, at::Tensor idx_tensor, at::Tensor out_tensor);
void gather_points_kernel_launcher_fast(int b, int c, int n, int npoints, void gather_points_kernel_launcher_fast(int b, int c, int n, int npoints,
const float *points, const int *idx, float *out, cudaStream_t stream); const float *points, const int *idx, float *out);
int gather_points_grad_wrapper_fast(int b, int c, int n, int npoints, int gather_points_grad_wrapper_fast(int b, int c, int n, int npoints,
at::Tensor grad_out_tensor, at::Tensor idx_tensor, at::Tensor grad_points_tensor); at::Tensor grad_out_tensor, at::Tensor idx_tensor, at::Tensor grad_points_tensor);
void gather_points_grad_kernel_launcher_fast(int b, int c, int n, int npoints, void gather_points_grad_kernel_launcher_fast(int b, int c, int n, int npoints,
const float *grad_out, const int *idx, float *grad_points, cudaStream_t stream); const float *grad_out, const int *idx, float *grad_points);
int furthest_point_sampling_wrapper(int b, int n, int m, int furthest_point_sampling_wrapper(int b, int n, int m,
at::Tensor points_tensor, at::Tensor temp_tensor, at::Tensor idx_tensor); at::Tensor points_tensor, at::Tensor temp_tensor, at::Tensor idx_tensor);
void furthest_point_sampling_kernel_launcher(int b, int n, int m, void furthest_point_sampling_kernel_launcher(int b, int n, int m,
const float *dataset, float *temp, int *idxs, cudaStream_t stream); const float *dataset, float *temp, int *idxs);
#endif #endif
...@@ -32,7 +32,6 @@ int furthest_point_sampling_wrapper(int b, int n, int m, ...@@ -32,7 +32,6 @@ int furthest_point_sampling_wrapper(int b, int n, int m,
float *temp = temp_tensor.data<float>(); float *temp = temp_tensor.data<float>();
int *idx = idx_tensor.data<int>(); int *idx = idx_tensor.data<int>();
cudaStream_t stream = THCState_getCurrentStream(state); furthest_point_sampling_kernel_launcher(b, n, m, points, temp, idx);
furthest_point_sampling_kernel_launcher(b, n, m, points, temp, idx, stream);
return 1; return 1;
} }
...@@ -140,7 +140,7 @@ __global__ void furthest_point_sampling_kernel(int b, int n, int m, ...@@ -140,7 +140,7 @@ __global__ void furthest_point_sampling_kernel(int b, int n, int m,
} }
void furthest_point_sampling_kernel_launcher(int b, int n, int m, void furthest_point_sampling_kernel_launcher(int b, int n, int m,
const float *dataset, float *temp, int *idxs, cudaStream_t stream) { const float *dataset, float *temp, int *idxs) {
// dataset: (B, N, 3) // dataset: (B, N, 3)
// tmp: (B, N) // tmp: (B, N)
// output: // output:
...@@ -151,29 +151,29 @@ void furthest_point_sampling_kernel_launcher(int b, int n, int m, ...@@ -151,29 +151,29 @@ void furthest_point_sampling_kernel_launcher(int b, int n, int m,
switch (n_threads) { switch (n_threads) {
case 1024: case 1024:
furthest_point_sampling_kernel<1024><<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs); break; furthest_point_sampling_kernel<1024><<<b, n_threads>>>(b, n, m, dataset, temp, idxs); break;
case 512: case 512:
furthest_point_sampling_kernel<512><<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs); break; furthest_point_sampling_kernel<512><<<b, n_threads>>>(b, n, m, dataset, temp, idxs); break;
case 256: case 256:
furthest_point_sampling_kernel<256><<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs); break; furthest_point_sampling_kernel<256><<<b, n_threads>>>(b, n, m, dataset, temp, idxs); break;
case 128: case 128:
furthest_point_sampling_kernel<128><<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs); break; furthest_point_sampling_kernel<128><<<b, n_threads>>>(b, n, m, dataset, temp, idxs); break;
case 64: case 64:
furthest_point_sampling_kernel<64><<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs); break; furthest_point_sampling_kernel<64><<<b, n_threads>>>(b, n, m, dataset, temp, idxs); break;
case 32: case 32:
furthest_point_sampling_kernel<32><<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs); break; furthest_point_sampling_kernel<32><<<b, n_threads>>>(b, n, m, dataset, temp, idxs); break;
case 16: case 16:
furthest_point_sampling_kernel<16><<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs); break; furthest_point_sampling_kernel<16><<<b, n_threads>>>(b, n, m, dataset, temp, idxs); break;
case 8: case 8:
furthest_point_sampling_kernel<8><<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs); break; furthest_point_sampling_kernel<8><<<b, n_threads>>>(b, n, m, dataset, temp, idxs); break;
case 4: case 4:
furthest_point_sampling_kernel<4><<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs); break; furthest_point_sampling_kernel<4><<<b, n_threads>>>(b, n, m, dataset, temp, idxs); break;
case 2: case 2:
furthest_point_sampling_kernel<2><<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs); break; furthest_point_sampling_kernel<2><<<b, n_threads>>>(b, n, m, dataset, temp, idxs); break;
case 1: case 1:
furthest_point_sampling_kernel<1><<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs); break; furthest_point_sampling_kernel<1><<<b, n_threads>>>(b, n, m, dataset, temp, idxs); break;
default: default:
furthest_point_sampling_kernel<512><<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs); furthest_point_sampling_kernel<512><<<b, n_threads>>>(b, n, m, dataset, temp, idxs);
} }
err = cudaGetLastError(); err = cudaGetLastError();
......
...@@ -10,6 +10,6 @@ int furthest_point_sampling_wrapper(int b, int n, int m, ...@@ -10,6 +10,6 @@ int furthest_point_sampling_wrapper(int b, int n, int m,
at::Tensor points_tensor, at::Tensor temp_tensor, at::Tensor idx_tensor); at::Tensor points_tensor, at::Tensor temp_tensor, at::Tensor idx_tensor);
void furthest_point_sampling_kernel_launcher(int b, int n, int m, void furthest_point_sampling_kernel_launcher(int b, int n, int m,
const float *dataset, float *temp, int *idxs, cudaStream_t stream); const float *dataset, float *temp, int *idxs);
#endif #endif
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment