Commit c3d636dc authored by Nikhila Ravi's avatar Nikhila Ravi Committed by Facebook GitHub Bot
Browse files

Cuda updates

Summary:
Updates to:
- enable cuda kernel launches on any GPU (not just the default)
- cuda and contiguous checks for all kernels
- checks to ensure all tensors are on the same device
- error reporting in the cuda kernels
- cuda tests now run on a random device not just the default

Reviewed By: jcjohnson, gkioxari

Differential Revision: D21215280

fbshipit-source-id: 1bedc9fe6c35e9e920bdc4d78ed12865b1005519
parent c9267ab7
...@@ -2,6 +2,8 @@ ...@@ -2,6 +2,8 @@
#include <ATen/ATen.h> #include <ATen/ATen.h>
#include <ATen/core/TensorAccessor.h> #include <ATen/core/TensorAccessor.h>
#include <ATen/cuda/CUDAContext.h>
#include <c10/cuda/CUDAGuard.h>
#include <cuda.h> #include <cuda.h>
#include <cuda_runtime.h> #include <cuda_runtime.h>
...@@ -136,6 +138,17 @@ at::Tensor alphaCompositeCudaForward( ...@@ -136,6 +138,17 @@ at::Tensor alphaCompositeCudaForward(
const at::Tensor& features, const at::Tensor& features,
const at::Tensor& alphas, const at::Tensor& alphas,
const at::Tensor& points_idx) { const at::Tensor& points_idx) {
// Check inputs are on the same device
at::TensorArg features_t{features, "features", 1},
alphas_t{alphas, "alphas", 2}, points_idx_t{points_idx, "points_idx", 3};
at::CheckedFrom c = "alphaCompositeCudaForward";
at::checkAllSameGPU(c, {features_t, alphas_t, points_idx_t});
at::checkAllSameType(c, {features_t, alphas_t});
// Set the device for the kernel launch based on the device of the input
at::cuda::CUDAGuard device_guard(features.device());
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
const int64_t batch_size = points_idx.size(0); const int64_t batch_size = points_idx.size(0);
const int64_t C = features.size(0); const int64_t C = features.size(0);
const int64_t H = points_idx.size(2); const int64_t H = points_idx.size(2);
...@@ -143,19 +156,24 @@ at::Tensor alphaCompositeCudaForward( ...@@ -143,19 +156,24 @@ at::Tensor alphaCompositeCudaForward(
auto result = at::zeros({batch_size, C, H, W}, features.options()); auto result = at::zeros({batch_size, C, H, W}, features.options());
if (result.numel() == 0) {
AT_CUDA_CHECK(cudaGetLastError());
return result;
}
const dim3 threadsPerBlock(64); const dim3 threadsPerBlock(64);
const dim3 numBlocks(batch_size, 1024 / batch_size + 1); const dim3 numBlocks(batch_size, 1024 / batch_size + 1);
// TODO(gkioxari) add AT_DISPATCH_FLOATING_TYPES once atomicAdd supports // TODO(gkioxari) add AT_DISPATCH_FLOATING_TYPES once atomicAdd supports
// doubles. Currently, support is for floats only. // doubles. Currently, support is for floats only.
alphaCompositeCudaForwardKernel<<<numBlocks, threadsPerBlock>>>( alphaCompositeCudaForwardKernel<<<numBlocks, threadsPerBlock, 0, stream>>>(
// clang-format off // clang-format off
result.packed_accessor64<float, 4, at::RestrictPtrTraits>(), result.packed_accessor64<float, 4, at::RestrictPtrTraits>(),
features.packed_accessor64<float, 2, at::RestrictPtrTraits>(), features.packed_accessor64<float, 2, at::RestrictPtrTraits>(),
alphas.packed_accessor64<float, 4, at::RestrictPtrTraits>(), alphas.packed_accessor64<float, 4, at::RestrictPtrTraits>(),
points_idx.packed_accessor64<int64_t, 4, at::RestrictPtrTraits>()); points_idx.packed_accessor64<int64_t, 4, at::RestrictPtrTraits>());
// clang-format on // clang-format on
AT_CUDA_CHECK(cudaGetLastError());
return result; return result;
} }
...@@ -164,9 +182,26 @@ std::tuple<at::Tensor, at::Tensor> alphaCompositeCudaBackward( ...@@ -164,9 +182,26 @@ std::tuple<at::Tensor, at::Tensor> alphaCompositeCudaBackward(
const at::Tensor& features, const at::Tensor& features,
const at::Tensor& alphas, const at::Tensor& alphas,
const at::Tensor& points_idx) { const at::Tensor& points_idx) {
// Check inputs are on the same device
at::TensorArg grad_outputs_t{grad_outputs, "grad_outputs", 1},
features_t{features, "features", 2}, alphas_t{alphas, "alphas", 3},
points_idx_t{points_idx, "points_idx", 4};
at::CheckedFrom c = "alphaCompositeCudaBackward";
at::checkAllSameGPU(c, {grad_outputs_t, features_t, alphas_t, points_idx_t});
at::checkAllSameType(c, {grad_outputs_t, features_t, alphas_t});
// Set the device for the kernel launch based on the device of the input
at::cuda::CUDAGuard device_guard(features.device());
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
auto grad_features = at::zeros_like(features); auto grad_features = at::zeros_like(features);
auto grad_alphas = at::zeros_like(alphas); auto grad_alphas = at::zeros_like(alphas);
if (grad_features.numel() == 0 || grad_alphas.numel() == 0) {
AT_CUDA_CHECK(cudaGetLastError());
return std::make_tuple(grad_features, grad_alphas);
}
const int64_t bs = alphas.size(0); const int64_t bs = alphas.size(0);
const dim3 threadsPerBlock(64); const dim3 threadsPerBlock(64);
...@@ -174,7 +209,7 @@ std::tuple<at::Tensor, at::Tensor> alphaCompositeCudaBackward( ...@@ -174,7 +209,7 @@ std::tuple<at::Tensor, at::Tensor> alphaCompositeCudaBackward(
// TODO(gkioxari) add AT_DISPATCH_FLOATING_TYPES once atomicAdd supports // TODO(gkioxari) add AT_DISPATCH_FLOATING_TYPES once atomicAdd supports
// doubles. Currently, support is for floats only. // doubles. Currently, support is for floats only.
alphaCompositeCudaBackwardKernel<<<numBlocks, threadsPerBlock>>>( alphaCompositeCudaBackwardKernel<<<numBlocks, threadsPerBlock, 0, stream>>>(
// clang-format off // clang-format off
grad_features.packed_accessor64<float, 2, at::RestrictPtrTraits>(), grad_features.packed_accessor64<float, 2, at::RestrictPtrTraits>(),
grad_alphas.packed_accessor64<float, 4, at::RestrictPtrTraits>(), grad_alphas.packed_accessor64<float, 4, at::RestrictPtrTraits>(),
...@@ -183,6 +218,6 @@ std::tuple<at::Tensor, at::Tensor> alphaCompositeCudaBackward( ...@@ -183,6 +218,6 @@ std::tuple<at::Tensor, at::Tensor> alphaCompositeCudaBackward(
alphas.packed_accessor64<float, 4, at::RestrictPtrTraits>(), alphas.packed_accessor64<float, 4, at::RestrictPtrTraits>(),
points_idx.packed_accessor64<int64_t, 4, at::RestrictPtrTraits>()); points_idx.packed_accessor64<int64_t, 4, at::RestrictPtrTraits>());
// clang-format on // clang-format on
AT_CUDA_CHECK(cudaGetLastError());
return std::make_tuple(grad_features, grad_alphas); return std::make_tuple(grad_features, grad_alphas);
} }
...@@ -2,6 +2,8 @@ ...@@ -2,6 +2,8 @@
#include <ATen/ATen.h> #include <ATen/ATen.h>
#include <ATen/core/TensorAccessor.h> #include <ATen/core/TensorAccessor.h>
#include <ATen/cuda/CUDAContext.h>
#include <c10/cuda/CUDAGuard.h>
#include <cuda.h> #include <cuda.h>
#include <cuda_runtime.h> #include <cuda_runtime.h>
...@@ -151,6 +153,17 @@ at::Tensor weightedSumNormCudaForward( ...@@ -151,6 +153,17 @@ at::Tensor weightedSumNormCudaForward(
const at::Tensor& features, const at::Tensor& features,
const at::Tensor& alphas, const at::Tensor& alphas,
const at::Tensor& points_idx) { const at::Tensor& points_idx) {
// Check inputs are on the same device
at::TensorArg features_t{features, "features", 1},
alphas_t{alphas, "alphas", 2}, points_idx_t{points_idx, "points_idx", 3};
at::CheckedFrom c = "weightedSumNormCudaForward";
at::checkAllSameGPU(c, {features_t, alphas_t, points_idx_t});
at::checkAllSameType(c, {features_t, alphas_t});
// Set the device for the kernel launch based on the device of the input
at::cuda::CUDAGuard device_guard(features.device());
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
const int64_t batch_size = points_idx.size(0); const int64_t batch_size = points_idx.size(0);
const int64_t C = features.size(0); const int64_t C = features.size(0);
const int64_t H = points_idx.size(2); const int64_t H = points_idx.size(2);
...@@ -158,19 +171,25 @@ at::Tensor weightedSumNormCudaForward( ...@@ -158,19 +171,25 @@ at::Tensor weightedSumNormCudaForward(
auto result = at::zeros({batch_size, C, H, W}, features.options()); auto result = at::zeros({batch_size, C, H, W}, features.options());
if (result.numel() == 0) {
AT_CUDA_CHECK(cudaGetLastError());
return result;
}
const dim3 threadsPerBlock(64); const dim3 threadsPerBlock(64);
const dim3 numBlocks(batch_size, 1024 / batch_size + 1); const dim3 numBlocks(batch_size, 1024 / batch_size + 1);
// TODO(gkioxari) add AT_DISPATCH_FLOATING_TYPES once atomicAdd supports // TODO(gkioxari) add AT_DISPATCH_FLOATING_TYPES once atomicAdd supports
// doubles. Currently, support is for floats only. // doubles. Currently, support is for floats only.
// clang-format off // clang-format off
weightedSumNormCudaForwardKernel<<<numBlocks, threadsPerBlock>>>( weightedSumNormCudaForwardKernel<<<numBlocks, threadsPerBlock, 0, stream>>>(
result.packed_accessor64<float, 4, at::RestrictPtrTraits>(), result.packed_accessor64<float, 4, at::RestrictPtrTraits>(),
features.packed_accessor64<float, 2, at::RestrictPtrTraits>(), features.packed_accessor64<float, 2, at::RestrictPtrTraits>(),
alphas.packed_accessor64<float, 4, at::RestrictPtrTraits>(), alphas.packed_accessor64<float, 4, at::RestrictPtrTraits>(),
points_idx.packed_accessor64<int64_t, 4, at::RestrictPtrTraits>()); points_idx.packed_accessor64<int64_t, 4, at::RestrictPtrTraits>());
// clang-format on // clang-format on
AT_CUDA_CHECK(cudaGetLastError());
return result; return result;
} }
...@@ -179,9 +198,26 @@ std::tuple<at::Tensor, at::Tensor> weightedSumNormCudaBackward( ...@@ -179,9 +198,26 @@ std::tuple<at::Tensor, at::Tensor> weightedSumNormCudaBackward(
const at::Tensor& features, const at::Tensor& features,
const at::Tensor& alphas, const at::Tensor& alphas,
const at::Tensor& points_idx) { const at::Tensor& points_idx) {
// Check inputs are on the same device
at::TensorArg grad_outputs_t{grad_outputs, "grad_outputs", 1},
features_t{features, "features", 2}, alphas_t{alphas, "alphas", 3},
points_idx_t{points_idx, "points_idx", 4};
at::CheckedFrom c = "weightedSumNormCudaBackward";
at::checkAllSameGPU(c, {grad_outputs_t, features_t, alphas_t, points_idx_t});
at::checkAllSameType(c, {grad_outputs_t, features_t, alphas_t});
// Set the device for the kernel launch based on the device of the input
at::cuda::CUDAGuard device_guard(features.device());
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
auto grad_features = at::zeros_like(features); auto grad_features = at::zeros_like(features);
auto grad_alphas = at::zeros_like(alphas); auto grad_alphas = at::zeros_like(alphas);
if (grad_features.numel() == 0 || grad_alphas.numel() == 0) {
AT_CUDA_CHECK(cudaGetLastError());
return std::make_tuple(grad_features, grad_alphas);
}
const int64_t bs = points_idx.size(0); const int64_t bs = points_idx.size(0);
const dim3 threadsPerBlock(64); const dim3 threadsPerBlock(64);
...@@ -189,7 +225,7 @@ std::tuple<at::Tensor, at::Tensor> weightedSumNormCudaBackward( ...@@ -189,7 +225,7 @@ std::tuple<at::Tensor, at::Tensor> weightedSumNormCudaBackward(
// TODO(gkioxari) add AT_DISPATCH_FLOATING_TYPES once atomicAdd supports // TODO(gkioxari) add AT_DISPATCH_FLOATING_TYPES once atomicAdd supports
// doubles. Currently, support is for floats only. // doubles. Currently, support is for floats only.
weightedSumNormCudaBackwardKernel<<<numBlocks, threadsPerBlock>>>( weightedSumNormCudaBackwardKernel<<<numBlocks, threadsPerBlock, 0, stream>>>(
// clang-format off // clang-format off
grad_features.packed_accessor64<float, 2, at::RestrictPtrTraits>(), grad_features.packed_accessor64<float, 2, at::RestrictPtrTraits>(),
grad_alphas.packed_accessor64<float, 4, at::RestrictPtrTraits>(), grad_alphas.packed_accessor64<float, 4, at::RestrictPtrTraits>(),
...@@ -198,6 +234,6 @@ std::tuple<at::Tensor, at::Tensor> weightedSumNormCudaBackward( ...@@ -198,6 +234,6 @@ std::tuple<at::Tensor, at::Tensor> weightedSumNormCudaBackward(
alphas.packed_accessor64<float, 4, at::RestrictPtrTraits>(), alphas.packed_accessor64<float, 4, at::RestrictPtrTraits>(),
points_idx.packed_accessor64<int64_t, 4, at::RestrictPtrTraits>()); points_idx.packed_accessor64<int64_t, 4, at::RestrictPtrTraits>());
// clang-format on // clang-format on
AT_CUDA_CHECK(cudaGetLastError());
return std::make_tuple(grad_features, grad_alphas); return std::make_tuple(grad_features, grad_alphas);
} }
...@@ -2,6 +2,8 @@ ...@@ -2,6 +2,8 @@
#include <ATen/ATen.h> #include <ATen/ATen.h>
#include <ATen/core/TensorAccessor.h> #include <ATen/core/TensorAccessor.h>
#include <ATen/cuda/CUDAContext.h>
#include <c10/cuda/CUDAGuard.h>
#include <cuda.h> #include <cuda.h>
#include <cuda_runtime.h> #include <cuda_runtime.h>
...@@ -110,6 +112,17 @@ at::Tensor weightedSumCudaForward( ...@@ -110,6 +112,17 @@ at::Tensor weightedSumCudaForward(
const at::Tensor& features, const at::Tensor& features,
const at::Tensor& alphas, const at::Tensor& alphas,
const at::Tensor& points_idx) { const at::Tensor& points_idx) {
// Check inputs are on the same device
at::TensorArg features_t{features, "features", 1},
alphas_t{alphas, "alphas", 2}, points_idx_t{points_idx, "points_idx", 3};
at::CheckedFrom c = "weightedSumCudaForward";
at::checkAllSameGPU(c, {features_t, alphas_t, points_idx_t});
at::checkAllSameType(c, {features_t, alphas_t});
// Set the device for the kernel launch based on the device of the input
at::cuda::CUDAGuard device_guard(features.device());
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
const int64_t batch_size = points_idx.size(0); const int64_t batch_size = points_idx.size(0);
const int64_t C = features.size(0); const int64_t C = features.size(0);
const int64_t H = points_idx.size(2); const int64_t H = points_idx.size(2);
...@@ -117,19 +130,24 @@ at::Tensor weightedSumCudaForward( ...@@ -117,19 +130,24 @@ at::Tensor weightedSumCudaForward(
auto result = at::zeros({batch_size, C, H, W}, features.options()); auto result = at::zeros({batch_size, C, H, W}, features.options());
if (result.numel() == 0) {
AT_CUDA_CHECK(cudaGetLastError());
return result;
}
const dim3 threadsPerBlock(64); const dim3 threadsPerBlock(64);
const dim3 numBlocks(batch_size, 1024 / batch_size + 1); const dim3 numBlocks(batch_size, 1024 / batch_size + 1);
// TODO(gkioxari) add AT_DISPATCH_FLOATING_TYPES once atomicAdd supports // TODO(gkioxari) add AT_DISPATCH_FLOATING_TYPES once atomicAdd supports
// doubles. Currently, support is for floats only. // doubles. Currently, support is for floats only.
weightedSumCudaForwardKernel<<<numBlocks, threadsPerBlock>>>( weightedSumCudaForwardKernel<<<numBlocks, threadsPerBlock, 0, stream>>>(
// clang-format off // clang-format off
result.packed_accessor64<float, 4, at::RestrictPtrTraits>(), result.packed_accessor64<float, 4, at::RestrictPtrTraits>(),
features.packed_accessor64<float, 2, at::RestrictPtrTraits>(), features.packed_accessor64<float, 2, at::RestrictPtrTraits>(),
alphas.packed_accessor64<float, 4, at::RestrictPtrTraits>(), alphas.packed_accessor64<float, 4, at::RestrictPtrTraits>(),
points_idx.packed_accessor64<int64_t, 4, at::RestrictPtrTraits>()); points_idx.packed_accessor64<int64_t, 4, at::RestrictPtrTraits>());
// clang-format on // clang-format on
AT_CUDA_CHECK(cudaGetLastError());
return result; return result;
} }
...@@ -138,9 +156,26 @@ std::tuple<at::Tensor, at::Tensor> weightedSumCudaBackward( ...@@ -138,9 +156,26 @@ std::tuple<at::Tensor, at::Tensor> weightedSumCudaBackward(
const at::Tensor& features, const at::Tensor& features,
const at::Tensor& alphas, const at::Tensor& alphas,
const at::Tensor& points_idx) { const at::Tensor& points_idx) {
// Check inputs are on the same device
at::TensorArg grad_outputs_t{grad_outputs, "grad_outputs", 1},
features_t{features, "features", 2}, alphas_t{alphas, "alphas", 3},
points_idx_t{points_idx, "points_idx", 4};
at::CheckedFrom c = "weightedSumCudaBackward";
at::checkAllSameGPU(c, {grad_outputs_t, features_t, alphas_t, points_idx_t});
at::checkAllSameType(c, {grad_outputs_t, features_t, alphas_t});
// Set the device for the kernel launch based on the device of the input
at::cuda::CUDAGuard device_guard(features.device());
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
auto grad_features = at::zeros_like(features); auto grad_features = at::zeros_like(features);
auto grad_alphas = at::zeros_like(alphas); auto grad_alphas = at::zeros_like(alphas);
if (grad_features.numel() == 0 || grad_alphas.numel() == 0) {
AT_CUDA_CHECK(cudaGetLastError());
return std::make_tuple(grad_features, grad_alphas);
}
const int64_t bs = points_idx.size(0); const int64_t bs = points_idx.size(0);
const dim3 threadsPerBlock(64); const dim3 threadsPerBlock(64);
...@@ -148,7 +183,7 @@ std::tuple<at::Tensor, at::Tensor> weightedSumCudaBackward( ...@@ -148,7 +183,7 @@ std::tuple<at::Tensor, at::Tensor> weightedSumCudaBackward(
// TODO(gkioxari) add AT_DISPATCH_FLOATING_TYPES once atomicAdd supports // TODO(gkioxari) add AT_DISPATCH_FLOATING_TYPES once atomicAdd supports
// doubles. Currently, support is for floats only. // doubles. Currently, support is for floats only.
weightedSumCudaBackwardKernel<<<numBlocks, threadsPerBlock>>>( weightedSumCudaBackwardKernel<<<numBlocks, threadsPerBlock, 0, stream>>>(
// clang-format off // clang-format off
grad_features.packed_accessor64<float, 2, at::RestrictPtrTraits>(), grad_features.packed_accessor64<float, 2, at::RestrictPtrTraits>(),
grad_alphas.packed_accessor64<float, 4, at::RestrictPtrTraits>(), grad_alphas.packed_accessor64<float, 4, at::RestrictPtrTraits>(),
...@@ -157,6 +192,6 @@ std::tuple<at::Tensor, at::Tensor> weightedSumCudaBackward( ...@@ -157,6 +192,6 @@ std::tuple<at::Tensor, at::Tensor> weightedSumCudaBackward(
alphas.packed_accessor64<float, 4, at::RestrictPtrTraits>(), alphas.packed_accessor64<float, 4, at::RestrictPtrTraits>(),
points_idx.packed_accessor64<int64_t, 4, at::RestrictPtrTraits>()); points_idx.packed_accessor64<int64_t, 4, at::RestrictPtrTraits>());
// clang-format on // clang-format on
AT_CUDA_CHECK(cudaGetLastError());
return std::make_tuple(grad_features, grad_alphas); return std::make_tuple(grad_features, grad_alphas);
} }
...@@ -23,7 +23,7 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { ...@@ -23,7 +23,7 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
#endif #endif
m.def("knn_points_idx", &KNearestNeighborIdx); m.def("knn_points_idx", &KNearestNeighborIdx);
m.def("knn_points_backward", &KNearestNeighborBackward); m.def("knn_points_backward", &KNearestNeighborBackward);
m.def("gather_scatter", &gather_scatter); m.def("gather_scatter", &GatherScatter);
m.def("rasterize_points", &RasterizePoints); m.def("rasterize_points", &RasterizePoints);
m.def("rasterize_points_backward", &RasterizePointsBackward); m.def("rasterize_points_backward", &RasterizePointsBackward);
m.def("rasterize_meshes_backward", &RasterizeMeshesBackward); m.def("rasterize_meshes_backward", &RasterizeMeshesBackward);
......
// Copyright (c) Facebook, Inc. and its affiliates. All rights reserved. // Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
#include <ATen/ATen.h> #include <ATen/ATen.h>
#include <ATen/cuda/CUDAContext.h>
#include <c10/cuda/CUDAGuard.h>
#include <tuple> #include <tuple>
template <typename scalar_t> template <typename scalar_t>
...@@ -213,14 +215,30 @@ std::tuple<at::Tensor, at::Tensor> FaceAreasNormalsForwardCuda( ...@@ -213,14 +215,30 @@ std::tuple<at::Tensor, at::Tensor> FaceAreasNormalsForwardCuda(
const auto V = verts.size(0); const auto V = verts.size(0);
const auto F = faces.size(0); const auto F = faces.size(0);
// Check inputs are on the same device
at::TensorArg verts_t{verts, "verts", 1}, faces_t{verts, "faces", 2};
at::CheckedFrom c = "FaceAreasNormalsForwardCuda";
at::checkAllSameGPU(c, {verts_t, faces_t});
at::checkAllSameType(c, {verts_t, faces_t});
// Set the device for the kernel launch based on the device of verts
at::cuda::CUDAGuard device_guard(verts.device());
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
at::Tensor areas = at::empty({F}, verts.options()); at::Tensor areas = at::empty({F}, verts.options());
at::Tensor normals = at::empty({F, 3}, verts.options()); at::Tensor normals = at::empty({F, 3}, verts.options());
if (areas.numel() == 0) {
AT_CUDA_CHECK(cudaGetLastError());
return std::make_tuple(areas, normals);
}
const int blocks = 64; const int blocks = 64;
const int threads = 512; const int threads = 512;
AT_DISPATCH_FLOATING_TYPES( AT_DISPATCH_FLOATING_TYPES(
verts.scalar_type(), "face_areas_normals_forward_cuda", ([&] { verts.scalar_type(), "face_areas_normals_forward_cuda", ([&] {
FaceAreasNormalsForwardKernel<scalar_t><<<blocks, threads>>>( FaceAreasNormalsForwardKernel<scalar_t><<<blocks, threads, 0, stream>>>(
verts.data_ptr<scalar_t>(), verts.data_ptr<scalar_t>(),
faces.data_ptr<int64_t>(), faces.data_ptr<int64_t>(),
areas.data_ptr<scalar_t>(), areas.data_ptr<scalar_t>(),
...@@ -228,7 +246,7 @@ std::tuple<at::Tensor, at::Tensor> FaceAreasNormalsForwardCuda( ...@@ -228,7 +246,7 @@ std::tuple<at::Tensor, at::Tensor> FaceAreasNormalsForwardCuda(
V, V,
F); F);
})); }));
AT_CUDA_CHECK(cudaGetLastError());
return std::make_tuple(areas, normals); return std::make_tuple(areas, normals);
} }
...@@ -237,16 +255,33 @@ at::Tensor FaceAreasNormalsBackwardCuda( ...@@ -237,16 +255,33 @@ at::Tensor FaceAreasNormalsBackwardCuda(
const at::Tensor grad_normals, const at::Tensor grad_normals,
const at::Tensor verts, const at::Tensor verts,
const at::Tensor faces) { const at::Tensor faces) {
// Check inputs are on the same device
at::TensorArg verts_t{verts, "verts", 1}, faces_t{verts, "faces", 2},
grad_areas_t{verts, "grad_areas", 3},
grad_normals_t{verts, "grad_normals", 4};
at::CheckedFrom c = "FaceAreasNormalsBackwardCuda";
at::checkAllSameGPU(c, {verts_t, faces_t, grad_areas_t, grad_normals_t});
at::checkAllSameType(c, {verts_t, faces_t, grad_areas_t, grad_normals_t});
// Set the device for the kernel launch based on the device of verts
at::cuda::CUDAGuard device_guard(verts.device());
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
const auto V = verts.size(0); const auto V = verts.size(0);
const auto F = faces.size(0); const auto F = faces.size(0);
at::Tensor grad_verts = at::zeros({V, 3}, grad_areas.options()); at::Tensor grad_verts = at::zeros({V, 3}, grad_areas.options());
if (grad_verts.numel() == 0) {
AT_CUDA_CHECK(cudaGetLastError());
return grad_verts;
}
const int blocks = 64; const int blocks = 64;
const int threads = 512; const int threads = 512;
// TODO(gkioxari) add AT_DISPATCH_FLOATING_TYPES once atomicAdd supports // TODO(gkioxari) add AT_DISPATCH_FLOATING_TYPES once atomicAdd supports
// doubles. Currently, support is for floats only. // doubles. Currently, support is for floats only.
FaceAreasNormalsBackwardKernel<<<blocks, threads>>>( FaceAreasNormalsBackwardKernel<<<blocks, threads, 0, stream>>>(
grad_areas.data_ptr<float>(), grad_areas.data_ptr<float>(),
grad_normals.data_ptr<float>(), grad_normals.data_ptr<float>(),
verts.data_ptr<float>(), verts.data_ptr<float>(),
...@@ -255,5 +290,6 @@ at::Tensor FaceAreasNormalsBackwardCuda( ...@@ -255,5 +290,6 @@ at::Tensor FaceAreasNormalsBackwardCuda(
V, V,
F); F);
AT_CUDA_CHECK(cudaGetLastError());
return grad_verts; return grad_verts;
} }
...@@ -3,6 +3,7 @@ ...@@ -3,6 +3,7 @@
#pragma once #pragma once
#include <torch/extension.h> #include <torch/extension.h>
#include <tuple> #include <tuple>
#include "utils/pytorch3d_cutils.h"
// Compute areas of mesh faces using packed representation. // Compute areas of mesh faces using packed representation.
// //
...@@ -46,6 +47,8 @@ std::tuple<at::Tensor, at::Tensor> FaceAreasNormalsForward( ...@@ -46,6 +47,8 @@ std::tuple<at::Tensor, at::Tensor> FaceAreasNormalsForward(
const at::Tensor faces) { const at::Tensor faces) {
if (verts.is_cuda() && faces.is_cuda()) { if (verts.is_cuda() && faces.is_cuda()) {
#ifdef WITH_CUDA #ifdef WITH_CUDA
CHECK_CONTIGUOUS_CUDA(verts);
CHECK_CONTIGUOUS_CUDA(faces);
return FaceAreasNormalsForwardCuda(verts, faces); return FaceAreasNormalsForwardCuda(verts, faces);
#else #else
AT_ERROR("Not compiled with GPU support."); AT_ERROR("Not compiled with GPU support.");
...@@ -62,6 +65,10 @@ at::Tensor FaceAreasNormalsBackward( ...@@ -62,6 +65,10 @@ at::Tensor FaceAreasNormalsBackward(
const at::Tensor faces) { const at::Tensor faces) {
if (verts.is_cuda() && faces.is_cuda()) { if (verts.is_cuda() && faces.is_cuda()) {
#ifdef WITH_CUDA #ifdef WITH_CUDA
CHECK_CONTIGUOUS_CUDA(verts);
CHECK_CONTIGUOUS_CUDA(faces);
CHECK_CONTIGUOUS_CUDA(grad_areas);
CHECK_CONTIGUOUS_CUDA(grad_normals);
return FaceAreasNormalsBackwardCuda(grad_areas, grad_normals, verts, faces); return FaceAreasNormalsBackwardCuda(grad_areas, grad_normals, verts, faces);
#else #else
AT_ERROR("Not compiled with GPU support."); AT_ERROR("Not compiled with GPU support.");
......
// Copyright (c) Facebook, Inc. and its affiliates. All rights reserved. // Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
#include <ATen/ATen.h> #include <ATen/ATen.h>
#include <ATen/cuda/CUDAContext.h>
#include <c10/cuda/CUDAGuard.h>
// TODO(T47953967) to make this cuda kernel support all datatypes. // TODO(T47953967) to make this cuda kernel support all datatypes.
__global__ void gather_scatter_kernel( __global__ void GatherScatterCudaKernel(
const float* __restrict__ input, const float* __restrict__ input,
const int64_t* __restrict__ edges, const int64_t* __restrict__ edges,
float* __restrict__ output, float* __restrict__ output,
...@@ -41,11 +43,20 @@ __global__ void gather_scatter_kernel( ...@@ -41,11 +43,20 @@ __global__ void gather_scatter_kernel(
} }
} }
at::Tensor gather_scatter_cuda( at::Tensor GatherScatterCuda(
const at::Tensor input, const at::Tensor input,
const at::Tensor edges, const at::Tensor edges,
bool directed, bool directed,
bool backward) { bool backward) {
// Check inputs are on the same device
at::TensorArg input_t{input, "input", 1}, edges_t{edges, "edges", 2};
at::CheckedFrom c = "GatherScatterCuda";
at::checkAllSameGPU(c, {input_t, edges_t});
// Set the device for the kernel launch based on the device of the input
at::cuda::CUDAGuard device_guard(input.device());
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
const auto num_vertices = input.size(0); const auto num_vertices = input.size(0);
const auto input_feature_dim = input.size(1); const auto input_feature_dim = input.size(1);
const auto num_edges = edges.size(0); const auto num_edges = edges.size(0);
...@@ -55,7 +66,12 @@ at::Tensor gather_scatter_cuda( ...@@ -55,7 +66,12 @@ at::Tensor gather_scatter_cuda(
const size_t max_blocks = 1920; const size_t max_blocks = 1920;
const size_t blocks = num_edges < max_blocks ? num_edges : max_blocks; const size_t blocks = num_edges < max_blocks ? num_edges : max_blocks;
gather_scatter_kernel<<<blocks, threads>>>( if (output.numel() == 0) {
AT_CUDA_CHECK(cudaGetLastError());
return output;
}
GatherScatterCudaKernel<<<blocks, threads, 0, stream>>>(
input.data_ptr<float>(), input.data_ptr<float>(),
edges.data_ptr<int64_t>(), edges.data_ptr<int64_t>(),
output.data_ptr<float>(), output.data_ptr<float>(),
...@@ -64,6 +80,6 @@ at::Tensor gather_scatter_cuda( ...@@ -64,6 +80,6 @@ at::Tensor gather_scatter_cuda(
num_vertices, num_vertices,
input_feature_dim, input_feature_dim,
num_edges); num_edges);
AT_CUDA_CHECK(cudaGetLastError());
return output; return output;
} }
...@@ -2,6 +2,7 @@ ...@@ -2,6 +2,7 @@
#pragma once #pragma once
#include <torch/extension.h> #include <torch/extension.h>
#include "utils/pytorch3d_cutils.h"
// Fused gather scatter operation for aggregating features of neighbor nodes // Fused gather scatter operation for aggregating features of neighbor nodes
// in a graph. This gather scatter operation is specific to graphs as edge // in a graph. This gather scatter operation is specific to graphs as edge
...@@ -20,21 +21,23 @@ ...@@ -20,21 +21,23 @@
// output: float32 Tensor of same shape as input. // output: float32 Tensor of same shape as input.
// Cuda implementation. // Cuda implementation.
at::Tensor gather_scatter_cuda( at::Tensor GatherScatterCuda(
const at::Tensor input, const at::Tensor input,
const at::Tensor edges, const at::Tensor edges,
bool directed, bool directed,
bool backward); bool backward);
// Exposed implementation. // Exposed implementation.
at::Tensor gather_scatter( at::Tensor GatherScatter(
const at::Tensor input, const at::Tensor input,
const at::Tensor edges, const at::Tensor edges,
bool directed, bool directed,
bool backward) { bool backward) {
if (input.is_cuda() && edges.is_cuda()) { if (input.is_cuda() && edges.is_cuda()) {
#ifdef WITH_CUDA #ifdef WITH_CUDA
return gather_scatter_cuda(input, edges, directed, backward); CHECK_CONTIGUOUS_CUDA(input);
CHECK_CONTIGUOUS_CUDA(edges);
return GatherScatterCuda(input, edges, directed, backward);
#else #else
AT_ERROR("Not compiled with GPU support."); AT_ERROR("Not compiled with GPU support.");
#endif #endif
......
// Copyright (c) Facebook, Inc. and its affiliates. All rights reserved. // Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
#include <ATen/ATen.h> #include <ATen/ATen.h>
#include <ATen/cuda/CUDAContext.h>
#include <c10/cuda/CUDAGuard.h>
#include <float.h> #include <float.h>
#include <iostream> #include <iostream>
#include <tuple> #include <tuple>
...@@ -114,7 +116,8 @@ struct KNearestNeighborV1Functor { ...@@ -114,7 +116,8 @@ struct KNearestNeighborV1Functor {
const size_t P1, const size_t P1,
const size_t P2, const size_t P2,
const size_t K) { const size_t K) {
KNearestNeighborKernelV1<scalar_t, D><<<blocks, threads>>>( cudaStream_t stream = at::cuda::getCurrentCUDAStream();
KNearestNeighborKernelV1<scalar_t, D><<<blocks, threads, 0, stream>>>(
points1, points2, lengths1, lengths2, dists, idxs, N, P1, P2, K); points1, points2, lengths1, lengths2, dists, idxs, N, P1, P2, K);
} }
}; };
...@@ -178,7 +181,8 @@ struct KNearestNeighborKernelV2Functor { ...@@ -178,7 +181,8 @@ struct KNearestNeighborKernelV2Functor {
const int64_t N, const int64_t N,
const int64_t P1, const int64_t P1,
const int64_t P2) { const int64_t P2) {
KNearestNeighborKernelV2<scalar_t, D, K><<<blocks, threads>>>( cudaStream_t stream = at::cuda::getCurrentCUDAStream();
KNearestNeighborKernelV2<scalar_t, D, K><<<blocks, threads, 0, stream>>>(
points1, points2, lengths1, lengths2, dists, idxs, N, P1, P2); points1, points2, lengths1, lengths2, dists, idxs, N, P1, P2);
} }
}; };
...@@ -245,7 +249,8 @@ struct KNearestNeighborKernelV3Functor { ...@@ -245,7 +249,8 @@ struct KNearestNeighborKernelV3Functor {
const size_t N, const size_t N,
const size_t P1, const size_t P1,
const size_t P2) { const size_t P2) {
KNearestNeighborKernelV3<scalar_t, D, K><<<blocks, threads>>>( cudaStream_t stream = at::cuda::getCurrentCUDAStream();
KNearestNeighborKernelV3<scalar_t, D, K><<<blocks, threads, 0, stream>>>(
points1, points2, lengths1, lengths2, dists, idxs, N, P1, P2); points1, points2, lengths1, lengths2, dists, idxs, N, P1, P2);
} }
}; };
...@@ -296,17 +301,33 @@ std::tuple<at::Tensor, at::Tensor> KNearestNeighborIdxCuda( ...@@ -296,17 +301,33 @@ std::tuple<at::Tensor, at::Tensor> KNearestNeighborIdxCuda(
const at::Tensor& lengths2, const at::Tensor& lengths2,
int K, int K,
int version) { int version) {
// Check inputs are on the same device
at::TensorArg p1_t{p1, "p1", 1}, p2_t{p2, "p2", 2},
lengths1_t{lengths1, "lengths1", 3}, lengths2_t{lengths2, "lengths2", 4};
at::CheckedFrom c = "KNearestNeighborIdxCuda";
at::checkAllSameGPU(c, {p1_t, p2_t, lengths1_t, lengths2_t});
at::checkAllSameType(c, {p1_t, p2_t});
// Set the device for the kernel launch based on the device of the input
at::cuda::CUDAGuard device_guard(p1.device());
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
const auto N = p1.size(0); const auto N = p1.size(0);
const auto P1 = p1.size(1); const auto P1 = p1.size(1);
const auto P2 = p2.size(1); const auto P2 = p2.size(1);
const auto D = p2.size(2); const auto D = p2.size(2);
const int64_t K_64 = K; const int64_t K_64 = K;
AT_ASSERTM(p2.size(2) == D, "Point sets must have the same last dimension"); TORCH_CHECK(p2.size(2) == D, "Point sets must have the same last dimension");
auto long_dtype = p1.options().dtype(at::kLong); auto long_dtype = p1.options().dtype(at::kLong);
auto idxs = at::zeros({N, P1, K}, long_dtype); auto idxs = at::zeros({N, P1, K}, long_dtype);
auto dists = at::zeros({N, P1, K}, p1.options()); auto dists = at::zeros({N, P1, K}, p1.options());
if (idxs.numel() == 0) {
AT_CUDA_CHECK(cudaGetLastError());
return std::make_tuple(idxs, dists);
}
if (version < 0) { if (version < 0) {
version = ChooseVersion(D, K); version = ChooseVersion(D, K);
} else if (!KnnCheckVersion(version, D, K)) { } else if (!KnnCheckVersion(version, D, K)) {
...@@ -328,7 +349,7 @@ std::tuple<at::Tensor, at::Tensor> KNearestNeighborIdxCuda( ...@@ -328,7 +349,7 @@ std::tuple<at::Tensor, at::Tensor> KNearestNeighborIdxCuda(
if (version == 0) { if (version == 0) {
AT_DISPATCH_FLOATING_TYPES(p1.scalar_type(), "knn_kernel_cuda", ([&] { AT_DISPATCH_FLOATING_TYPES(p1.scalar_type(), "knn_kernel_cuda", ([&] {
KNearestNeighborKernelV0<scalar_t> KNearestNeighborKernelV0<scalar_t>
<<<blocks, threads>>>( <<<blocks, threads, 0, stream>>>(
p1.data_ptr<scalar_t>(), p1.data_ptr<scalar_t>(),
p2.data_ptr<scalar_t>(), p2.data_ptr<scalar_t>(),
lengths1.data_ptr<int64_t>(), lengths1.data_ptr<int64_t>(),
...@@ -409,7 +430,7 @@ std::tuple<at::Tensor, at::Tensor> KNearestNeighborIdxCuda( ...@@ -409,7 +430,7 @@ std::tuple<at::Tensor, at::Tensor> KNearestNeighborIdxCuda(
P2); P2);
})); }));
} }
AT_CUDA_CHECK(cudaGetLastError());
return std::make_tuple(idxs, dists); return std::make_tuple(idxs, dists);
} }
...@@ -465,27 +486,45 @@ std::tuple<at::Tensor, at::Tensor> KNearestNeighborBackwardCuda( ...@@ -465,27 +486,45 @@ std::tuple<at::Tensor, at::Tensor> KNearestNeighborBackwardCuda(
const at::Tensor& lengths2, const at::Tensor& lengths2,
const at::Tensor& idxs, const at::Tensor& idxs,
const at::Tensor& grad_dists) { const at::Tensor& grad_dists) {
// Check inputs are on the same device
at::TensorArg p1_t{p1, "p1", 1}, p2_t{p2, "p2", 2},
lengths1_t{lengths1, "lengths1", 3}, lengths2_t{lengths2, "lengths2", 4},
idxs_t{idxs, "idxs", 5}, grad_dists_t{grad_dists, "grad_dists", 6};
at::CheckedFrom c = "KNearestNeighborBackwardCuda";
at::checkAllSameGPU(
c, {p1_t, p2_t, lengths1_t, lengths2_t, idxs_t, grad_dists_t});
at::checkAllSameType(c, {p1_t, p2_t, grad_dists_t});
// Set the device for the kernel launch based on the device of the input
at::cuda::CUDAGuard device_guard(p1.device());
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
const auto N = p1.size(0); const auto N = p1.size(0);
const auto P1 = p1.size(1); const auto P1 = p1.size(1);
const auto P2 = p2.size(1); const auto P2 = p2.size(1);
const auto D = p2.size(2); const auto D = p2.size(2);
const auto K = idxs.size(2); const auto K = idxs.size(2);
AT_ASSERTM(p2.size(2) == D, "Point sets must have the same last dimension"); TORCH_CHECK(p2.size(2) == D, "Point sets must have the same last dimension");
AT_ASSERTM(idxs.size(0) == N, "KNN idxs must have the same batch dimension"); TORCH_CHECK(idxs.size(0) == N, "KNN idxs must have the same batch dimension");
AT_ASSERTM( TORCH_CHECK(
idxs.size(1) == P1, "KNN idxs must have the same point dimension as p1"); idxs.size(1) == P1, "KNN idxs must have the same point dimension as p1");
AT_ASSERTM(grad_dists.size(0) == N); TORCH_CHECK(grad_dists.size(0) == N);
AT_ASSERTM(grad_dists.size(1) == P1); TORCH_CHECK(grad_dists.size(1) == P1);
AT_ASSERTM(grad_dists.size(2) == K); TORCH_CHECK(grad_dists.size(2) == K);
auto grad_p1 = at::zeros({N, P1, D}, p1.options()); auto grad_p1 = at::zeros({N, P1, D}, p1.options());
auto grad_p2 = at::zeros({N, P2, D}, p2.options()); auto grad_p2 = at::zeros({N, P2, D}, p2.options());
if (grad_p1.numel() == 0 || grad_p2.numel() == 0) {
AT_CUDA_CHECK(cudaGetLastError());
return std::make_tuple(grad_p1, grad_p2);
}
const int blocks = 64; const int blocks = 64;
const int threads = 512; const int threads = 512;
KNearestNeighborBackwardKernel<<<blocks, threads>>>( KNearestNeighborBackwardKernel<<<blocks, threads, 0, stream>>>(
p1.data_ptr<float>(), p1.data_ptr<float>(),
p2.data_ptr<float>(), p2.data_ptr<float>(),
lengths1.data_ptr<int64_t>(), lengths1.data_ptr<int64_t>(),
...@@ -500,5 +539,6 @@ std::tuple<at::Tensor, at::Tensor> KNearestNeighborBackwardCuda( ...@@ -500,5 +539,6 @@ std::tuple<at::Tensor, at::Tensor> KNearestNeighborBackwardCuda(
K, K,
D); D);
AT_CUDA_CHECK(cudaGetLastError());
return std::make_tuple(grad_p1, grad_p2); return std::make_tuple(grad_p1, grad_p2);
} }
// Copyright (c) Facebook, Inc. and its affiliates. All rights reserved. // Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
#include <ATen/ATen.h> #include <ATen/ATen.h>
#include <ATen/cuda/CUDAContext.h>
#include <c10/cuda/CUDAGuard.h>
// Kernel for inputs_packed of shape (F, D), where D > 1 // Kernel for inputs_packed of shape (F, D), where D > 1
template <typename scalar_t> template <typename scalar_t>
...@@ -114,21 +116,36 @@ at::Tensor PackedToPaddedCuda( ...@@ -114,21 +116,36 @@ at::Tensor PackedToPaddedCuda(
const at::Tensor inputs_packed, const at::Tensor inputs_packed,
const at::Tensor first_idxs, const at::Tensor first_idxs,
const int64_t max_size) { const int64_t max_size) {
// Check inputs are on the same device
at::TensorArg inputs_packed_t{inputs_packed, "inputs_packed", 1},
first_idxs_t{first_idxs, "first_idxs", 2};
at::CheckedFrom c = "PackedToPaddedCuda";
at::checkAllSameGPU(c, {inputs_packed_t, first_idxs_t});
// Set the device for the kernel launch based on the device of the input
at::cuda::CUDAGuard device_guard(inputs_packed.device());
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
const int64_t num_inputs = inputs_packed.size(0); const int64_t num_inputs = inputs_packed.size(0);
const int64_t batch_size = first_idxs.size(0); const int64_t batch_size = first_idxs.size(0);
AT_ASSERTM( TORCH_CHECK(
inputs_packed.dim() == 2, "inputs_packed must be a 2-dimensional tensor"); inputs_packed.dim() == 2, "inputs_packed must be a 2-dimensional tensor");
const int64_t D = inputs_packed.size(1); const int64_t D = inputs_packed.size(1);
at::Tensor inputs_padded = at::Tensor inputs_padded =
at::zeros({batch_size, max_size, D}, inputs_packed.options()); at::zeros({batch_size, max_size, D}, inputs_packed.options());
if (inputs_padded.numel() == 0) {
AT_CUDA_CHECK(cudaGetLastError());
return inputs_padded;
}
const int threads = 512; const int threads = 512;
const int blocks = batch_size; const int blocks = batch_size;
if (D == 1) { if (D == 1) {
AT_DISPATCH_FLOATING_TYPES( AT_DISPATCH_FLOATING_TYPES(
inputs_packed.scalar_type(), "packed_to_padded_d1_kernel", ([&] { inputs_packed.scalar_type(), "packed_to_padded_d1_kernel", ([&] {
PackedToPaddedKernelD1<scalar_t><<<blocks, threads>>>( PackedToPaddedKernelD1<scalar_t><<<blocks, threads, 0, stream>>>(
inputs_packed.data_ptr<scalar_t>(), inputs_packed.data_ptr<scalar_t>(),
first_idxs.data_ptr<int64_t>(), first_idxs.data_ptr<int64_t>(),
inputs_padded.data_ptr<scalar_t>(), inputs_padded.data_ptr<scalar_t>(),
...@@ -139,7 +156,7 @@ at::Tensor PackedToPaddedCuda( ...@@ -139,7 +156,7 @@ at::Tensor PackedToPaddedCuda(
} else { } else {
AT_DISPATCH_FLOATING_TYPES( AT_DISPATCH_FLOATING_TYPES(
inputs_packed.scalar_type(), "packed_to_padded_kernel", ([&] { inputs_packed.scalar_type(), "packed_to_padded_kernel", ([&] {
PackedToPaddedKernel<scalar_t><<<blocks, threads>>>( PackedToPaddedKernel<scalar_t><<<blocks, threads, 0, stream>>>(
inputs_packed.data_ptr<scalar_t>(), inputs_packed.data_ptr<scalar_t>(),
first_idxs.data_ptr<int64_t>(), first_idxs.data_ptr<int64_t>(),
inputs_padded.data_ptr<scalar_t>(), inputs_padded.data_ptr<scalar_t>(),
...@@ -150,6 +167,7 @@ at::Tensor PackedToPaddedCuda( ...@@ -150,6 +167,7 @@ at::Tensor PackedToPaddedCuda(
})); }));
} }
AT_CUDA_CHECK(cudaGetLastError());
return inputs_padded; return inputs_padded;
} }
...@@ -157,11 +175,21 @@ at::Tensor PaddedToPackedCuda( ...@@ -157,11 +175,21 @@ at::Tensor PaddedToPackedCuda(
const at::Tensor inputs_padded, const at::Tensor inputs_padded,
const at::Tensor first_idxs, const at::Tensor first_idxs,
const int64_t num_inputs) { const int64_t num_inputs) {
// Check inputs are on the same device
at::TensorArg inputs_padded_t{inputs_padded, "inputs_padded", 1},
first_idxs_t{first_idxs, "first_idxs", 2};
at::CheckedFrom c = "PaddedToPackedCuda";
at::checkAllSameGPU(c, {inputs_padded_t, first_idxs_t});
// Set the device for the kernel launch based on the device of the input
at::cuda::CUDAGuard device_guard(inputs_padded.device());
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
const int64_t batch_size = inputs_padded.size(0); const int64_t batch_size = inputs_padded.size(0);
const int64_t max_size = inputs_padded.size(1); const int64_t max_size = inputs_padded.size(1);
AT_ASSERTM(batch_size == first_idxs.size(0), "sizes mismatch"); TORCH_CHECK(batch_size == first_idxs.size(0), "sizes mismatch");
AT_ASSERTM( TORCH_CHECK(
inputs_padded.dim() == 3, inputs_padded.dim() == 3,
"inputs_padded must be a 3-dimensional tensor"); "inputs_padded must be a 3-dimensional tensor");
const int64_t D = inputs_padded.size(2); const int64_t D = inputs_padded.size(2);
...@@ -169,13 +197,18 @@ at::Tensor PaddedToPackedCuda( ...@@ -169,13 +197,18 @@ at::Tensor PaddedToPackedCuda(
at::Tensor inputs_packed = at::Tensor inputs_packed =
at::zeros({num_inputs, D}, inputs_padded.options()); at::zeros({num_inputs, D}, inputs_padded.options());
if (inputs_packed.numel() == 0) {
AT_CUDA_CHECK(cudaGetLastError());
return inputs_packed;
}
const int threads = 512; const int threads = 512;
const int blocks = batch_size; const int blocks = batch_size;
if (D == 1) { if (D == 1) {
AT_DISPATCH_FLOATING_TYPES( AT_DISPATCH_FLOATING_TYPES(
inputs_padded.scalar_type(), "padded_to_packed_d1_kernel", ([&] { inputs_padded.scalar_type(), "padded_to_packed_d1_kernel", ([&] {
PaddedToPackedKernelD1<scalar_t><<<blocks, threads>>>( PaddedToPackedKernelD1<scalar_t><<<blocks, threads, 0, stream>>>(
inputs_padded.data_ptr<scalar_t>(), inputs_padded.data_ptr<scalar_t>(),
first_idxs.data_ptr<int64_t>(), first_idxs.data_ptr<int64_t>(),
inputs_packed.data_ptr<scalar_t>(), inputs_packed.data_ptr<scalar_t>(),
...@@ -186,7 +219,7 @@ at::Tensor PaddedToPackedCuda( ...@@ -186,7 +219,7 @@ at::Tensor PaddedToPackedCuda(
} else { } else {
AT_DISPATCH_FLOATING_TYPES( AT_DISPATCH_FLOATING_TYPES(
inputs_padded.scalar_type(), "padded_to_packed_kernel", ([&] { inputs_padded.scalar_type(), "padded_to_packed_kernel", ([&] {
PaddedToPackedKernel<scalar_t><<<blocks, threads>>>( PaddedToPackedKernel<scalar_t><<<blocks, threads, 0, stream>>>(
inputs_padded.data_ptr<scalar_t>(), inputs_padded.data_ptr<scalar_t>(),
first_idxs.data_ptr<int64_t>(), first_idxs.data_ptr<int64_t>(),
inputs_packed.data_ptr<scalar_t>(), inputs_packed.data_ptr<scalar_t>(),
...@@ -197,5 +230,6 @@ at::Tensor PaddedToPackedCuda( ...@@ -197,5 +230,6 @@ at::Tensor PaddedToPackedCuda(
})); }));
} }
AT_CUDA_CHECK(cudaGetLastError());
return inputs_packed; return inputs_packed;
} }
...@@ -2,6 +2,7 @@ ...@@ -2,6 +2,7 @@
#pragma once #pragma once
#include <torch/extension.h> #include <torch/extension.h>
#include "utils/pytorch3d_cutils.h"
// PackedToPadded // PackedToPadded
// Converts a packed tensor into a padded tensor, restoring the batch dimension. // Converts a packed tensor into a padded tensor, restoring the batch dimension.
...@@ -74,6 +75,8 @@ at::Tensor PackedToPadded( ...@@ -74,6 +75,8 @@ at::Tensor PackedToPadded(
const int64_t max_size) { const int64_t max_size) {
if (inputs_packed.is_cuda()) { if (inputs_packed.is_cuda()) {
#ifdef WITH_CUDA #ifdef WITH_CUDA
CHECK_CONTIGUOUS_CUDA(inputs_packed);
CHECK_CONTIGUOUS_CUDA(first_idxs);
return PackedToPaddedCuda(inputs_packed, first_idxs, max_size); return PackedToPaddedCuda(inputs_packed, first_idxs, max_size);
#else #else
AT_ERROR("Not compiled with GPU support."); AT_ERROR("Not compiled with GPU support.");
...@@ -89,6 +92,8 @@ at::Tensor PaddedToPacked( ...@@ -89,6 +92,8 @@ at::Tensor PaddedToPacked(
const int64_t num_inputs) { const int64_t num_inputs) {
if (inputs_padded.is_cuda()) { if (inputs_padded.is_cuda()) {
#ifdef WITH_CUDA #ifdef WITH_CUDA
CHECK_CONTIGUOUS_CUDA(inputs_padded);
CHECK_CONTIGUOUS_CUDA(first_idxs);
return PaddedToPackedCuda(inputs_padded, first_idxs, num_inputs); return PaddedToPackedCuda(inputs_padded, first_idxs, num_inputs);
#else #else
AT_ERROR("Not compiled with GPU support."); AT_ERROR("Not compiled with GPU support.");
......
// Copyright (c) Facebook, Inc. and its affiliates. All rights reserved. // Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
#include <ATen/ATen.h> #include <ATen/ATen.h>
#include <ATen/cuda/CUDAContext.h>
#include <c10/cuda/CUDAGuard.h>
#include <algorithm> #include <algorithm>
#include <list> #include <list>
#include <queue> #include <queue>
...@@ -103,26 +105,45 @@ std::tuple<at::Tensor, at::Tensor> PointEdgeDistanceForwardCuda( ...@@ -103,26 +105,45 @@ std::tuple<at::Tensor, at::Tensor> PointEdgeDistanceForwardCuda(
const at::Tensor& segms, const at::Tensor& segms,
const at::Tensor& segms_first_idx, const at::Tensor& segms_first_idx,
const int64_t max_points) { const int64_t max_points) {
// Check inputs are on the same device
at::TensorArg points_t{points, "points", 1},
points_first_idx_t{points_first_idx, "points_first_idx", 2},
segms_t{segms, "segms", 3},
segms_first_idx_t{segms_first_idx, "segms_first_idx", 4};
at::CheckedFrom c = "PointEdgeDistanceForwardCuda";
at::checkAllSameGPU(
c, {points_t, points_first_idx_t, segms_t, segms_first_idx_t});
at::checkAllSameType(c, {points_t, segms_t});
// Set the device for the kernel launch based on the device of the input
at::cuda::CUDAGuard device_guard(points.device());
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
const int64_t P = points.size(0); const int64_t P = points.size(0);
const int64_t S = segms.size(0); const int64_t S = segms.size(0);
const int64_t B = points_first_idx.size(0); const int64_t B = points_first_idx.size(0);
AT_ASSERTM(points.size(1) == 3, "points must be of shape Px3"); TORCH_CHECK(points.size(1) == 3, "points must be of shape Px3");
AT_ASSERTM( TORCH_CHECK(
(segms.size(1) == 2) && (segms.size(2) == 3), (segms.size(1) == 2) && (segms.size(2) == 3),
"segms must be of shape Sx2x3"); "segms must be of shape Sx2x3");
AT_ASSERTM(segms_first_idx.size(0) == B); TORCH_CHECK(segms_first_idx.size(0) == B);
// clang-format off // clang-format off
at::Tensor dists = at::zeros({P,}, points.options()); at::Tensor dists = at::zeros({P,}, points.options());
at::Tensor idxs = at::zeros({P,}, points_first_idx.options()); at::Tensor idxs = at::zeros({P,}, points_first_idx.options());
// clang-format on // clang-format on
if (dists.numel() == 0) {
AT_CUDA_CHECK(cudaGetLastError());
return std::make_tuple(dists, idxs);
}
const int threads = 128; const int threads = 128;
const dim3 blocks(max_points, B); const dim3 blocks(max_points, B);
size_t shared_size = threads * sizeof(size_t) + threads * sizeof(int64_t); size_t shared_size = threads * sizeof(size_t) + threads * sizeof(int64_t);
PointEdgeForwardKernel<<<blocks, threads, shared_size>>>( PointEdgeForwardKernel<<<blocks, threads, shared_size, stream>>>(
points.data_ptr<float>(), points.data_ptr<float>(),
points_first_idx.data_ptr<int64_t>(), points_first_idx.data_ptr<int64_t>(),
segms.data_ptr<float>(), segms.data_ptr<float>(),
...@@ -132,7 +153,7 @@ std::tuple<at::Tensor, at::Tensor> PointEdgeDistanceForwardCuda( ...@@ -132,7 +153,7 @@ std::tuple<at::Tensor, at::Tensor> PointEdgeDistanceForwardCuda(
B, B,
P, P,
S); S);
AT_CUDA_CHECK(cudaGetLastError());
return std::make_tuple(dists, idxs); return std::make_tuple(dists, idxs);
} }
...@@ -183,25 +204,42 @@ std::tuple<at::Tensor, at::Tensor> PointEdgeDistanceBackwardCuda( ...@@ -183,25 +204,42 @@ std::tuple<at::Tensor, at::Tensor> PointEdgeDistanceBackwardCuda(
const at::Tensor& segms, const at::Tensor& segms,
const at::Tensor& idx_points, const at::Tensor& idx_points,
const at::Tensor& grad_dists) { const at::Tensor& grad_dists) {
// Check inputs are on the same device
at::TensorArg points_t{points, "points", 1},
idx_points_t{idx_points, "idx_points", 2}, segms_t{segms, "segms", 3},
grad_dists_t{grad_dists, "grad_dists", 4};
at::CheckedFrom c = "PointEdgeDistanceBackwardCuda";
at::checkAllSameGPU(c, {points_t, idx_points_t, segms_t, grad_dists_t});
at::checkAllSameType(c, {points_t, segms_t, grad_dists_t});
// Set the device for the kernel launch based on the device of the input
at::cuda::CUDAGuard device_guard(points.device());
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
const int64_t P = points.size(0); const int64_t P = points.size(0);
const int64_t S = segms.size(0); const int64_t S = segms.size(0);
AT_ASSERTM(points.size(1) == 3, "points must be of shape Px3"); TORCH_CHECK(points.size(1) == 3, "points must be of shape Px3");
AT_ASSERTM( TORCH_CHECK(
(segms.size(1) == 2) && (segms.size(2) == 3), (segms.size(1) == 2) && (segms.size(2) == 3),
"segms must be of shape Sx2x3"); "segms must be of shape Sx2x3");
AT_ASSERTM(idx_points.size(0) == P); TORCH_CHECK(idx_points.size(0) == P);
AT_ASSERTM(grad_dists.size(0) == P); TORCH_CHECK(grad_dists.size(0) == P);
// clang-format off // clang-format off
at::Tensor grad_points = at::zeros({P, 3}, points.options()); at::Tensor grad_points = at::zeros({P, 3}, points.options());
at::Tensor grad_segms = at::zeros({S, 2, 3}, segms.options()); at::Tensor grad_segms = at::zeros({S, 2, 3}, segms.options());
// clang-format on // clang-format on
if (grad_points.numel() == 0 || grad_segms.numel() == 0) {
AT_CUDA_CHECK(cudaGetLastError());
return std::make_tuple(grad_points, grad_segms);
}
const int blocks = 64; const int blocks = 64;
const int threads = 512; const int threads = 512;
PointEdgeBackwardKernel<<<blocks, threads>>>( PointEdgeBackwardKernel<<<blocks, threads, 0, stream>>>(
points.data_ptr<float>(), points.data_ptr<float>(),
segms.data_ptr<float>(), segms.data_ptr<float>(),
idx_points.data_ptr<int64_t>(), idx_points.data_ptr<int64_t>(),
...@@ -210,6 +248,7 @@ std::tuple<at::Tensor, at::Tensor> PointEdgeDistanceBackwardCuda( ...@@ -210,6 +248,7 @@ std::tuple<at::Tensor, at::Tensor> PointEdgeDistanceBackwardCuda(
grad_segms.data_ptr<float>(), grad_segms.data_ptr<float>(),
P); P);
AT_CUDA_CHECK(cudaGetLastError());
return std::make_tuple(grad_points, grad_segms); return std::make_tuple(grad_points, grad_segms);
} }
...@@ -308,26 +347,45 @@ std::tuple<at::Tensor, at::Tensor> EdgePointDistanceForwardCuda( ...@@ -308,26 +347,45 @@ std::tuple<at::Tensor, at::Tensor> EdgePointDistanceForwardCuda(
const at::Tensor& segms, const at::Tensor& segms,
const at::Tensor& segms_first_idx, const at::Tensor& segms_first_idx,
const int64_t max_segms) { const int64_t max_segms) {
// Check inputs are on the same device
at::TensorArg points_t{points, "points", 1},
points_first_idx_t{points_first_idx, "points_first_idx", 2},
segms_t{segms, "segms", 3},
segms_first_idx_t{segms_first_idx, "segms_first_idx", 4};
at::CheckedFrom c = "EdgePointDistanceForwardCuda";
at::checkAllSameGPU(
c, {points_t, points_first_idx_t, segms_t, segms_first_idx_t});
at::checkAllSameType(c, {points_t, segms_t});
// Set the device for the kernel launch based on the device of the input
at::cuda::CUDAGuard device_guard(points.device());
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
const int64_t P = points.size(0); const int64_t P = points.size(0);
const int64_t S = segms.size(0); const int64_t S = segms.size(0);
const int64_t B = points_first_idx.size(0); const int64_t B = points_first_idx.size(0);
AT_ASSERTM(points.size(1) == 3, "points must be of shape Px3"); TORCH_CHECK(points.size(1) == 3, "points must be of shape Px3");
AT_ASSERTM( TORCH_CHECK(
(segms.size(1) == 2) && (segms.size(2) == 3), (segms.size(1) == 2) && (segms.size(2) == 3),
"segms must be of shape Sx2x3"); "segms must be of shape Sx2x3");
AT_ASSERTM(segms_first_idx.size(0) == B); TORCH_CHECK(segms_first_idx.size(0) == B);
// clang-format off // clang-format off
at::Tensor dists = at::zeros({S,}, segms.options()); at::Tensor dists = at::zeros({S,}, segms.options());
at::Tensor idxs = at::zeros({S,}, segms_first_idx.options()); at::Tensor idxs = at::zeros({S,}, segms_first_idx.options());
// clang-format on // clang-format on
if (dists.numel() == 0) {
AT_CUDA_CHECK(cudaGetLastError());
return std::make_tuple(dists, idxs);
}
const int threads = 128; const int threads = 128;
const dim3 blocks(max_segms, B); const dim3 blocks(max_segms, B);
size_t shared_size = threads * sizeof(size_t) + threads * sizeof(int64_t); size_t shared_size = threads * sizeof(size_t) + threads * sizeof(int64_t);
EdgePointForwardKernel<<<blocks, threads, shared_size>>>( EdgePointForwardKernel<<<blocks, threads, shared_size, stream>>>(
points.data_ptr<float>(), points.data_ptr<float>(),
points_first_idx.data_ptr<int64_t>(), points_first_idx.data_ptr<int64_t>(),
segms.data_ptr<float>(), segms.data_ptr<float>(),
...@@ -337,7 +395,7 @@ std::tuple<at::Tensor, at::Tensor> EdgePointDistanceForwardCuda( ...@@ -337,7 +395,7 @@ std::tuple<at::Tensor, at::Tensor> EdgePointDistanceForwardCuda(
B, B,
P, P,
S); S);
AT_CUDA_CHECK(cudaGetLastError());
return std::make_tuple(dists, idxs); return std::make_tuple(dists, idxs);
} }
...@@ -389,15 +447,27 @@ std::tuple<at::Tensor, at::Tensor> EdgePointDistanceBackwardCuda( ...@@ -389,15 +447,27 @@ std::tuple<at::Tensor, at::Tensor> EdgePointDistanceBackwardCuda(
const at::Tensor& segms, const at::Tensor& segms,
const at::Tensor& idx_segms, const at::Tensor& idx_segms,
const at::Tensor& grad_dists) { const at::Tensor& grad_dists) {
// Check inputs are on the same device
at::TensorArg points_t{points, "points", 1},
idx_segms_t{idx_segms, "idx_segms", 2}, segms_t{segms, "segms", 3},
grad_dists_t{grad_dists, "grad_dists", 4};
at::CheckedFrom c = "PointEdgeDistanceBackwardCuda";
at::checkAllSameGPU(c, {points_t, idx_segms_t, segms_t, grad_dists_t});
at::checkAllSameType(c, {points_t, segms_t, grad_dists_t});
// Set the device for the kernel launch based on the device of the input
at::cuda::CUDAGuard device_guard(points.device());
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
const int64_t P = points.size(0); const int64_t P = points.size(0);
const int64_t S = segms.size(0); const int64_t S = segms.size(0);
AT_ASSERTM(points.size(1) == 3, "points must be of shape Px3"); TORCH_CHECK(points.size(1) == 3, "points must be of shape Px3");
AT_ASSERTM( TORCH_CHECK(
(segms.size(1) == 2) && (segms.size(2) == 3), (segms.size(1) == 2) && (segms.size(2) == 3),
"segms must be of shape Sx2x3"); "segms must be of shape Sx2x3");
AT_ASSERTM(idx_segms.size(0) == S); TORCH_CHECK(idx_segms.size(0) == S);
AT_ASSERTM(grad_dists.size(0) == S); TORCH_CHECK(grad_dists.size(0) == S);
// clang-format off // clang-format off
at::Tensor grad_points = at::zeros({P, 3}, points.options()); at::Tensor grad_points = at::zeros({P, 3}, points.options());
...@@ -407,7 +477,7 @@ std::tuple<at::Tensor, at::Tensor> EdgePointDistanceBackwardCuda( ...@@ -407,7 +477,7 @@ std::tuple<at::Tensor, at::Tensor> EdgePointDistanceBackwardCuda(
const int blocks = 64; const int blocks = 64;
const int threads = 512; const int threads = 512;
EdgePointBackwardKernel<<<blocks, threads>>>( EdgePointBackwardKernel<<<blocks, threads, 0, stream>>>(
points.data_ptr<float>(), points.data_ptr<float>(),
segms.data_ptr<float>(), segms.data_ptr<float>(),
idx_segms.data_ptr<int64_t>(), idx_segms.data_ptr<int64_t>(),
...@@ -451,26 +521,42 @@ __global__ void PointEdgeArrayForwardKernel( ...@@ -451,26 +521,42 @@ __global__ void PointEdgeArrayForwardKernel(
at::Tensor PointEdgeArrayDistanceForwardCuda( at::Tensor PointEdgeArrayDistanceForwardCuda(
const at::Tensor& points, const at::Tensor& points,
const at::Tensor& segms) { const at::Tensor& segms) {
// Check inputs are on the same device
at::TensorArg points_t{points, "points", 1}, segms_t{segms, "segms", 2};
at::CheckedFrom c = "PointEdgeArrayDistanceForwardCuda";
at::checkAllSameGPU(c, {points_t, segms_t});
at::checkAllSameType(c, {points_t, segms_t});
// Set the device for the kernel launch based on the device of the input
at::cuda::CUDAGuard device_guard(points.device());
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
const int64_t P = points.size(0); const int64_t P = points.size(0);
const int64_t S = segms.size(0); const int64_t S = segms.size(0);
AT_ASSERTM(points.size(1) == 3, "points must be of shape Px3"); TORCH_CHECK(points.size(1) == 3, "points must be of shape Px3");
AT_ASSERTM( TORCH_CHECK(
(segms.size(1) == 2) && (segms.size(2) == 3), (segms.size(1) == 2) && (segms.size(2) == 3),
"segms must be of shape Sx2x3"); "segms must be of shape Sx2x3");
at::Tensor dists = at::zeros({P, S}, points.options()); at::Tensor dists = at::zeros({P, S}, points.options());
if (dists.numel() == 0) {
AT_CUDA_CHECK(cudaGetLastError());
return dists;
}
const size_t blocks = 1024; const size_t blocks = 1024;
const size_t threads = 64; const size_t threads = 64;
PointEdgeArrayForwardKernel<<<blocks, threads>>>( PointEdgeArrayForwardKernel<<<blocks, threads, 0, stream>>>(
points.data_ptr<float>(), points.data_ptr<float>(),
segms.data_ptr<float>(), segms.data_ptr<float>(),
dists.data_ptr<float>(), dists.data_ptr<float>(),
P, P,
S); S);
AT_CUDA_CHECK(cudaGetLastError());
return dists; return dists;
} }
...@@ -520,22 +606,38 @@ std::tuple<at::Tensor, at::Tensor> PointEdgeArrayDistanceBackwardCuda( ...@@ -520,22 +606,38 @@ std::tuple<at::Tensor, at::Tensor> PointEdgeArrayDistanceBackwardCuda(
const at::Tensor& points, const at::Tensor& points,
const at::Tensor& segms, const at::Tensor& segms,
const at::Tensor& grad_dists) { const at::Tensor& grad_dists) {
// Check inputs are on the same device
at::TensorArg points_t{points, "points", 1}, segms_t{segms, "segms", 2},
grad_dists_t{grad_dists, "grad_dists", 3};
at::CheckedFrom c = "PointEdgeArrayDistanceBackwardCuda";
at::checkAllSameGPU(c, {points_t, segms_t, grad_dists_t});
at::checkAllSameType(c, {points_t, segms_t, grad_dists_t});
// Set the device for the kernel launch based on the device of the input
at::cuda::CUDAGuard device_guard(points.device());
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
const int64_t P = points.size(0); const int64_t P = points.size(0);
const int64_t S = segms.size(0); const int64_t S = segms.size(0);
AT_ASSERTM(points.size(1) == 3, "points must be of shape Px3"); TORCH_CHECK(points.size(1) == 3, "points must be of shape Px3");
AT_ASSERTM( TORCH_CHECK(
(segms.size(1) == 2) && (segms.size(2) == 3), (segms.size(1) == 2) && (segms.size(2) == 3),
"segms must be of shape Sx2x3"); "segms must be of shape Sx2x3");
AT_ASSERTM((grad_dists.size(0) == P) && (grad_dists.size(1) == S)); TORCH_CHECK((grad_dists.size(0) == P) && (grad_dists.size(1) == S));
at::Tensor grad_points = at::zeros({P, 3}, points.options()); at::Tensor grad_points = at::zeros({P, 3}, points.options());
at::Tensor grad_segms = at::zeros({S, 2, 3}, segms.options()); at::Tensor grad_segms = at::zeros({S, 2, 3}, segms.options());
if (grad_points.numel() == 0 || grad_segms.numel() == 0) {
AT_CUDA_CHECK(cudaGetLastError());
return std::make_tuple(grad_points, grad_segms);
}
const size_t blocks = 1024; const size_t blocks = 1024;
const size_t threads = 64; const size_t threads = 64;
PointEdgeArrayBackwardKernel<<<blocks, threads>>>( PointEdgeArrayBackwardKernel<<<blocks, threads, 0, stream>>>(
points.data_ptr<float>(), points.data_ptr<float>(),
segms.data_ptr<float>(), segms.data_ptr<float>(),
grad_dists.data_ptr<float>(), grad_dists.data_ptr<float>(),
...@@ -543,6 +645,6 @@ std::tuple<at::Tensor, at::Tensor> PointEdgeArrayDistanceBackwardCuda( ...@@ -543,6 +645,6 @@ std::tuple<at::Tensor, at::Tensor> PointEdgeArrayDistanceBackwardCuda(
grad_segms.data_ptr<float>(), grad_segms.data_ptr<float>(),
P, P,
S); S);
AT_CUDA_CHECK(cudaGetLastError());
return std::make_tuple(grad_points, grad_segms); return std::make_tuple(grad_points, grad_segms);
} }
...@@ -4,6 +4,7 @@ ...@@ -4,6 +4,7 @@
#include <torch/extension.h> #include <torch/extension.h>
#include <cstdio> #include <cstdio>
#include <tuple> #include <tuple>
#include "utils/pytorch3d_cutils.h"
// **************************************************************************** // ****************************************************************************
// * PointEdgeDistance * // * PointEdgeDistance *
...@@ -53,6 +54,10 @@ std::tuple<torch::Tensor, torch::Tensor> PointEdgeDistanceForward( ...@@ -53,6 +54,10 @@ std::tuple<torch::Tensor, torch::Tensor> PointEdgeDistanceForward(
const int64_t max_points) { const int64_t max_points) {
if (points.is_cuda()) { if (points.is_cuda()) {
#ifdef WITH_CUDA #ifdef WITH_CUDA
CHECK_CONTIGUOUS_CUDA(points);
CHECK_CONTIGUOUS_CUDA(points_first_idx);
CHECK_CONTIGUOUS_CUDA(segms);
CHECK_CONTIGUOUS_CUDA(segms_first_idx);
return PointEdgeDistanceForwardCuda( return PointEdgeDistanceForwardCuda(
points, points_first_idx, segms, segms_first_idx, max_points); points, points_first_idx, segms, segms_first_idx, max_points);
#else #else
...@@ -93,6 +98,10 @@ std::tuple<torch::Tensor, torch::Tensor> PointEdgeDistanceBackward( ...@@ -93,6 +98,10 @@ std::tuple<torch::Tensor, torch::Tensor> PointEdgeDistanceBackward(
const torch::Tensor& grad_dists) { const torch::Tensor& grad_dists) {
if (points.is_cuda()) { if (points.is_cuda()) {
#ifdef WITH_CUDA #ifdef WITH_CUDA
CHECK_CONTIGUOUS_CUDA(points);
CHECK_CONTIGUOUS_CUDA(segms);
CHECK_CONTIGUOUS_CUDA(idx_points);
CHECK_CONTIGUOUS_CUDA(grad_dists);
return PointEdgeDistanceBackwardCuda(points, segms, idx_points, grad_dists); return PointEdgeDistanceBackwardCuda(points, segms, idx_points, grad_dists);
#else #else
AT_ERROR("Not compiled with GPU support."); AT_ERROR("Not compiled with GPU support.");
...@@ -149,6 +158,10 @@ std::tuple<torch::Tensor, torch::Tensor> EdgePointDistanceForward( ...@@ -149,6 +158,10 @@ std::tuple<torch::Tensor, torch::Tensor> EdgePointDistanceForward(
const int64_t max_segms) { const int64_t max_segms) {
if (points.is_cuda()) { if (points.is_cuda()) {
#ifdef WITH_CUDA #ifdef WITH_CUDA
CHECK_CONTIGUOUS_CUDA(points);
CHECK_CONTIGUOUS_CUDA(points_first_idx);
CHECK_CONTIGUOUS_CUDA(segms);
CHECK_CONTIGUOUS_CUDA(segms_first_idx);
return EdgePointDistanceForwardCuda( return EdgePointDistanceForwardCuda(
points, points_first_idx, segms, segms_first_idx, max_segms); points, points_first_idx, segms, segms_first_idx, max_segms);
#else #else
...@@ -189,6 +202,10 @@ std::tuple<torch::Tensor, torch::Tensor> EdgePointDistanceBackward( ...@@ -189,6 +202,10 @@ std::tuple<torch::Tensor, torch::Tensor> EdgePointDistanceBackward(
const torch::Tensor& grad_dists) { const torch::Tensor& grad_dists) {
if (points.is_cuda()) { if (points.is_cuda()) {
#ifdef WITH_CUDA #ifdef WITH_CUDA
CHECK_CONTIGUOUS_CUDA(points);
CHECK_CONTIGUOUS_CUDA(segms);
CHECK_CONTIGUOUS_CUDA(idx_segms);
CHECK_CONTIGUOUS_CUDA(grad_dists);
return EdgePointDistanceBackwardCuda(points, segms, idx_segms, grad_dists); return EdgePointDistanceBackwardCuda(points, segms, idx_segms, grad_dists);
#else #else
AT_ERROR("Not compiled with GPU support."); AT_ERROR("Not compiled with GPU support.");
...@@ -220,7 +237,6 @@ std::tuple<torch::Tensor, torch::Tensor> EdgePointDistanceBackward( ...@@ -220,7 +237,6 @@ std::tuple<torch::Tensor, torch::Tensor> EdgePointDistanceBackward(
// will require for the forward pass 5.8G of memory to store dists. // will require for the forward pass 5.8G of memory to store dists.
#ifdef WITH_CUDA #ifdef WITH_CUDA
torch::Tensor PointEdgeArrayDistanceForwardCuda( torch::Tensor PointEdgeArrayDistanceForwardCuda(
const torch::Tensor& points, const torch::Tensor& points,
const torch::Tensor& segms); const torch::Tensor& segms);
...@@ -231,6 +247,8 @@ torch::Tensor PointEdgeArrayDistanceForward( ...@@ -231,6 +247,8 @@ torch::Tensor PointEdgeArrayDistanceForward(
const torch::Tensor& segms) { const torch::Tensor& segms) {
if (points.is_cuda()) { if (points.is_cuda()) {
#ifdef WITH_CUDA #ifdef WITH_CUDA
CHECK_CONTIGUOUS_CUDA(points);
CHECK_CONTIGUOUS_CUDA(segms);
return PointEdgeArrayDistanceForwardCuda(points, segms); return PointEdgeArrayDistanceForwardCuda(points, segms);
#else #else
AT_ERROR("Not compiled with GPU support."); AT_ERROR("Not compiled with GPU support.");
...@@ -265,6 +283,9 @@ std::tuple<torch::Tensor, torch::Tensor> PointEdgeArrayDistanceBackward( ...@@ -265,6 +283,9 @@ std::tuple<torch::Tensor, torch::Tensor> PointEdgeArrayDistanceBackward(
const torch::Tensor& grad_dists) { const torch::Tensor& grad_dists) {
if (points.is_cuda()) { if (points.is_cuda()) {
#ifdef WITH_CUDA #ifdef WITH_CUDA
CHECK_CONTIGUOUS_CUDA(points);
CHECK_CONTIGUOUS_CUDA(segms);
CHECK_CONTIGUOUS_CUDA(grad_dists);
return PointEdgeArrayDistanceBackwardCuda(points, segms, grad_dists); return PointEdgeArrayDistanceBackwardCuda(points, segms, grad_dists);
#else #else
AT_ERROR("Not compiled with GPU support."); AT_ERROR("Not compiled with GPU support.");
......
// Copyright (c) Facebook, Inc. and its affiliates. All rights reserved. // Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
#include <ATen/ATen.h> #include <ATen/ATen.h>
#include <ATen/cuda/CUDAContext.h>
#include <c10/cuda/CUDAGuard.h>
#include <algorithm> #include <algorithm>
#include <list> #include <list>
#include <queue> #include <queue>
...@@ -104,26 +106,45 @@ std::tuple<at::Tensor, at::Tensor> PointFaceDistanceForwardCuda( ...@@ -104,26 +106,45 @@ std::tuple<at::Tensor, at::Tensor> PointFaceDistanceForwardCuda(
const at::Tensor& tris, const at::Tensor& tris,
const at::Tensor& tris_first_idx, const at::Tensor& tris_first_idx,
const int64_t max_points) { const int64_t max_points) {
// Check inputs are on the same device
at::TensorArg points_t{points, "points", 1},
points_first_idx_t{points_first_idx, "points_first_idx", 2},
tris_t{tris, "tris", 3},
tris_first_idx_t{tris_first_idx, "tris_first_idx", 4};
at::CheckedFrom c = "PointFaceDistanceForwardCuda";
at::checkAllSameGPU(
c, {points_t, points_first_idx_t, tris_t, tris_first_idx_t});
at::checkAllSameType(c, {points_t, tris_t});
// Set the device for the kernel launch based on the device of the input
at::cuda::CUDAGuard device_guard(points.device());
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
const int64_t P = points.size(0); const int64_t P = points.size(0);
const int64_t T = tris.size(0); const int64_t T = tris.size(0);
const int64_t B = points_first_idx.size(0); const int64_t B = points_first_idx.size(0);
AT_ASSERTM(points.size(1) == 3, "points must be of shape Px3"); TORCH_CHECK(points.size(1) == 3, "points must be of shape Px3");
AT_ASSERTM( TORCH_CHECK(
(tris.size(1) == 3) && (tris.size(2) == 3), (tris.size(1) == 3) && (tris.size(2) == 3),
"tris must be of shape Tx3x3"); "tris must be of shape Tx3x3");
AT_ASSERTM(tris_first_idx.size(0) == B); TORCH_CHECK(tris_first_idx.size(0) == B);
// clang-format off // clang-format off
at::Tensor dists = at::zeros({P,}, points.options()); at::Tensor dists = at::zeros({P,}, points.options());
at::Tensor idxs = at::zeros({P,}, points_first_idx.options()); at::Tensor idxs = at::zeros({P,}, points_first_idx.options());
// clang-format on // clang-format on
if (dists.numel() == 0) {
AT_CUDA_CHECK(cudaGetLastError());
return std::make_tuple(dists, idxs);
}
const int threads = 128; const int threads = 128;
const dim3 blocks(max_points, B); const dim3 blocks(max_points, B);
size_t shared_size = threads * sizeof(size_t) + threads * sizeof(int64_t); size_t shared_size = threads * sizeof(size_t) + threads * sizeof(int64_t);
PointFaceForwardKernel<<<blocks, threads, shared_size>>>( PointFaceForwardKernel<<<blocks, threads, shared_size, stream>>>(
points.data_ptr<float>(), points.data_ptr<float>(),
points_first_idx.data_ptr<int64_t>(), points_first_idx.data_ptr<int64_t>(),
tris.data_ptr<float>(), tris.data_ptr<float>(),
...@@ -134,6 +155,7 @@ std::tuple<at::Tensor, at::Tensor> PointFaceDistanceForwardCuda( ...@@ -134,6 +155,7 @@ std::tuple<at::Tensor, at::Tensor> PointFaceDistanceForwardCuda(
P, P,
T); T);
AT_CUDA_CHECK(cudaGetLastError());
return std::make_tuple(dists, idxs); return std::make_tuple(dists, idxs);
} }
...@@ -191,25 +213,42 @@ std::tuple<at::Tensor, at::Tensor> PointFaceDistanceBackwardCuda( ...@@ -191,25 +213,42 @@ std::tuple<at::Tensor, at::Tensor> PointFaceDistanceBackwardCuda(
const at::Tensor& tris, const at::Tensor& tris,
const at::Tensor& idx_points, const at::Tensor& idx_points,
const at::Tensor& grad_dists) { const at::Tensor& grad_dists) {
// Check inputs are on the same device
at::TensorArg points_t{points, "points", 1},
idx_points_t{idx_points, "idx_points", 2}, tris_t{tris, "tris", 3},
grad_dists_t{grad_dists, "grad_dists", 4};
at::CheckedFrom c = "PointFaceDistanceBackwardCuda";
at::checkAllSameGPU(c, {points_t, idx_points_t, tris_t, grad_dists_t});
at::checkAllSameType(c, {points_t, tris_t, grad_dists_t});
// Set the device for the kernel launch based on the device of the input
at::cuda::CUDAGuard device_guard(points.device());
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
const int64_t P = points.size(0); const int64_t P = points.size(0);
const int64_t T = tris.size(0); const int64_t T = tris.size(0);
AT_ASSERTM(points.size(1) == 3, "points must be of shape Px3"); TORCH_CHECK(points.size(1) == 3, "points must be of shape Px3");
AT_ASSERTM( TORCH_CHECK(
(tris.size(1) == 3) && (tris.size(2) == 3), (tris.size(1) == 3) && (tris.size(2) == 3),
"tris must be of shape Tx3x3"); "tris must be of shape Tx3x3");
AT_ASSERTM(idx_points.size(0) == P); TORCH_CHECK(idx_points.size(0) == P);
AT_ASSERTM(grad_dists.size(0) == P); TORCH_CHECK(grad_dists.size(0) == P);
// clang-format off // clang-format off
at::Tensor grad_points = at::zeros({P, 3}, points.options()); at::Tensor grad_points = at::zeros({P, 3}, points.options());
at::Tensor grad_tris = at::zeros({T, 3, 3}, tris.options()); at::Tensor grad_tris = at::zeros({T, 3, 3}, tris.options());
// clang-format on // clang-format on
if (grad_points.numel() == 0 || grad_tris.numel() == 0) {
AT_CUDA_CHECK(cudaGetLastError());
return std::make_tuple(grad_points, grad_tris);
}
const int blocks = 64; const int blocks = 64;
const int threads = 512; const int threads = 512;
PointFaceBackwardKernel<<<blocks, threads>>>( PointFaceBackwardKernel<<<blocks, threads, 0, stream>>>(
points.data_ptr<float>(), points.data_ptr<float>(),
tris.data_ptr<float>(), tris.data_ptr<float>(),
idx_points.data_ptr<int64_t>(), idx_points.data_ptr<int64_t>(),
...@@ -218,6 +257,7 @@ std::tuple<at::Tensor, at::Tensor> PointFaceDistanceBackwardCuda( ...@@ -218,6 +257,7 @@ std::tuple<at::Tensor, at::Tensor> PointFaceDistanceBackwardCuda(
grad_tris.data_ptr<float>(), grad_tris.data_ptr<float>(),
P); P);
AT_CUDA_CHECK(cudaGetLastError());
return std::make_tuple(grad_points, grad_tris); return std::make_tuple(grad_points, grad_tris);
} }
...@@ -317,26 +357,45 @@ std::tuple<at::Tensor, at::Tensor> FacePointDistanceForwardCuda( ...@@ -317,26 +357,45 @@ std::tuple<at::Tensor, at::Tensor> FacePointDistanceForwardCuda(
const at::Tensor& tris, const at::Tensor& tris,
const at::Tensor& tris_first_idx, const at::Tensor& tris_first_idx,
const int64_t max_tris) { const int64_t max_tris) {
// Check inputs are on the same device
at::TensorArg points_t{points, "points", 1},
points_first_idx_t{points_first_idx, "points_first_idx", 2},
tris_t{tris, "tris", 3},
tris_first_idx_t{tris_first_idx, "tris_first_idx", 4};
at::CheckedFrom c = "FacePointDistanceForwardCuda";
at::checkAllSameGPU(
c, {points_t, points_first_idx_t, tris_t, tris_first_idx_t});
at::checkAllSameType(c, {points_t, tris_t});
// Set the device for the kernel launch based on the device of the input
at::cuda::CUDAGuard device_guard(points.device());
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
const int64_t P = points.size(0); const int64_t P = points.size(0);
const int64_t T = tris.size(0); const int64_t T = tris.size(0);
const int64_t B = points_first_idx.size(0); const int64_t B = points_first_idx.size(0);
AT_ASSERTM(points.size(1) == 3, "points must be of shape Px3"); TORCH_CHECK(points.size(1) == 3, "points must be of shape Px3");
AT_ASSERTM( TORCH_CHECK(
(tris.size(1) == 3) && (tris.size(2) == 3), (tris.size(1) == 3) && (tris.size(2) == 3),
"tris must be of shape Tx3x3"); "tris must be of shape Tx3x3");
AT_ASSERTM(tris_first_idx.size(0) == B); TORCH_CHECK(tris_first_idx.size(0) == B);
// clang-format off // clang-format off
at::Tensor dists = at::zeros({T,}, tris.options()); at::Tensor dists = at::zeros({T,}, tris.options());
at::Tensor idxs = at::zeros({T,}, tris_first_idx.options()); at::Tensor idxs = at::zeros({T,}, tris_first_idx.options());
// clang-format on // clang-format on
if (dists.numel() == 0) {
AT_CUDA_CHECK(cudaGetLastError());
return std::make_tuple(dists, idxs);
}
const int threads = 128; const int threads = 128;
const dim3 blocks(max_tris, B); const dim3 blocks(max_tris, B);
size_t shared_size = threads * sizeof(size_t) + threads * sizeof(int64_t); size_t shared_size = threads * sizeof(size_t) + threads * sizeof(int64_t);
FacePointForwardKernel<<<blocks, threads, shared_size>>>( FacePointForwardKernel<<<blocks, threads, shared_size, stream>>>(
points.data_ptr<float>(), points.data_ptr<float>(),
points_first_idx.data_ptr<int64_t>(), points_first_idx.data_ptr<int64_t>(),
tris.data_ptr<float>(), tris.data_ptr<float>(),
...@@ -347,6 +406,7 @@ std::tuple<at::Tensor, at::Tensor> FacePointDistanceForwardCuda( ...@@ -347,6 +406,7 @@ std::tuple<at::Tensor, at::Tensor> FacePointDistanceForwardCuda(
P, P,
T); T);
AT_CUDA_CHECK(cudaGetLastError());
return std::make_tuple(dists, idxs); return std::make_tuple(dists, idxs);
} }
...@@ -405,25 +465,42 @@ std::tuple<at::Tensor, at::Tensor> FacePointDistanceBackwardCuda( ...@@ -405,25 +465,42 @@ std::tuple<at::Tensor, at::Tensor> FacePointDistanceBackwardCuda(
const at::Tensor& tris, const at::Tensor& tris,
const at::Tensor& idx_tris, const at::Tensor& idx_tris,
const at::Tensor& grad_dists) { const at::Tensor& grad_dists) {
// Check inputs are on the same device
at::TensorArg points_t{points, "points", 1},
idx_tris_t{idx_tris, "idx_tris", 2}, tris_t{tris, "tris", 3},
grad_dists_t{grad_dists, "grad_dists", 4};
at::CheckedFrom c = "FacePointDistanceBackwardCuda";
at::checkAllSameGPU(c, {points_t, idx_tris_t, tris_t, grad_dists_t});
at::checkAllSameType(c, {points_t, tris_t, grad_dists_t});
// Set the device for the kernel launch based on the device of the input
at::cuda::CUDAGuard device_guard(points.device());
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
const int64_t P = points.size(0); const int64_t P = points.size(0);
const int64_t T = tris.size(0); const int64_t T = tris.size(0);
AT_ASSERTM(points.size(1) == 3, "points must be of shape Px3"); TORCH_CHECK(points.size(1) == 3, "points must be of shape Px3");
AT_ASSERTM( TORCH_CHECK(
(tris.size(1) == 3) && (tris.size(2) == 3), (tris.size(1) == 3) && (tris.size(2) == 3),
"tris must be of shape Tx3x3"); "tris must be of shape Tx3x3");
AT_ASSERTM(idx_tris.size(0) == T); TORCH_CHECK(idx_tris.size(0) == T);
AT_ASSERTM(grad_dists.size(0) == T); TORCH_CHECK(grad_dists.size(0) == T);
// clang-format off // clang-format off
at::Tensor grad_points = at::zeros({P, 3}, points.options()); at::Tensor grad_points = at::zeros({P, 3}, points.options());
at::Tensor grad_tris = at::zeros({T, 3, 3}, tris.options()); at::Tensor grad_tris = at::zeros({T, 3, 3}, tris.options());
// clang-format on // clang-format on
if (grad_points.numel() == 0 || grad_tris.numel() == 0) {
AT_CUDA_CHECK(cudaGetLastError());
return std::make_tuple(grad_points, grad_tris);
}
const int blocks = 64; const int blocks = 64;
const int threads = 512; const int threads = 512;
FacePointBackwardKernel<<<blocks, threads>>>( FacePointBackwardKernel<<<blocks, threads, 0, stream>>>(
points.data_ptr<float>(), points.data_ptr<float>(),
tris.data_ptr<float>(), tris.data_ptr<float>(),
idx_tris.data_ptr<int64_t>(), idx_tris.data_ptr<int64_t>(),
...@@ -432,6 +509,7 @@ std::tuple<at::Tensor, at::Tensor> FacePointDistanceBackwardCuda( ...@@ -432,6 +509,7 @@ std::tuple<at::Tensor, at::Tensor> FacePointDistanceBackwardCuda(
grad_tris.data_ptr<float>(), grad_tris.data_ptr<float>(),
T); T);
AT_CUDA_CHECK(cudaGetLastError());
return std::make_tuple(grad_points, grad_tris); return std::make_tuple(grad_points, grad_tris);
} }
...@@ -468,26 +546,42 @@ __global__ void PointFaceArrayForwardKernel( ...@@ -468,26 +546,42 @@ __global__ void PointFaceArrayForwardKernel(
at::Tensor PointFaceArrayDistanceForwardCuda( at::Tensor PointFaceArrayDistanceForwardCuda(
const at::Tensor& points, const at::Tensor& points,
const at::Tensor& tris) { const at::Tensor& tris) {
// Check inputs are on the same device
at::TensorArg points_t{points, "points", 1}, tris_t{tris, "tris", 2};
at::CheckedFrom c = "PointFaceArrayDistanceForwardCuda";
at::checkAllSameGPU(c, {points_t, tris_t});
at::checkAllSameType(c, {points_t, tris_t});
// Set the device for the kernel launch based on the device of the input
at::cuda::CUDAGuard device_guard(points.device());
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
const int64_t P = points.size(0); const int64_t P = points.size(0);
const int64_t T = tris.size(0); const int64_t T = tris.size(0);
AT_ASSERTM(points.size(1) == 3, "points must be of shape Px3"); TORCH_CHECK(points.size(1) == 3, "points must be of shape Px3");
AT_ASSERTM( TORCH_CHECK(
(tris.size(1) == 3) && (tris.size(2) == 3), (tris.size(1) == 3) && (tris.size(2) == 3),
"tris must be of shape Tx3x3"); "tris must be of shape Tx3x3");
at::Tensor dists = at::zeros({P, T}, points.options()); at::Tensor dists = at::zeros({P, T}, points.options());
if (dists.numel() == 0) {
AT_CUDA_CHECK(cudaGetLastError());
return dists;
}
const size_t blocks = 1024; const size_t blocks = 1024;
const size_t threads = 64; const size_t threads = 64;
PointFaceArrayForwardKernel<<<blocks, threads>>>( PointFaceArrayForwardKernel<<<blocks, threads, 0, stream>>>(
points.data_ptr<float>(), points.data_ptr<float>(),
tris.data_ptr<float>(), tris.data_ptr<float>(),
dists.data_ptr<float>(), dists.data_ptr<float>(),
P, P,
T); T);
AT_CUDA_CHECK(cudaGetLastError());
return dists; return dists;
} }
...@@ -546,22 +640,38 @@ std::tuple<at::Tensor, at::Tensor> PointFaceArrayDistanceBackwardCuda( ...@@ -546,22 +640,38 @@ std::tuple<at::Tensor, at::Tensor> PointFaceArrayDistanceBackwardCuda(
const at::Tensor& points, const at::Tensor& points,
const at::Tensor& tris, const at::Tensor& tris,
const at::Tensor& grad_dists) { const at::Tensor& grad_dists) {
// Check inputs are on the same device
at::TensorArg points_t{points, "points", 1}, tris_t{tris, "tris", 2},
grad_dists_t{grad_dists, "grad_dists", 3};
at::CheckedFrom c = "PointFaceArrayDistanceBackwardCuda";
at::checkAllSameGPU(c, {points_t, tris_t, grad_dists_t});
at::checkAllSameType(c, {points_t, tris_t, grad_dists_t});
// Set the device for the kernel launch based on the device of the input
at::cuda::CUDAGuard device_guard(points.device());
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
const int64_t P = points.size(0); const int64_t P = points.size(0);
const int64_t T = tris.size(0); const int64_t T = tris.size(0);
AT_ASSERTM(points.size(1) == 3, "points must be of shape Px3"); TORCH_CHECK(points.size(1) == 3, "points must be of shape Px3");
AT_ASSERTM( TORCH_CHECK(
(tris.size(1) == 3) && (tris.size(2) == 3), (tris.size(1) == 3) && (tris.size(2) == 3),
"tris must be of shape Tx3x3"); "tris must be of shape Tx3x3");
AT_ASSERTM((grad_dists.size(0) == P) && (grad_dists.size(1) == T)); TORCH_CHECK((grad_dists.size(0) == P) && (grad_dists.size(1) == T));
at::Tensor grad_points = at::zeros({P, 3}, points.options()); at::Tensor grad_points = at::zeros({P, 3}, points.options());
at::Tensor grad_tris = at::zeros({T, 3, 3}, tris.options()); at::Tensor grad_tris = at::zeros({T, 3, 3}, tris.options());
if (grad_points.numel() == 0 || grad_tris.numel() == 0) {
AT_CUDA_CHECK(cudaGetLastError());
return std::make_tuple(grad_points, grad_tris);
}
const size_t blocks = 1024; const size_t blocks = 1024;
const size_t threads = 64; const size_t threads = 64;
PointFaceArrayBackwardKernel<<<blocks, threads>>>( PointFaceArrayBackwardKernel<<<blocks, threads, 0, stream>>>(
points.data_ptr<float>(), points.data_ptr<float>(),
tris.data_ptr<float>(), tris.data_ptr<float>(),
grad_dists.data_ptr<float>(), grad_dists.data_ptr<float>(),
...@@ -570,5 +680,6 @@ std::tuple<at::Tensor, at::Tensor> PointFaceArrayDistanceBackwardCuda( ...@@ -570,5 +680,6 @@ std::tuple<at::Tensor, at::Tensor> PointFaceArrayDistanceBackwardCuda(
P, P,
T); T);
AT_CUDA_CHECK(cudaGetLastError());
return std::make_tuple(grad_points, grad_tris); return std::make_tuple(grad_points, grad_tris);
} }
...@@ -4,6 +4,7 @@ ...@@ -4,6 +4,7 @@
#include <torch/extension.h> #include <torch/extension.h>
#include <cstdio> #include <cstdio>
#include <tuple> #include <tuple>
#include "utils/pytorch3d_cutils.h"
// **************************************************************************** // ****************************************************************************
// * PointFaceDistance * // * PointFaceDistance *
...@@ -55,6 +56,10 @@ std::tuple<torch::Tensor, torch::Tensor> PointFaceDistanceForward( ...@@ -55,6 +56,10 @@ std::tuple<torch::Tensor, torch::Tensor> PointFaceDistanceForward(
const int64_t max_points) { const int64_t max_points) {
if (points.is_cuda()) { if (points.is_cuda()) {
#ifdef WITH_CUDA #ifdef WITH_CUDA
CHECK_CONTIGUOUS_CUDA(points);
CHECK_CONTIGUOUS_CUDA(points_first_idx);
CHECK_CONTIGUOUS_CUDA(tris);
CHECK_CONTIGUOUS_CUDA(tris_first_idx);
return PointFaceDistanceForwardCuda( return PointFaceDistanceForwardCuda(
points, points_first_idx, tris, tris_first_idx, max_points); points, points_first_idx, tris, tris_first_idx, max_points);
#else #else
...@@ -95,6 +100,10 @@ std::tuple<torch::Tensor, torch::Tensor> PointFaceDistanceBackward( ...@@ -95,6 +100,10 @@ std::tuple<torch::Tensor, torch::Tensor> PointFaceDistanceBackward(
const torch::Tensor& grad_dists) { const torch::Tensor& grad_dists) {
if (points.is_cuda()) { if (points.is_cuda()) {
#ifdef WITH_CUDA #ifdef WITH_CUDA
CHECK_CONTIGUOUS_CUDA(points);
CHECK_CONTIGUOUS_CUDA(tris);
CHECK_CONTIGUOUS_CUDA(idx_points);
CHECK_CONTIGUOUS_CUDA(grad_dists);
return PointFaceDistanceBackwardCuda(points, tris, idx_points, grad_dists); return PointFaceDistanceBackwardCuda(points, tris, idx_points, grad_dists);
#else #else
AT_ERROR("Not compiled with GPU support."); AT_ERROR("Not compiled with GPU support.");
...@@ -151,6 +160,10 @@ std::tuple<torch::Tensor, torch::Tensor> FacePointDistanceForward( ...@@ -151,6 +160,10 @@ std::tuple<torch::Tensor, torch::Tensor> FacePointDistanceForward(
const int64_t max_tris) { const int64_t max_tris) {
if (points.is_cuda()) { if (points.is_cuda()) {
#ifdef WITH_CUDA #ifdef WITH_CUDA
CHECK_CONTIGUOUS_CUDA(points);
CHECK_CONTIGUOUS_CUDA(points_first_idx);
CHECK_CONTIGUOUS_CUDA(tris);
CHECK_CONTIGUOUS_CUDA(tris_first_idx);
return FacePointDistanceForwardCuda( return FacePointDistanceForwardCuda(
points, points_first_idx, tris, tris_first_idx, max_tris); points, points_first_idx, tris, tris_first_idx, max_tris);
#else #else
...@@ -191,6 +204,10 @@ std::tuple<torch::Tensor, torch::Tensor> FacePointDistanceBackward( ...@@ -191,6 +204,10 @@ std::tuple<torch::Tensor, torch::Tensor> FacePointDistanceBackward(
const torch::Tensor& grad_dists) { const torch::Tensor& grad_dists) {
if (points.is_cuda()) { if (points.is_cuda()) {
#ifdef WITH_CUDA #ifdef WITH_CUDA
CHECK_CONTIGUOUS_CUDA(points);
CHECK_CONTIGUOUS_CUDA(tris);
CHECK_CONTIGUOUS_CUDA(idx_tris);
CHECK_CONTIGUOUS_CUDA(grad_dists);
return FacePointDistanceBackwardCuda(points, tris, idx_tris, grad_dists); return FacePointDistanceBackwardCuda(points, tris, idx_tris, grad_dists);
#else #else
AT_ERROR("Not compiled with GPU support."); AT_ERROR("Not compiled with GPU support.");
...@@ -233,6 +250,8 @@ torch::Tensor PointFaceArrayDistanceForward( ...@@ -233,6 +250,8 @@ torch::Tensor PointFaceArrayDistanceForward(
const torch::Tensor& tris) { const torch::Tensor& tris) {
if (points.is_cuda()) { if (points.is_cuda()) {
#ifdef WITH_CUDA #ifdef WITH_CUDA
CHECK_CONTIGUOUS_CUDA(points);
CHECK_CONTIGUOUS_CUDA(tris);
return PointFaceArrayDistanceForwardCuda(points, tris); return PointFaceArrayDistanceForwardCuda(points, tris);
#else #else
AT_ERROR("Not compiled with GPU support."); AT_ERROR("Not compiled with GPU support.");
...@@ -254,7 +273,6 @@ torch::Tensor PointFaceArrayDistanceForward( ...@@ -254,7 +273,6 @@ torch::Tensor PointFaceArrayDistanceForward(
// //
#ifdef WITH_CUDA #ifdef WITH_CUDA
std::tuple<torch::Tensor, torch::Tensor> PointFaceArrayDistanceBackwardCuda( std::tuple<torch::Tensor, torch::Tensor> PointFaceArrayDistanceBackwardCuda(
const torch::Tensor& points, const torch::Tensor& points,
const torch::Tensor& tris, const torch::Tensor& tris,
...@@ -267,6 +285,9 @@ std::tuple<torch::Tensor, torch::Tensor> PointFaceArrayDistanceBackward( ...@@ -267,6 +285,9 @@ std::tuple<torch::Tensor, torch::Tensor> PointFaceArrayDistanceBackward(
const torch::Tensor& grad_dists) { const torch::Tensor& grad_dists) {
if (points.is_cuda()) { if (points.is_cuda()) {
#ifdef WITH_CUDA #ifdef WITH_CUDA
CHECK_CONTIGUOUS_CUDA(points);
CHECK_CONTIGUOUS_CUDA(tris);
CHECK_CONTIGUOUS_CUDA(grad_dists);
return PointFaceArrayDistanceBackwardCuda(points, tris, grad_dists); return PointFaceArrayDistanceBackwardCuda(points, tris, grad_dists);
#else #else
AT_ERROR("Not compiled with GPU support."); AT_ERROR("Not compiled with GPU support.");
......
// Copyright (c) Facebook, Inc. and its affiliates. All rights reserved. // Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
#include <ATen/ATen.h> #include <ATen/ATen.h>
#include <ATen/cuda/CUDAContext.h>
#include <c10/cuda/CUDAGuard.h>
#include <float.h> #include <float.h>
#include <math.h> #include <math.h>
#include <thrust/tuple.h> #include <thrust/tuple.h>
...@@ -285,14 +287,14 @@ RasterizeMeshesNaiveCuda( ...@@ -285,14 +287,14 @@ RasterizeMeshesNaiveCuda(
const int num_closest, const int num_closest,
const bool perspective_correct, const bool perspective_correct,
const bool cull_backfaces) { const bool cull_backfaces) {
if (face_verts.ndimension() != 3 || face_verts.size(1) != 3 || TORCH_CHECK(
face_verts.size(2) != 3) { face_verts.ndimension() == 3 && face_verts.size(1) == 3 &&
AT_ERROR("face_verts must have dimensions (num_faces, 3, 3)"); face_verts.size(2) == 3,
} "face_verts must have dimensions (num_faces, 3, 3)");
if (num_faces_per_mesh.size(0) != mesh_to_faces_packed_first_idx.size(0)) {
AT_ERROR( TORCH_CHECK(
"num_faces_per_mesh must have save size first dimension as mesh_to_faces_packed_first_idx"); num_faces_per_mesh.size(0) == mesh_to_faces_packed_first_idx.size(0),
} "num_faces_per_mesh must have save size first dimension as mesh_to_faces_packed_first_idx");
if (num_closest > kMaxPointsPerPixel) { if (num_closest > kMaxPointsPerPixel) {
std::stringstream ss; std::stringstream ss;
...@@ -300,6 +302,20 @@ RasterizeMeshesNaiveCuda( ...@@ -300,6 +302,20 @@ RasterizeMeshesNaiveCuda(
AT_ERROR(ss.str()); AT_ERROR(ss.str());
} }
// Check inputs are on the same device
at::TensorArg face_verts_t{face_verts, "face_verts", 1},
mesh_to_faces_packed_first_idx_t{
mesh_to_faces_packed_first_idx, "mesh_to_faces_packed_first_idx", 2},
num_faces_per_mesh_t{num_faces_per_mesh, "num_faces_per_mesh", 3};
at::CheckedFrom c = "RasterizeMeshesNaiveCuda";
at::checkAllSameGPU(
c,
{face_verts_t, mesh_to_faces_packed_first_idx_t, num_faces_per_mesh_t});
// Set the device for the kernel launch based on the device of the input
at::cuda::CUDAGuard device_guard(face_verts.device());
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
const int N = num_faces_per_mesh.size(0); // batch size. const int N = num_faces_per_mesh.size(0); // batch size.
const int H = image_size; // Assume square images. const int H = image_size; // Assume square images.
const int W = image_size; const int W = image_size;
...@@ -313,10 +329,15 @@ RasterizeMeshesNaiveCuda( ...@@ -313,10 +329,15 @@ RasterizeMeshesNaiveCuda(
at::Tensor pix_dists = at::full({N, H, W, K}, -1, float_opts); at::Tensor pix_dists = at::full({N, H, W, K}, -1, float_opts);
at::Tensor bary = at::full({N, H, W, K, 3}, -1, float_opts); at::Tensor bary = at::full({N, H, W, K, 3}, -1, float_opts);
if (face_idxs.numel() == 0) {
AT_CUDA_CHECK(cudaGetLastError());
return std::make_tuple(face_idxs, zbuf, bary, pix_dists);
}
const size_t blocks = 1024; const size_t blocks = 1024;
const size_t threads = 64; const size_t threads = 64;
RasterizeMeshesNaiveCudaKernel<<<blocks, threads>>>( RasterizeMeshesNaiveCudaKernel<<<blocks, threads, 0, stream>>>(
face_verts.contiguous().data_ptr<float>(), face_verts.contiguous().data_ptr<float>(),
mesh_to_faces_packed_first_idx.contiguous().data_ptr<int64_t>(), mesh_to_faces_packed_first_idx.contiguous().data_ptr<int64_t>(),
num_faces_per_mesh.contiguous().data_ptr<int64_t>(), num_faces_per_mesh.contiguous().data_ptr<int64_t>(),
...@@ -332,6 +353,7 @@ RasterizeMeshesNaiveCuda( ...@@ -332,6 +353,7 @@ RasterizeMeshesNaiveCuda(
pix_dists.contiguous().data_ptr<float>(), pix_dists.contiguous().data_ptr<float>(),
bary.contiguous().data_ptr<float>()); bary.contiguous().data_ptr<float>());
AT_CUDA_CHECK(cudaGetLastError());
return std::make_tuple(face_idxs, zbuf, bary, pix_dists); return std::make_tuple(face_idxs, zbuf, bary, pix_dists);
} }
...@@ -465,6 +487,22 @@ at::Tensor RasterizeMeshesBackwardCuda( ...@@ -465,6 +487,22 @@ at::Tensor RasterizeMeshesBackwardCuda(
const at::Tensor& grad_bary, // (N, H, W, K, 3) const at::Tensor& grad_bary, // (N, H, W, K, 3)
const at::Tensor& grad_dists, // (N, H, W, K) const at::Tensor& grad_dists, // (N, H, W, K)
const bool perspective_correct) { const bool perspective_correct) {
// Check inputs are on the same device
at::TensorArg face_verts_t{face_verts, "face_verts", 1},
pix_to_face_t{pix_to_face, "pix_to_face", 2},
grad_zbuf_t{grad_zbuf, "grad_zbuf", 3},
grad_bary_t{grad_bary, "grad_bary", 4},
grad_dists_t{grad_dists, "grad_dists", 5};
at::CheckedFrom c = "RasterizeMeshesBackwardCuda";
at::checkAllSameGPU(
c, {face_verts_t, pix_to_face_t, grad_zbuf_t, grad_bary_t, grad_dists_t});
at::checkAllSameType(
c, {face_verts_t, grad_zbuf_t, grad_bary_t, grad_dists_t});
// Set the device for the kernel launch based on the device of the input
at::cuda::CUDAGuard device_guard(face_verts.device());
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
const int F = face_verts.size(0); const int F = face_verts.size(0);
const int N = pix_to_face.size(0); const int N = pix_to_face.size(0);
const int H = pix_to_face.size(1); const int H = pix_to_face.size(1);
...@@ -472,10 +510,16 @@ at::Tensor RasterizeMeshesBackwardCuda( ...@@ -472,10 +510,16 @@ at::Tensor RasterizeMeshesBackwardCuda(
const int K = pix_to_face.size(3); const int K = pix_to_face.size(3);
at::Tensor grad_face_verts = at::zeros({F, 3, 3}, face_verts.options()); at::Tensor grad_face_verts = at::zeros({F, 3, 3}, face_verts.options());
if (grad_face_verts.numel() == 0) {
AT_CUDA_CHECK(cudaGetLastError());
return grad_face_verts;
}
const size_t blocks = 1024; const size_t blocks = 1024;
const size_t threads = 64; const size_t threads = 64;
RasterizeMeshesBackwardCudaKernel<<<blocks, threads>>>( RasterizeMeshesBackwardCudaKernel<<<blocks, threads, 0, stream>>>(
face_verts.contiguous().data_ptr<float>(), face_verts.contiguous().data_ptr<float>(),
pix_to_face.contiguous().data_ptr<int64_t>(), pix_to_face.contiguous().data_ptr<int64_t>(),
perspective_correct, perspective_correct,
...@@ -488,6 +532,7 @@ at::Tensor RasterizeMeshesBackwardCuda( ...@@ -488,6 +532,7 @@ at::Tensor RasterizeMeshesBackwardCuda(
grad_dists.contiguous().data_ptr<float>(), grad_dists.contiguous().data_ptr<float>(),
grad_face_verts.contiguous().data_ptr<float>()); grad_face_verts.contiguous().data_ptr<float>());
AT_CUDA_CHECK(cudaGetLastError());
return grad_face_verts; return grad_face_verts;
} }
...@@ -626,10 +671,24 @@ at::Tensor RasterizeMeshesCoarseCuda( ...@@ -626,10 +671,24 @@ at::Tensor RasterizeMeshesCoarseCuda(
const float blur_radius, const float blur_radius,
const int bin_size, const int bin_size,
const int max_faces_per_bin) { const int max_faces_per_bin) {
if (face_verts.ndimension() != 3 || face_verts.size(1) != 3 || TORCH_CHECK(
face_verts.size(2) != 3) { face_verts.ndimension() == 3 && face_verts.size(1) == 3 &&
AT_ERROR("face_verts must have dimensions (num_faces, 3, 3)"); face_verts.size(2) == 3,
} "face_verts must have dimensions (num_faces, 3, 3)");
// Check inputs are on the same device
at::TensorArg face_verts_t{face_verts, "face_verts", 1},
mesh_to_face_first_idx_t{
mesh_to_face_first_idx, "mesh_to_face_first_idx", 2},
num_faces_per_mesh_t{num_faces_per_mesh, "num_faces_per_mesh", 3};
at::CheckedFrom c = "RasterizeMeshesCoarseCuda";
at::checkAllSameGPU(
c, {face_verts_t, mesh_to_face_first_idx_t, num_faces_per_mesh_t});
// Set the device for the kernel launch based on the device of the input
at::cuda::CUDAGuard device_guard(face_verts.device());
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
const int W = image_size; const int W = image_size;
const int H = image_size; const int H = image_size;
const int F = face_verts.size(0); const int F = face_verts.size(0);
...@@ -645,12 +704,18 @@ at::Tensor RasterizeMeshesCoarseCuda( ...@@ -645,12 +704,18 @@ at::Tensor RasterizeMeshesCoarseCuda(
auto opts = face_verts.options().dtype(at::kInt); auto opts = face_verts.options().dtype(at::kInt);
at::Tensor faces_per_bin = at::zeros({N, num_bins, num_bins}, opts); at::Tensor faces_per_bin = at::zeros({N, num_bins, num_bins}, opts);
at::Tensor bin_faces = at::full({N, num_bins, num_bins, M}, -1, opts); at::Tensor bin_faces = at::full({N, num_bins, num_bins, M}, -1, opts);
if (bin_faces.numel() == 0) {
AT_CUDA_CHECK(cudaGetLastError());
return bin_faces;
}
const int chunk_size = 512; const int chunk_size = 512;
const size_t shared_size = num_bins * num_bins * chunk_size / 8; const size_t shared_size = num_bins * num_bins * chunk_size / 8;
const size_t blocks = 64; const size_t blocks = 64;
const size_t threads = 512; const size_t threads = 512;
RasterizeMeshesCoarseCudaKernel<<<blocks, threads, shared_size>>>( RasterizeMeshesCoarseCudaKernel<<<blocks, threads, shared_size, stream>>>(
face_verts.contiguous().data_ptr<float>(), face_verts.contiguous().data_ptr<float>(),
mesh_to_face_first_idx.contiguous().data_ptr<int64_t>(), mesh_to_face_first_idx.contiguous().data_ptr<int64_t>(),
num_faces_per_mesh.contiguous().data_ptr<int64_t>(), num_faces_per_mesh.contiguous().data_ptr<int64_t>(),
...@@ -664,6 +729,8 @@ at::Tensor RasterizeMeshesCoarseCuda( ...@@ -664,6 +729,8 @@ at::Tensor RasterizeMeshesCoarseCuda(
M, M,
faces_per_bin.contiguous().data_ptr<int32_t>(), faces_per_bin.contiguous().data_ptr<int32_t>(),
bin_faces.contiguous().data_ptr<int32_t>()); bin_faces.contiguous().data_ptr<int32_t>());
AT_CUDA_CHECK(cudaGetLastError());
return bin_faces; return bin_faces;
} }
...@@ -775,13 +842,22 @@ RasterizeMeshesFineCuda( ...@@ -775,13 +842,22 @@ RasterizeMeshesFineCuda(
const int faces_per_pixel, const int faces_per_pixel,
const bool perspective_correct, const bool perspective_correct,
const bool cull_backfaces) { const bool cull_backfaces) {
if (face_verts.ndimension() != 3 || face_verts.size(1) != 3 || TORCH_CHECK(
face_verts.size(2) != 3) { face_verts.ndimension() == 3 && face_verts.size(1) == 3 &&
AT_ERROR("face_verts must have dimensions (num_faces, 3, 3)"); face_verts.size(2) == 3,
} "face_verts must have dimensions (num_faces, 3, 3)");
if (bin_faces.ndimension() != 4) { TORCH_CHECK(bin_faces.ndimension() == 4, "bin_faces must have 4 dimensions");
AT_ERROR("bin_faces must have 4 dimensions");
} // Check inputs are on the same device
at::TensorArg face_verts_t{face_verts, "face_verts", 1},
bin_faces_t{bin_faces, "bin_faces", 2};
at::CheckedFrom c = "RasterizeMeshesFineCuda";
at::checkAllSameGPU(c, {face_verts_t, bin_faces_t});
// Set the device for the kernel launch based on the device of the input
at::cuda::CUDAGuard device_guard(face_verts.device());
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
const int N = bin_faces.size(0); const int N = bin_faces.size(0);
const int B = bin_faces.size(1); const int B = bin_faces.size(1);
const int M = bin_faces.size(3); const int M = bin_faces.size(3);
...@@ -790,7 +866,7 @@ RasterizeMeshesFineCuda( ...@@ -790,7 +866,7 @@ RasterizeMeshesFineCuda(
const int W = image_size; const int W = image_size;
if (K > kMaxPointsPerPixel) { if (K > kMaxPointsPerPixel) {
AT_ERROR("Must have num_closest <= 8"); AT_ERROR("Must have num_closest <= 150");
} }
auto long_opts = face_verts.options().dtype(at::kLong); auto long_opts = face_verts.options().dtype(at::kLong);
auto float_opts = face_verts.options().dtype(at::kFloat); auto float_opts = face_verts.options().dtype(at::kFloat);
...@@ -800,10 +876,15 @@ RasterizeMeshesFineCuda( ...@@ -800,10 +876,15 @@ RasterizeMeshesFineCuda(
at::Tensor pix_dists = at::full({N, H, W, K}, -1, float_opts); at::Tensor pix_dists = at::full({N, H, W, K}, -1, float_opts);
at::Tensor bary = at::full({N, H, W, K, 3}, -1, float_opts); at::Tensor bary = at::full({N, H, W, K, 3}, -1, float_opts);
if (face_idxs.numel() == 0) {
AT_CUDA_CHECK(cudaGetLastError());
return std::make_tuple(face_idxs, zbuf, bary, pix_dists);
}
const size_t blocks = 1024; const size_t blocks = 1024;
const size_t threads = 64; const size_t threads = 64;
RasterizeMeshesFineCudaKernel<<<blocks, threads>>>( RasterizeMeshesFineCudaKernel<<<blocks, threads, 0, stream>>>(
face_verts.contiguous().data_ptr<float>(), face_verts.contiguous().data_ptr<float>(),
bin_faces.contiguous().data_ptr<int32_t>(), bin_faces.contiguous().data_ptr<int32_t>(),
blur_radius, blur_radius,
......
...@@ -4,6 +4,7 @@ ...@@ -4,6 +4,7 @@
#include <torch/extension.h> #include <torch/extension.h>
#include <cstdio> #include <cstdio>
#include <tuple> #include <tuple>
#include "utils/pytorch3d_cutils.h"
// **************************************************************************** // ****************************************************************************
// * FORWARD PASS * // * FORWARD PASS *
...@@ -95,6 +96,9 @@ RasterizeMeshesNaive( ...@@ -95,6 +96,9 @@ RasterizeMeshesNaive(
// TODO: Better type checking. // TODO: Better type checking.
if (face_verts.is_cuda()) { if (face_verts.is_cuda()) {
#ifdef WITH_CUDA #ifdef WITH_CUDA
CHECK_CONTIGUOUS_CUDA(face_verts);
CHECK_CONTIGUOUS_CUDA(mesh_to_face_first_idx);
CHECK_CONTIGUOUS_CUDA(num_faces_per_mesh);
return RasterizeMeshesNaiveCuda( return RasterizeMeshesNaiveCuda(
face_verts, face_verts,
mesh_to_face_first_idx, mesh_to_face_first_idx,
...@@ -175,6 +179,11 @@ torch::Tensor RasterizeMeshesBackward( ...@@ -175,6 +179,11 @@ torch::Tensor RasterizeMeshesBackward(
const bool perspective_correct) { const bool perspective_correct) {
if (face_verts.is_cuda()) { if (face_verts.is_cuda()) {
#ifdef WITH_CUDA #ifdef WITH_CUDA
CHECK_CONTIGUOUS_CUDA(face_verts);
CHECK_CONTIGUOUS_CUDA(pix_to_face);
CHECK_CONTIGUOUS_CUDA(grad_zbuf);
CHECK_CONTIGUOUS_CUDA(grad_bary);
CHECK_CONTIGUOUS_CUDA(grad_dists);
return RasterizeMeshesBackwardCuda( return RasterizeMeshesBackwardCuda(
face_verts, face_verts,
pix_to_face, pix_to_face,
...@@ -251,6 +260,9 @@ torch::Tensor RasterizeMeshesCoarse( ...@@ -251,6 +260,9 @@ torch::Tensor RasterizeMeshesCoarse(
const int max_faces_per_bin) { const int max_faces_per_bin) {
if (face_verts.is_cuda()) { if (face_verts.is_cuda()) {
#ifdef WITH_CUDA #ifdef WITH_CUDA
CHECK_CONTIGUOUS_CUDA(face_verts);
CHECK_CONTIGUOUS_CUDA(mesh_to_face_first_idx);
CHECK_CONTIGUOUS_CUDA(num_faces_per_mesh);
return RasterizeMeshesCoarseCuda( return RasterizeMeshesCoarseCuda(
face_verts, face_verts,
mesh_to_face_first_idx, mesh_to_face_first_idx,
...@@ -347,6 +359,8 @@ RasterizeMeshesFine( ...@@ -347,6 +359,8 @@ RasterizeMeshesFine(
const bool cull_backfaces) { const bool cull_backfaces) {
if (face_verts.is_cuda()) { if (face_verts.is_cuda()) {
#ifdef WITH_CUDA #ifdef WITH_CUDA
CHECK_CONTIGUOUS_CUDA(face_verts);
CHECK_CONTIGUOUS_CUDA(bin_faces);
return RasterizeMeshesFineCuda( return RasterizeMeshesFineCuda(
face_verts, face_verts,
bin_faces, bin_faces,
......
// Copyright (c) Facebook, Inc. and its affiliates. All rights reserved. // Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
#include <ATen/ATen.h> #include <ATen/ATen.h>
#include <ATen/cuda/CUDAContext.h>
#include <c10/cuda/CUDAGuard.h>
#include <math.h> #include <math.h>
#include <cstdio> #include <cstdio>
#include <sstream> #include <sstream>
...@@ -145,13 +147,25 @@ std::tuple<at::Tensor, at::Tensor, at::Tensor> RasterizePointsNaiveCuda( ...@@ -145,13 +147,25 @@ std::tuple<at::Tensor, at::Tensor, at::Tensor> RasterizePointsNaiveCuda(
const int image_size, const int image_size,
const float radius, const float radius,
const int points_per_pixel) { const int points_per_pixel) {
if (points.ndimension() != 2 || points.size(1) != 3) { // Check inputs are on the same device
AT_ERROR("points must have dimensions (num_points, 3)"); at::TensorArg points_t{points, "points", 1},
} cloud_to_packed_first_idx_t{
if (num_points_per_cloud.size(0) != cloud_to_packed_first_idx.size(0)) { cloud_to_packed_first_idx, "cloud_to_packed_first_idx", 2},
AT_ERROR( num_points_per_cloud_t{num_points_per_cloud, "num_points_per_cloud", 3};
"num_points_per_cloud must have same size first dimension as cloud_to_packed_first_idx"); at::CheckedFrom c = "RasterizePointsNaiveCuda";
} at::checkAllSameGPU(
c, {points_t, cloud_to_packed_first_idx_t, num_points_per_cloud_t});
// Set the device for the kernel launch based on the device of the input
at::cuda::CUDAGuard device_guard(points.device());
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
TORCH_CHECK(
points.ndimension() == 2 && points.size(1) == 3,
"points must have dimensions (num_points, 3)");
TORCH_CHECK(
num_points_per_cloud.size(0) == cloud_to_packed_first_idx.size(0),
"num_points_per_cloud must have same size first dimension as cloud_to_packed_first_idx");
const int N = num_points_per_cloud.size(0); // batch size. const int N = num_points_per_cloud.size(0); // batch size.
const int S = image_size; const int S = image_size;
...@@ -169,9 +183,14 @@ std::tuple<at::Tensor, at::Tensor, at::Tensor> RasterizePointsNaiveCuda( ...@@ -169,9 +183,14 @@ std::tuple<at::Tensor, at::Tensor, at::Tensor> RasterizePointsNaiveCuda(
at::Tensor zbuf = at::full({N, S, S, K}, -1, float_opts); at::Tensor zbuf = at::full({N, S, S, K}, -1, float_opts);
at::Tensor pix_dists = at::full({N, S, S, K}, -1, float_opts); at::Tensor pix_dists = at::full({N, S, S, K}, -1, float_opts);
if (point_idxs.numel() == 0) {
AT_CUDA_CHECK(cudaGetLastError());
return std::make_tuple(point_idxs, zbuf, pix_dists);
}
const size_t blocks = 1024; const size_t blocks = 1024;
const size_t threads = 64; const size_t threads = 64;
RasterizePointsNaiveCudaKernel<<<blocks, threads>>>( RasterizePointsNaiveCudaKernel<<<blocks, threads, 0, stream>>>(
points.contiguous().data_ptr<float>(), points.contiguous().data_ptr<float>(),
cloud_to_packed_first_idx.contiguous().data_ptr<int64_t>(), cloud_to_packed_first_idx.contiguous().data_ptr<int64_t>(),
num_points_per_cloud.contiguous().data_ptr<int64_t>(), num_points_per_cloud.contiguous().data_ptr<int64_t>(),
...@@ -182,6 +201,8 @@ std::tuple<at::Tensor, at::Tensor, at::Tensor> RasterizePointsNaiveCuda( ...@@ -182,6 +201,8 @@ std::tuple<at::Tensor, at::Tensor, at::Tensor> RasterizePointsNaiveCuda(
point_idxs.contiguous().data_ptr<int32_t>(), point_idxs.contiguous().data_ptr<int32_t>(),
zbuf.contiguous().data_ptr<float>(), zbuf.contiguous().data_ptr<float>(),
pix_dists.contiguous().data_ptr<float>()); pix_dists.contiguous().data_ptr<float>());
AT_CUDA_CHECK(cudaGetLastError());
return std::make_tuple(point_idxs, zbuf, pix_dists); return std::make_tuple(point_idxs, zbuf, pix_dists);
} }
...@@ -323,14 +344,28 @@ at::Tensor RasterizePointsCoarseCuda( ...@@ -323,14 +344,28 @@ at::Tensor RasterizePointsCoarseCuda(
const float radius, const float radius,
const int bin_size, const int bin_size,
const int max_points_per_bin) { const int max_points_per_bin) {
TORCH_CHECK(
points.ndimension() == 2 && points.size(1) == 3,
"points must have dimensions (num_points, 3)");
// Check inputs are on the same device
at::TensorArg points_t{points, "points", 1},
cloud_to_packed_first_idx_t{
cloud_to_packed_first_idx, "cloud_to_packed_first_idx", 2},
num_points_per_cloud_t{num_points_per_cloud, "num_points_per_cloud", 3};
at::CheckedFrom c = "RasterizePointsCoarseCuda";
at::checkAllSameGPU(
c, {points_t, cloud_to_packed_first_idx_t, num_points_per_cloud_t});
// Set the device for the kernel launch based on the device of the input
at::cuda::CUDAGuard device_guard(points.device());
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
const int P = points.size(0); const int P = points.size(0);
const int N = num_points_per_cloud.size(0); const int N = num_points_per_cloud.size(0);
const int num_bins = 1 + (image_size - 1) / bin_size; // divide round up const int num_bins = 1 + (image_size - 1) / bin_size; // divide round up
const int M = max_points_per_bin; const int M = max_points_per_bin;
if (points.ndimension() != 2 || points.size(1) != 3) {
AT_ERROR("points must have dimensions (num_points, 3)");
}
if (num_bins >= 22) { if (num_bins >= 22) {
// Make sure we do not use too much shared memory. // Make sure we do not use too much shared memory.
std::stringstream ss; std::stringstream ss;
...@@ -340,12 +375,18 @@ at::Tensor RasterizePointsCoarseCuda( ...@@ -340,12 +375,18 @@ at::Tensor RasterizePointsCoarseCuda(
auto opts = points.options().dtype(at::kInt); auto opts = points.options().dtype(at::kInt);
at::Tensor points_per_bin = at::zeros({N, num_bins, num_bins}, opts); at::Tensor points_per_bin = at::zeros({N, num_bins, num_bins}, opts);
at::Tensor bin_points = at::full({N, num_bins, num_bins, M}, -1, opts); at::Tensor bin_points = at::full({N, num_bins, num_bins, M}, -1, opts);
if (bin_points.numel() == 0) {
AT_CUDA_CHECK(cudaGetLastError());
return bin_points;
}
const int chunk_size = 512; const int chunk_size = 512;
const size_t shared_size = num_bins * num_bins * chunk_size / 8; const size_t shared_size = num_bins * num_bins * chunk_size / 8;
const size_t blocks = 64; const size_t blocks = 64;
const size_t threads = 512; const size_t threads = 512;
RasterizePointsCoarseCudaKernel<<<blocks, threads, shared_size>>>( RasterizePointsCoarseCudaKernel<<<blocks, threads, shared_size, stream>>>(
points.contiguous().data_ptr<float>(), points.contiguous().data_ptr<float>(),
cloud_to_packed_first_idx.contiguous().data_ptr<int64_t>(), cloud_to_packed_first_idx.contiguous().data_ptr<int64_t>(),
num_points_per_cloud.contiguous().data_ptr<int64_t>(), num_points_per_cloud.contiguous().data_ptr<int64_t>(),
...@@ -358,6 +399,8 @@ at::Tensor RasterizePointsCoarseCuda( ...@@ -358,6 +399,8 @@ at::Tensor RasterizePointsCoarseCuda(
M, M,
points_per_bin.contiguous().data_ptr<int32_t>(), points_per_bin.contiguous().data_ptr<int32_t>(),
bin_points.contiguous().data_ptr<int32_t>()); bin_points.contiguous().data_ptr<int32_t>());
AT_CUDA_CHECK(cudaGetLastError());
return bin_points; return bin_points;
} }
...@@ -448,13 +491,23 @@ std::tuple<at::Tensor, at::Tensor, at::Tensor> RasterizePointsFineCuda( ...@@ -448,13 +491,23 @@ std::tuple<at::Tensor, at::Tensor, at::Tensor> RasterizePointsFineCuda(
const float radius, const float radius,
const int bin_size, const int bin_size,
const int points_per_pixel) { const int points_per_pixel) {
// Check inputs are on the same device
at::TensorArg points_t{points, "points", 1},
bin_points_t{bin_points, "bin_points", 2};
at::CheckedFrom c = "RasterizePointsFineCuda";
at::checkAllSameGPU(c, {points_t, bin_points_t});
// Set the device for the kernel launch based on the device of the input
at::cuda::CUDAGuard device_guard(points.device());
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
const int N = bin_points.size(0); const int N = bin_points.size(0);
const int B = bin_points.size(1); // num_bins const int B = bin_points.size(1); // num_bins
const int M = bin_points.size(3); const int M = bin_points.size(3);
const int S = image_size; const int S = image_size;
const int K = points_per_pixel; const int K = points_per_pixel;
if (K > kMaxPointsPerPixel) { if (K > kMaxPointsPerPixel) {
AT_ERROR("Must have num_closest <= 8"); AT_ERROR("Must have num_closest <= 150");
} }
auto int_opts = points.options().dtype(at::kInt); auto int_opts = points.options().dtype(at::kInt);
auto float_opts = points.options().dtype(at::kFloat); auto float_opts = points.options().dtype(at::kFloat);
...@@ -462,9 +515,14 @@ std::tuple<at::Tensor, at::Tensor, at::Tensor> RasterizePointsFineCuda( ...@@ -462,9 +515,14 @@ std::tuple<at::Tensor, at::Tensor, at::Tensor> RasterizePointsFineCuda(
at::Tensor zbuf = at::full({N, S, S, K}, -1, float_opts); at::Tensor zbuf = at::full({N, S, S, K}, -1, float_opts);
at::Tensor pix_dists = at::full({N, S, S, K}, -1, float_opts); at::Tensor pix_dists = at::full({N, S, S, K}, -1, float_opts);
if (point_idxs.numel() == 0) {
AT_CUDA_CHECK(cudaGetLastError());
return std::make_tuple(point_idxs, zbuf, pix_dists);
}
const size_t blocks = 1024; const size_t blocks = 1024;
const size_t threads = 64; const size_t threads = 64;
RasterizePointsFineCudaKernel<<<blocks, threads>>>( RasterizePointsFineCudaKernel<<<blocks, threads, 0, stream>>>(
points.contiguous().data_ptr<float>(), points.contiguous().data_ptr<float>(),
bin_points.contiguous().data_ptr<int32_t>(), bin_points.contiguous().data_ptr<int32_t>(),
radius, radius,
...@@ -478,6 +536,7 @@ std::tuple<at::Tensor, at::Tensor, at::Tensor> RasterizePointsFineCuda( ...@@ -478,6 +536,7 @@ std::tuple<at::Tensor, at::Tensor, at::Tensor> RasterizePointsFineCuda(
zbuf.contiguous().data_ptr<float>(), zbuf.contiguous().data_ptr<float>(),
pix_dists.contiguous().data_ptr<float>()); pix_dists.contiguous().data_ptr<float>());
AT_CUDA_CHECK(cudaGetLastError());
return std::make_tuple(point_idxs, zbuf, pix_dists); return std::make_tuple(point_idxs, zbuf, pix_dists);
} }
...@@ -537,6 +596,19 @@ at::Tensor RasterizePointsBackwardCuda( ...@@ -537,6 +596,19 @@ at::Tensor RasterizePointsBackwardCuda(
const at::Tensor& idxs, // (N, H, W, K) const at::Tensor& idxs, // (N, H, W, K)
const at::Tensor& grad_zbuf, // (N, H, W, K) const at::Tensor& grad_zbuf, // (N, H, W, K)
const at::Tensor& grad_dists) { // (N, H, W, K) const at::Tensor& grad_dists) { // (N, H, W, K)
// Check inputs are on the same device
at::TensorArg points_t{points, "points", 1}, idxs_t{idxs, "idxs", 2},
grad_zbuf_t{grad_zbuf, "grad_zbuf", 3},
grad_dists_t{grad_dists, "grad_dists", 4};
at::CheckedFrom c = "RasterizePointsBackwardCuda";
at::checkAllSameGPU(c, {points_t, idxs_t, grad_zbuf_t, grad_dists_t});
at::checkAllSameType(c, {points_t, grad_zbuf_t, grad_dists_t});
// Set the device for the kernel launch based on the device of the input
at::cuda::CUDAGuard device_guard(points.device());
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
const int P = points.size(0); const int P = points.size(0);
const int N = idxs.size(0); const int N = idxs.size(0);
const int H = idxs.size(1); const int H = idxs.size(1);
...@@ -544,10 +616,16 @@ at::Tensor RasterizePointsBackwardCuda( ...@@ -544,10 +616,16 @@ at::Tensor RasterizePointsBackwardCuda(
const int K = idxs.size(3); const int K = idxs.size(3);
at::Tensor grad_points = at::zeros({P, 3}, points.options()); at::Tensor grad_points = at::zeros({P, 3}, points.options());
if (grad_points.numel() == 0) {
AT_CUDA_CHECK(cudaGetLastError());
return grad_points;
}
const size_t blocks = 1024; const size_t blocks = 1024;
const size_t threads = 64; const size_t threads = 64;
RasterizePointsBackwardCudaKernel<<<blocks, threads>>>( RasterizePointsBackwardCudaKernel<<<blocks, threads, 0, stream>>>(
points.contiguous().data_ptr<float>(), points.contiguous().data_ptr<float>(),
idxs.contiguous().data_ptr<int32_t>(), idxs.contiguous().data_ptr<int32_t>(),
N, N,
...@@ -559,5 +637,6 @@ at::Tensor RasterizePointsBackwardCuda( ...@@ -559,5 +637,6 @@ at::Tensor RasterizePointsBackwardCuda(
grad_dists.contiguous().data_ptr<float>(), grad_dists.contiguous().data_ptr<float>(),
grad_points.contiguous().data_ptr<float>()); grad_points.contiguous().data_ptr<float>());
AT_CUDA_CHECK(cudaGetLastError());
return grad_points; return grad_points;
} }
...@@ -4,6 +4,7 @@ ...@@ -4,6 +4,7 @@
#include <torch/extension.h> #include <torch/extension.h>
#include <cstdio> #include <cstdio>
#include <tuple> #include <tuple>
#include "utils/pytorch3d_cutils.h"
// **************************************************************************** // ****************************************************************************
// * NAIVE RASTERIZATION * // * NAIVE RASTERIZATION *
...@@ -66,6 +67,9 @@ std::tuple<torch::Tensor, torch::Tensor, torch::Tensor> RasterizePointsNaive( ...@@ -66,6 +67,9 @@ std::tuple<torch::Tensor, torch::Tensor, torch::Tensor> RasterizePointsNaive(
if (points.is_cuda() && cloud_to_packed_first_idx.is_cuda() && if (points.is_cuda() && cloud_to_packed_first_idx.is_cuda() &&
num_points_per_cloud.is_cuda()) { num_points_per_cloud.is_cuda()) {
#ifdef WITH_CUDA #ifdef WITH_CUDA
CHECK_CONTIGUOUS_CUDA(points);
CHECK_CONTIGUOUS_CUDA(cloud_to_packed_first_idx);
CHECK_CONTIGUOUS_CUDA(num_points_per_cloud);
return RasterizePointsNaiveCuda( return RasterizePointsNaiveCuda(
points, points,
cloud_to_packed_first_idx, cloud_to_packed_first_idx,
...@@ -140,6 +144,9 @@ torch::Tensor RasterizePointsCoarse( ...@@ -140,6 +144,9 @@ torch::Tensor RasterizePointsCoarse(
if (points.is_cuda() && cloud_to_packed_first_idx.is_cuda() && if (points.is_cuda() && cloud_to_packed_first_idx.is_cuda() &&
num_points_per_cloud.is_cuda()) { num_points_per_cloud.is_cuda()) {
#ifdef WITH_CUDA #ifdef WITH_CUDA
CHECK_CONTIGUOUS_CUDA(points);
CHECK_CONTIGUOUS_CUDA(cloud_to_packed_first_idx);
CHECK_CONTIGUOUS_CUDA(num_points_per_cloud);
return RasterizePointsCoarseCuda( return RasterizePointsCoarseCuda(
points, points,
cloud_to_packed_first_idx, cloud_to_packed_first_idx,
...@@ -208,6 +215,8 @@ std::tuple<torch::Tensor, torch::Tensor, torch::Tensor> RasterizePointsFine( ...@@ -208,6 +215,8 @@ std::tuple<torch::Tensor, torch::Tensor, torch::Tensor> RasterizePointsFine(
const int points_per_pixel) { const int points_per_pixel) {
if (points.is_cuda()) { if (points.is_cuda()) {
#ifdef WITH_CUDA #ifdef WITH_CUDA
CHECK_CONTIGUOUS_CUDA(points);
CHECK_CONTIGUOUS_CUDA(bin_points);
return RasterizePointsFineCuda( return RasterizePointsFineCuda(
points, bin_points, image_size, radius, bin_size, points_per_pixel); points, bin_points, image_size, radius, bin_size, points_per_pixel);
#else #else
...@@ -257,6 +266,10 @@ torch::Tensor RasterizePointsBackward( ...@@ -257,6 +266,10 @@ torch::Tensor RasterizePointsBackward(
const torch::Tensor& grad_dists) { const torch::Tensor& grad_dists) {
if (points.is_cuda()) { if (points.is_cuda()) {
#ifdef WITH_CUDA #ifdef WITH_CUDA
CHECK_CONTIGUOUS_CUDA(points);
CHECK_CONTIGUOUS_CUDA(idxs);
CHECK_CONTIGUOUS_CUDA(grad_zbuf);
CHECK_CONTIGUOUS_CUDA(grad_dists);
return RasterizePointsBackwardCuda(points, idxs, grad_zbuf, grad_dists); return RasterizePointsBackwardCuda(points, idxs, grad_zbuf, grad_dists);
#else #else
AT_ERROR("Not compiled with GPU support"); AT_ERROR("Not compiled with GPU support");
......
...@@ -3,9 +3,9 @@ ...@@ -3,9 +3,9 @@
#pragma once #pragma once
#include <torch/extension.h> #include <torch/extension.h>
#define CHECK_CUDA(x) AT_ASSERTM(x.is_cuda(), #x "must be a CUDA tensor.") #define CHECK_CUDA(x) TORCH_CHECK(x.is_cuda(), #x "must be a CUDA tensor.")
#define CHECK_CONTIGUOUS(x) \ #define CHECK_CONTIGUOUS(x) \
AT_ASSERTM(x.is_contiguous(), #x "must be contiguous.") TORCH_CHECK(x.is_contiguous(), #x "must be contiguous.")
#define CHECK_CONTIGUOUS_CUDA(x) \ #define CHECK_CONTIGUOUS_CUDA(x) \
CHECK_CUDA(x); \ CHECK_CUDA(x); \
CHECK_CONTIGUOUS(x) CHECK_CONTIGUOUS(x)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment