// Copyright (c) Meta Platforms, Inc. and affiliates. // // This source code is licensed under the MIT license found in the // LICENSE file in the root directory of this source tree. #include #include #include #include #include #include "rasterize_kernel.h" #include using namespace math; template __global__ void rasterize_kernel( const index_t nthreads, TensorInfo v, TensorInfo vi, TensorInfo packed_index_depth_img) { typedef typename math::TVec2 scalar2_t; typedef typename math::TVec3 scalar3_t; typedef typename math::TVec4 scalar4_t; const index_t H = packed_index_depth_img.sizes[1]; const index_t W = packed_index_depth_img.sizes[2]; const index_t V = v.sizes[1]; const index_t n_prim = vi.sizes[1]; const index_t index_sN = packed_index_depth_img.strides[0]; const index_t index_sH = packed_index_depth_img.strides[1]; const index_t index_sW = packed_index_depth_img.strides[2]; const index_t v_sN = v.strides[0]; const index_t v_sV = v.strides[1]; const index_t v_sC = v.strides[2]; const index_t vi_sN = vi.strides[0]; const index_t vi_sF = vi.strides[1]; const index_t vi_sI = vi.strides[2]; CUDA_KERNEL_LOOP_TYPE(index, nthreads, index_t) { const index_t n = index / n_prim; const index_t id = index % n_prim; const int32_t* __restrict vi_ptr = vi.data + vi_sN * n + vi_sF * id; const int32_t vi_0 = (int32_t)(((uint32_t)vi_ptr[vi_sI * 0]) & 0x0FFFFFFFU); const int32_t vi_1 = vi_ptr[vi_sI * 1]; const int32_t vi_2 = vi_ptr[vi_sI * 2]; assert(vi_0 < V && vi_1 < V && vi_2 < V); // Skip degenerate triangles. Useful for padding of vi buffer for batched rasterization bool triangle_is_degenerate = (vi_0 == vi_1) && (vi_1 == vi_2); const scalar_t* __restrict v_ptr = v.data + n * v_sN; const scalar2_t p_0 = {v_ptr[v_sV * vi_0 + v_sC * 0], v_ptr[v_sV * vi_0 + v_sC * 1]}; const scalar2_t p_1 = {v_ptr[v_sV * vi_1 + v_sC * 0], v_ptr[v_sV * vi_1 + v_sC * 1]}; const scalar2_t p_2 = {v_ptr[v_sV * vi_2 + v_sC * 0], v_ptr[v_sV * vi_2 + v_sC * 1]}; const scalar3_t p_012_z = { v_ptr[v_sV * vi_0 + v_sC * 2], v_ptr[v_sV * vi_1 + v_sC * 2], v_ptr[v_sV * vi_2 + v_sC * 2]}; const scalar2_t min_p = math::min(math::min(p_0, p_1), p_2); const scalar2_t max_p = math::max(math::max(p_0, p_1), p_2); const bool all_z_greater_0 = math::all_greater(p_012_z, {1e-8f, 1e-8f, 1e-8f}); const bool in_canvas = math::all_less_or_eq(min_p, {(scalar_t)(W - 1), (scalar_t)(H - 1)}) && math::all_greater(max_p, {0.f, 0.f}); if (all_z_greater_0 && in_canvas && ~triangle_is_degenerate) { const scalar2_t v_01 = p_1 - p_0; const scalar2_t v_02 = p_2 - p_0; const scalar2_t v_12 = p_2 - p_1; const scalar_t denominator = v_01.x * v_02.y - v_01.y * v_02.x; if (denominator != 0.f) { // Compute triangle bounds with extra border. int min_x = max(0, int(min_p.x)); int min_y = max(0, int(min_p.y)); int max_x = min((int)W - 1, int(max_p.x) + 1); int max_y = min((int)H - 1, int(max_p.y) + 1); // Loop over pixels inside triangle bbox. for (int y = min_y; y <= max_y; ++y) { for (int x = min_x; x <= max_x; ++x) { const scalar2_t p = {(scalar_t)x, (scalar_t)y}; const scalar2_t vp0p = p - p_0; const scalar2_t vp1p = p - p_1; scalar3_t bary = scalar3_t({ vp1p.y * v_12.x - vp1p.x * v_12.y, vp0p.x * v_02.y - vp0p.y * v_02.x, vp0p.y * v_01.x - vp0p.x * v_01.y, }); bary *= sign(denominator); const bool on_edge_or_inside = (bary.x >= 0.f) && (bary.y >= 0.f) && (bary.z >= 0.f); bool on_edge_0 = bary.x == 0.f; bool on_edge_1 = bary.y == 0.f; bool on_edge_2 = bary.z == 0.f; const bool is_top_left_0 = (denominator > 0) ? (v_12.y < 0.f || v_12.y == 0.0f && v_12.x > 0.f) : (v_12.y > 0.f || v_12.y == 0.0f && v_12.x < 0.f); const bool is_top_left_1 = (denominator > 0) ? (v_02.y > 0.f || v_02.y == 0.0f && v_02.x < 0.f) : (v_02.y < 0.f || v_02.y == 0.0f && v_02.x > 0.f); const bool is_top_left_2 = (denominator > 0) ? (v_01.y < 0.f || v_01.y == 0.0f && v_01.x > 0.f) : (v_01.y > 0.f || v_01.y == 0.0f && v_01.x < 0.f); const bool is_top_left_or_inside = on_edge_or_inside && !(on_edge_0 && !is_top_left_0 || on_edge_1 && !is_top_left_1 || on_edge_2 && !is_top_left_2); if (is_top_left_or_inside) { bary /= abs(denominator); // interpolate inverse depth linearly const scalar3_t d_inv = 1.0 / epsclamp(p_012_z); const scalar_t depth_inverse = dot(d_inv, bary); const scalar_t depth = 1.0f / epsclamp(depth_inverse); const unsigned long long packed_val = (static_cast(__float_as_uint(depth)) << 32u) | static_cast(id); atomicMin( reinterpret_cast(packed_index_depth_img.data) + index_sN * n + index_sH * y + index_sW * x, packed_val); } } } } } } } template __device__ inline void get_line( const math::TVec2& p1, const math::TVec2& p2, scalar_t& a, scalar_t& b, scalar_t& c) { a = p1.y - p2.y; b = p2.x - p1.x; c = p1.x * p2.y - p2.x * p1.y; } template __device__ inline bool is_point_in_segment( const math::TVec2& p1, const math::TVec2& p2, const math::TVec2& c) { return ( (((p2.x >= c.x) && (c.x >= p1.x)) || ((p2.x <= c.x) && (c.x <= p1.x))) && (((p2.y >= c.y) && (c.y >= p1.y)) || ((p2.y <= c.y) && (c.y <= p1.y)))); } template __device__ inline math::TVec2 get_cross_point(scalar_t a1, scalar_t b1, scalar_t c1, scalar_t a2, scalar_t b2, scalar_t c2) { scalar_t d = a1 * b2 - a2 * b1; if (d == scalar_t(0)) { return math::TVec2{std::numeric_limits().max()}; } return math::TVec2{(b1 * c2 - b2 * c1) / d, (a2 * c1 - a1 * c2) / d}; } template __device__ inline math::TVec2 get_cross_point( scalar_t a1, scalar_t b1, scalar_t c1, const math::TVec2& p1, const math::TVec2& p2) { scalar_t a2 = 1e16; scalar_t b2 = 1e16; scalar_t c2 = 1e16; get_line(p1, p2, a2, b2, c2); scalar_t d = a1 * b2 - a2 * b1; if (d == scalar_t(0)) { return math::TVec2{std::numeric_limits().max()}; } return math::TVec2{(b1 * c2 - b2 * c1) / d, (a2 * c1 - a1 * c2) / d}; } template __device__ inline bool is_crossing_dimond( const math::TVec2& p1, const math::TVec2& p2, const math::TVec2& p) { scalar_t a0 = 1e16; scalar_t b0 = 1e16; scalar_t c0 = 1e16; get_line(p1, p2, a0, b0, c0); bool intersecting = false; { math::TVec2 s0 = {p.x, p.y - scalar_t(0.5)}; math::TVec2 s1 = {p.x + scalar_t(0.5), p.y}; auto c = get_cross_point(a0, b0, c0, s0, s1); intersecting |= is_point_in_segment(s0, s1, c) && is_point_in_segment(p1, p2, c); } { math::TVec2 s0 = {p.x + scalar_t(0.5), p.y}; math::TVec2 s1 = {p.x, p.y + scalar_t(0.5)}; auto c = get_cross_point(a0, b0, c0, s0, s1); intersecting |= is_point_in_segment(s0, s1, c) && is_point_in_segment(p1, p2, c); } { math::TVec2 s0 = {p.x, p.y + scalar_t(0.5)}; math::TVec2 s1 = {p.x - scalar_t(0.5), p.y}; auto c = get_cross_point(a0, b0, c0, s0, s1); intersecting |= is_point_in_segment(s0, s1, c) && is_point_in_segment(p1, p2, c); } { math::TVec2 s0 = {p.x - scalar_t(0.5), p.y}; math::TVec2 s1 = {p.x, p.y - scalar_t(0.5)}; auto c = get_cross_point(a0, b0, c0, s0, s1); intersecting |= is_point_in_segment(s0, s1, c) && is_point_in_segment(p1, p2, c); } return intersecting; } template __global__ void rasterize_lines_kernel( const index_t nthreads, TensorInfo v, TensorInfo vi, TensorInfo packed_index_depth_img) { typedef typename math::TVec2 scalar2_t; typedef typename math::TVec3 scalar3_t; typedef typename math::TVec4 scalar4_t; const index_t H = packed_index_depth_img.sizes[1]; const index_t W = packed_index_depth_img.sizes[2]; const index_t V = v.sizes[1]; const index_t n_prim = vi.sizes[1]; const index_t index_sN = packed_index_depth_img.strides[0]; const index_t index_sH = packed_index_depth_img.strides[1]; const index_t index_sW = packed_index_depth_img.strides[2]; const index_t v_sN = v.strides[0]; const index_t v_sV = v.strides[1]; const index_t v_sC = v.strides[2]; const index_t vi_sN = vi.strides[0]; const index_t vi_sF = vi.strides[1]; const index_t vi_sI = vi.strides[2]; CUDA_KERNEL_LOOP_TYPE(index, nthreads, index_t) { const index_t n = index / n_prim; const index_t id = index % n_prim; const int32_t* __restrict vi_ptr = vi.data + vi_sN * n + vi_sF * id; const int32_t flag = (int32_t)((((uint32_t)vi_ptr[vi_sI * 0] & 0xF0000000U)) >> 28U); const int32_t vi_0 = (int32_t)(((uint32_t)vi_ptr[vi_sI * 0]) & 0x0FFFFFFFU); const int32_t vi_1 = vi_ptr[vi_sI * 1]; const int32_t vi_2 = vi_ptr[vi_sI * 2]; // Skip degenerate triangles. Useful for padding of vi buffer for batched rasterization bool triangle_is_degenerate = (vi_0 == vi_1) && (vi_1 == vi_2); const bool edge_0_visible = (flag & 0b00000001) != 0; const bool edge_1_visible = (flag & 0b00000010) != 0; const bool edge_2_visible = (flag & 0b00000100) != 0; assert(vi_0 < V && vi_1 < V && vi_2 < V); const scalar_t* __restrict v_ptr = v.data + n * v_sN; const scalar2_t p_0 = {v_ptr[v_sV * vi_0 + v_sC * 0], v_ptr[v_sV * vi_0 + v_sC * 1]}; const scalar2_t p_1 = {v_ptr[v_sV * vi_1 + v_sC * 0], v_ptr[v_sV * vi_1 + v_sC * 1]}; const scalar2_t p_2 = {v_ptr[v_sV * vi_2 + v_sC * 0], v_ptr[v_sV * vi_2 + v_sC * 1]}; const scalar3_t p_012_z = { v_ptr[v_sV * vi_0 + v_sC * 2], v_ptr[v_sV * vi_1 + v_sC * 2], v_ptr[v_sV * vi_2 + v_sC * 2]}; const scalar2_t min_p = math::min(math::min(p_0, p_1), p_2); const scalar2_t max_p = math::max(math::max(p_0, p_1), p_2); const bool all_z_greater_0 = math::all_greater(p_012_z, {1e-8f, 1e-8f, 1e-8f}); const bool in_canvas = math::all_less_or_eq(min_p, {(scalar_t)(W - 1), (scalar_t)(H - 1)}) && math::all_greater(max_p, {0.f, 0.f}); if (all_z_greater_0 && in_canvas && ~triangle_is_degenerate) { const scalar2_t v_01 = p_1 - p_0; const scalar2_t v_02 = p_2 - p_0; const scalar2_t v_12 = p_2 - p_1; const scalar_t denominator = v_01.x * v_02.y - v_01.y * v_02.x; if (denominator != 0.f) { // Compute triangle bounds with extra border. int min_x = max(1, int(min_p.x) - 2); int min_y = max(1, int(min_p.y) - 2); int max_x = min((int)W - 2, int(max_p.x) + 2); int max_y = min((int)H - 2, int(max_p.y) + 2); // Loop over pixels inside triangle bbox. for (int y = min_y; y <= max_y; ++y) { for (int x = min_x; x <= max_x; ++x) { const scalar2_t p = {(scalar_t)x, (scalar_t)y}; const scalar2_t vp0p = p - p_0; const scalar2_t vp1p = p - p_1; bool intersecting = false; intersecting |= is_crossing_dimond(p_0, p_1, p) && edge_0_visible; intersecting |= is_crossing_dimond(p_1, p_2, p) && edge_1_visible; intersecting |= is_crossing_dimond(p_0, p_2, p) && edge_2_visible; scalar3_t bary = scalar3_t({ vp1p.y * v_12.x - vp1p.x * v_12.y, vp0p.x * v_02.y - vp0p.y * v_02.x, vp0p.y * v_01.x - vp0p.x * v_01.y, }); bary *= sign(denominator); const bool on_edge_or_inside = (bary.x >= 0.f) && (bary.y >= 0.f) && (bary.z >= 0.f); bool on_edge_0 = bary.x == 0.f; bool on_edge_1 = bary.y == 0.f; bool on_edge_2 = bary.z == 0.f; const bool is_top_left_0 = (denominator > 0) ? (v_12.y < 0.f || v_12.y == 0.0f && v_12.x > 0.f) : (v_12.y > 0.f || v_12.y == 0.0f && v_12.x < 0.f); const bool is_top_left_1 = (denominator > 0) ? (v_02.y > 0.f || v_02.y == 0.0f && v_02.x < 0.f) : (v_02.y < 0.f || v_02.y == 0.0f && v_02.x > 0.f); const bool is_top_left_2 = (denominator > 0) ? (v_01.y < 0.f || v_01.y == 0.0f && v_01.x > 0.f) : (v_01.y > 0.f || v_01.y == 0.0f && v_01.x < 0.f); const bool is_top_left_or_inside = on_edge_or_inside && !(on_edge_0 && !is_top_left_0 || on_edge_1 && !is_top_left_1 || on_edge_2 && !is_top_left_2); if (is_top_left_or_inside || intersecting) { bary /= abs(denominator); bary = math::max(bary, scalar3_t{0, 0, 0}); bary = math::min(bary, scalar3_t{1, 1, 1}); bary = bary / math::sum(bary); // interpolate inverse depth linearly const scalar3_t d_inv = 1.0 / epsclamp(p_012_z); const scalar_t depth_inverse = dot(d_inv, bary); const scalar_t depth = 1.0f / epsclamp(depth_inverse); const unsigned long long packed_val = (static_cast(__float_as_uint(depth)) << 32u) | (intersecting ? static_cast(id) : 0xFFFFFFFFULL); atomicMin( reinterpret_cast(packed_index_depth_img.data) + index_sN * n + index_sH * y + index_sW * x, packed_val); } } } } } } } template __global__ void unpack_kernel( const index_t nthreads, TensorInfo packed_index_depth_img, TensorInfo depth_img, TensorInfo index_img) { CUDA_KERNEL_LOOP_TYPE(index, nthreads, index_t) { const unsigned long long int pv = reinterpret_cast(packed_index_depth_img.data)[index]; const auto depth_uint = static_cast(pv >> 32); depth_img.data[index] = depth_uint == 0xFFFFFFFF ? 0.0f : __uint_as_float(depth_uint); reinterpret_cast(index_img.data)[index] = static_cast(pv & 0xFFFFFFFF); } } std::vector rasterize_cuda( const torch::Tensor& v, const torch::Tensor& vi, int64_t height, int64_t width, bool wireframe) { TORCH_CHECK(v.defined() && vi.defined(), "rasterize(): expected all inputs to be defined"); auto v_opt = v.options(); auto vi_opt = vi.options(); TORCH_CHECK( (v.device() == vi.device()) && (v.is_cuda()), "rasterize(): expected all inputs to be on same cuda device"); TORCH_CHECK( v.is_floating_point(), "rasterize(): expected v to have floating point type, but v has ", v.dtype()); TORCH_CHECK( vi.dtype() == torch::kInt32, "rasterize(): expected vi to have int32 type, but vi has ", vi.dtype()); TORCH_CHECK( v.layout() == torch::kStrided && vi.layout() == torch::kStrided, "rasterize(): expected all inputs to have torch.strided layout"); TORCH_CHECK( (v.dim() == 3) && (vi.dim() == 3), "rasterize(): expected v.ndim == 3, vi.ndim == 3, " "but got v with sizes ", v.sizes(), " and vi with sizes ", vi.sizes()); TORCH_CHECK( v.size(2) == 3 && vi.size(2) == 3, "rasterize(): expected third dim of v to be of size 3, and last dim of vi to be of size 3, but got ", v.size(2), " in the third dim of v, and ", vi.size(2), " in the last dim of vi"); TORCH_CHECK( (vi.size(0) == v.size(0)), "rasterize(): expected first dim of vi to match first dim of v, but got ", v.size(0), " in first dim of v, and ", vi.size(0), " in the first dim of vi"); TORCH_CHECK( v.size(1) < 0x10000000U, "rasterize(): expected second dim of v to be less or eual to 268435456, but got ", v.size(1)); TORCH_CHECK( height > 0 && width > 0, "rasterize(): both height and width have to be greater than zero, but got height: ", height, ", and width: ", width); const at::cuda::OptionalCUDAGuard device_guard(device_of(v)); auto stream = at::cuda::getCurrentCUDAStream(); auto N = v.size(0); auto T = vi.size(1); auto H = height; auto W = width; const auto count_rasterize = N * T; const auto count_unpack = N * H * W; auto packed_index_depth_img = at::empty({N, H, W}, v.options().dtype(torch::kInt64)); auto depth_img = at::empty({N, H, W}, v.options().dtype(torch::kFloat32)); auto index_img = at::empty({N, H, W}, v.options().dtype(torch::kInt32)); cudaMemsetAsync( packed_index_depth_img.data_ptr(), 0xFF, N * H * W * torch::elementSize(torch::kInt64), stream); // rasterize if (count_rasterize > 0) { AT_DISPATCH_FLOATING_TYPES(v.scalar_type(), "rasterize_kernel", [&] { if (at::native::canUse32BitIndexMath(v) && at::native::canUse32BitIndexMath(vi) && at::native::canUse32BitIndexMath(packed_index_depth_img)) { typedef int index_type; if (wireframe) { rasterize_lines_kernel <<>>( static_cast(count_rasterize), getTensorInfo(v), getTensorInfo(vi), getTensorInfo(packed_index_depth_img)); } else { rasterize_kernel <<>>( static_cast(count_rasterize), getTensorInfo(v), getTensorInfo(vi), getTensorInfo(packed_index_depth_img)); } C10_CUDA_KERNEL_LAUNCH_CHECK(); } else { typedef int64_t index_type; if (wireframe) { rasterize_lines_kernel <<>>( static_cast(count_rasterize), getTensorInfo(v), getTensorInfo(vi), getTensorInfo(packed_index_depth_img)); } else { rasterize_kernel <<>>( static_cast(count_rasterize), getTensorInfo(v), getTensorInfo(vi), getTensorInfo(packed_index_depth_img)); } C10_CUDA_KERNEL_LAUNCH_CHECK(); } }); } // unpack if (count_unpack > 0) { if (at::native::canUse32BitIndexMath(packed_index_depth_img) && at::native::canUse32BitIndexMath(depth_img) && at::native::canUse32BitIndexMath(index_img)) { typedef int index_type; unpack_kernel<<>>( static_cast(count_unpack), getTensorInfo(packed_index_depth_img), getTensorInfo(depth_img), getTensorInfo(index_img)); C10_CUDA_KERNEL_LAUNCH_CHECK(); } else { typedef int64_t index_type; unpack_kernel<<>>( static_cast(count_unpack), getTensorInfo(packed_index_depth_img), getTensorInfo(depth_img), getTensorInfo(index_img)); C10_CUDA_KERNEL_LAUNCH_CHECK(); } } return {depth_img, index_img}; }