support signed_distance

c5f26a44 · ashawkey · e1eb521b · c5f26a44 · c5f26a44 · c5f26a44
Commit c5f26a44 authored Dec 03, 2022 by ashawkey
11 changed files
--- a/.gitignore
+++ b/.gitignore
+__pycache__/
+build/
+*.egg-info/
+*.so
+.vs/
+*.ply
+eigen*/
\ No newline at end of file
--- a/cubvh/api.py
+++ b/cubvh/api.py
@@ -5,6 +5,11 @@ import torch
 # CUDA extension
 import _cubvh as _backend
+_sdf_mode_to_id = {
+    'watertight': 0,
+    'raystab': 1,
+}
 class cuBVH():
    def __init__(self, vertices, triangles):
        # vertices: np.ndarray, [N, 3]
@@ -78,4 +83,33 @@ class cuBVH():
        return distances, face_id, uvw
-    # def signed_distance():
+    def signed_distance(self, positions, return_uvw=False, mode='watertight'):
+        # positions: torch.Tensor, float, [N, 3]
+        positions = positions.float().contiguous()
+        if not positions.is_cuda: positions = positions.cuda()
+        prefix = positions.shape[:-1]
+        positions = positions.view(-1, 3)
+        N = positions.shape[0]
+        # init output buffers
+        distances = torch.empty(N, dtype=torch.float32, device=positions.device)
+        face_id = torch.empty(N, dtype=torch.int64, device=positions.device)
+        if return_uvw:
+            uvw = torch.empty(N, 3, dtype=torch.float32, device=positions.device)
+        else:
+            uvw = None
+        self.impl.signed_distance(positions, distances, face_id, uvw, _sdf_mode_to_id[mode]) # [N, 3]
+        distances = distances.view(*prefix)
+        face_id = face_id.view(*prefix)
+        if uvw is not None:
+            uvw = uvw.view(*prefix, 3)
+        return distances, face_id, uvw
--- a/include/cubvh/api.h
+++ b/include/cubvh/api.h
@@ -20,6 +20,7 @@ public:
    virtual void ray_trace(at::Tensor rays_o, at::Tensor rays_d, at::Tensor positions, at::Tensor face_id, at::Tensor depth) = 0;
    virtual void unsigned_distance(at::Tensor positions, at::Tensor distances, at::Tensor face_id, at::optional<at::Tensor> uvw) = 0;
+    virtual void signed_distance(at::Tensor positions, at::Tensor distances, at::Tensor face_id, at::optional<at::Tensor> uvw, uint32_t mode) = 0;
 };
 // function to create an implementation of cuBVH

--- a/include/cubvh/bvh.cuh
+++ b/include/cubvh/bvh.cuh
@@ -52,11 +52,11 @@ protected:
 public:
    virtual void build(std::vector<Triangle>& triangles, uint32_t n_primitives_per_leaf) = 0;
+    virtual void signed_distance_gpu(uint32_t n_elements, uint32_t mode, const float* positions, float* distances, int64_t* face_id, float* uvw, const Triangle* gpu_triangles, cudaStream_t stream) = 0;
    virtual void unsigned_distance_gpu(uint32_t n_elements, const float* positions, float* distances, int64_t* face_id, float* uvw, const Triangle* gpu_triangles, cudaStream_t stream) = 0;
    virtual void ray_trace_gpu(uint32_t n_elements, const float* rays_o, const float* rays_d, float* positions, int64_t* face_id, float* depth, const Triangle* gpu_triangles, cudaStream_t stream) = 0;
    // KIUI: not supported now.
-    // virtual void signed_distance_gpu(uint32_t n_elements, EMeshSdfMode mode, const Eigen::Vector3f* gpu_positions, float* gpu_distances, const Triangle* gpu_triangles, bool use_existing_distances_as_upper_bounds, cudaStream_t stream) = 0;
    // virtual bool touches_triangle(const BoundingBox& bb, const Triangle* __restrict__ triangles) const = 0;
    // virtual void build_optix(const GPUMemory<Triangle>& triangles, cudaStream_t stream) = 0;

--- a/include/cubvh/common.h
+++ b/include/cubvh/common.h
@@ -60,4 +60,39 @@ __host__ __device__ void host_device_swap(T& a, T& b) {
    T c(a); a=b; b=c;
 }
+inline __host__ __device__ Eigen::Vector3f cylindrical_to_dir(const Eigen::Vector2f& p) {
+	const float cos_theta = -2.0f * p.x() + 1.0f;
+	const float phi = 2.0f * PI * (p.y() - 0.5f);
+	const float sin_theta = sqrtf(fmaxf(1.0f - cos_theta * cos_theta, 0.0f));
+	float sin_phi, cos_phi;
+	sincosf(phi, &sin_phi, &cos_phi);
+	return {sin_theta * cos_phi, sin_theta * sin_phi, cos_theta};
+}
+inline __host__ __device__ float fractf(float x) {
+	return x - floorf(x);
+}
+template <uint32_t N_DIRS>
+__device__ __host__ Eigen::Vector3f fibonacci_dir(uint32_t i, const Eigen::Vector2f& offset) {
+	// Fibonacci lattice with offset
+	float epsilon;
+	if (N_DIRS >= 11000) {
+		epsilon = 27;
+	} else if (N_DIRS >= 890) {
+		epsilon = 10;
+	} else if (N_DIRS >= 177) {
+		epsilon = 3.33;
+	} else if (N_DIRS >= 24) {
+		epsilon = 1.33;
+	} else {
+		epsilon = 0.33;
+	}
+	static constexpr float GOLDEN_RATIO = 1.6180339887498948482045868343656f;
+	return cylindrical_to_dir(Eigen::Vector2f{fractf((i+epsilon) / (N_DIRS-1+2*epsilon) + offset.x()), fractf(i / GOLDEN_RATIO + offset.y())});
+}
 }
\ No newline at end of file
--- a/include/cubvh/pcg32.h
+++ b/include/cubvh/pcg32.h
+/*
+ * Tiny self-contained version of the PCG Random Number Generation for C++
+ * put together from pieces of the much larger C/C++ codebase.
+ * Wenzel Jakob, February 2015
+ *
+ * The PCG random number generator was developed by Melissa O'Neill
+ * <oneill@pcg-random.org>
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * For additional information about the PCG random number generation scheme,
+ * including its license and other licensing options, visit
+ *
+ *     http://www.pcg-random.org
+ *
+ * Note: This code was modified to work with CUDA by the tiny-cuda-nn authors.
+ */
+#pragma once
+// #include <cubvh/common.h>
+#define PCG32_DEFAULT_STATE  0x853c49e6748fea9bULL
+#define PCG32_DEFAULT_STREAM 0xda3e39cb94b95bdbULL
+#define PCG32_MULT           0x5851f42d4c957f2dULL
+#include <cmath>
+#include <cassert>
+namespace cubvh {
+/// PCG32 Pseudorandom number generator
+struct pcg32 {
+	/// Initialize the pseudorandom number generator with default seed
+	__host__ __device__ pcg32() : state(PCG32_DEFAULT_STATE), inc(PCG32_DEFAULT_STREAM) {}
+	/// Initialize the pseudorandom number generator with the \ref seed() function
+	__host__ __device__ pcg32(uint64_t initstate, uint64_t initseq = 1u) { seed(initstate, initseq); }
+	/**
+	 * \brief Seed the pseudorandom number generator
+	 *
+	 * Specified in two parts: a state initializer and a sequence selection
+	 * constant (a.k.a. stream id)
+	 */
+	__host__ __device__ void seed(uint64_t initstate, uint64_t initseq = 1) {
+		state = 0U;
+		inc = (initseq << 1u) | 1u;
+		next_uint();
+		state += initstate;
+		next_uint();
+	}
+	/// Generate a uniformly distributed unsigned 32-bit random number
+	__host__ __device__ uint32_t next_uint() {
+		uint64_t oldstate = state;
+		state = oldstate * PCG32_MULT + inc;
+		uint32_t xorshifted = (uint32_t) (((oldstate >> 18u) ^ oldstate) >> 27u);
+		uint32_t rot = (uint32_t) (oldstate >> 59u);
+		return (xorshifted >> rot) | (xorshifted << ((~rot + 1u) & 31));
+	}
+	/// Generate a uniformly distributed number, r, where 0 <= r < bound
+	__host__ __device__ uint32_t next_uint(uint32_t bound) {
+		// To avoid bias, we need to make the range of the RNG a multiple of
+		// bound, which we do by dropping output less than a threshold.
+		// A naive scheme to calculate the threshold would be to do
+		//
+		//     uint32_t threshold = 0x100000000ull % bound;
+		//
+		// but 64-bit div/mod is slower than 32-bit div/mod (especially on
+		// 32-bit platforms).  In essence, we do
+		//
+		//     uint32_t threshold = (0x100000000ull-bound) % bound;
+		//
+		// because this version will calculate the same modulus, but the LHS
+		// value is less than 2^32.
+		uint32_t threshold = (~bound+1u) % bound;
+		// Uniformity guarantees that this loop will terminate.  In practice, it
+		// should usually terminate quickly; on average (assuming all bounds are
+		// equally likely), 82.25% of the time, we can expect it to require just
+		// one iteration.  In the worst case, someone passes a bound of 2^31 + 1
+		// (i.e., 2147483649), which invalidates almost 50% of the range.  In
+		// practice, bounds are typically small and only a tiny amount of the range
+		// is eliminated.
+		for (;;) {
+			uint32_t r = next_uint();
+			if (r >= threshold)
+				return r % bound;
+		}
+	}
+	/// Generate a single precision floating point value on the interval [0, 1)
+	__host__ __device__ float next_float() {
+		/* Trick from MTGP: generate an uniformly distributed
+			single precision number in [1,2) and subtract 1. */
+		union {
+			uint32_t u;
+			float f;
+		} x;
+		x.u = (next_uint() >> 9) | 0x3f800000u;
+		return x.f - 1.0f;
+	}
+	/**
+	 * \brief Generate a double precision floating point value on the interval [0, 1)
+	 *
+	 * \remark Since the underlying random number generator produces 32 bit output,
+	 * only the first 32 mantissa bits will be filled (however, the resolution is still
+	 * finer than in \ref next_float(), which only uses 23 mantissa bits)
+	 */
+	__host__ __device__ double next_double() {
+		/* Trick from MTGP: generate an uniformly distributed
+			double precision number in [1,2) and subtract 1. */
+		union {
+			uint64_t u;
+			double d;
+		} x;
+		x.u = ((uint64_t) next_uint() << 20) | 0x3ff0000000000000ULL;
+		return x.d - 1.0;
+	}
+	/**
+	 * \brief Multi-step advance function (jump-ahead, jump-back)
+	 *
+	 * The method used here is based on Brown, "Random Number Generation
+	 * with Arbitrary Stride", Transactions of the American Nuclear
+	 * Society (Nov. 1994). The algorithm is very similar to fast
+	 * exponentiation.
+	 *
+	 * The default value of 2^32 ensures that the PRNG is advanced
+	 * sufficiently far that there is (likely) no overlap with
+	 * previously drawn random numbers, even if small advancements.
+	 * are made inbetween.
+	 */
+	__host__ __device__ void advance(int64_t delta_ = (1ll<<32)) {
+		uint64_t
+			cur_mult = PCG32_MULT,
+			cur_plus = inc,
+			acc_mult = 1u,
+			acc_plus = 0u;
+		/* Even though delta is an unsigned integer, we can pass a signed
+			integer to go backwards, it just goes "the long way round". */
+		uint64_t delta = (uint64_t) delta_;
+		while (delta > 0) {
+			if (delta & 1) {
+				acc_mult *= cur_mult;
+				acc_plus = acc_plus * cur_mult + cur_plus;
+			}
+			cur_plus = (cur_mult + 1) * cur_plus;
+			cur_mult *= cur_mult;
+			delta /= 2;
+		}
+		state = acc_mult * state + acc_plus;
+	}
+	/// Compute the distance between two PCG32 pseudorandom number generators
+	__host__ __device__ int64_t operator-(const pcg32 &other) const {
+		assert(inc == other.inc);
+		uint64_t
+			cur_mult = PCG32_MULT,
+			cur_plus = inc,
+			cur_state = other.state,
+			the_bit = 1u,
+			distance = 0u;
+		while (state != cur_state) {
+			if ((state & the_bit) != (cur_state & the_bit)) {
+				cur_state = cur_state * cur_mult + cur_plus;
+				distance |= the_bit;
+			}
+			assert((state & the_bit) == (cur_state & the_bit));
+			the_bit <<= 1;
+			cur_plus = (cur_mult + 1ULL) * cur_plus;
+			cur_mult *= cur_mult;
+		}
+		return (int64_t) distance;
+	}
+	/// Equality operator
+	__host__ __device__ bool operator==(const pcg32 &other) const { return state == other.state && inc == other.inc; }
+	/// Inequality operator
+	__host__ __device__ bool operator!=(const pcg32 &other) const { return state != other.state || inc != other.inc; }
+	uint64_t state;  // RNG state.  All values are possible.
+	uint64_t inc;    // Controls which RNG sequence (stream) is selected. Must *always* be odd.
+};
+}
\ No newline at end of file
--- a/readme.md
+++ b/readme.md
@@ -30,18 +30,23 @@ import trimesh
 import torch
 import cubvh
-# build BVH from mesh
+### build BVH from mesh
 mesh = trimesh.load('example.ply')
 BVH = cubvh.cuBVH(mesh.vertices, mesh.faces) # build with numpy.ndarray/torch.Tensor
+### query ray-mesh intersection
-# query ray-mesh intersection
 rays_o, rays_d = get_ray(pose, intrinsics, H, W) # [N, 3], [N, 3], query with torch.Tensor (cuda)
 intersections, face_normals, depth = BVH.ray_trace(rays_o, rays_d) # [N, 3], [N, 3], [N,]
-# query unsigned distance
+### query unsigned distance
 points # [N, 3]
 distances, face_id, uvw = BVH.unsigned_distance(points, return_uvw=True) # [N], [N], [N, 3]
+### query signed distance (INNER is NEGETIVE!)
+# for watertight meshes (default)
+distances, face_id, uvw = BVH.signed_distance(points, return_uvw=True, mode='watertight') # [N], [N], [N, 3]
+# for non-watertight meshes:
+distances, face_id, uvw = BVH.signed_distance(points, return_uvw=True, mode='raystab') # [N], [N], [N, 3]
 ```

--- a/src/api.cu
+++ b/src/api.cu
@@ -42,7 +42,6 @@ public:
    }
-    // accept torch tensor (gpu) to init
    void ray_trace(at::Tensor rays_o, at::Tensor rays_d, at::Tensor positions, at::Tensor face_id, at::Tensor depth) {
        const uint32_t n_elements = rays_o.size(0);
@@ -51,7 +50,6 @@ public:
        triangle_bvh->ray_trace_gpu(n_elements, rays_o.data_ptr<float>(), rays_d.data_ptr<float>(), positions.data_ptr<float>(), face_id.data_ptr<int64_t>(), depth.data_ptr<float>(), triangles_gpu.data(), stream);
    }
-    // optionally return uvw (barycentric) and face_id
    void unsigned_distance(at::Tensor positions, at::Tensor distances, at::Tensor face_id, at::optional<at::Tensor> uvw) {
        const uint32_t n_elements = positions.size(0);
@@ -61,6 +59,14 @@ public:
    }
+    void signed_distance(at::Tensor positions, at::Tensor distances, at::Tensor face_id, at::optional<at::Tensor> uvw, uint32_t mode) {
+        const uint32_t n_elements = positions.size(0);
+        cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+        triangle_bvh->signed_distance_gpu(n_elements, mode, positions.data_ptr<float>(), distances.data_ptr<float>(), face_id.data_ptr<int64_t>(), uvw.has_value() ? uvw.value().data_ptr<float>() : nullptr, triangles_gpu.data(), stream);
+    }
    std::vector<Triangle> triangles_cpu;
    GPUMemory<Triangle> triangles_gpu;
    std::shared_ptr<TriangleBvh> triangle_bvh;

--- a/src/bindings.cpp
+++ b/src/bindings.cpp
@@ -14,7 +14,8 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
 py::class_<cuBVH>(m, "cuBVH")
    .def("ray_trace", &cuBVH::ray_trace)
-    .def("unsigned_distance", &cuBVH::unsigned_distance);
+    .def("unsigned_distance", &cuBVH::unsigned_distance)
+    .def("signed_distance", &cuBVH::signed_distance);
 m.def("create_cuBVH", &create_cuBVH);

--- a/src/bvh.cu
+++ b/src/bvh.cu
@@ -2,6 +2,7 @@
 #include <cubvh/common.h>
 #include <cubvh/triangle.cuh>
 #include <cubvh/bvh.cuh>
+#include <cubvh/pcg32.h>
 // #ifdef NGP_OPTIX
 // #  include <optix.h>
@@ -132,8 +133,8 @@ constexpr float MAX_DIST_SQ = MAX_DIST*MAX_DIST;
 // }
 // #endif //NGP_OPTIX
-// __global__ void signed_distance_watertight_kernel(uint32_t n_elements, const Vector3f* __restrict__ positions, const TriangleBvhNode* __restrict__ bvhnodes, const Triangle* __restrict__ triangles, float* __restrict__ distances, bool use_existing_distances_as_upper_bounds = false);
+__global__ void signed_distance_watertight_kernel(uint32_t n_elements, const Vector3f* __restrict__ positions, float* __restrict__ distances, int64_t* __restrict__ face_id, Vector3f* __restrict__ uvw, const TriangleBvhNode* __restrict__ bvhnodes, const Triangle* __restrict__ triangles, bool use_existing_distances_as_upper_bounds);
-// __global__ void signed_distance_raystab_kernel(uint32_t n_elements, const Vector3f* __restrict__ positions, const TriangleBvhNode* __restrict__ bvhnodes, const Triangle* __restrict__ triangles, float* __restrict__ distances, bool use_existing_distances_as_upper_bounds = false);
+__global__ void signed_distance_raystab_kernel(uint32_t n_elements, const Vector3f* __restrict__ positions, float* __restrict__ distances, int64_t* __restrict__ face_id, Vector3f* __restrict__ uvw, const TriangleBvhNode* __restrict__ bvhnodes, const Triangle* __restrict__ triangles, bool use_existing_distances_as_upper_bounds);
 __global__ void unsigned_distance_kernel(uint32_t n_elements, const Vector3f* __restrict__ positions, float* __restrict__ distances, int64_t* __restrict__ face_id, Vector3f* __restrict__ uvw, const TriangleBvhNode* __restrict__ bvhnodes, const Triangle* __restrict__ triangles, bool use_existing_distances_as_upper_bounds);
 __global__ void raytrace_kernel(uint32_t n_elements, const Vector3f* __restrict__ rays_o, const Vector3f* __restrict__ rays_d, Vector3f* __restrict__ positions, int64_t* __restrict__ face_id, float* __restrict__ depth, const TriangleBvhNode* __restrict__ nodes, const Triangle* __restrict__ triangles);
@@ -391,76 +392,69 @@ public:
        return result / total_weight;
    }
-    // __host__ __device__ static float signed_distance_watertight(const Vector3f& point, const TriangleBvhNode* __restrict__ bvhnodes, const Triangle* __restrict__ triangles, float max_distance_sq = MAX_DIST_SQ) {
+    __host__ __device__ static std::pair<int, float> signed_distance_watertight(const Vector3f& point, const TriangleBvhNode* __restrict__ bvhnodes, const Triangle* __restrict__ triangles, float max_distance_sq = MAX_DIST_SQ) {
-    //     auto p = closest_triangle(point, bvhnodes, triangles, max_distance_sq);
+        auto res = closest_triangle(point, bvhnodes, triangles, max_distance_sq);
-    //     const Triangle& tri = triangles[p.first];
+        const Triangle& tri = triangles[res.first];
-    //     Vector3f closest_point = tri.closest_point(point);
+        Vector3f closest_point = tri.closest_point(point);
-    //     Vector3f avg_normal = avg_normal_around_point(closest_point, bvhnodes, triangles);
+        Vector3f avg_normal = avg_normal_around_point(closest_point, bvhnodes, triangles);
-    //     return std::copysignf(p.second, avg_normal.dot(point - closest_point));
+        return {res.first, std::copysignf(res.second, avg_normal.dot(point - closest_point))};
-    // }
+    }
-    // __host__ __device__ static float signed_distance_raystab(const Vector3f& point, const TriangleBvhNode* __restrict__ bvhnodes, const Triangle* __restrict__ triangles, float max_distance_sq = MAX_DIST_SQ, default_rng_t rng={}) {
+    __host__ __device__ static std::pair<int, float> signed_distance_raystab(const Vector3f& point, const TriangleBvhNode* __restrict__ bvhnodes, const Triangle* __restrict__ triangles, float max_distance_sq = MAX_DIST_SQ, pcg32 rng={}) {
-    //     float distance = unsigned_distance(point, bvhnodes, triangles, max_distance_sq);
+        auto res = closest_triangle(point, bvhnodes, triangles, max_distance_sq);
-    //     Vector2f offset = random_val_2d(rng);
+        Vector2f offset = {rng.next_float(), rng.next_float()};
-    //     static constexpr uint32_t N_STAB_RAYS = 32;
+        static constexpr uint32_t N_STAB_RAYS = 32;
-    //     for (uint32_t i = 0; i < N_STAB_RAYS; ++i) {
+        for (uint32_t i = 0; i < N_STAB_RAYS; ++i) {
-    //         // Use a Fibonacci lattice (with random offset) to regularly
+            // Use a Fibonacci lattice (with random offset) to regularly
-    //         // distribute the stab rays over the sphere.
+            // distribute the stab rays over the sphere.
-    //         Vector3f d = fibonacci_dir<N_STAB_RAYS>(i, offset);
+            Vector3f d = fibonacci_dir<N_STAB_RAYS>(i, offset);
-    //         // If any of the stab rays goes outside the mesh, the SDF is positive.
+            // If any of the stab rays goes outside the mesh, the SDF is positive.
-    //         if (ray_intersect(point, -d, bvhnodes, triangles).first < 0 || ray_intersect(point, d, bvhnodes, triangles).first < 0) {
+            if (ray_intersect(point, -d, bvhnodes, triangles).first < 0 || ray_intersect(point, d, bvhnodes, triangles).first < 0) {
-    //             return distance;
+                return {res.first, res.second};
-    //         }
+            }
-    //     }
+        }
-    //     return -distance;
+        return {res.first, -res.second};
-    // }
+    }
-    // // // Assumes that "point" is a location on a triangle
-    // // Vector3f avg_normal_around_point(const Vector3f& point, const Triangle* __restrict__ triangles) const {
+    void signed_distance_gpu(uint32_t n_elements, uint32_t mode, const float* positions, float* distances, int64_t* face_id, float* uvw, const Triangle* gpu_triangles, cudaStream_t stream) override {
-    // //     return avg_normal_around_point(point, m_nodes.data(), triangles);
-    // // }
+        const Vector3f* positions_vec = (const Vector3f*)positions;
+        Vector3f* uvw_vec = (Vector3f*)uvw;
-    // // float signed_distance(EMeshSdfMode mode, const Vector3f& point, const std::vector<Triangle>& triangles) const {
-    // //     if (mode == EMeshSdfMode::Watertight) {
+        if (mode == 0) {
-    // //         return signed_distance_watertight(point, m_nodes.data(), triangles.data());
+            // watertight
-    // //     } else {
+            linear_kernel(signed_distance_watertight_kernel, 0u, stream,
-    // //         return signed_distance_raystab(point, m_nodes.data(), triangles.data());
+                n_elements,
-    // //     }
+                positions_vec,
-    // // }
+                distances,
+                face_id,
-    // void signed_distance_gpu(uint32_t n_elements, EMeshSdfMode mode, const Vector3f* gpu_positions, float* gpu_distances, const Triangle* gpu_triangles, bool use_existing_distances_as_upper_bounds, cudaStream_t stream) override {
+                uvw_vec,
-    //     if (mode == EMeshSdfMode::Watertight) {
+                m_nodes_gpu.data(),
-    //         linear_kernel(signed_distance_watertight_kernel, 0, stream,
+                gpu_triangles,
-    //             n_elements,
+                false
-    //             gpu_positions,
+            );
-    //             m_nodes_gpu.data(),
-    //             gpu_triangles,
+        } else {
-    //             gpu_distances,
+            // raystab
-    //             use_existing_distances_as_upper_bounds
+            linear_kernel(signed_distance_raystab_kernel, 0u, stream,
-    //         );
+                n_elements,
-    //     } else {
+                positions_vec,
-    //         {
+                distances,
-    //             if (mode == EMeshSdfMode::Raystab) {
+                face_id,
-    //                 linear_kernel(signed_distance_raystab_kernel, 0, stream,
+                uvw_vec,
-    //                     n_elements,
+                m_nodes_gpu.data(),
-    //                     gpu_positions,
+                gpu_triangles,
-    //                     m_nodes_gpu.data(),
+                false
-    //                     gpu_triangles,
+            );
-    //                     gpu_distances,
+        }
-    //                     use_existing_distances_as_upper_bounds
+    }
-    //                 );
-    //             } else if (mode == EMeshSdfMode::PathEscape) {
-    //                 throw std::runtime_error{"TriangleBvh: EMeshSdfMode::PathEscape is only supported with OptiX enabled."};
-    //             }
-    //         }
-    //     }
-    // }
    void unsigned_distance_gpu(uint32_t n_elements, const float* positions, float* distances, int64_t* face_id, float* uvw, const Triangle* gpu_triangles, cudaStream_t stream) override {
@@ -477,7 +471,6 @@ public:
            gpu_triangles,
            false
        );
    }
    void ray_trace_gpu(uint32_t n_elements, const float* rays_o, const float* rays_d, float* positions, int64_t* face_id, float* depth, const Triangle* gpu_triangles, cudaStream_t stream) override {
@@ -622,37 +615,61 @@ std::unique_ptr<TriangleBvh> TriangleBvh::make() {
    return std::unique_ptr<TriangleBvh>(new TriangleBvh4());
 }
-// __global__ void signed_distance_watertight_kernel(uint32_t n_elements,
+__global__ void signed_distance_watertight_kernel(
-//     const Vector3f* __restrict__ positions,
+    uint32_t n_elements, const Vector3f* __restrict__ positions,
-//     const TriangleBvhNode* __restrict__ bvhnodes,
+    float* __restrict__ distances, int64_t* __restrict__ face_id, Vector3f* __restrict__ uvw,
-//     const Triangle* __restrict__ triangles,
+    const TriangleBvhNode* __restrict__ bvhnodes, const Triangle* __restrict__ triangles, bool use_existing_distances_as_upper_bounds
-//     float* __restrict__ distances,
+) {
-//     bool use_existing_distances_as_upper_bounds
+    uint32_t i = blockIdx.x * blockDim.x + threadIdx.x;
-// ) {
+    if (i >= n_elements) return;
-//     uint32_t i = blockIdx.x * blockDim.x + threadIdx.x;
-//     if (i >= n_elements) return;
-//     float max_distance = use_existing_distances_as_upper_bounds ? distances[i] : MAX_DIST;
-//     distances[i] = TriangleBvh4::signed_distance_watertight(positions[i], bvhnodes, triangles, max_distance*max_distance);
-// }
-// __global__ void signed_distance_raystab_kernel(
+    float max_distance = use_existing_distances_as_upper_bounds ? distances[i] : MAX_DIST;
-//     uint32_t n_elements,
-//     const Vector3f* __restrict__ positions,
+    Vector3f point = positions[i];
-//     const TriangleBvhNode* __restrict__ bvhnodes,
-//     const Triangle* __restrict__ triangles,
+    auto res = TriangleBvh4::signed_distance_watertight(point, bvhnodes, triangles, max_distance*max_distance);
-//     float* __restrict__ distances,
-//     bool use_existing_distances_as_upper_bounds
+    // write 
-// ) {
+    distances[i] = res.second;
-//     uint32_t i = blockIdx.x * blockDim.x + threadIdx.x;
+    face_id[i] = triangles[res.first].id;
-//     if (i >= n_elements) return;
+    // optional output
-//     float max_distance = use_existing_distances_as_upper_bounds ? distances[i] : MAX_DIST;
+    if (uvw) {
-//     default_rng_t rng;
+        // get closest point
-//     rng.advance(i * 2);
+        Vector3f cpoint = triangles[res.first].closest_point(point);
+        // query uvw
-//     distances[i] = TriangleBvh4::signed_distance_raystab(positions[i], bvhnodes, triangles, max_distance*max_distance, rng);
+        uvw[i] = triangles[res.first].barycentric(cpoint);
-// }
+    }
+}
+__global__ void signed_distance_raystab_kernel(
+    uint32_t n_elements, const Vector3f* __restrict__ positions,
+    float* __restrict__ distances, int64_t* __restrict__ face_id, Vector3f* __restrict__ uvw,
+    const TriangleBvhNode* __restrict__ bvhnodes, const Triangle* __restrict__ triangles, bool use_existing_distances_as_upper_bounds
+) {
+    uint32_t i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i >= n_elements) return;
+    float max_distance = use_existing_distances_as_upper_bounds ? distances[i] : MAX_DIST;
+    pcg32 rng;
+    rng.advance(i * 2);
+    Vector3f point = positions[i];
+    auto res = TriangleBvh4::signed_distance_raystab(point, bvhnodes, triangles, max_distance*max_distance, rng);
+    // write 
+    distances[i] = res.second;
+    face_id[i] = triangles[res.first].id;
+    // optional output
+    if (uvw) {
+        // get closest point
+        Vector3f cpoint = triangles[res.first].closest_point(point);
+        // query uvw
+        uvw[i] = triangles[res.first].barycentric(cpoint);
+    }
+}
 __global__ void unsigned_distance_kernel(
    uint32_t n_elements, const Vector3f* __restrict__ positions,

--- a/test/signed_distance.py
+++ b/test/signed_distance.py
+import numpy as np
+import trimesh
+import argparse
+import torch
+import cubvh
+import time
+def create_dodecahedron(radius=1, center=np.array([0, 0, 0])):
+    vertices = np.array([
+        -0.57735,  -0.57735,  0.57735,
+        0.934172,  0.356822,  0,
+        0.934172,  -0.356822,  0,
+        -0.934172,  0.356822,  0,
+        -0.934172,  -0.356822,  0,
+        0,  0.934172,  0.356822,
+        0,  0.934172,  -0.356822,
+        0.356822,  0,  -0.934172,
+        -0.356822,  0,  -0.934172,
+        0,  -0.934172,  -0.356822,
+        0,  -0.934172,  0.356822,
+        0.356822,  0,  0.934172,
+        -0.356822,  0,  0.934172,
+        0.57735,  0.57735,  -0.57735,
+        0.57735,  0.57735,  0.57735,
+        -0.57735,  0.57735,  -0.57735,
+        -0.57735,  0.57735,  0.57735,
+        0.57735,  -0.57735,  -0.57735,
+        0.57735,  -0.57735,  0.57735,
+        -0.57735,  -0.57735,  -0.57735,
+        ]).reshape((-1,3), order="C")
+    faces = np.array([
+        19, 3, 2,
+        12, 19, 2,
+        15, 12, 2,
+        8, 14, 2,
+        18, 8, 2,
+        3, 18, 2,
+        20, 5, 4,
+        9, 20, 4,
+        16, 9, 4,
+        13, 17, 4,
+        1, 13, 4,
+        5, 1, 4,
+        7, 16, 4,
+        6, 7, 4,
+        17, 6, 4,
+        6, 15, 2,
+        7, 6, 2,
+        14, 7, 2,
+        10, 18, 3,
+        11, 10, 3,
+        19, 11, 3,
+        11, 1, 5,
+        10, 11, 5,
+        20, 10, 5,
+        20, 9, 8,
+        10, 20, 8,
+        18, 10, 8,
+        9, 16, 7,
+        8, 9, 7,
+        14, 8, 7,
+        12, 15, 6,
+        13, 12, 6,
+        17, 13, 6,
+        13, 1, 11,
+        12, 13, 11,
+        19, 12, 11,
+        ]).reshape((-1, 3), order="C")-1
+    length = np.linalg.norm(vertices, axis=1).reshape((-1, 1))
+    vertices = vertices / length * radius + center
+    return trimesh.Trimesh(vertices=vertices, faces=faces)
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--N', default=1000, type=int)
+    parser.add_argument('--mesh', default='', type=str)
+    opt = parser.parse_args()
+    if opt.mesh == '':
+        mesh = create_dodecahedron()
+    else:
+        mesh = trimesh.load(opt.mesh, force='mesh', skip_material=True)
+    # query nearest triangles for a set of points
+    points = torch.randn(opt.N, 3, device='cuda', dtype=torch.float32)
+    # Ours
+    _t0 = time.time()
+    BVH = cubvh.cuBVH(mesh.vertices, mesh.faces)
+    torch.cuda.synchronize()
+    _t1 = time.time()
+    distances, face_id, _ = BVH.signed_distance(points, return_uvw=False, mode='raystab')
+    torch.cuda.synchronize()
+    _t2 = time.time()
+    print(f'[TIME] Ours total {_t2 - _t0:.6f}s = build {_t1 - _t0:.6f}s + query {_t2 - _t1:.6f}s')
+    # GT results by trimesh
+    _t0 = time.time()
+    gt_distances = -trimesh.proximity.signed_distance(mesh, points.cpu().numpy())
+    _t1 = time.time()
+    print(f'[TIME] Trimesh total {_t1 - _t0:.6f}s')
+    # verify correctness
+    distances = distances.cpu().numpy().astype(np.float32)
+    np.testing.assert_allclose(
+        distances,
+        gt_distances.astype(np.float32),
+        atol=1e-5
+    )
\ No newline at end of file