init

e1eb521b · ashawkey · e1eb521b · e1eb521b · e1eb521b · e1eb521b
Commit e1eb521b authored Dec 03, 2022 by ashawkey
17 changed files
--- a/LICENSE
+++ b/LICENSE
+MIT License
+
+Copyright (c) 2022 kiui
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
--- a/LICENSE_NVIDIA
+++ b/LICENSE_NVIDIA
+Copyright (c) 2022, NVIDIA Corporation & affiliates. All rights reserved.
+
+
+NVIDIA Source Code License for instant neural graphics primitives
+
+
+=======================================================================
+
+1. Definitions
+
+"Licensor" means any person or entity that distributes its Work.
+
+"Software" means the original work of authorship made available under
+this License.
+
+"Work" means the Software and any additions to or derivative works of
+the Software that are made available under this License.
+
+The terms "reproduce," "reproduction," "derivative works," and
+"distribution" have the meaning as provided under U.S. copyright law;
+provided, however, that for the purposes of this License, derivative
+works shall not include works that remain separable from, or merely
+link (or bind by name) to the interfaces of, the Work.
+
+Works, including the Software, are "made available" under this License
+by including in or with the Work either (a) a copyright notice
+referencing the applicability of this License to the Work, or (b) a
+copy of this License.
+
+2. License Grants
+
+    2.1 Copyright Grant. Subject to the terms and conditions of this
+    License, each Licensor grants to you a perpetual, worldwide,
+    non-exclusive, royalty-free, copyright license to reproduce,
+    prepare derivative works of, publicly display, publicly perform,
+    sublicense and distribute its Work and any resulting derivative
+    works in any form.
+
+3. Limitations
+
+    3.1 Redistribution. You may reproduce or distribute the Work only
+    if (a) you do so under this License, (b) you include a complete
+    copy of this License with your distribution, and (c) you retain
+    without modification any copyright, patent, trademark, or
+    attribution notices that are present in the Work.
+
+    3.2 Derivative Works. You may specify that additional or different
+    terms apply to the use, reproduction, and distribution of your
+    derivative works of the Work ("Your Terms") only if (a) Your Terms
+    provide that the use limitation in Section 3.3 applies to your
+    derivative works, and (b) you identify the specific derivative
+    works that are subject to Your Terms. Notwithstanding Your Terms,
+    this License (including the redistribution requirements in Section
+    3.1) will continue to apply to the Work itself.
+
+    3.3 Use Limitation. The Work and any derivative works thereof only
+    may be used or intended for use non-commercially. Notwithstanding
+    the foregoing, NVIDIA and its affiliates may use the Work and any
+    derivative works commercially. As used herein, "non-commercially"
+    means for research or evaluation purposes only.
+
+    3.4 Patent Claims. If you bring or threaten to bring a patent claim
+    against any Licensor (including any claim, cross-claim or
+    counterclaim in a lawsuit) to enforce any patents that you allege
+    are infringed by any Work, then your rights under this License from
+    such Licensor (including the grant in Section 2.1) will terminate
+    immediately.
+
+    3.5 Trademarks. This License does not grant any rights to use any
+    Licensor’s or its affiliates’ names, logos, or trademarks, except
+    as necessary to reproduce the notices described in this License.
+
+    3.6 Termination. If you violate any term of this License, then your
+    rights under this License (including the grant in Section 2.1) will
+    terminate immediately.
+
+4. Disclaimer of Warranty.
+
+THE WORK IS PROVIDED "AS IS" WITHOUT WARRANTIES OR CONDITIONS OF ANY
+KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WARRANTIES OR CONDITIONS OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, TITLE OR
+NON-INFRINGEMENT. YOU BEAR THE RISK OF UNDERTAKING ANY ACTIVITIES UNDER
+THIS LICENSE.
+
+5. Limitation of Liability.
+
+EXCEPT AS PROHIBITED BY APPLICABLE LAW, IN NO EVENT AND UNDER NO LEGAL
+THEORY, WHETHER IN TORT (INCLUDING NEGLIGENCE), CONTRACT, OR OTHERWISE
+SHALL ANY LICENSOR BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY DIRECT,
+INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES ARISING OUT OF
+OR RELATED TO THIS LICENSE, THE USE OR INABILITY TO USE THE WORK
+(INCLUDING BUT NOT LIMITED TO LOSS OF GOODWILL, BUSINESS INTERRUPTION,
+LOST PROFITS OR DATA, COMPUTER FAILURE OR MALFUNCTION, OR ANY OTHER
+COMMERCIAL DAMAGES OR LOSSES), EVEN IF THE LICENSOR HAS BEEN ADVISED OF
+THE POSSIBILITY OF SUCH DAMAGES.
+
+=======================================================================
--- a/cubvh/__init__.py
+++ b/cubvh/__init__.py
+from .api import cuBVH
\ No newline at end of file
--- a/cubvh/api.py
+++ b/cubvh/api.py
+
+import numpy as np
+import torch
+
+# CUDA extension
+import _cubvh as _backend
+
+class cuBVH():
+    def __init__(self, vertices, triangles):
+        # vertices: np.ndarray, [N, 3]
+        # triangles: np.ndarray, [M, 3]
+
+        if torch.is_tensor(vertices): vertices = vertices.detach().cpu().numpy()
+        if torch.is_tensor(triangles): triangles = triangles.detach().cpu().numpy()
+
+        assert triangles.shape[0] > 8, "BVH needs at least 8 triangles."
+        
+        # implementation
+        self.impl = _backend.create_cuBVH(vertices, triangles)
+
+    def ray_trace(self, rays_o, rays_d):
+        # rays_o: torch.Tensor, float, [N, 3]
+        # rays_d: torch.Tensor, float, [N, 3]
+
+        rays_o = rays_o.float().contiguous()
+        rays_d = rays_d.float().contiguous()
+
+        if not rays_o.is_cuda: rays_o = rays_o.cuda()
+        if not rays_d.is_cuda: rays_d = rays_d.cuda()
+
+        prefix = rays_o.shape[:-1]
+        rays_o = rays_o.view(-1, 3)
+        rays_d = rays_d.view(-1, 3)
+
+        N = rays_o.shape[0]
+
+        # init output buffers
+        positions = torch.empty(N, 3, dtype=torch.float32, device=rays_o.device)
+        face_id = torch.empty(N, dtype=torch.int64, device=rays_o.device)
+        depth = torch.empty(N, dtype=torch.float32, device=rays_o.device)
+        
+        self.impl.ray_trace(rays_o, rays_d, positions, face_id, depth) # [N, 3]
+
+        positions = positions.view(*prefix, 3)
+        face_id = face_id.view(*prefix)
+        depth = depth.view(*prefix)
+
+        return positions, face_id, depth
+
+    def unsigned_distance(self, positions, return_uvw=False):
+        # positions: torch.Tensor, float, [N, 3]
+
+        positions = positions.float().contiguous()
+
+        if not positions.is_cuda: positions = positions.cuda()
+
+        prefix = positions.shape[:-1]
+        positions = positions.view(-1, 3)
+
+        N = positions.shape[0]
+
+        # init output buffers
+        distances = torch.empty(N, dtype=torch.float32, device=positions.device)
+        face_id = torch.empty(N, dtype=torch.int64, device=positions.device)
+
+        if return_uvw:
+            uvw = torch.empty(N, 3, dtype=torch.float32, device=positions.device)
+        else:
+            uvw = None
+        
+        self.impl.unsigned_distance(positions, distances, face_id, uvw) # [N, 3]
+
+        distances = distances.view(*prefix)
+        face_id = face_id.view(*prefix)
+        if uvw is not None:
+            uvw = uvw.view(*prefix, 3)
+
+        return distances, face_id, uvw
+
+    
+    # def signed_distance():
--- a/include/cubvh/api.h
+++ b/include/cubvh/api.h
+#pragma once
+
+#include <Eigen/Dense>
+#include <ATen/cuda/CUDAContext.h>
+#include <torch/torch.h>
+#include <cuda_runtime.h>
+
+using namespace Eigen;
+
+using Verts = Matrix<float, Dynamic, 3, RowMajor>;
+using Trigs = Matrix<uint32_t, Dynamic, 3, RowMajor>;
+
+namespace cubvh {
+
+// abstract class of raytracer
+class cuBVH {
+public:
+    cuBVH() {}
+    virtual ~cuBVH() {}
+
+    virtual void ray_trace(at::Tensor rays_o, at::Tensor rays_d, at::Tensor positions, at::Tensor face_id, at::Tensor depth) = 0;
+    virtual void unsigned_distance(at::Tensor positions, at::Tensor distances, at::Tensor face_id, at::optional<at::Tensor> uvw) = 0;
+};
+
+// function to create an implementation of cuBVH
+cuBVH* create_cuBVH(Ref<const Verts> vertices, Ref<const Trigs> triangles);
+    
+} // namespace cubvh
\ No newline at end of file
--- a/include/cubvh/bounding_box.cuh
+++ b/include/cubvh/bounding_box.cuh
+#pragma once
+
+#include <cubvh/common.h>
+#include <cubvh/triangle.cuh>
+
+namespace cubvh {
+
+template <int N_POINTS>
+__host__ __device__ inline void project(Eigen::Vector3f points[N_POINTS], const Eigen::Vector3f& axis, float& min, float& max) {
+    min = std::numeric_limits<float>::infinity();
+    max = -std::numeric_limits<float>::infinity();
+
+    #pragma unroll
+    for (uint32_t i = 0; i < N_POINTS; ++i) {
+        float val = axis.dot(points[i]);
+
+        if (val < min) {
+            min = val;
+        }
+
+        if (val > max) {
+            max = val;
+        }
+    }
+}
+
+struct BoundingBox {
+
+    Eigen::Vector3f min = Eigen::Vector3f::Constant(std::numeric_limits<float>::infinity());
+    Eigen::Vector3f max = Eigen::Vector3f::Constant(-std::numeric_limits<float>::infinity());
+
+    __host__ __device__ BoundingBox() {}
+
+    __host__ __device__ BoundingBox(const Eigen::Vector3f& a, const Eigen::Vector3f& b) : min{a}, max{b} {}
+
+    __host__ __device__ explicit BoundingBox(const Triangle& tri) {
+        min = max = tri.a;
+        enlarge(tri.b);
+        enlarge(tri.c);
+    }
+
+    BoundingBox(std::vector<Triangle>::iterator begin, std::vector<Triangle>::iterator end) {
+        min = max = begin->a;
+        for (auto it = begin; it != end; ++it) {
+            enlarge(*it);
+        }
+    }
+
+    __host__ __device__ void enlarge(const BoundingBox& other) {
+        min = min.cwiseMin(other.min);
+        max = max.cwiseMax(other.max);
+    }
+
+    __host__ __device__ void enlarge(const Triangle& tri) {
+        enlarge(tri.a);
+        enlarge(tri.b);
+        enlarge(tri.c);
+    }
+
+    __host__ __device__ void enlarge(const Eigen::Vector3f& point) {
+        min = min.cwiseMin(point);
+        max = max.cwiseMax(point);
+    }
+
+    __host__ __device__ void inflate(float amount) {
+        min -= Eigen::Vector3f::Constant(amount);
+        max += Eigen::Vector3f::Constant(amount);
+    }
+
+    __host__ __device__ Eigen::Vector3f diag() const {
+        return max - min;
+    }
+
+    __host__ __device__ Eigen::Vector3f relative_pos(const Eigen::Vector3f& pos) const {
+        return (pos - min).cwiseQuotient(diag());
+    }
+
+    __host__ __device__ Eigen::Vector3f center() const {
+        return 0.5f * (max + min);
+    }
+
+    __host__ __device__ BoundingBox intersection(const BoundingBox& other) const {
+        BoundingBox result = *this;
+        result.min = result.min.cwiseMax(other.min);
+        result.max = result.max.cwiseMin(other.max);
+        return result;
+    }
+
+    __host__ __device__ bool intersects(const BoundingBox& other) const {
+        return !intersection(other).is_empty();
+    }
+
+    // Based on the separating axis theorem
+    // (https://fileadmin.cs.lth.se/cs/Personal/Tomas_Akenine-Moller/code/tribox_tam.pdf)
+    // Code adapted from a C# implementation at stack overflow
+    // https://stackoverflow.com/a/17503268
+    __host__ __device__ bool intersects(const Triangle& triangle) const {
+        float triangle_min, triangle_max;
+        float box_min, box_max;
+
+        // Test the box normals (x-, y- and z-axes)
+        Eigen::Vector3f box_normals[3] = {
+            Eigen::Vector3f{1.0f, 0.0f, 0.0f},
+            Eigen::Vector3f{0.0f, 1.0f, 0.0f},
+            Eigen::Vector3f{0.0f, 0.0f, 1.0f},
+        };
+
+        Eigen::Vector3f triangle_normal = triangle.normal();
+        Eigen::Vector3f triangle_verts[3];
+        triangle.get_vertices(triangle_verts);
+
+        for (int i = 0; i < 3; i++) {
+            project<3>(triangle_verts, box_normals[i], triangle_min, triangle_max);
+            if (triangle_max < min[i] || triangle_min > max[i]) {
+                return false; // No intersection possible.
+            }
+        }
+
+        Eigen::Vector3f verts[8];
+        get_vertices(verts);
+
+        // Test the triangle normal
+        float triangle_offset = triangle_normal.dot(triangle.a);
+        project<8>(verts, triangle_normal, box_min, box_max);
+        if (box_max < triangle_offset || box_min > triangle_offset) {
+            return false; // No intersection possible.
+        }
+
+        // Test the nine edge cross-products
+        Eigen::Vector3f edges[3] = {
+            triangle.a - triangle.b,
+            triangle.a - triangle.c,
+            triangle.b - triangle.c,
+        };
+
+        for (int i = 0; i < 3; i++) {
+            for (int j = 0; j < 3; j++) {
+                // The box normals are the same as it's edge tangents
+                Eigen::Vector3f axis = edges[i].cross(box_normals[j]);
+                project<8>(verts, axis, box_min, box_max);
+                project<3>(triangle_verts, axis, triangle_min, triangle_max);
+                if (box_max < triangle_min || box_min > triangle_max)
+                    return false; // No intersection possible
+            }
+        }
+
+        // No separating axis found.
+        return true;
+    }
+
+    __host__ __device__ Eigen::Vector2f ray_intersect(Eigen::Ref<const Eigen::Vector3f> pos, Eigen::Ref<const Eigen::Vector3f> dir) const {
+        float tmin = (min.x() - pos.x()) / dir.x();
+        float tmax = (max.x() - pos.x()) / dir.x();
+
+        if (tmin > tmax) {
+            host_device_swap(tmin, tmax);
+        }
+
+        float tymin = (min.y() - pos.y()) / dir.y();
+        float tymax = (max.y() - pos.y()) / dir.y();
+
+        if (tymin > tymax) {
+            host_device_swap(tymin, tymax);
+        }
+
+        if (tmin > tymax || tymin > tmax) {
+            return { std::numeric_limits<float>::max(), std::numeric_limits<float>::max() };
+        }
+
+        if (tymin > tmin) {
+            tmin = tymin;
+        }
+
+        if (tymax < tmax) {
+            tmax = tymax;
+        }
+
+        float tzmin = (min.z() - pos.z()) / dir.z();
+        float tzmax = (max.z() - pos.z()) / dir.z();
+
+        if (tzmin > tzmax) {
+            host_device_swap(tzmin, tzmax);
+        }
+
+        if (tmin > tzmax || tzmin > tmax) {
+            return { std::numeric_limits<float>::max(), std::numeric_limits<float>::max() };
+        }
+
+        if (tzmin > tmin) {
+            tmin = tzmin;
+        }
+
+        if (tzmax < tmax) {
+            tmax = tzmax;
+        }
+
+        return { tmin, tmax };
+    }
+
+    __host__ __device__ bool is_empty() const {
+        return (max.array() < min.array()).any();
+    }
+
+    __host__ __device__ bool contains(const Eigen::Vector3f& p) const {
+        return
+            p.x() >= min.x() && p.x() <= max.x() &&
+            p.y() >= min.y() && p.y() <= max.y() &&
+            p.z() >= min.z() && p.z() <= max.z();
+    }
+
+    /// Calculate the squared point-AABB distance
+    __host__ __device__ float distance(const Eigen::Vector3f& p) const {
+        return sqrt(distance_sq(p));
+    }
+
+    __host__ __device__ float distance_sq(const Eigen::Vector3f& p) const {
+        return (min - p).cwiseMax(p - max).cwiseMax(0.0f).squaredNorm();
+    }
+
+    __host__ __device__ float signed_distance(const Eigen::Vector3f& p) const {
+        Eigen::Vector3f q = (p - min).cwiseAbs() - diag();
+        return q.cwiseMax(0.0f).norm() + std::min(std::max(q.x(), std::max(q.y(), q.z())), 0.0f);
+    }
+
+    __host__ __device__ void get_vertices(Eigen::Vector3f v[8]) const {
+        v[0] = {min.x(), min.y(), min.z()};
+        v[1] = {min.x(), min.y(), max.z()};
+        v[2] = {min.x(), max.y(), min.z()};
+        v[3] = {min.x(), max.y(), max.z()};
+        v[4] = {max.x(), min.y(), min.z()};
+        v[5] = {max.x(), min.y(), max.z()};
+        v[6] = {max.x(), max.y(), min.z()};
+        v[7] = {max.x(), max.y(), max.z()};
+    }
+
+};
+
+inline std::ostream& operator<< (std::ostream& os, const cubvh::BoundingBox& bb) {
+    os << "[";
+    os << "min=[" << bb.min.x() << "," << bb.min.y() << "," << bb.min.z() << "], ";
+    os << "max=[" << bb.max.x() << "," << bb.max.y() << "," << bb.max.z() << "]";
+    os << "]";
+    return os;
+}
+
+}
\ No newline at end of file
--- a/include/cubvh/bvh.cuh
+++ b/include/cubvh/bvh.cuh
+#pragma once
+
+#include <cubvh/common.h>
+#include <cubvh/triangle.cuh>
+#include <cubvh/bounding_box.cuh>
+#include <cubvh/gpu_memory.h>
+
+#include <memory>
+
+namespace cubvh {
+
+struct TriangleBvhNode {
+    BoundingBox bb;
+    int left_idx; // negative values indicate leaves
+    int right_idx;
+};
+
+
+template <typename T, int MAX_SIZE=32>
+class FixedStack {
+public:
+    __host__ __device__ void push(T val) {
+        if (m_count >= MAX_SIZE-1) {
+            printf("WARNING TOO BIG\n");
+        }
+        m_elems[m_count++] = val;
+    }
+
+    __host__ __device__ T pop() {
+        return m_elems[--m_count];
+    }
+
+    __host__ __device__ bool empty() const {
+        return m_count <= 0;
+    }
+
+private:
+    T m_elems[MAX_SIZE];
+    int m_count = 0;
+};
+
+using FixedIntStack = FixedStack<int>;
+
+
+class TriangleBvh {
+
+protected:
+    std::vector<TriangleBvhNode> m_nodes;
+    GPUMemory<TriangleBvhNode> m_nodes_gpu;
+    TriangleBvh() {};
+
+public:
+    virtual void build(std::vector<Triangle>& triangles, uint32_t n_primitives_per_leaf) = 0;
+
+    virtual void unsigned_distance_gpu(uint32_t n_elements, const float* positions, float* distances, int64_t* face_id, float* uvw, const Triangle* gpu_triangles, cudaStream_t stream) = 0;
+    virtual void ray_trace_gpu(uint32_t n_elements, const float* rays_o, const float* rays_d, float* positions, int64_t* face_id, float* depth, const Triangle* gpu_triangles, cudaStream_t stream) = 0;
+
+    // KIUI: not supported now.
+    // virtual void signed_distance_gpu(uint32_t n_elements, EMeshSdfMode mode, const Eigen::Vector3f* gpu_positions, float* gpu_distances, const Triangle* gpu_triangles, bool use_existing_distances_as_upper_bounds, cudaStream_t stream) = 0;
+    // virtual bool touches_triangle(const BoundingBox& bb, const Triangle* __restrict__ triangles) const = 0;
+    // virtual void build_optix(const GPUMemory<Triangle>& triangles, cudaStream_t stream) = 0;
+
+    static std::unique_ptr<TriangleBvh> make();
+
+    TriangleBvhNode* nodes_gpu() const {
+        return m_nodes_gpu.data();
+    }
+
+};
+
+}
\ No newline at end of file
--- a/include/cubvh/common.h
+++ b/include/cubvh/common.h
+#pragma once
+
+#include <iostream>
+#include <string>
+#include <vector>
+
+#include <cstdint>
+#include <cmath>
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <cuda_fp16.h>
+
+#include <Eigen/Dense>
+
+namespace cubvh {
+
+static constexpr float PI = 3.14159265358979323846f;
+static constexpr float SQRT2 = 1.41421356237309504880f;
+
+
+// enum class EMeshSdfMode : int {
+// 	Watertight,
+// 	Raystab,
+// 	PathEscape,
+// };
+// static constexpr const char* MeshSdfModeStr = "Watertight\0Raystab\0PathEscape\0\0";
+
+template <typename T>
+__host__ __device__ T div_round_up(T val, T divisor) {
+    return (val + divisor - 1) / divisor;
+}
+
+constexpr uint32_t n_threads_linear = 128;
+
+template <typename T>
+constexpr uint32_t n_blocks_linear(T n_elements) {
+    return (uint32_t)div_round_up(n_elements, (T)n_threads_linear);
+}
+
+template <typename K, typename T, typename ... Types>
+inline void linear_kernel(K kernel, uint32_t shmem_size, cudaStream_t stream, T n_elements, Types ... args) {
+    if (n_elements <= 0) {
+        return;
+    }
+    kernel<<<n_blocks_linear(n_elements), n_threads_linear, shmem_size, stream>>>((uint32_t)n_elements, args...);
+}
+
+inline __host__ __device__ float sign(float x) {
+    return copysignf(1.0, x);
+}
+
+template <typename T>
+__host__ __device__ T clamp(T val, T lower, T upper) {
+    return val < lower ? lower : (upper < val ? upper : val);
+}
+
+template <typename T>
+__host__ __device__ void host_device_swap(T& a, T& b) {
+    T c(a); a=b; b=c;
+}
+
+}
\ No newline at end of file
--- a/include/cubvh/gpu_memory.h
+++ b/include/cubvh/gpu_memory.h
+/*
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *//*
+ */
+
+/** @file   gpu_memory.h
+ *  @author Nikolaus Binder and Thomas Müller, NVIDIA
+ *  @brief  Managed memory on the GPU. Like a std::vector, memory is alocated  either explicitly (resize/enlarge)
+ *          or implicitly (resize_and_copy_from_host etc). Memory is always and automatically released in the destructor.
+ */
+
+#pragma once
+
+#include <cubvh/common.h>
+#include <atomic>
+#include <stdexcept>
+#include <stdint.h>
+#include <string>
+#include <vector>
+
+/// Checks the result of a cudaXXXXXX call and throws an error on failure
+#define CUDA_CHECK_THROW(x)                                                                                               \
+    do {                                                                                                                  \
+        cudaError_t result = x;                                                                                           \
+        if (result != cudaSuccess)                                                                                        \
+            throw std::runtime_error(std::string("CUDA Error: " #x " failed with error ") + cudaGetErrorString(result));  \
+    } while(0)
+
+
+namespace cubvh {
+
+#define DEBUG_GUARD_SIZE 0
+
+inline std::atomic<size_t>& total_n_bytes_allocated() {
+    static std::atomic<size_t> s_total_n_bytes_allocated{0};
+    return s_total_n_bytes_allocated;
+}
+
+/// Managed memory on the Device
+template<class T>
+class GPUMemory {
+private:
+    T* m_data = nullptr;
+    size_t m_size = 0; // Number of elements
+    bool m_owned = true;
+
+public:
+    GPUMemory() {}
+
+    GPUMemory<T>& operator=(GPUMemory<T>&& other) {
+        std::swap(m_data, other.m_data);
+        std::swap(m_size, other.m_size);
+        return *this;
+    }
+
+    GPUMemory(GPUMemory<T>&& other) {
+        *this = std::move(other);
+    }
+
+    __host__ __device__ GPUMemory(const GPUMemory<T> &other) : m_data{other.m_data}, m_size{other.m_size}, m_owned{false} {}
+
+    void check_guards() const {
+#if DEBUG_GUARD_SIZE > 0
+        if (!m_data)
+            return;
+        uint8_t buf[DEBUG_GUARD_SIZE];
+        const uint8_t *rawptr=(const uint8_t *)m_data;
+        cudaMemcpy(buf, rawptr-DEBUG_GUARD_SIZE, DEBUG_GUARD_SIZE, cudaMemcpyDeviceToHost);
+        for (int i=0;i<DEBUG_GUARD_SIZE;++i) if (buf[i] != 0xff) {
+            printf("TRASH BEFORE BLOCK offset %d data %p, read 0x%02x expected 0xff!\n", i, m_data, buf[i] );
+            break;
+        }
+        cudaMemcpy(buf, rawptr+m_size*sizeof(T), DEBUG_GUARD_SIZE, cudaMemcpyDeviceToHost);
+        for (int i=0;i<DEBUG_GUARD_SIZE;++i) if (buf[i] != 0xfe) {
+            printf("TRASH AFTER BLOCK offset %d data %p, read 0x%02x expected 0xfe!\n", i, m_data, buf[i] );
+            break;
+        }
+#endif
+    }
+
+    void allocate_memory(size_t n_bytes) {
+        if (n_bytes == 0) {
+            return;
+        }
+
+#ifdef TCNN_VERBOSE_MEMORY_ALLOCS
+        std::cout << "GPUMemory: Allocating " << bytes_to_string(n_bytes) << "." << std::endl;
+#endif
+
+        uint8_t *rawptr = nullptr;
+        CUDA_CHECK_THROW(cudaMalloc(&rawptr, n_bytes+DEBUG_GUARD_SIZE*2));
+#if DEBUG_GUARD_SIZE > 0
+        CUDA_CHECK_THROW(cudaMemset(rawptr , 0xff, DEBUG_GUARD_SIZE));
+        CUDA_CHECK_THROW(cudaMemset(rawptr+n_bytes+DEBUG_GUARD_SIZE , 0xfe, DEBUG_GUARD_SIZE));
+#endif
+        if (rawptr) rawptr+=DEBUG_GUARD_SIZE;
+        m_data=(T*)(rawptr);
+        total_n_bytes_allocated() += n_bytes;
+    }
+
+    void free_memory() {
+        if (!m_data) {
+            return;
+        }
+
+        uint8_t *rawptr = (uint8_t*)m_data;
+        if (rawptr) rawptr-=DEBUG_GUARD_SIZE;
+        CUDA_CHECK_THROW(cudaFree(rawptr));
+
+        total_n_bytes_allocated() -= get_bytes();
+
+        m_data = nullptr;
+    }
+
+    /// Allocates memory for size items of type T
+    GPUMemory(const size_t size) {
+        resize(size);
+    }
+
+    /// Frees memory again
+    __host__ __device__ ~GPUMemory() {
+#ifndef __CUDA_ARCH__
+        if (!m_owned) {
+            return;
+        }
+
+        try {
+            if (m_data) {
+                free_memory();
+                m_size = 0;
+            }
+        } catch (std::runtime_error error) {
+            // Don't need to report on memory-free problems when the driver is shutting down.
+            if (std::string{error.what()}.find("driver shutting down") == std::string::npos) {
+                fprintf(stderr, "Could not free memory: %s\n", error.what());
+            }
+        }
+#endif
+    }
+
+    /** @name Resizing/enlargement
+     *  @{
+     */
+    /// Resizes the array to the exact new size, even if it is already larger
+    void resize(const size_t size) {
+        if (!m_owned) {
+            throw std::runtime_error("Cannot resize non-owned memory.");
+        }
+
+        if (m_size != size) {
+            if (m_size) {
+                try {
+                    free_memory();
+                } catch (std::runtime_error error) {
+                    throw std::runtime_error(std::string("Could not free memory: ") + error.what());
+                }
+            }
+
+            if (size > 0) {
+                try {
+                    allocate_memory(size * sizeof(T));
+                } catch (std::runtime_error error) {
+                    throw std::runtime_error(std::string("Could not allocate memory: ") + error.what());
+                }
+            }
+
+            m_size = size;
+        }
+    }
+
+    /// Enlarges the array if its size is smaller
+    void enlarge(const size_t size) {
+        if (size > m_size) {
+            resize(size);
+        }
+    }
+    /** @} */
+
+    /** @name Memset
+     *  @{
+     */
+    /// Sets the memory of the first num_elements to value
+    void memset(const int value, const size_t num_elements, const size_t offset = 0) {
+        if (num_elements + offset > m_size) {
+            throw std::runtime_error("Could not set memory: Number of elements larger than allocated memory");
+        }
+
+        try {
+            CUDA_CHECK_THROW(cudaMemset(m_data + offset, value, num_elements * sizeof(T)));
+        } catch (std::runtime_error error) {
+            throw std::runtime_error(std::string("Could not set memory: ") + error.what());
+        }
+    }
+
+    /// Sets the memory of the all elements to value
+    void memset(const int value) {
+        memset(value, m_size);
+    }
+    /** @} */
+
+    /** @name Copy operations
+     *  @{
+     */
+    /// Copy data of num_elements from the raw pointer on the host
+    void copy_from_host(const T* host_data, const size_t num_elements) {
+        try {
+            CUDA_CHECK_THROW(cudaMemcpy(data(), host_data, num_elements * sizeof(T), cudaMemcpyHostToDevice));
+        } catch (std::runtime_error error) {
+            throw std::runtime_error(std::string("Could not copy from host: ") + error.what());
+        }
+    }
+
+    /// Copy num_elements from the host vector
+    void copy_from_host(const std::vector<T>& data, const size_t num_elements) {
+        if (data.size() < num_elements) {
+            throw std::runtime_error(std::string("Trying to copy ") + std::to_string(num_elements) + std::string(" elements, but vector size is only ") + std::to_string(data.size()));
+        }
+        copy_from_host(data.data(), num_elements);
+    }
+
+    /// Copies data from the raw host pointer to fill the entire array
+    void copy_from_host(const T* data) {
+        copy_from_host(data, m_size);
+    }
+
+    /// Copies num_elements of data from the raw host pointer after enlarging the array so that everything fits in
+    void enlarge_and_copy_from_host(const T* data, const size_t num_elements) {
+        enlarge(num_elements);
+        copy_from_host(data, num_elements);
+    }
+
+    /// Copies num_elements from the host vector after enlarging the array so that everything fits in
+    void enlarge_and_copy_from_host(const std::vector<T>& data, const size_t num_elements) {
+        enlarge_and_copy_from_host(data.data(), num_elements);
+    }
+
+    /// Copies the entire host vector after enlarging the array so that everything fits in
+    void enlarge_and_copy_from_host(const std::vector<T>& data) {
+        enlarge_and_copy_from_host(data.data(), data.size());
+    }
+
+    /// Copies num_elements of data from the raw host pointer after resizing the array
+    void resize_and_copy_from_host(const T* data, const size_t num_elements) {
+        resize(num_elements);
+        copy_from_host(data, num_elements);
+    }
+
+    /// Copies num_elements from the host vector after resizing the array
+    void resize_and_copy_from_host(const std::vector<T>& data, const size_t num_elements) {
+        resize_and_copy_from_host(data.data(), num_elements);
+    }
+
+    /// Copies the entire host vector after resizing the array
+    void resize_and_copy_from_host(const std::vector<T>& data) {
+        resize_and_copy_from_host(data.data(), data.size());
+    }
+
+    /// Copies the entire host vector to the device. Fails if there is not enough space available.
+    void copy_from_host(const std::vector<T>& data) {
+        if (data.size() < m_size) {
+            throw std::runtime_error(std::string("Trying to copy ") + std::to_string(m_size) + std::string(" elements, but vector size is only ") + std::to_string(data.size()));
+        }
+        copy_from_host(data.data(), m_size);
+    }
+
+    /// Copies num_elements of data from the raw host pointer to the device. Fails if there is not enough space available.
+    void copy_to_host(T* host_data, const size_t num_elements) const {
+        if (num_elements > m_size) {
+            throw std::runtime_error(std::string("Trying to copy ") + std::to_string(num_elements) + std::string(" elements, but vector size is only ") + std::to_string(m_size));
+        }
+        try {
+            CUDA_CHECK_THROW(cudaMemcpy(host_data, data(), num_elements * sizeof(T), cudaMemcpyDeviceToHost));
+        } catch (std::runtime_error error) {
+            throw std::runtime_error(std::string("Could not copy to host: ") + error.what());
+        }
+    }
+
+    /// Copies num_elements from the device to a vector on the host
+    void copy_to_host(std::vector<T>& data, const size_t num_elements) const {
+        if (data.size() < num_elements) {
+            throw std::runtime_error(std::string("Trying to copy ") + std::to_string(num_elements) + std::string(" elements, but vector size is only ") + std::to_string(data.size()));
+        }
+        copy_to_host(data.data(), num_elements);
+    }
+
+    /// Copies num_elements from the device to a raw pointer on the host
+    void copy_to_host(T* data) const {
+        copy_to_host(data, m_size);
+    }
+
+    /// Copies all elements from the device to a vector on the host
+    void copy_to_host(std::vector<T>& data) const {
+        if (data.size() < m_size) {
+            throw std::runtime_error(std::string("Trying to copy ") + std::to_string(m_size) + std::string(" elements, but vector size is only ") + std::to_string(data.size()));
+        }
+        copy_to_host(data.data(), m_size);
+    }
+
+    /// Copies data from another device array to this one, automatically resizing it
+    void copy_from_device(const GPUMemory<T> &other) {
+        if (m_size != other.m_size) {
+            resize(other.m_size);
+        }
+
+        try {
+            CUDA_CHECK_THROW(cudaMemcpy(m_data, other.m_data, m_size * sizeof(T), cudaMemcpyDeviceToDevice));
+        } catch (std::runtime_error error) {
+            throw std::runtime_error(std::string("Could not copy from device: ") + error.what());
+        }
+    }
+
+    /// Copies size elements from another device array to this one, automatically resizing it
+    void copy_from_device(const GPUMemory<T> &other, const size_t size) {
+        if (m_size < size) {
+            resize(size);
+        }
+
+        try {
+            CUDA_CHECK_THROW(cudaMemcpy(m_data, other.m_data, size * sizeof(T), cudaMemcpyDeviceToDevice));
+        } catch (std::runtime_error error) {
+            throw std::runtime_error(std::string("Could not copy from device: ") + error.what());
+        }
+    }
+
+    // Created an (owned) copy of the data
+    GPUMemory<T> copy() const {
+        GPUMemory<T> result{m_size};
+        result.copy_from_device(*this);
+        return result;
+    }
+
+    T* data() const {
+        check_guards();
+        return m_data;
+    }
+
+    __host__ __device__ T& operator[](size_t idx) const {
+#ifdef DEBUG_BUFFER_OVERRUN
+        if (idx > m_size) {
+            printf("WARNING: buffer overrun of %p at idx %zu\n", idx);
+        }
+#endif
+        return m_data[idx];
+    }
+
+    __host__ __device__ T& operator[](uint32_t idx) const {
+#ifdef DEBUG_BUFFER_OVERRUN
+        if (idx > m_size) {
+            printf("WARNING: buffer overrun of %p at idx %u\n", idx);
+        }
+#endif
+        return m_data[idx];
+    }
+
+    size_t get_num_elements() const {
+        return m_size;
+    }
+
+    size_t size() const {
+        return get_num_elements();
+    }
+
+    size_t get_bytes() const {
+        return m_size * sizeof(T);
+    }
+
+    size_t bytes() const {
+        return get_bytes();
+    }
+};
+
+}
\ No newline at end of file
--- a/include/cubvh/triangle.cuh
+++ b/include/cubvh/triangle.cuh
+#pragma once
+
+#include <cubvh/common.h>
+
+namespace cubvh {
+
+// Triangle data structure
+struct Triangle {
+
+    __host__ __device__ Eigen::Vector3f sample_uniform_position(const Eigen::Vector2f& sample) const {
+        float sqrt_x = std::sqrt(sample.x());
+        float factor0 = 1.0f - sqrt_x;
+        float factor1 = sqrt_x * (1.0f - sample.y());
+        float factor2 = sqrt_x * sample.y();
+
+        return factor0 * a + factor1 * b + factor2 * c;
+    }
+
+    __host__ __device__ float surface_area() const {
+        return 0.5f * Eigen::Vector3f((b - a).cross(c - a)).norm();
+    }
+
+    __host__ __device__ Eigen::Vector3f normal() const {
+        return (b - a).cross(c - a).normalized();
+    }
+
+    __host__ __device__ float ray_intersect(const Eigen::Vector3f &ro, const Eigen::Vector3f &rd, Eigen::Vector3f& n) const { // based on https://www.iquilezles.org/www/articles/intersectors/intersectors.htm
+        Eigen::Vector3f v1v0 = b - a;
+        Eigen::Vector3f v2v0 = c - a;
+        Eigen::Vector3f rov0 = ro - a;
+        n = v1v0.cross( v2v0 );
+        Eigen::Vector3f q = rov0.cross( rd );
+        float d = 1.0f/rd.dot( n );
+        float u = d*-q.dot( v2v0 );
+        float v = d* q.dot( v1v0 );
+        float t = d*-n.dot( rov0 );
+        if( u<0.0f || u>1.0f || v<0.0f || (u+v)>1.0f || t<0.0f) t = 1e6f;
+        return t; // Eigen::Vector3f( t, u, v );
+    }
+
+    __host__ __device__ float ray_intersect(const Eigen::Vector3f &ro, const Eigen::Vector3f &rd) const {
+        Eigen::Vector3f n;
+        return ray_intersect(ro, rd, n);
+    }
+
+    __host__ __device__ float distance_sq(const Eigen::Vector3f& pos) const {
+        // prepare data
+        Eigen::Vector3f v21 = b - a; Eigen::Vector3f p1 = pos - a;
+        Eigen::Vector3f v32 = c - b; Eigen::Vector3f p2 = pos - b;
+        Eigen::Vector3f v13 = a - c; Eigen::Vector3f p3 = pos - c;
+        Eigen::Vector3f nor = v21.cross(v13);
+
+        return
+            // inside/outside test
+            (sign(v21.cross(nor).dot(p1)) + sign(v32.cross(nor).dot(p2)) + sign(v13.cross(nor).dot(p3)) < 2.0f)
+            ?
+            // 3 edges
+            std::min(
+                std::min(
+                    (v21 * clamp(v21.dot(p1) / v21.squaredNorm(), 0.0f, 1.0f)-p1).squaredNorm(),
+                    (v32 * clamp(v32.dot(p2) / v32.squaredNorm(), 0.0f, 1.0f)-p2).squaredNorm()
+                ),
+                (v13 * clamp(v13.dot(p3) / v13.squaredNorm(), 0.0f, 1.0f)-p3).squaredNorm()
+            )
+            :
+            // 1 face
+            nor.dot(p1)*nor.dot(p1)/nor.squaredNorm();
+    }
+
+    __host__ __device__ float distance(const Eigen::Vector3f& pos) const {
+        return std::sqrt(distance_sq(pos));
+    }
+
+    __host__ __device__ bool point_in_triangle(const Eigen::Vector3f& p) const {
+        // Move the triangle so that the point becomes the
+        // triangles origin
+        Eigen::Vector3f local_a = a - p;
+        Eigen::Vector3f local_b = b - p;
+        Eigen::Vector3f local_c = c - p;
+
+        // The point should be moved too, so they are both
+        // relative, but because we don't use p in the
+        // equation anymore, we don't need it!
+        // p -= p;
+
+        // Compute the normal vectors for triangles:
+        // u = normal of PBC
+        // v = normal of PCA
+        // w = normal of PAB
+
+        Eigen::Vector3f u = local_b.cross(local_c);
+        Eigen::Vector3f v = local_c.cross(local_a);
+        Eigen::Vector3f w = local_a.cross(local_b);
+
+        // Test to see if the normals are facing
+        // the same direction, return false if not
+        if (u.dot(v) < 0.0f) {
+            return false;
+        }
+        if (u.dot(w) < 0.0f) {
+            return false;
+        }
+
+        // All normals facing the same way, return true
+        return true;
+    }
+
+    __host__ __device__ Eigen::Vector3f closest_point_to_line(const Eigen::Vector3f& a, const Eigen::Vector3f& b, const Eigen::Vector3f& c) const {
+        float t = (c - a).dot(b - a) / (b - a).dot(b - a);
+        t = std::max(std::min(t, 1.0f), 0.0f);
+        return a + t * (b - a);
+    }
+
+    __host__ __device__ Eigen::Vector3f closest_point(Eigen::Vector3f point) const {
+        point -= normal().dot(point - a) * normal();
+
+        if (point_in_triangle(point)) {
+            return point;
+        }
+
+        Eigen::Vector3f c1 = closest_point_to_line(a, b, point);
+        Eigen::Vector3f c2 = closest_point_to_line(b, c, point);
+        Eigen::Vector3f c3 = closest_point_to_line(c, a, point);
+
+        float mag1 = (point - c1).squaredNorm();
+        float mag2 = (point - c2).squaredNorm();
+        float mag3 = (point - c3).squaredNorm();
+
+        float min = std::min(mag1, mag2);
+        min = std::min(min, mag3);
+
+        if (min == mag1) {
+            return c1;
+        }
+        else if (min == mag2) {
+            return c2;
+        }
+        return c3;
+    }
+
+    __host__ __device__ Eigen::Vector3f barycentric(const Eigen::Vector3f& p) const {
+        Eigen::Vector3f v0 = b - a;
+        Eigen::Vector3f v1 = c - a;
+        Eigen::Vector3f v2 = p - a;
+
+        float d00 = v0.dot(v0);
+        float d01 = v0.dot(v1);
+        float d11 = v1.dot(v1);
+        float d20 = v2.dot(v0);
+        float d21 = v2.dot(v1);
+
+        float denom = d00 * d11 - d01 * d01;
+        float v = (d11 * d20 - d01 * d21) / denom;
+        float w = (d00 * d21 - d01 * d20) / denom;
+        float u = 1.0 - v - w;
+        
+        return Eigen::Vector3f(u, v, w);
+    }
+
+    __host__ __device__ Eigen::Vector3f centroid() const {
+        return (a + b + c) / 3.0f;
+    }
+
+    __host__ __device__ float centroid(int axis) const {
+        return (a[axis] + b[axis] + c[axis]) / 3;
+    }
+
+    __host__ __device__ void get_vertices(Eigen::Vector3f v[3]) const {
+        v[0] = a;
+        v[1] = b;
+        v[2] = c;
+    }
+
+    Eigen::Vector3f a, b, c;
+    int64_t id;
+};
+
+
+inline std::ostream& operator<<(std::ostream& os, const Triangle& triangle) {
+    os << "[";
+    os << "a=[" << triangle.a.x() << "," << triangle.a.y() << "," << triangle.a.z() << "], ";
+    os << "b=[" << triangle.b.x() << "," << triangle.b.y() << "," << triangle.b.z() << "], ";
+    os << "c=[" << triangle.c.x() << "," << triangle.c.y() << "," << triangle.c.z() << "]";
+    os << "]";
+    return os;
+}
+
+
+}
\ No newline at end of file
--- a/readme.md
+++ b/readme.md
+# cuBVH
+
+A CUDA Mesh BVH acceleration toolkit.
+
+### Install
+
+```python
+git clone https://github.com/ashawkey/cubvh
+cd cubvh
+pip install .
+```
+
+### Usage
+
+Example for a mesh normal renderer:
+
+```bash
+python test/renderer.py # default, show a dodecahedron
+python test/renderer.py --mesh example.ply # show any mesh file
+```
+
+https://user-images.githubusercontent.com/25863658/183238748-7ac82808-6cd3-4bb6-867a-9c22f8e3f7dd.mp4
+
+Example code:
+
+```python
+import numpy as np
+import trimesh
+
+import torch
+import cubvh
+
+# build BVH from mesh
+mesh = trimesh.load('example.ply')
+BVH = cubvh.cuBVH(mesh.vertices, mesh.faces) # build with numpy.ndarray/torch.Tensor
+
+
+# query ray-mesh intersection
+rays_o, rays_d = get_ray(pose, intrinsics, H, W) # [N, 3], [N, 3], query with torch.Tensor (cuda)
+intersections, face_normals, depth = BVH.ray_trace(rays_o, rays_d) # [N, 3], [N, 3], [N,]
+
+# query unsigned distance
+points # [N, 3]
+distances, face_id, uvw = BVH.unsigned_distance(points, return_uvw=True) # [N], [N], [N, 3]
+```
+
+
+### Acknowledgement
+
+* Credits to [Thomas Müller](https://tom94.net/)'s amazing [tiny-cuda-nn](https://github.com/NVlabs/tiny-cuda-nn) and [instant-ngp](https://github.com/NVlabs/instant-ngp)!
--- a/setup.py
+++ b/setup.py
+import os
+from setuptools import setup
+from torch.utils.cpp_extension import BuildExtension, CUDAExtension
+
+_src_path = os.path.dirname(os.path.abspath(__file__))
+
+# ref: https://github.com/sxyu/sdf/blob/master/setup.py
+def find_eigen(min_ver=(3, 3, 0)):
+    """Helper to find or download the Eigen C++ library"""
+    import re, os
+    try_paths = [
+        '/usr/include/eigen3',
+        '/usr/local/include/eigen3',
+        os.path.expanduser('~/.local/include/eigen3'),
+        'C:/Program Files/eigen3',
+        'C:/Program Files (x86)/eigen3',
+    ]
+    WORLD_VER_STR = "#define EIGEN_WORLD_VERSION"
+    MAJOR_VER_STR = "#define EIGEN_MAJOR_VERSION"
+    MINOR_VER_STR = "#define EIGEN_MINOR_VERSION"
+    EIGEN_WEB_URL = 'https://gitlab.com/libeigen/eigen/-/archive/3.3.7/eigen-3.3.7.tar.bz2'
+    TMP_EIGEN_FILE = 'tmp_eigen.tar.bz2'
+    TMP_EIGEN_DIR = 'eigen-3.3.7'
+    min_ver_str = '.'.join(map(str, min_ver))
+
+    eigen_path = None
+    for path in try_paths:
+        macros_path = os.path.join(path, 'Eigen/src/Core/util/Macros.h')
+        if os.path.exists(macros_path):
+            macros = open(macros_path, 'r').read().split('\n')
+            world_ver, major_ver, minor_ver = None, None, None
+            for line in macros:
+                if line.startswith(WORLD_VER_STR):
+                    world_ver = int(line[len(WORLD_VER_STR):])
+                elif line.startswith(MAJOR_VER_STR):
+                    major_ver = int(line[len(MAJOR_VER_STR):])
+                elif line.startswith(MINOR_VER_STR):
+                    minor_ver = int(line[len(MAJOR_VER_STR):])
+            if not world_ver or not major_ver or not minor_ver:
+                print('Failed to parse macros file', macros_path)
+            else:
+                ver = (world_ver, major_ver, minor_ver)
+                ver_str = '.'.join(map(str, ver))
+                if ver < min_ver:
+                    print('Found unsuitable Eigen version', ver_str, 'at',
+                          path, '(need >= ' + min_ver_str + ')')
+                else:
+                    print('Found Eigen version', ver_str, 'at', path)
+                    eigen_path = path
+                    break
+
+    if eigen_path is None:
+        try:
+            import urllib.request
+            print("Couldn't find Eigen locally, downloading...")
+            req = urllib.request.Request(
+                EIGEN_WEB_URL,
+                data=None,
+                headers={
+                    'User-Agent':
+                    'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.135 Safari/537.36'
+                })
+
+            with urllib.request.urlopen(req) as resp,\
+                  open(TMP_EIGEN_FILE, 'wb') as file:
+                data = resp.read()
+                file.write(data)
+            import tarfile
+            tar = tarfile.open(TMP_EIGEN_FILE)
+            tar.extractall()
+            tar.close()
+
+            eigen_path = TMP_EIGEN_DIR
+            os.remove(TMP_EIGEN_FILE)
+        except:
+            print('Download failed, failed to find Eigen')
+
+    if eigen_path is not None:
+        print('Found eigen at', eigen_path)
+
+    return eigen_path
+
+nvcc_flags = [
+    '-O3', '-std=c++14',
+    "--expt-extended-lambda",
+	"--expt-relaxed-constexpr",
+    '-U__CUDA_NO_HALF_OPERATORS__', '-U__CUDA_NO_HALF_CONVERSIONS__', '-U__CUDA_NO_HALF2_OPERATORS__',
+]
+
+if os.name == "posix":
+    c_flags = ['-O3', '-std=c++14']
+elif os.name == "nt":
+    c_flags = ['/O2', '/std:c++17']
+
+    # find cl.exe
+    def find_cl_path():
+        import glob
+        for edition in ["Enterprise", "Professional", "BuildTools", "Community"]:
+            paths = sorted(glob.glob(r"C:\\Program Files (x86)\\Microsoft Visual Studio\\*\\%s\\VC\\Tools\\MSVC\\*\\bin\\Hostx64\\x64" % edition), reverse=True)
+            if paths:
+                return paths[0]
+
+    # If cl.exe is not on path, try to find it.
+    if os.system("where cl.exe >nul 2>nul") != 0:
+        cl_path = find_cl_path()
+        if cl_path is None:
+            raise RuntimeError("Could not locate a supported Microsoft Visual C++ installation")
+        os.environ["PATH"] += ";" + cl_path
+
+'''
+Usage:
+python setup.py build_ext --inplace # build extensions locally, do not install (only can be used from the parent directory)
+python setup.py install # build extensions and install (copy) to PATH.
+pip install . # ditto but better (e.g., dependency & metadata handling)
+python setup.py develop # build extensions and install (symbolic) to PATH.
+pip install -e . # ditto but better (e.g., dependency & metadata handling)
+'''
+setup(
+    name='cubvh', # package name, import this to use python API
+    version='0.1.0',
+    description='CUDA BVH implementation',
+    url='https://github.com/ashawkey/cubvh',
+    author='kiui',
+    author_email='ashawkey1999@gmail.com',
+    ext_modules=[
+        CUDAExtension(
+            name='_cubvh', # extension name, import this to use CUDA API
+            sources=[os.path.join(_src_path, 'src', f) for f in [
+                'bvh.cu',
+                'api.cu',
+                'bindings.cpp',
+            ]],
+            include_dirs=[
+                os.path.join(_src_path, 'include'),
+                find_eigen(),
+            ],
+            extra_compile_args={
+                'cxx': c_flags,
+                'nvcc': nvcc_flags,
+            }
+        ),
+    ],
+    cmdclass={
+        'build_ext': BuildExtension,
+    },
+    install_requires=[
+        'ninja',
+        'trimesh',
+        'opencv-python',
+        'torch',
+        'numpy ',
+        'tqdm',
+        'matplotlib',
+        'dearpygui',
+    ],
+)
\ No newline at end of file
--- a/src/api.cu
+++ b/src/api.cu
+#pragma once
+
+#include <cubvh/api.h>
+
+#include <cubvh/common.h>
+#include <cubvh/bvh.cuh>
+
+#include <Eigen/Dense>
+
+using namespace Eigen;
+
+using Verts = Matrix<float, Dynamic, 3, RowMajor>;
+using Trigs = Matrix<uint32_t, Dynamic, 3, RowMajor>;
+
+namespace cubvh {
+
+class cuBVHImpl : public cuBVH {
+public:
+
+    // accept numpy array (cpu) to init 
+    cuBVHImpl(Ref<const Verts> vertices, Ref<const Trigs> triangles) : cuBVH() {
+
+        const size_t n_vertices = vertices.rows();
+        const size_t n_triangles = triangles.rows();
+
+        triangles_cpu.resize(n_triangles);
+
+        for (size_t i = 0; i < n_triangles; i++) {
+            triangles_cpu[i] = {vertices.row(triangles(i, 0)), vertices.row(triangles(i, 1)), vertices.row(triangles(i, 2)), (int64_t)i};
+        }
+
+        if (!triangle_bvh) {
+            triangle_bvh = TriangleBvh::make();
+        }
+
+        triangle_bvh->build(triangles_cpu, 8);
+
+        triangles_gpu.resize_and_copy_from_host(triangles_cpu);
+
+        // TODO: need OPTIX
+        // triangle_bvh->build_optix(triangles_gpu, m_inference_stream);
+
+    }
+
+    // accept torch tensor (gpu) to init
+    void ray_trace(at::Tensor rays_o, at::Tensor rays_d, at::Tensor positions, at::Tensor face_id, at::Tensor depth) {
+
+        const uint32_t n_elements = rays_o.size(0);
+        cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+        triangle_bvh->ray_trace_gpu(n_elements, rays_o.data_ptr<float>(), rays_d.data_ptr<float>(), positions.data_ptr<float>(), face_id.data_ptr<int64_t>(), depth.data_ptr<float>(), triangles_gpu.data(), stream);
+    }
+
+    // optionally return uvw (barycentric) and face_id
+    void unsigned_distance(at::Tensor positions, at::Tensor distances, at::Tensor face_id, at::optional<at::Tensor> uvw) {
+
+        const uint32_t n_elements = positions.size(0);
+        cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+        triangle_bvh->unsigned_distance_gpu(n_elements, positions.data_ptr<float>(), distances.data_ptr<float>(), face_id.data_ptr<int64_t>(), uvw.has_value() ? uvw.value().data_ptr<float>() : nullptr, triangles_gpu.data(), stream);
+
+    }
+
+    std::vector<Triangle> triangles_cpu;
+    GPUMemory<Triangle> triangles_gpu;
+    std::shared_ptr<TriangleBvh> triangle_bvh;
+};
+    
+cuBVH* create_cuBVH(Ref<const Verts> vertices, Ref<const Trigs> triangles) {
+    return new cuBVHImpl{vertices, triangles};
+}
+
+} // namespace cubvh
\ No newline at end of file
--- a/src/bindings.cpp
+++ b/src/bindings.cpp
+#include <pybind11/pybind11.h>
+#include <pybind11/numpy.h>
+#include <pybind11/eigen.h>
+
+#include <torch/extension.h>
+
+#include <cubvh/api.h>
+
+
+namespace py = pybind11;
+using namespace cubvh;
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+
+py::class_<cuBVH>(m, "cuBVH")
+    .def("ray_trace", &cuBVH::ray_trace)
+    .def("unsigned_distance", &cuBVH::unsigned_distance);
+
+m.def("create_cuBVH", &create_cuBVH);
+
+}
\ No newline at end of file
--- a/src/bvh.cu
+++ b/src/bvh.cu
--- a/test/renderer.py
+++ b/test/renderer.py
+import argparse
+import numpy as np
+import trimesh
+
+import torch
+import cubvh
+
+import dearpygui.dearpygui as dpg
+from scipy.spatial.transform import Rotation as R
+
+def create_dodecahedron(radius=1, center=np.array([0, 0, 0])):
+
+    vertices = np.array([
+        -0.57735,  -0.57735,  0.57735,
+        0.934172,  0.356822,  0,
+        0.934172,  -0.356822,  0,
+        -0.934172,  0.356822,  0,
+        -0.934172,  -0.356822,  0,
+        0,  0.934172,  0.356822,
+        0,  0.934172,  -0.356822,
+        0.356822,  0,  -0.934172,
+        -0.356822,  0,  -0.934172,
+        0,  -0.934172,  -0.356822,
+        0,  -0.934172,  0.356822,
+        0.356822,  0,  0.934172,
+        -0.356822,  0,  0.934172,
+        0.57735,  0.57735,  -0.57735,
+        0.57735,  0.57735,  0.57735,
+        -0.57735,  0.57735,  -0.57735,
+        -0.57735,  0.57735,  0.57735,
+        0.57735,  -0.57735,  -0.57735,
+        0.57735,  -0.57735,  0.57735,
+        -0.57735,  -0.57735,  -0.57735,
+        ]).reshape((-1,3), order="C")
+
+    faces = np.array([
+        19, 3, 2,
+        12, 19, 2,
+        15, 12, 2,
+        8, 14, 2,
+        18, 8, 2,
+        3, 18, 2,
+        20, 5, 4,
+        9, 20, 4,
+        16, 9, 4,
+        13, 17, 4,
+        1, 13, 4,
+        5, 1, 4,
+        7, 16, 4,
+        6, 7, 4,
+        17, 6, 4,
+        6, 15, 2,
+        7, 6, 2,
+        14, 7, 2,
+        10, 18, 3,
+        11, 10, 3,
+        19, 11, 3,
+        11, 1, 5,
+        10, 11, 5,
+        20, 10, 5,
+        20, 9, 8,
+        10, 20, 8,
+        18, 10, 8,
+        9, 16, 7,
+        8, 9, 7,
+        14, 8, 7,
+        12, 15, 6,
+        13, 12, 6,
+        17, 13, 6,
+        13, 1, 11,
+        12, 13, 11,
+        19, 12, 11,
+        ]).reshape((-1, 3), order="C")-1
+
+    length = np.linalg.norm(vertices, axis=1).reshape((-1, 1))
+    vertices = vertices / length * radius + center
+
+    return trimesh.Trimesh(vertices=vertices, faces=faces)
+
+
+class OrbitCamera:
+    def __init__(self, W, H, r=2, fovy=60):
+        self.W = W
+        self.H = H
+        self.radius = r # camera distance from center
+        self.fovy = fovy # in degree
+        self.center = np.array([0, 0, 0], dtype=np.float32) # look at this point
+        self.rot = R.from_quat([1, 0, 0, 0]) # init camera matrix: [[1, 0, 0], [0, -1, 0], [0, 0, 1]] (to suit ngp convention)
+        self.up = np.array([0, 1, 0], dtype=np.float32) # need to be normalized!
+
+    # pose
+    @property
+    def pose(self):
+        # first move camera to radius
+        res = np.eye(4, dtype=np.float32)
+        res[2, 3] -= self.radius
+        # rotate
+        rot = np.eye(4, dtype=np.float32)
+        rot[:3, :3] = self.rot.as_matrix()
+        res = rot @ res
+        # translate
+        res[:3, 3] -= self.center
+        return res
+    
+    # intrinsics
+    @property
+    def intrinsics(self):
+        focal = self.H / (2 * np.tan(np.radians(self.fovy) / 2))
+        return np.array([focal, focal, self.W // 2, self.H // 2])
+    
+    def orbit(self, dx, dy):
+        # rotate along camera up/side axis!
+        side = self.rot.as_matrix()[:3, 0] # why this is side --> ? # already normalized.
+        rotvec_x = self.up * np.radians(-0.05 * dx)
+        rotvec_y = side * np.radians(-0.05 * dy)
+        self.rot = R.from_rotvec(rotvec_x) * R.from_rotvec(rotvec_y) * self.rot
+
+    def scale(self, delta):
+        self.radius *= 1.1 ** (-delta)
+
+    def pan(self, dx, dy, dz=0):
+        # pan in camera coordinate system (careful on the sensitivity!)
+        self.center += 0.0005 * self.rot.as_matrix()[:3, :3] @ np.array([dx, dy, dz])
+
+
+@torch.cuda.amp.autocast(enabled=False)
+def get_rays(poses, intrinsics, H, W, N=-1, error_map=None):
+    ''' get rays
+    Args:
+        poses: [B, 4, 4], cam2world
+        intrinsics: [4]
+        H, W, N: int
+        error_map: [B, 128 * 128], sample probability based on training error
+    Returns:
+        rays_o, rays_d: [B, N, 3]
+        inds: [B, N]
+    '''
+
+    device = poses.device
+    B = poses.shape[0]
+    fx, fy, cx, cy = intrinsics
+
+    i, j = torch.meshgrid(torch.linspace(0, W-1, W, device=device), torch.linspace(0, H-1, H, device=device))
+    i = i.t().reshape([1, H*W]).expand([B, H*W]) + 0.5
+    j = j.t().reshape([1, H*W]).expand([B, H*W]) + 0.5
+
+    results = {}
+
+    if N > 0:
+        N = min(N, H*W)
+
+        if error_map is None:
+            inds = torch.randint(0, H*W, size=[N], device=device) # may duplicate
+            inds = inds.expand([B, N])
+        else:
+
+            # weighted sample on a low-reso grid
+            inds_coarse = torch.multinomial(error_map.to(device), N, replacement=False) # [B, N], but in [0, 128*128)
+
+            # map to the original resolution with random perturb.
+            inds_x, inds_y = inds_coarse // 128, inds_coarse % 128 # `//` will throw a warning in torch 1.10... anyway.
+            sx, sy = H / 128, W / 128
+            inds_x = (inds_x * sx + torch.rand(B, N, device=device) * sx).long().clamp(max=H - 1)
+            inds_y = (inds_y * sy + torch.rand(B, N, device=device) * sy).long().clamp(max=W - 1)
+            inds = inds_x * W + inds_y
+
+            results['inds_coarse'] = inds_coarse # need this when updating error_map
+
+        i = torch.gather(i, -1, inds)
+        j = torch.gather(j, -1, inds)
+
+        results['inds'] = inds
+
+    else:
+        inds = torch.arange(H*W, device=device).expand([B, H*W])
+
+    zs = torch.ones_like(i)
+    xs = (i - cx) / fx * zs
+    ys = (j - cy) / fy * zs
+    directions = torch.stack((xs, ys, zs), dim=-1)
+    directions = directions / torch.norm(directions, dim=-1, keepdim=True)
+    rays_d = directions @ poses[:, :3, :3].transpose(-1, -2) # (B, N, 3)
+
+    rays_o = poses[..., :3, 3] # [B, 3]
+    rays_o = rays_o[..., None, :].expand_as(rays_d) # [B, N, 3]
+
+    results['rays_o'] = rays_o
+    results['rays_d'] = rays_d
+
+    return results
+    
+
+class GUI:
+    def __init__(self, opt, debug=True):
+        self.opt = opt # shared with the trainer's opt to support in-place modification of rendering parameters.
+        self.W = opt.W
+        self.H = opt.H
+        self.debug = debug
+        self.cam = OrbitCamera(opt.W, opt.H, r=opt.radius, fovy=opt.fovy)
+        self.bg_color = torch.ones(3, dtype=torch.float32) # default white bg
+
+        self.render_buffer = np.zeros((self.W, self.H, 3), dtype=np.float32)
+        self.need_update = True # camera moved, should reset accumulation
+        
+        self.mode = 'face_id' # choose from ['position', 'depth', 'face_id']?
+
+        # Set3
+        self.cmap = np.array([
+            (0.5529411764705883, 0.8274509803921568, 0.7803921568627451),
+            (1.0, 1.0, 0.7019607843137254),
+            (0.7450980392156863, 0.7294117647058823, 0.8549019607843137),
+            (0.984313725490196, 0.5019607843137255, 0.4470588235294118),
+            (0.5019607843137255, 0.6941176470588235, 0.8274509803921568),
+            (0.9921568627450981, 0.7058823529411765, 0.3843137254901961),
+            (0.7019607843137254, 0.8705882352941177, 0.4117647058823529),
+            (0.9882352941176471, 0.803921568627451, 0.8980392156862745),
+            (0.8509803921568627, 0.8509803921568627, 0.8509803921568627),
+            (0.7372549019607844, 0.5019607843137255, 0.7411764705882353),
+            (0.8, 0.9215686274509803, 0.7725490196078432),
+            (1.0, 0.9294117647058824, 0.43529411764705883)], dtype=np.float32)
+
+        # load mesh
+        if opt.mesh == '':
+            self.mesh = create_dodecahedron()
+        else:
+            self.mesh = trimesh.load(opt.mesh, force='mesh', skip_material=True)
+
+        # normalize
+        center = self.mesh.vertices.mean(axis=0)
+        length = (self.mesh.vertices.max(axis=0) - self.mesh.vertices.min(axis=0)).max()
+        self.mesh.vertices = (self.mesh.vertices - center) / (length + 1e-5)
+        print(f'[INFO] load mesh {self.mesh.vertices.shape}, {self.mesh.faces.shape}')
+
+        # prepare raytracer
+        self.RT = cubvh.cuBVH(self.mesh.vertices, self.mesh.faces)
+
+        dpg.create_context()
+        self.register_dpg()
+        self.step()
+        
+
+    def __del__(self):
+        dpg.destroy_context()
+
+
+    def prepare_buffer(self, outputs):
+        positions, face_id, depth = outputs
+
+        if self.mode == 'position':
+            # outputs is the actual 3D point, how to visualize them ???
+            # naive normalize...
+            positions = positions.detach().cpu().numpy().reshape(self.H, self.W, 3)
+            positions = (positions - positions.min(axis=0, keepdims=True)) / (positions.max(axis=0, keepdims=True) - positions.min(axis=0, keepdims=True) + 1e-8)
+            return positions
+        elif self.mode == 'face_id':
+            # already normalized to [-1, 1]
+            face_id = face_id.detach().cpu().numpy().reshape(self.H, self.W)
+            mask = face_id < 0 # the bg
+            face_id = self.cmap[face_id % self.cmap.shape[0]]
+            face_id[mask] = 0
+            return face_id
+        elif self.mode == 'depth':
+            depth = depth.detach().cpu().numpy().reshape(self.H, self.W, 1)
+            mask = depth >= 10
+            mn = depth[~mask].min()
+            mx = depth[~mask].max()
+            depth = (depth - mn) / (mx - mn + 1e-5)
+            depth[mask] = 0
+            depth = depth.repeat(3, -1)
+            return depth
+        else:
+            raise NotImplementedError()
+
+    
+    def step(self):
+
+        if self.need_update:
+        
+            starter, ender = torch.cuda.Event(enable_timing=True), torch.cuda.Event(enable_timing=True)
+            starter.record()
+
+            # outputs = self.trainer.test_gui(self.cam.pose, self.cam.intrinsics, self.W, self.H, self.bg_color, self.spp, self.downscale)
+
+            pose = torch.from_numpy(self.cam.pose).unsqueeze(0).cuda()
+            rays = get_rays(pose, self.cam.intrinsics, self.H, self.W, -1)
+            rays_o = rays['rays_o'].contiguous().view(-1, 3)
+            rays_d = rays['rays_d'].contiguous().view(-1, 3)
+            outputs = self.RT.ray_trace(rays_o, rays_d)
+            
+            ender.record()
+            torch.cuda.synchronize()
+            t = starter.elapsed_time(ender)
+
+            if self.need_update:
+                self.render_buffer = self.prepare_buffer(outputs)
+                self.need_update = False
+            else:
+                self.render_buffer = (self.render_buffer * self.spp + self.prepare_buffer(outputs)) / (self.spp + 1)
+
+            dpg.set_value("_log_infer_time", f'{t:.4f}ms ({int(1000/t)} FPS)')
+            dpg.set_value("_texture", self.render_buffer)
+
+        
+    def register_dpg(self):
+
+        ### register texture 
+
+        with dpg.texture_registry(show=False):
+            dpg.add_raw_texture(self.W, self.H, self.render_buffer, format=dpg.mvFormat_Float_rgb, tag="_texture")
+
+        ### register window
+
+        # the rendered image, as the primary window
+        with dpg.window(tag="_primary_window", width=self.W, height=self.H):
+
+            # add the texture
+            dpg.add_image("_texture")
+
+        dpg.set_primary_window("_primary_window", True)
+
+        # control window
+        with dpg.window(label="Control", tag="_control_window", width=300, height=200):
+
+            # button theme
+            with dpg.theme() as theme_button:
+                with dpg.theme_component(dpg.mvButton):
+                    dpg.add_theme_color(dpg.mvThemeCol_Button, (23, 3, 18))
+                    dpg.add_theme_color(dpg.mvThemeCol_ButtonHovered, (51, 3, 47))
+                    dpg.add_theme_color(dpg.mvThemeCol_ButtonActive, (83, 18, 83))
+                    dpg.add_theme_style(dpg.mvStyleVar_FrameRounding, 5)
+                    dpg.add_theme_style(dpg.mvStyleVar_FramePadding, 3, 3)              
+
+            with dpg.group(horizontal=True):
+                dpg.add_text("Infer time: ")
+                dpg.add_text("no data", tag="_log_infer_time")
+            
+            # rendering options
+            with dpg.collapsing_header(label="Options", default_open=True):
+
+                # mode combo
+                def callback_change_mode(sender, app_data):
+                    self.mode = app_data
+                    self.need_update = True
+                
+                dpg.add_combo(('position', 'face_id', 'depth'), label='mode', default_value=self.mode, callback=callback_change_mode)
+
+                # # bg_color picker
+                # def callback_change_bg(sender, app_data):
+                #     self.bg_color = torch.tensor(app_data[:3], dtype=torch.float32) # only need RGB in [0, 1]
+                #     self.need_update = True
+
+                # dpg.add_color_edit((255, 255, 255), label="Background Color", width=200, tag="_color_editor", no_alpha=True, callback=callback_change_bg)
+
+                # fov slider
+                def callback_set_fovy(sender, app_data):
+                    self.cam.fovy = app_data
+                    self.need_update = True
+
+                dpg.add_slider_int(label="FoV (vertical)", min_value=1, max_value=120, format="%d deg", default_value=self.cam.fovy, callback=callback_set_fovy)
+
+            # debug info
+            if self.debug:
+                with dpg.collapsing_header(label="Debug"):
+                    # pose
+                    dpg.add_separator()
+                    dpg.add_text("Camera Pose:")
+                    dpg.add_text(str(self.cam.pose), tag="_log_pose")
+
+
+        ### register camera handler
+
+        def callback_camera_drag_rotate(sender, app_data):
+
+            if not dpg.is_item_focused("_primary_window"):
+                return
+
+            dx = app_data[1]
+            dy = app_data[2]
+
+            self.cam.orbit(dx, dy)
+            self.need_update = True
+
+            if self.debug:
+                dpg.set_value("_log_pose", str(self.cam.pose))
+
+
+        def callback_camera_wheel_scale(sender, app_data):
+
+            if not dpg.is_item_focused("_primary_window"):
+                return
+
+            delta = app_data
+
+            self.cam.scale(delta)
+            self.need_update = True
+
+            if self.debug:
+                dpg.set_value("_log_pose", str(self.cam.pose))
+
+
+        def callback_camera_drag_pan(sender, app_data):
+
+            if not dpg.is_item_focused("_primary_window"):
+                return
+
+            dx = app_data[1]
+            dy = app_data[2]
+
+            self.cam.pan(dx, dy)
+            self.need_update = True
+
+            if self.debug:
+                dpg.set_value("_log_pose", str(self.cam.pose))
+
+
+        with dpg.handler_registry():
+            dpg.add_mouse_drag_handler(button=dpg.mvMouseButton_Left, callback=callback_camera_drag_rotate)
+            dpg.add_mouse_wheel_handler(callback=callback_camera_wheel_scale)
+            dpg.add_mouse_drag_handler(button=dpg.mvMouseButton_Middle, callback=callback_camera_drag_pan)
+
+        
+        dpg.create_viewport(title='mesh viewer', width=self.W, height=self.H, resizable=False)
+
+        ### global theme
+        with dpg.theme() as theme_no_padding:
+            with dpg.theme_component(dpg.mvAll):
+                # set all padding to 0 to avoid scroll bar
+                dpg.add_theme_style(dpg.mvStyleVar_WindowPadding, 0, 0, category=dpg.mvThemeCat_Core)
+                dpg.add_theme_style(dpg.mvStyleVar_FramePadding, 0, 0, category=dpg.mvThemeCat_Core)
+                dpg.add_theme_style(dpg.mvStyleVar_CellPadding, 0, 0, category=dpg.mvThemeCat_Core)
+        
+        dpg.bind_item_theme("_primary_window", theme_no_padding)
+
+        dpg.setup_dearpygui()
+
+        #dpg.show_metrics()
+
+        dpg.show_viewport()
+
+
+    def render(self):
+        while dpg.is_dearpygui_running():
+            self.step()
+            dpg.render_dearpygui_frame()
+
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--mesh', default='', type=str)
+    parser.add_argument('--W', type=int, default=1920, help="GUI width")
+    parser.add_argument('--H', type=int, default=1080, help="GUI height")
+    parser.add_argument('--radius', type=float, default=5, help="default GUI camera radius from center")
+    parser.add_argument('--fovy', type=float, default=50, help="default GUI camera fovy")
+
+    opt = parser.parse_args()
+
+    gui = GUI(opt)
+    gui.render()
--- a/test/unsigned_distance.py
+++ b/test/unsigned_distance.py
+import numpy as np
+import trimesh
+import argparse
+import torch
+import cubvh
+import time
+
+
+def create_dodecahedron(radius=1, center=np.array([0, 0, 0])):
+
+    vertices = np.array([
+        -0.57735,  -0.57735,  0.57735,
+        0.934172,  0.356822,  0,
+        0.934172,  -0.356822,  0,
+        -0.934172,  0.356822,  0,
+        -0.934172,  -0.356822,  0,
+        0,  0.934172,  0.356822,
+        0,  0.934172,  -0.356822,
+        0.356822,  0,  -0.934172,
+        -0.356822,  0,  -0.934172,
+        0,  -0.934172,  -0.356822,
+        0,  -0.934172,  0.356822,
+        0.356822,  0,  0.934172,
+        -0.356822,  0,  0.934172,
+        0.57735,  0.57735,  -0.57735,
+        0.57735,  0.57735,  0.57735,
+        -0.57735,  0.57735,  -0.57735,
+        -0.57735,  0.57735,  0.57735,
+        0.57735,  -0.57735,  -0.57735,
+        0.57735,  -0.57735,  0.57735,
+        -0.57735,  -0.57735,  -0.57735,
+        ]).reshape((-1,3), order="C")
+
+    faces = np.array([
+        19, 3, 2,
+        12, 19, 2,
+        15, 12, 2,
+        8, 14, 2,
+        18, 8, 2,
+        3, 18, 2,
+        20, 5, 4,
+        9, 20, 4,
+        16, 9, 4,
+        13, 17, 4,
+        1, 13, 4,
+        5, 1, 4,
+        7, 16, 4,
+        6, 7, 4,
+        17, 6, 4,
+        6, 15, 2,
+        7, 6, 2,
+        14, 7, 2,
+        10, 18, 3,
+        11, 10, 3,
+        19, 11, 3,
+        11, 1, 5,
+        10, 11, 5,
+        20, 10, 5,
+        20, 9, 8,
+        10, 20, 8,
+        18, 10, 8,
+        9, 16, 7,
+        8, 9, 7,
+        14, 8, 7,
+        12, 15, 6,
+        13, 12, 6,
+        17, 13, 6,
+        13, 1, 11,
+        12, 13, 11,
+        19, 12, 11,
+        ]).reshape((-1, 3), order="C")-1
+
+    length = np.linalg.norm(vertices, axis=1).reshape((-1, 1))
+    vertices = vertices / length * radius + center
+
+    return trimesh.Trimesh(vertices=vertices, faces=faces)
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--N', default=1000, type=int)
+    parser.add_argument('--mesh', default='', type=str)
+    
+    opt = parser.parse_args()
+
+    if opt.mesh == '':
+        mesh = create_dodecahedron()
+    else:
+        mesh = trimesh.load(opt.mesh, force='mesh', skip_material=True)
+
+
+    # query nearest triangles for a set of points
+    points = torch.randn(opt.N, 3, device='cuda', dtype=torch.float32)
+
+    # Ours
+    _t0 = time.time()
+    BVH = cubvh.cuBVH(mesh.vertices, mesh.faces)
+    torch.cuda.synchronize()
+    _t1 = time.time()
+    distances, face_id, uvw = BVH.unsigned_distance(points, return_uvw=True)
+    torch.cuda.synchronize()
+    _t2 = time.time()
+    print(f'[TIME] Ours total {_t2 - _t0:.6f}s = build {_t1 - _t0:.6f}s + query {_t2 - _t1:.6f}s')
+
+    # GT results by trimesh
+    _t0 = time.time()
+    gt_cpoint, gt_distances, gt_face_id = trimesh.proximity.closest_point(mesh, points.cpu().numpy())
+    _t1 = time.time()
+    print(f'[TIME] Trimesh total {_t1 - _t0:.6f}s')
+
+    # verify correctness
+    face_id = face_id.cpu().numpy().astype(np.int64)
+
+    # NOTE: if there are duplicated vertices, this will fail... but it won't affect later correctness.
+    # np.testing.assert_array_equal(
+    #     face_id,
+    #     gt_face_id.astype(np.int64)
+    # )
+    
+    distances = distances.cpu().numpy().astype(np.float32)
+    np.testing.assert_allclose(
+        distances,
+        gt_distances.astype(np.float32),
+        atol=1e-5
+    )
+    
+    # calc cpoint from uvw and face_id
+    uvw = uvw.cpu().numpy().astype(np.float32) # [M, 3]
+    trigs = mesh.faces[face_id] # [M, 3]
+
+    cpoint = mesh.vertices[trigs[:, 0]] * uvw[:, [0]] + \
+             mesh.vertices[trigs[:, 1]] * uvw[:, [1]] + \
+             mesh.vertices[trigs[:, 2]] * uvw[:, [2]]
+
+    np.testing.assert_allclose(
+        cpoint,
+        gt_cpoint.astype(np.float32),
+        atol=1e-5
+    )
+
+    
\ No newline at end of file