fix windows build problem

77f1cf0b · yan.yan · 19a599e1 · 77f1cf0b · 77f1cf0b · 77f1cf0b
Commit 77f1cf0b authored Sep 24, 2022 by yan.yan
5 changed files
--- a/spconv/algo.py
+++ b/spconv/algo.py
@@ -164,6 +164,7 @@ class GemmTunerSimple(GemmTunerSimpleBase):
        if key in self._nvrtc_caches:
            return self._nvrtc_caches[key]
        mod, ker = self._compile_nvrtc_module(desp)
+        print(f"Can't find algo {desp} in prebuilt. compile with nvrtc...")
        nvrtc_params = _get_nvrtc_params(mod, ker, "gemm_kernel")
        self._nvrtc_caches[key] = nvrtc_params
        return nvrtc_params
@@ -288,6 +289,7 @@ class SimpleGemm:
        if key in self._nvrtc_caches:
            return self._nvrtc_caches[key]
        mod, ker = self._compile_nvrtc_module(desp)
+        print(f"Can't find algo {desp} in prebuilt. compile with nvrtc...")
        nvrtc_params = _get_nvrtc_params(mod, ker, "gemm_kernel")
        self._nvrtc_caches[key] = nvrtc_params
        return nvrtc_params

--- a/spconv/csrc/sparse/all.py
+++ b/spconv/csrc/sparse/all.py
@@ -126,6 +126,7 @@ class SpconvOps(pccm.Class):
                defines.append(f"#define SPCONV_ALLOC_{to_snake_case(name).upper()} {pccm.literal(v)}")
        define_str = "\n".join(defines)
        self.add_global_code(define_str)
+        self.build_meta.add_global_cflags("cl", "/DNOMINMAX")
        # for name in dir(AllocKeys):
        #     if not name.startswith("__"):
        #         v = getattr(AllocKeys, name)
@@ -1580,10 +1581,10 @@ class SpconvOps(pccm.Class):
        }}
        if (!subm){{
            size_t pair_single_size = kv * int64_t(num_act_in);
-            auto ten = tv::from_blob(workspace, {{pair_single_size + 1}}, use_int64_hash_k ? tv::int64 : tv::int32, 0);
+            auto ten = tv::from_blob(workspace, {{int64_t(pair_single_size + 1)}}, use_int64_hash_k ? tv::int64 : tv::int32, 0);
            res.insert({{{pccm.literal(AllocKeys.IndicePairsUniq)}, ten}});
            workspace += ten.nbytes();
-            auto ten2 = tv::from_blob(workspace, {{pair_single_size + 1}}, use_int64_hash_k ? tv::int64 : tv::int32, 0);
+            auto ten2 = tv::from_blob(workspace, {{int64_t(pair_single_size + 1)}}, use_int64_hash_k ? tv::int64 : tv::int32, 0);
            res.insert({{{pccm.literal(AllocKeys.IndicePairsUniqBackup)}, ten2}});
            workspace += ten2.nbytes();
        }}

--- a/spconv/csrc/sparse/alloc.py
+++ b/spconv/csrc/sparse/alloc.py
@@ -220,7 +220,7 @@ class ThrustAllocator(pccm.Class):
        code.arg("ptr", "char *")
        code.arg("num_bytes", "size_t")
        code.raw(f"""
-        return allocator_.free_noexcept(tv::from_blob(ptr, {{num_bytes}}, tv::uint8, 0));
+        return allocator_.free_noexcept(tv::from_blob(ptr, {{int64_t(num_bytes)}}, tv::uint8, 0));
        """)
        return code

--- a/spconv/csrc/utils/pcc.py
+++ b/spconv/csrc/utils/pcc.py
@@ -64,7 +64,6 @@ class PointCloudCompress(pccm.Class):
        auto point_stride = points.stride(0);
        int64_t final_size = sizeof(int64_t) * 5 + sizeof(float) * 3;
        tv::Tensor res;
-        tv::ssprint(1);
        tv::dispatch<float, double>(points.dtype(), [&](auto IP){{
            using TPoint = TV_DECLTYPE(IP);
@@ -88,13 +87,13 @@ class PointCloudCompress(pccm.Class):
                        auto pos_int = op::apply(floorf, pos_unit_voxel).cast<int32_t>();
                        auto pos_enc = (point / errors - pos_int.cast<float>() * float(256)).cast<uint8_t>();
                        tv::array<uint8_t, kEncodeDim> enc;
-                        tv::if_constexpr<(kEncodeDim > 3)>([&](auto _){{
+                        enc[0] = pos_enc[0];
+                        enc[1] = pos_enc[1];
+                        enc[2] = pos_enc[2];
+                        if (kEncodeDim > 3){{
                            TInten inten = intensity_data[0];
-                            enc = _(tv::array<uint8_t, kEncodeDim>{{pos_enc[0], pos_enc[1], pos_enc[2], uint8_t(inten)}});
+                            enc[3] = uint8_t(inten);
-                            intensity_data += inten_stride;
+                        }}
-                        }}, [&](auto _){{
-                            enc = _(tv::array<uint8_t, kEncodeDim>{{pos_enc[0], pos_enc[1], pos_enc[2]}});
-                        }});
                        auto pos_uint = pos_int + hash_t::direct_hash_offset();
                        uint64_t scalar = hash_t::encode(pos_int[0], pos_int[1], pos_int[2]);
                        auto iter = hash.find(scalar);
@@ -225,7 +224,7 @@ class PointCloudCompress(pccm.Class):
        error[2] = error_header[2];
        res_ptr += sizeof(float) * 3;
        tv::Tensor points;
-        tv::dispatch_int<static_cast<int>(EncodeType::XYZI_8), static_cast<int>(EncodeType::XYZ_8)>(static_cast<int>(type), [&](auto I){{
+        tv::dispatch_int<static_cast<int>(EncodeType::XYZI_8), static_cast<int>(EncodeType::XYZ_8)>(static_cast<int>(type), [&, error](auto I){{
            constexpr int kTypeInt = TV_DECLTYPE(I)::value;
            constexpr int kEncodeDim = kTypeInt == static_cast<int>(EncodeType::XYZI_8) ? 4 : 3;
            points = tv::empty({{N, kEncodeDim}}, tv::float32);
@@ -241,7 +240,7 @@ class PointCloudCompress(pccm.Class):
                auto point_cur_ptr = points_ptr;
                for (int j = 0; j < cluster_size; ++j){{
                    auto& enc = enc_ptr[j];
-                    auto point = op::slice<0, 3>(enc).template cast<float>() * error + offset;
+                    tv::array<float, 3> point = op::slice<0, 3>(enc).template cast<float>() * error + offset;
                    point_cur_ptr[0] = point[0];
                    point_cur_ptr[1] = point[1];
                    point_cur_ptr[2] = point[2];

--- a/test/test_all_algo.py
+++ b/test/test_all_algo.py
@@ -640,6 +640,7 @@ def _test_native_conv_cuda(subm: bool):
    arch = torch.cuda.get_device_capability()
    stream = get_current_stream()
+    force_nvrtc = False
    for shape, bs, C, K, k, s, p, d, dtype in tqdm.tqdm(params_grid(
            shapes, batchsizes, in_channels, out_channels, ksizes,
            strides, paddings, dilations, dtypes)):
@@ -718,7 +719,8 @@ def _test_native_conv_cuda(subm: bool):
                                c_inds=out_indices,
                                hint=AlgoHint.Fowrard.value,
                                alpha=1.0,
-                                beta=beta)
+                                beta=beta,
+                                force_nvrtc=force_nvrtc)
                        else:
                            GEMM.run_with_tuned_result(
                                BestAlgoByProfile(desp, tester.arch, 1),
@@ -735,7 +737,8 @@ def _test_native_conv_cuda(subm: bool):
                                c_inds=out_indices,
                                hint=AlgoHint.Fowrard.value,
                                alpha=1.0,
-                                beta=beta)
+                                beta=beta,
+                                force_nvrtc=force_nvrtc)
                        inited = True
                    if bias is not None and tester.check_act:
                        InferenceOps.bias_add_act_inplace(output_tv, bias, tv.gemm.Activation.ReLU, 0, 0)
@@ -801,7 +804,8 @@ def _test_native_conv_cuda(subm: bool):
                                c_inds=inp_indices,
                                hint=AlgoHint.Fowrard.value,
                                alpha=1.0,
-                                beta=beta)
+                                beta=beta,
+                                force_nvrtc=force_nvrtc)
                        else:
                            GEMM.run_with_tuned_result(
                                BestAlgoByProfile(desp, tester.arch, 1),
@@ -818,7 +822,8 @@ def _test_native_conv_cuda(subm: bool):
                                c_inds=inp_indices,
                                hint=AlgoHint.Fowrard.value,
                                alpha=1.0,
-                                beta=beta)
+                                beta=beta,
+                                force_nvrtc=force_nvrtc)
                        inited = True
                    din_my = inp_tv.cpu().numpy()
@@ -879,7 +884,8 @@ def _test_native_conv_cuda(subm: bool):
                                c_inds=tv.Tensor(),
                                hint=AlgoHint.BackwardWeight.value,
                                alpha=1.0,
-                                beta=beta)
+                                beta=beta,
+                                force_nvrtc=force_nvrtc)
                        else:
                            GEMM.run_with_tuned_result(BestAlgoByProfile(desp, tester.arch, 32),
@@ -896,7 +902,8 @@ def _test_native_conv_cuda(subm: bool):
                                                    b_inds=b_inds,
                                                    hint=AlgoHint.BackwardWeight.value,
                                                    alpha=1.0,
-                                                    beta=beta)
+                                                    beta=beta,
+                                                    force_nvrtc=force_nvrtc)
                    dw_my = weight_tv.cpu().numpy()
                    if dtype != np.float16:
@@ -909,8 +916,8 @@ def _test_native_conv_cuda(subm: bool):
 def test_all_algo_unit():
    # for i in range(5):
-    _test_impgemm_conv_cuda(True)
+    # _test_impgemm_conv_cuda(True)
-    _test_impgemm_conv_cuda(False)
+    # _test_impgemm_conv_cuda(False)
    _test_native_conv_cuda(True)
    _test_native_conv_cuda(False)