Commit 77f1cf0b authored by yan.yan's avatar yan.yan
Browse files

fix windows build problem

parent 19a599e1
...@@ -164,6 +164,7 @@ class GemmTunerSimple(GemmTunerSimpleBase): ...@@ -164,6 +164,7 @@ class GemmTunerSimple(GemmTunerSimpleBase):
if key in self._nvrtc_caches: if key in self._nvrtc_caches:
return self._nvrtc_caches[key] return self._nvrtc_caches[key]
mod, ker = self._compile_nvrtc_module(desp) mod, ker = self._compile_nvrtc_module(desp)
print(f"Can't find algo {desp} in prebuilt. compile with nvrtc...")
nvrtc_params = _get_nvrtc_params(mod, ker, "gemm_kernel") nvrtc_params = _get_nvrtc_params(mod, ker, "gemm_kernel")
self._nvrtc_caches[key] = nvrtc_params self._nvrtc_caches[key] = nvrtc_params
return nvrtc_params return nvrtc_params
...@@ -288,6 +289,7 @@ class SimpleGemm: ...@@ -288,6 +289,7 @@ class SimpleGemm:
if key in self._nvrtc_caches: if key in self._nvrtc_caches:
return self._nvrtc_caches[key] return self._nvrtc_caches[key]
mod, ker = self._compile_nvrtc_module(desp) mod, ker = self._compile_nvrtc_module(desp)
print(f"Can't find algo {desp} in prebuilt. compile with nvrtc...")
nvrtc_params = _get_nvrtc_params(mod, ker, "gemm_kernel") nvrtc_params = _get_nvrtc_params(mod, ker, "gemm_kernel")
self._nvrtc_caches[key] = nvrtc_params self._nvrtc_caches[key] = nvrtc_params
return nvrtc_params return nvrtc_params
......
...@@ -126,6 +126,7 @@ class SpconvOps(pccm.Class): ...@@ -126,6 +126,7 @@ class SpconvOps(pccm.Class):
defines.append(f"#define SPCONV_ALLOC_{to_snake_case(name).upper()} {pccm.literal(v)}") defines.append(f"#define SPCONV_ALLOC_{to_snake_case(name).upper()} {pccm.literal(v)}")
define_str = "\n".join(defines) define_str = "\n".join(defines)
self.add_global_code(define_str) self.add_global_code(define_str)
self.build_meta.add_global_cflags("cl", "/DNOMINMAX")
# for name in dir(AllocKeys): # for name in dir(AllocKeys):
# if not name.startswith("__"): # if not name.startswith("__"):
# v = getattr(AllocKeys, name) # v = getattr(AllocKeys, name)
...@@ -1580,10 +1581,10 @@ class SpconvOps(pccm.Class): ...@@ -1580,10 +1581,10 @@ class SpconvOps(pccm.Class):
}} }}
if (!subm){{ if (!subm){{
size_t pair_single_size = kv * int64_t(num_act_in); size_t pair_single_size = kv * int64_t(num_act_in);
auto ten = tv::from_blob(workspace, {{pair_single_size + 1}}, use_int64_hash_k ? tv::int64 : tv::int32, 0); auto ten = tv::from_blob(workspace, {{int64_t(pair_single_size + 1)}}, use_int64_hash_k ? tv::int64 : tv::int32, 0);
res.insert({{{pccm.literal(AllocKeys.IndicePairsUniq)}, ten}}); res.insert({{{pccm.literal(AllocKeys.IndicePairsUniq)}, ten}});
workspace += ten.nbytes(); workspace += ten.nbytes();
auto ten2 = tv::from_blob(workspace, {{pair_single_size + 1}}, use_int64_hash_k ? tv::int64 : tv::int32, 0); auto ten2 = tv::from_blob(workspace, {{int64_t(pair_single_size + 1)}}, use_int64_hash_k ? tv::int64 : tv::int32, 0);
res.insert({{{pccm.literal(AllocKeys.IndicePairsUniqBackup)}, ten2}}); res.insert({{{pccm.literal(AllocKeys.IndicePairsUniqBackup)}, ten2}});
workspace += ten2.nbytes(); workspace += ten2.nbytes();
}} }}
......
...@@ -220,7 +220,7 @@ class ThrustAllocator(pccm.Class): ...@@ -220,7 +220,7 @@ class ThrustAllocator(pccm.Class):
code.arg("ptr", "char *") code.arg("ptr", "char *")
code.arg("num_bytes", "size_t") code.arg("num_bytes", "size_t")
code.raw(f""" code.raw(f"""
return allocator_.free_noexcept(tv::from_blob(ptr, {{num_bytes}}, tv::uint8, 0)); return allocator_.free_noexcept(tv::from_blob(ptr, {{int64_t(num_bytes)}}, tv::uint8, 0));
""") """)
return code return code
......
...@@ -64,7 +64,6 @@ class PointCloudCompress(pccm.Class): ...@@ -64,7 +64,6 @@ class PointCloudCompress(pccm.Class):
auto point_stride = points.stride(0); auto point_stride = points.stride(0);
int64_t final_size = sizeof(int64_t) * 5 + sizeof(float) * 3; int64_t final_size = sizeof(int64_t) * 5 + sizeof(float) * 3;
tv::Tensor res; tv::Tensor res;
tv::ssprint(1);
tv::dispatch<float, double>(points.dtype(), [&](auto IP){{ tv::dispatch<float, double>(points.dtype(), [&](auto IP){{
using TPoint = TV_DECLTYPE(IP); using TPoint = TV_DECLTYPE(IP);
...@@ -88,13 +87,13 @@ class PointCloudCompress(pccm.Class): ...@@ -88,13 +87,13 @@ class PointCloudCompress(pccm.Class):
auto pos_int = op::apply(floorf, pos_unit_voxel).cast<int32_t>(); auto pos_int = op::apply(floorf, pos_unit_voxel).cast<int32_t>();
auto pos_enc = (point / errors - pos_int.cast<float>() * float(256)).cast<uint8_t>(); auto pos_enc = (point / errors - pos_int.cast<float>() * float(256)).cast<uint8_t>();
tv::array<uint8_t, kEncodeDim> enc; tv::array<uint8_t, kEncodeDim> enc;
tv::if_constexpr<(kEncodeDim > 3)>([&](auto _){{ enc[0] = pos_enc[0];
enc[1] = pos_enc[1];
enc[2] = pos_enc[2];
if (kEncodeDim > 3){{
TInten inten = intensity_data[0]; TInten inten = intensity_data[0];
enc = _(tv::array<uint8_t, kEncodeDim>{{pos_enc[0], pos_enc[1], pos_enc[2], uint8_t(inten)}}); enc[3] = uint8_t(inten);
intensity_data += inten_stride; }}
}}, [&](auto _){{
enc = _(tv::array<uint8_t, kEncodeDim>{{pos_enc[0], pos_enc[1], pos_enc[2]}});
}});
auto pos_uint = pos_int + hash_t::direct_hash_offset(); auto pos_uint = pos_int + hash_t::direct_hash_offset();
uint64_t scalar = hash_t::encode(pos_int[0], pos_int[1], pos_int[2]); uint64_t scalar = hash_t::encode(pos_int[0], pos_int[1], pos_int[2]);
auto iter = hash.find(scalar); auto iter = hash.find(scalar);
...@@ -225,7 +224,7 @@ class PointCloudCompress(pccm.Class): ...@@ -225,7 +224,7 @@ class PointCloudCompress(pccm.Class):
error[2] = error_header[2]; error[2] = error_header[2];
res_ptr += sizeof(float) * 3; res_ptr += sizeof(float) * 3;
tv::Tensor points; tv::Tensor points;
tv::dispatch_int<static_cast<int>(EncodeType::XYZI_8), static_cast<int>(EncodeType::XYZ_8)>(static_cast<int>(type), [&](auto I){{ tv::dispatch_int<static_cast<int>(EncodeType::XYZI_8), static_cast<int>(EncodeType::XYZ_8)>(static_cast<int>(type), [&, error](auto I){{
constexpr int kTypeInt = TV_DECLTYPE(I)::value; constexpr int kTypeInt = TV_DECLTYPE(I)::value;
constexpr int kEncodeDim = kTypeInt == static_cast<int>(EncodeType::XYZI_8) ? 4 : 3; constexpr int kEncodeDim = kTypeInt == static_cast<int>(EncodeType::XYZI_8) ? 4 : 3;
points = tv::empty({{N, kEncodeDim}}, tv::float32); points = tv::empty({{N, kEncodeDim}}, tv::float32);
...@@ -241,7 +240,7 @@ class PointCloudCompress(pccm.Class): ...@@ -241,7 +240,7 @@ class PointCloudCompress(pccm.Class):
auto point_cur_ptr = points_ptr; auto point_cur_ptr = points_ptr;
for (int j = 0; j < cluster_size; ++j){{ for (int j = 0; j < cluster_size; ++j){{
auto& enc = enc_ptr[j]; auto& enc = enc_ptr[j];
auto point = op::slice<0, 3>(enc).template cast<float>() * error + offset; tv::array<float, 3> point = op::slice<0, 3>(enc).template cast<float>() * error + offset;
point_cur_ptr[0] = point[0]; point_cur_ptr[0] = point[0];
point_cur_ptr[1] = point[1]; point_cur_ptr[1] = point[1];
point_cur_ptr[2] = point[2]; point_cur_ptr[2] = point[2];
......
...@@ -640,6 +640,7 @@ def _test_native_conv_cuda(subm: bool): ...@@ -640,6 +640,7 @@ def _test_native_conv_cuda(subm: bool):
arch = torch.cuda.get_device_capability() arch = torch.cuda.get_device_capability()
stream = get_current_stream() stream = get_current_stream()
force_nvrtc = False
for shape, bs, C, K, k, s, p, d, dtype in tqdm.tqdm(params_grid( for shape, bs, C, K, k, s, p, d, dtype in tqdm.tqdm(params_grid(
shapes, batchsizes, in_channels, out_channels, ksizes, shapes, batchsizes, in_channels, out_channels, ksizes,
strides, paddings, dilations, dtypes)): strides, paddings, dilations, dtypes)):
...@@ -718,7 +719,8 @@ def _test_native_conv_cuda(subm: bool): ...@@ -718,7 +719,8 @@ def _test_native_conv_cuda(subm: bool):
c_inds=out_indices, c_inds=out_indices,
hint=AlgoHint.Fowrard.value, hint=AlgoHint.Fowrard.value,
alpha=1.0, alpha=1.0,
beta=beta) beta=beta,
force_nvrtc=force_nvrtc)
else: else:
GEMM.run_with_tuned_result( GEMM.run_with_tuned_result(
BestAlgoByProfile(desp, tester.arch, 1), BestAlgoByProfile(desp, tester.arch, 1),
...@@ -735,7 +737,8 @@ def _test_native_conv_cuda(subm: bool): ...@@ -735,7 +737,8 @@ def _test_native_conv_cuda(subm: bool):
c_inds=out_indices, c_inds=out_indices,
hint=AlgoHint.Fowrard.value, hint=AlgoHint.Fowrard.value,
alpha=1.0, alpha=1.0,
beta=beta) beta=beta,
force_nvrtc=force_nvrtc)
inited = True inited = True
if bias is not None and tester.check_act: if bias is not None and tester.check_act:
InferenceOps.bias_add_act_inplace(output_tv, bias, tv.gemm.Activation.ReLU, 0, 0) InferenceOps.bias_add_act_inplace(output_tv, bias, tv.gemm.Activation.ReLU, 0, 0)
...@@ -801,7 +804,8 @@ def _test_native_conv_cuda(subm: bool): ...@@ -801,7 +804,8 @@ def _test_native_conv_cuda(subm: bool):
c_inds=inp_indices, c_inds=inp_indices,
hint=AlgoHint.Fowrard.value, hint=AlgoHint.Fowrard.value,
alpha=1.0, alpha=1.0,
beta=beta) beta=beta,
force_nvrtc=force_nvrtc)
else: else:
GEMM.run_with_tuned_result( GEMM.run_with_tuned_result(
BestAlgoByProfile(desp, tester.arch, 1), BestAlgoByProfile(desp, tester.arch, 1),
...@@ -818,7 +822,8 @@ def _test_native_conv_cuda(subm: bool): ...@@ -818,7 +822,8 @@ def _test_native_conv_cuda(subm: bool):
c_inds=inp_indices, c_inds=inp_indices,
hint=AlgoHint.Fowrard.value, hint=AlgoHint.Fowrard.value,
alpha=1.0, alpha=1.0,
beta=beta) beta=beta,
force_nvrtc=force_nvrtc)
inited = True inited = True
din_my = inp_tv.cpu().numpy() din_my = inp_tv.cpu().numpy()
...@@ -879,7 +884,8 @@ def _test_native_conv_cuda(subm: bool): ...@@ -879,7 +884,8 @@ def _test_native_conv_cuda(subm: bool):
c_inds=tv.Tensor(), c_inds=tv.Tensor(),
hint=AlgoHint.BackwardWeight.value, hint=AlgoHint.BackwardWeight.value,
alpha=1.0, alpha=1.0,
beta=beta) beta=beta,
force_nvrtc=force_nvrtc)
else: else:
GEMM.run_with_tuned_result(BestAlgoByProfile(desp, tester.arch, 32), GEMM.run_with_tuned_result(BestAlgoByProfile(desp, tester.arch, 32),
...@@ -896,7 +902,8 @@ def _test_native_conv_cuda(subm: bool): ...@@ -896,7 +902,8 @@ def _test_native_conv_cuda(subm: bool):
b_inds=b_inds, b_inds=b_inds,
hint=AlgoHint.BackwardWeight.value, hint=AlgoHint.BackwardWeight.value,
alpha=1.0, alpha=1.0,
beta=beta) beta=beta,
force_nvrtc=force_nvrtc)
dw_my = weight_tv.cpu().numpy() dw_my = weight_tv.cpu().numpy()
if dtype != np.float16: if dtype != np.float16:
...@@ -909,8 +916,8 @@ def _test_native_conv_cuda(subm: bool): ...@@ -909,8 +916,8 @@ def _test_native_conv_cuda(subm: bool):
def test_all_algo_unit(): def test_all_algo_unit():
# for i in range(5): # for i in range(5):
_test_impgemm_conv_cuda(True) # _test_impgemm_conv_cuda(True)
_test_impgemm_conv_cuda(False) # _test_impgemm_conv_cuda(False)
_test_native_conv_cuda(True) _test_native_conv_cuda(True)
_test_native_conv_cuda(False) _test_native_conv_cuda(False)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment