Commit 2f66dd23 authored by yan.yan's avatar yan.yan
Browse files

fix cpu only build problem

parent 6a32c34f
......@@ -112,6 +112,8 @@ We offer python 3.7-3.11 and cuda 10.2/11.4/11.7/12.0 prebuilt binaries for wind
For Linux users, you need to install pip >= 20.3 first to install prebuilt.
**WARNING**: spconv-cu117 may require CUDA Driver >= 515.
```pip install spconv``` for CPU only (**Linux Only**). you should only use this for debug usage, the performance isn't optimized due to manylinux limit (no omp support).
```pip install spconv-cu102``` for CUDA 10.2
......
[build-system]
requires = ["setuptools>=41.0", "wheel", "pccm>=0.4.0", "cumm>=0.3.0"]
requires = ["setuptools>=41.0", "wheel", "pccm>=0.4.0", "cumm>=0.3.1"]
build-backend = "setuptools.build_meta"
......@@ -38,9 +38,9 @@ if cuda_ver:
cuda_ver = cuda_ver.replace(".", "") # 10.2 to 102
RELEASE_NAME += "-cu{}".format(cuda_ver)
deps = ["cumm-cu{}>=0.3.0".format(cuda_ver)]
deps = ["cumm-cu{}>=0.3.1".format(cuda_ver)]
else:
deps = ["cumm>=0.3.0"]
deps = ["cumm>=0.3.1"]
......
......@@ -64,7 +64,7 @@ def get_gemm_algo_desp_from_param(p: GemmAlgoParams):
desp.element_per_access_a = ker.input_spec.input_iter_a.element_per_acc
desp.element_per_access_b = ker.input_spec.input_iter_b.element_per_acc
desp.element_per_access_c = ker.output_spec.out_iter.element_per_acc
desp.min_arch = ker.min_arch()
return desp
......@@ -87,6 +87,7 @@ def get_conv_algo_desp_from_param(p: ConvAlgoParams):
desp.element_per_access_a = ker.input_spec.input_iter_a.element_per_acc
desp.element_per_access_b = ker.input_spec.input_iter_b.element_per_acc
desp.element_per_access_c = ker.output_spec.out_iter.element_per_acc
desp.min_arch = ker.min_arch()
return desp
......
......@@ -13,14 +13,10 @@
# limitations under the License.
import spconv.core_cc as _ext
if hasattr(_ext, "cumm"):
CPU_ONLY_BUILD = False
else:
CPU_ONLY_BUILD = True
from spconv.core_cc.csrc.sparse.all import SpconvOps
CPU_ONLY_BUILD = SpconvOps.is_cpu_only_build()
BUILD_CUMM_VERSION = SpconvOps.cumm_version()
BUILD_PCCM_VERSION = SpconvOps.pccm_version()
from spconv.core_cc.csrc.utils.boxops import BoxOps
......
......@@ -84,6 +84,10 @@ class HashCoreHost(pccm.Class):
self.add_include("tensorview/hash/hash_core.h")
class SpconvOps(pccm.Class):
if CUMM_CPU_ONLY_BUILD:
_STATIC_FUNCTION = pccm.static_function
else:
_STATIC_FUNCTION = pccm.cuda.static_function
def __init__(self):
super().__init__()
self.add_dependency(ThrustCustomAllocatorV2, ExternalAllocator, GemmBasicHost, ThrustAllocator)
......@@ -143,6 +147,15 @@ class SpconvOps(pccm.Class):
""")
return code.ret("std::string")
@pccm.pybind.mark
@pccm.static_function
def is_cpu_only_build(self):
code = pccm.FunctionCode()
code.raw(f"""
return {pccm.literal(CUMM_CPU_ONLY_BUILD)};
""")
return code.ret("bool")
@pccm.pybind.mark
@pccm.static_function
def pccm_version(self):
......@@ -155,7 +168,7 @@ class SpconvOps(pccm.Class):
return code.ret("std::string")
@pccm.pybind.mark
@pccm.cuda.static_function
@_STATIC_FUNCTION
def generate_conv_inds_stage1(self):
code = pccm.FunctionCode()
code.arg("indices", "tv::Tensor")
......@@ -200,7 +213,7 @@ class SpconvOps(pccm.Class):
return code # .ret("int")
@pccm.pybind.mark
@pccm.cuda.static_function
@_STATIC_FUNCTION
def generate_conv_inds_stage1_5(self):
code = pccm.FunctionCode()
code.arg("indice_pairs_uniq", "tv::Tensor")
......@@ -219,7 +232,7 @@ class SpconvOps(pccm.Class):
return code.ret("int")
@pccm.pybind.mark
@pccm.cuda.static_function
@_STATIC_FUNCTION
def generate_conv_inds_stage2(self):
code = pccm.FunctionCode()
code.arg("indices, hashdata_k, hashdata_v", "tv::Tensor")
......@@ -270,7 +283,7 @@ class SpconvOps(pccm.Class):
return code.ret("int")
@pccm.pybind.mark
@pccm.cuda.static_function
@_STATIC_FUNCTION
def generate_conv_inds_mask_stage1(self):
code = pccm.FunctionCode()
if CUMM_CPU_ONLY_BUILD:
......@@ -316,7 +329,7 @@ class SpconvOps(pccm.Class):
return code # .ret("int")
@pccm.pybind.mark
@pccm.cuda.static_function
@_STATIC_FUNCTION
def generate_conv_inds_mask_stage1_direct_table(self):
code = pccm.FunctionCode()
if CUMM_CPU_ONLY_BUILD:
......@@ -362,7 +375,7 @@ class SpconvOps(pccm.Class):
return code # .ret("int")
@pccm.pybind.mark
@pccm.cuda.static_function
@_STATIC_FUNCTION
def unique_hash(self):
code = pccm.FunctionCode()
if CUMM_CPU_ONLY_BUILD:
......@@ -378,7 +391,7 @@ class SpconvOps(pccm.Class):
return code.ret("int")
@pccm.pybind.mark
@pccm.cuda.static_function
@_STATIC_FUNCTION
def assign_output_direct_hash(self):
code = pccm.FunctionCode()
if CUMM_CPU_ONLY_BUILD:
......@@ -420,7 +433,7 @@ class SpconvOps(pccm.Class):
return code
@pccm.pybind.mark
@pccm.cuda.static_function
@_STATIC_FUNCTION
def generate_conv_inds_mask_stage2(self):
code = pccm.FunctionCode()
if CUMM_CPU_ONLY_BUILD:
......@@ -470,7 +483,7 @@ class SpconvOps(pccm.Class):
return code.ret("int")
@pccm.pybind.mark
@pccm.cuda.static_function
@_STATIC_FUNCTION
def generate_conv_inds_stage2_mask_direct_table(self):
code = pccm.FunctionCode()
if CUMM_CPU_ONLY_BUILD:
......@@ -519,7 +532,7 @@ class SpconvOps(pccm.Class):
return code.ret("int")
@pccm.pybind.mark
@pccm.cuda.static_function
@_STATIC_FUNCTION
def generate_subm_conv_inds(self):
code = pccm.FunctionCode()
if CUMM_CPU_ONLY_BUILD:
......@@ -634,7 +647,7 @@ class SpconvOps(pccm.Class):
return code.ret("int")
@pccm.pybind.mark
@pccm.cuda.static_function
@_STATIC_FUNCTION
def maxpool_forward(self):
code = pccm.FunctionCode()
if CUMM_CPU_ONLY_BUILD:
......@@ -651,7 +664,7 @@ class SpconvOps(pccm.Class):
return code
@pccm.pybind.mark
@pccm.cuda.static_function
@_STATIC_FUNCTION
def maxpool_backward(self):
code = pccm.FunctionCode()
if CUMM_CPU_ONLY_BUILD:
......@@ -671,7 +684,7 @@ class SpconvOps(pccm.Class):
@pccm.pybind.mark
@pccm.cuda.static_function
@_STATIC_FUNCTION
def indice_maxpool(self):
code = pccm.FunctionCode()
code.arg("out_features, features", "tv::Tensor")
......@@ -715,7 +728,7 @@ class SpconvOps(pccm.Class):
@pccm.pybind.mark
@pccm.cuda.static_function
@_STATIC_FUNCTION
def indice_maxpool_backward(self):
code = pccm.FunctionCode()
code.arg("din, features, out_features, out_bp", "tv::Tensor")
......@@ -757,7 +770,7 @@ class SpconvOps(pccm.Class):
return code
@pccm.pybind.mark
@pccm.cuda.static_function
@_STATIC_FUNCTION
def maxpool_implicit_gemm_forward(self):
code = pccm.FunctionCode()
if CUMM_CPU_ONLY_BUILD:
......@@ -773,7 +786,7 @@ class SpconvOps(pccm.Class):
return code
@pccm.pybind.mark
@pccm.cuda.static_function
@_STATIC_FUNCTION
def maxpool_implicit_gemm_backward(self):
code = pccm.FunctionCode()
if CUMM_CPU_ONLY_BUILD:
......@@ -791,7 +804,7 @@ class SpconvOps(pccm.Class):
return code
@pccm.pybind.mark
@pccm.cuda.static_function
@_STATIC_FUNCTION
def avgpool_implicit_gemm_forward(self):
code = pccm.FunctionCode()
if CUMM_CPU_ONLY_BUILD:
......@@ -808,7 +821,7 @@ class SpconvOps(pccm.Class):
return code
@pccm.pybind.mark
@pccm.cuda.static_function
@_STATIC_FUNCTION
def avgpool_implicit_gemm_backward(self):
code = pccm.FunctionCode()
if CUMM_CPU_ONLY_BUILD:
......@@ -936,19 +949,19 @@ class SpconvOps(pccm.Class):
@pccm.pybind.mark
@pccm.cuda.static_function
@_STATIC_FUNCTION
def sort_1d_by_key_allocator(self):
# for python
return self.sort_1d_by_key_allocator_template(False)
@pccm.cuda.static_function
@_STATIC_FUNCTION
def sort_1d_by_key_allocator_v2(self):
# for cpp only
return self.sort_1d_by_key_allocator_template(True)
@pccm.pybind.mark
@pccm.cuda.static_function
@_STATIC_FUNCTION
def sort_1d_by_key_split(self):
code = pccm.FunctionCode()
if CUMM_CPU_ONLY_BUILD:
......@@ -1070,17 +1083,17 @@ class SpconvOps(pccm.Class):
@pccm.pybind.mark
@pccm.cuda.static_function
@_STATIC_FUNCTION
def sort_1d_by_key_split_allocator(self):
return self.sort_1d_by_key_split_allocator_template(False)
@pccm.pybind.mark
@pccm.cuda.static_function
@_STATIC_FUNCTION
def sort_1d_by_key_split_allocator_v2(self):
return self.sort_1d_by_key_split_allocator_template(True)
@pccm.pybind.mark
@pccm.cuda.static_function
@_STATIC_FUNCTION
def count_bits(self):
code = pccm.FunctionCode()
if CUMM_CPU_ONLY_BUILD:
......@@ -1140,7 +1153,7 @@ class SpconvOps(pccm.Class):
return code.ret("tv::Tensor")
@pccm.pybind.mark
@pccm.cuda.static_function
@_STATIC_FUNCTION
def reverse_bits(self):
code = pccm.FunctionCode()
if CUMM_CPU_ONLY_BUILD:
......@@ -1202,7 +1215,7 @@ class SpconvOps(pccm.Class):
# cpu only build can't use pccm.cuda
__CUDA_DECORATOR = pccm.static_function
if not CUMM_CPU_ONLY_BUILD:
__CUDA_DECORATOR = pccm.cuda.static_function
__CUDA_DECORATOR = _STATIC_FUNCTION
@pccm.pybind.mark
@__CUDA_DECORATOR
......@@ -1243,7 +1256,7 @@ class SpconvOps(pccm.Class):
return code
@pccm.pybind.mark
@pccm.cuda.static_function
@_STATIC_FUNCTION
def sort_1d_by_key(self):
code = pccm.FunctionCode()
if CUMM_CPU_ONLY_BUILD:
......@@ -1475,13 +1488,16 @@ class SpconvOps(pccm.Class):
""")
return code.ret("std::vector<int>")
@pccm.cuda.static_function
@_STATIC_FUNCTION
def apply_thrust_unique_to_indice_pairs_uniq(self):
code = pccm.code()
code.add_dependency(CustomThrustLib)
code.arg("data", "tv::Tensor")
code.arg("allocator", "ThrustAllocator&")
code.arg("stream_int", f"std::uintptr_t", "0")
if CUMM_CPU_ONLY_BUILD:
return code.make_invalid()
code.add_dependency(CustomThrustLib)
code.raw(f"""
int num_out_act = 0;
int uniq_size = data.dim(0);
......@@ -1622,7 +1638,7 @@ class SpconvOps(pccm.Class):
if CUMM_CPU_ONLY_BUILD:
code.raw(f"""
throw std::runtime_error("this function can only be used with CUDA.")
TV_THROW_RT_ERR("this function can only be used with CUDA.");
""")
return code.ret("std::tuple<tv::Tensor, int>")
code.raw(f"""
......
......@@ -624,6 +624,9 @@ class GemmTunerSimple(pccm.ParameterizedClass):
code.arg("shuffle_type", "int")
code.arg("a_inds_shape, b_inds_shape, c_inds_shape",
"std::vector<int64_t>")
if CUMM_CPU_ONLY_BUILD:
code.raw(f"TV_THROW_RT_ERR(\"not implemented for cpu!!!\")")
return code.ret("std::tuple<int, int, int>")
code.raw(f"""
return GemmMain::extract_mnk(a_shape, b_shape, trans_a,
trans_b, trans_c,
......
......@@ -148,7 +148,10 @@ class InferenceOpsKernel(pccm.ParameterizedClass):
class InferenceOps(pccm.Class):
def __init__(self):
super().__init__()
self.add_dependency(TensorView, LaunchUtils)
self.add_dependency(TensorView)
if not CUMM_CPU_ONLY_BUILD:
self.add_dependency(LaunchUtils)
self.kernel = InferenceOpsKernel()
self.add_include("tensorview/gemm/core/constants.h")
self.add_static_const("kMaxGridYZDim", "int", "65535")
......
......@@ -30,14 +30,13 @@ from spconv.constants import SPCONV_CPP_INDICE_PAIRS, SPCONV_CPP_INDICE_PAIRS_IG
import spconv.core_cc as _ext
from spconv.core_cc.csrc.sparse.convops.spops import ConvGemmOps
from spconv.core_cc.csrc.sparse.inference import InferenceOps
from spconv.cppconstants import CPU_ONLY_BUILD
from spconv.utils import nullcontext
if hasattr(_ext, "cumm"):
CPU_ONLY_BUILD = False
if not CPU_ONLY_BUILD:
from spconv.algo import GEMM, CONV, GEMM_CPP, CONV_CPP
else:
CPU_ONLY_BUILD = True
GEMM = None
CONV = None
GEMM_CPP = None
......@@ -1175,7 +1174,6 @@ def indice_conv_backward(features: torch.Tensor,
return (din, dfilters.reshape(filters_shape))
maxnhot = max(indice_pair_num_cpu)
arch = get_arch()
filters_tv = torch_tensor_to_tv(filters)
dfilters_tv = torch_tensor_to_tv(dfilters)
......@@ -1224,7 +1222,7 @@ def indice_conv_backward(features: torch.Tensor,
torch.mm(out_buffer[:nhot], filters_KC, out=inp_buffer[:nhot])
SpconvOps.scatter_add_cpu(din_tv, inp_buffer_tv, inp_indices)
return (din, dfilters.reshape(filters_shape))
arch = get_arch()
profile_idx = kv_center
if subm or indice_pair_num_cpu[profile_idx] == 0:
profile_idx = kv_center - 1
......
......@@ -22,7 +22,7 @@ from cumm import tensorview as tv
from spconv.core import ConvAlgo
import spconv.pytorch as spconv
from spconv.utils import Point2VoxelCPU3d, Point2VoxelGPU3d
from spconv.utils import Point2VoxelCPU3d
# torch.backends.cudnn.enabled = False
def waymo_data(batch_size=1, num_features=-1):
......@@ -44,6 +44,8 @@ def waymo_data(batch_size=1, num_features=-1):
return voxels, coors, gen.grid_size
def waymo_data_large(batch_size=1):
from spconv.utils import Point2VoxelGPU3d
gen = Point2VoxelGPU3d([0.1, 0.1, 0.1], [-80, -80, -2, 80, 80, 6], 3,
1600000, 1)
# gen = VoxelGeneratorV2([0.1, 0.1, 0.1], [-80, -80, -2, 80, 80, 6], 1,
......@@ -395,7 +397,7 @@ def main():
# voxels, coors, spatial_shape = waymo_data(num_features=3)
with open(Path(__file__).parent / "data" / "test_spconv.pkl", "rb") as f:
(voxels, coors, spatial_shape) = pickle.load(f)
voxels, coors, spatial_shape = waymo_data_large()
# voxels, coors, spatial_shape = waymo_data_large()
# breakpoint()
print(spatial_shape)
......@@ -478,11 +480,11 @@ def main():
# for i in range(10):
# out = net(voxels_th, coors_th, 1)
# print("------------")
# torch.cuda.synchronize()
# t = time.time()
# # torch.cuda.synchronize()
# # t = time.time()
# out.features.backward(dout_t)
# torch.cuda.synchronize()
# times.append(time.time() - t)
# # torch.cuda.synchronize()
# # times.append(time.time() - t)
# # # print((net.grid == -1).float().sum(), net.grid.numel())
# # # print("spconv time", time.time() - t)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment