fix cpu only build problem

2f66dd23 · yan.yan · 6a32c34f · 2f66dd23 · 2f66dd23 · 2f66dd23
Commit 2f66dd23 authored Sep 25, 2022 by yan.yan
10 changed files
--- a/README.md
+++ b/README.md
@@ -112,6 +112,8 @@ We offer python 3.7-3.11 and cuda 10.2/11.4/11.7/12.0 prebuilt binaries for wind
 For Linux users, you need to install pip >= 20.3 first to install prebuilt.
+**WARNING**: spconv-cu117 may require CUDA Driver >= 515.
 ```pip install spconv``` for CPU only (**Linux Only**). you should only use this for debug usage, the performance isn't optimized due to manylinux limit (no omp support).
 ```pip install spconv-cu102``` for CUDA 10.2

--- a/pyproject.toml
+++ b/pyproject.toml
 [build-system]
-requires = ["setuptools>=41.0", "wheel", "pccm>=0.4.0", "cumm>=0.3.0"]
+requires = ["setuptools>=41.0", "wheel", "pccm>=0.4.0", "cumm>=0.3.1"]
 build-backend = "setuptools.build_meta"
--- a/setup.py
+++ b/setup.py
@@ -38,9 +38,9 @@ if cuda_ver:
    cuda_ver = cuda_ver.replace(".", "") # 10.2 to 102
    RELEASE_NAME += "-cu{}".format(cuda_ver)
-    deps = ["cumm-cu{}>=0.3.0".format(cuda_ver)]
+    deps = ["cumm-cu{}>=0.3.1".format(cuda_ver)]
 else:
-    deps = ["cumm>=0.3.0"]
+    deps = ["cumm>=0.3.1"]

--- a/spconv/algocore.py
+++ b/spconv/algocore.py
@@ -64,7 +64,7 @@ def get_gemm_algo_desp_from_param(p: GemmAlgoParams):
    desp.element_per_access_a = ker.input_spec.input_iter_a.element_per_acc
    desp.element_per_access_b = ker.input_spec.input_iter_b.element_per_acc
    desp.element_per_access_c = ker.output_spec.out_iter.element_per_acc
+    desp.min_arch = ker.min_arch()
    return desp
@@ -87,6 +87,7 @@ def get_conv_algo_desp_from_param(p: ConvAlgoParams):
    desp.element_per_access_a = ker.input_spec.input_iter_a.element_per_acc
    desp.element_per_access_b = ker.input_spec.input_iter_b.element_per_acc
    desp.element_per_access_c = ker.output_spec.out_iter.element_per_acc
+    desp.min_arch = ker.min_arch()
    return desp

--- a/spconv/cppconstants.py
+++ b/spconv/cppconstants.py
@@ -13,14 +13,10 @@
 # limitations under the License.
 import spconv.core_cc as _ext
-if hasattr(_ext, "cumm"):
-    CPU_ONLY_BUILD = False
-else:
-    CPU_ONLY_BUILD = True
 from spconv.core_cc.csrc.sparse.all import SpconvOps
+CPU_ONLY_BUILD = SpconvOps.is_cpu_only_build()
 BUILD_CUMM_VERSION = SpconvOps.cumm_version()
 BUILD_PCCM_VERSION = SpconvOps.pccm_version()
 from spconv.core_cc.csrc.utils.boxops import BoxOps

--- a/spconv/csrc/sparse/all.py
+++ b/spconv/csrc/sparse/all.py
@@ -84,6 +84,10 @@ class HashCoreHost(pccm.Class):
        self.add_include("tensorview/hash/hash_core.h")
 class SpconvOps(pccm.Class):
+    if CUMM_CPU_ONLY_BUILD:
+        _STATIC_FUNCTION = pccm.static_function
+    else:
+        _STATIC_FUNCTION = pccm.cuda.static_function
    def __init__(self):
        super().__init__()
        self.add_dependency(ThrustCustomAllocatorV2, ExternalAllocator, GemmBasicHost, ThrustAllocator)
@@ -143,6 +147,15 @@ class SpconvOps(pccm.Class):
        """)
        return code.ret("std::string")
+    @pccm.pybind.mark
+    @pccm.static_function
+    def is_cpu_only_build(self):
+        code = pccm.FunctionCode()
+        code.raw(f"""
+        return {pccm.literal(CUMM_CPU_ONLY_BUILD)};
+        """)
+        return code.ret("bool")
    @pccm.pybind.mark
    @pccm.static_function
    def pccm_version(self):
@@ -155,7 +168,7 @@ class SpconvOps(pccm.Class):
        return code.ret("std::string")
    @pccm.pybind.mark
-    @pccm.cuda.static_function
+    @_STATIC_FUNCTION
    def generate_conv_inds_stage1(self):
        code = pccm.FunctionCode()
        code.arg("indices", "tv::Tensor")
@@ -200,7 +213,7 @@ class SpconvOps(pccm.Class):
        return code  # .ret("int")
    @pccm.pybind.mark
-    @pccm.cuda.static_function
+    @_STATIC_FUNCTION
    def generate_conv_inds_stage1_5(self):
        code = pccm.FunctionCode()
        code.arg("indice_pairs_uniq", "tv::Tensor")
@@ -219,7 +232,7 @@ class SpconvOps(pccm.Class):
        return code.ret("int")
    @pccm.pybind.mark
-    @pccm.cuda.static_function
+    @_STATIC_FUNCTION
    def generate_conv_inds_stage2(self):
        code = pccm.FunctionCode()
        code.arg("indices, hashdata_k, hashdata_v", "tv::Tensor")
@@ -270,7 +283,7 @@ class SpconvOps(pccm.Class):
        return code.ret("int")
    @pccm.pybind.mark
-    @pccm.cuda.static_function
+    @_STATIC_FUNCTION
    def generate_conv_inds_mask_stage1(self):
        code = pccm.FunctionCode()
        if CUMM_CPU_ONLY_BUILD:
@@ -316,7 +329,7 @@ class SpconvOps(pccm.Class):
        return code  # .ret("int")
    @pccm.pybind.mark
-    @pccm.cuda.static_function
+    @_STATIC_FUNCTION
    def generate_conv_inds_mask_stage1_direct_table(self):
        code = pccm.FunctionCode()
        if CUMM_CPU_ONLY_BUILD:
@@ -362,7 +375,7 @@ class SpconvOps(pccm.Class):
        return code  # .ret("int")
    @pccm.pybind.mark
-    @pccm.cuda.static_function
+    @_STATIC_FUNCTION
    def unique_hash(self):
        code = pccm.FunctionCode()
        if CUMM_CPU_ONLY_BUILD:
@@ -378,7 +391,7 @@ class SpconvOps(pccm.Class):
        return code.ret("int")
    @pccm.pybind.mark
-    @pccm.cuda.static_function
+    @_STATIC_FUNCTION
    def assign_output_direct_hash(self):
        code = pccm.FunctionCode()
        if CUMM_CPU_ONLY_BUILD:
@@ -420,7 +433,7 @@ class SpconvOps(pccm.Class):
        return code
    @pccm.pybind.mark
-    @pccm.cuda.static_function
+    @_STATIC_FUNCTION
    def generate_conv_inds_mask_stage2(self):
        code = pccm.FunctionCode()
        if CUMM_CPU_ONLY_BUILD:
@@ -470,7 +483,7 @@ class SpconvOps(pccm.Class):
        return code.ret("int")
    @pccm.pybind.mark
-    @pccm.cuda.static_function
+    @_STATIC_FUNCTION
    def generate_conv_inds_stage2_mask_direct_table(self):
        code = pccm.FunctionCode()
        if CUMM_CPU_ONLY_BUILD:
@@ -519,7 +532,7 @@ class SpconvOps(pccm.Class):
        return code.ret("int")
    @pccm.pybind.mark
-    @pccm.cuda.static_function
+    @_STATIC_FUNCTION
    def generate_subm_conv_inds(self):
        code = pccm.FunctionCode()
        if CUMM_CPU_ONLY_BUILD:
@@ -634,7 +647,7 @@ class SpconvOps(pccm.Class):
        return code.ret("int")
    @pccm.pybind.mark
-    @pccm.cuda.static_function
+    @_STATIC_FUNCTION
    def maxpool_forward(self):
        code = pccm.FunctionCode()
        if CUMM_CPU_ONLY_BUILD:
@@ -651,7 +664,7 @@ class SpconvOps(pccm.Class):
        return code
    @pccm.pybind.mark
-    @pccm.cuda.static_function
+    @_STATIC_FUNCTION
    def maxpool_backward(self):
        code = pccm.FunctionCode()
        if CUMM_CPU_ONLY_BUILD:
@@ -671,7 +684,7 @@ class SpconvOps(pccm.Class):
    @pccm.pybind.mark
-    @pccm.cuda.static_function
+    @_STATIC_FUNCTION
    def indice_maxpool(self):
        code = pccm.FunctionCode()
        code.arg("out_features, features", "tv::Tensor")
@@ -715,7 +728,7 @@ class SpconvOps(pccm.Class):
    @pccm.pybind.mark
-    @pccm.cuda.static_function
+    @_STATIC_FUNCTION
    def indice_maxpool_backward(self):
        code = pccm.FunctionCode()
        code.arg("din, features, out_features, out_bp", "tv::Tensor")
@@ -757,7 +770,7 @@ class SpconvOps(pccm.Class):
        return code
    @pccm.pybind.mark
-    @pccm.cuda.static_function
+    @_STATIC_FUNCTION
    def maxpool_implicit_gemm_forward(self):
        code = pccm.FunctionCode()
        if CUMM_CPU_ONLY_BUILD:
@@ -773,7 +786,7 @@ class SpconvOps(pccm.Class):
        return code
    @pccm.pybind.mark
-    @pccm.cuda.static_function
+    @_STATIC_FUNCTION
    def maxpool_implicit_gemm_backward(self):
        code = pccm.FunctionCode()
        if CUMM_CPU_ONLY_BUILD:
@@ -791,7 +804,7 @@ class SpconvOps(pccm.Class):
        return code
    @pccm.pybind.mark
-    @pccm.cuda.static_function
+    @_STATIC_FUNCTION
    def avgpool_implicit_gemm_forward(self):
        code = pccm.FunctionCode()
        if CUMM_CPU_ONLY_BUILD:
@@ -808,7 +821,7 @@ class SpconvOps(pccm.Class):
        return code
    @pccm.pybind.mark
-    @pccm.cuda.static_function
+    @_STATIC_FUNCTION
    def avgpool_implicit_gemm_backward(self):
        code = pccm.FunctionCode()
        if CUMM_CPU_ONLY_BUILD:
@@ -936,19 +949,19 @@ class SpconvOps(pccm.Class):
    @pccm.pybind.mark
-    @pccm.cuda.static_function
+    @_STATIC_FUNCTION
    def sort_1d_by_key_allocator(self):
        # for python
        return self.sort_1d_by_key_allocator_template(False)
-    @pccm.cuda.static_function
+    @_STATIC_FUNCTION
    def sort_1d_by_key_allocator_v2(self):
        # for cpp only
        return self.sort_1d_by_key_allocator_template(True)
    @pccm.pybind.mark
-    @pccm.cuda.static_function
+    @_STATIC_FUNCTION
    def sort_1d_by_key_split(self):
        code = pccm.FunctionCode()
        if CUMM_CPU_ONLY_BUILD:
@@ -1070,17 +1083,17 @@ class SpconvOps(pccm.Class):
    @pccm.pybind.mark
-    @pccm.cuda.static_function
+    @_STATIC_FUNCTION
    def sort_1d_by_key_split_allocator(self):
        return self.sort_1d_by_key_split_allocator_template(False)
    @pccm.pybind.mark
-    @pccm.cuda.static_function
+    @_STATIC_FUNCTION
    def sort_1d_by_key_split_allocator_v2(self):
        return self.sort_1d_by_key_split_allocator_template(True)
    @pccm.pybind.mark
-    @pccm.cuda.static_function
+    @_STATIC_FUNCTION
    def count_bits(self):
        code = pccm.FunctionCode()
        if CUMM_CPU_ONLY_BUILD:
@@ -1140,7 +1153,7 @@ class SpconvOps(pccm.Class):
        return code.ret("tv::Tensor")
    @pccm.pybind.mark
-    @pccm.cuda.static_function
+    @_STATIC_FUNCTION
    def reverse_bits(self):
        code = pccm.FunctionCode()
        if CUMM_CPU_ONLY_BUILD:
@@ -1202,7 +1215,7 @@ class SpconvOps(pccm.Class):
    # cpu only build can't use pccm.cuda
    __CUDA_DECORATOR = pccm.static_function
    if not CUMM_CPU_ONLY_BUILD:
-        __CUDA_DECORATOR = pccm.cuda.static_function
+        __CUDA_DECORATOR = _STATIC_FUNCTION
    @pccm.pybind.mark 
    @__CUDA_DECORATOR
@@ -1243,7 +1256,7 @@ class SpconvOps(pccm.Class):
        return code 
    @pccm.pybind.mark
-    @pccm.cuda.static_function
+    @_STATIC_FUNCTION
    def sort_1d_by_key(self):
        code = pccm.FunctionCode()
        if CUMM_CPU_ONLY_BUILD:
@@ -1475,13 +1488,16 @@ class SpconvOps(pccm.Class):
        """)
        return code.ret("std::vector<int>")
-    @pccm.cuda.static_function
+    @_STATIC_FUNCTION
    def apply_thrust_unique_to_indice_pairs_uniq(self):
        code = pccm.code()
-        code.add_dependency(CustomThrustLib)
        code.arg("data", "tv::Tensor")
        code.arg("allocator", "ThrustAllocator&")
        code.arg("stream_int", f"std::uintptr_t", "0")
+        if CUMM_CPU_ONLY_BUILD:
+            return code.make_invalid()
+        code.add_dependency(CustomThrustLib)
        code.raw(f"""
        int num_out_act = 0;
        int uniq_size = data.dim(0);
@@ -1622,7 +1638,7 @@ class SpconvOps(pccm.Class):
        if CUMM_CPU_ONLY_BUILD:
            code.raw(f"""
-            throw std::runtime_error("this function can only be used with CUDA.")
+            TV_THROW_RT_ERR("this function can only be used with CUDA.");
            """)
            return code.ret("std::tuple<tv::Tensor, int>")
        code.raw(f"""

--- a/spconv/csrc/sparse/convops.py
+++ b/spconv/csrc/sparse/convops.py
@@ -624,6 +624,9 @@ class GemmTunerSimple(pccm.ParameterizedClass):
        code.arg("shuffle_type", "int")
        code.arg("a_inds_shape, b_inds_shape, c_inds_shape",
                 "std::vector<int64_t>")
+        if CUMM_CPU_ONLY_BUILD:
+            code.raw(f"TV_THROW_RT_ERR(\"not implemented for cpu!!!\")")
+            return code.ret("std::tuple<int, int, int>")
        code.raw(f"""
        return GemmMain::extract_mnk(a_shape, b_shape, trans_a,
                                    trans_b, trans_c,

--- a/spconv/csrc/sparse/inference.py
+++ b/spconv/csrc/sparse/inference.py
@@ -148,7 +148,10 @@ class InferenceOpsKernel(pccm.ParameterizedClass):
 class InferenceOps(pccm.Class):
    def __init__(self):
        super().__init__()
-        self.add_dependency(TensorView, LaunchUtils)
+        self.add_dependency(TensorView)
+        if not CUMM_CPU_ONLY_BUILD:
+            self.add_dependency(LaunchUtils)
        self.kernel = InferenceOpsKernel()
        self.add_include("tensorview/gemm/core/constants.h")
        self.add_static_const("kMaxGridYZDim", "int", "65535")

--- a/spconv/pytorch/ops.py
+++ b/spconv/pytorch/ops.py
@@ -30,14 +30,13 @@ from spconv.constants import SPCONV_CPP_INDICE_PAIRS, SPCONV_CPP_INDICE_PAIRS_IG
 import spconv.core_cc as _ext
 from spconv.core_cc.csrc.sparse.convops.spops import ConvGemmOps
 from spconv.core_cc.csrc.sparse.inference import InferenceOps
+from spconv.cppconstants import CPU_ONLY_BUILD
 from spconv.utils import nullcontext
-if hasattr(_ext, "cumm"):
+if not CPU_ONLY_BUILD:
-    CPU_ONLY_BUILD = False
    from spconv.algo import GEMM, CONV, GEMM_CPP, CONV_CPP
 else:
-    CPU_ONLY_BUILD = True
    GEMM = None
    CONV = None
    GEMM_CPP = None
@@ -1175,7 +1174,6 @@ def indice_conv_backward(features: torch.Tensor,
        return (din, dfilters.reshape(filters_shape))
    maxnhot = max(indice_pair_num_cpu)
-    arch = get_arch()
    filters_tv = torch_tensor_to_tv(filters)
    dfilters_tv = torch_tensor_to_tv(dfilters)
@@ -1224,7 +1222,7 @@ def indice_conv_backward(features: torch.Tensor,
            torch.mm(out_buffer[:nhot], filters_KC, out=inp_buffer[:nhot])
            SpconvOps.scatter_add_cpu(din_tv, inp_buffer_tv, inp_indices)
        return (din, dfilters.reshape(filters_shape))
+    arch = get_arch()
    profile_idx = kv_center
    if subm or indice_pair_num_cpu[profile_idx] == 0:
        profile_idx = kv_center - 1

--- a/test/benchmark.py
+++ b/test/benchmark.py
@@ -22,7 +22,7 @@ from cumm import tensorview as tv
 from spconv.core import ConvAlgo
 import spconv.pytorch as spconv
-from spconv.utils import Point2VoxelCPU3d, Point2VoxelGPU3d
+from spconv.utils import Point2VoxelCPU3d
 # torch.backends.cudnn.enabled = False
 def waymo_data(batch_size=1, num_features=-1):
@@ -44,6 +44,8 @@ def waymo_data(batch_size=1, num_features=-1):
    return voxels, coors, gen.grid_size
 def waymo_data_large(batch_size=1):
+    from spconv.utils import Point2VoxelGPU3d
    gen = Point2VoxelGPU3d([0.1, 0.1, 0.1], [-80, -80, -2, 80, 80, 6], 3,
                           1600000, 1)
    # gen = VoxelGeneratorV2([0.1, 0.1, 0.1], [-80, -80, -2, 80, 80, 6], 1,
@@ -395,7 +397,7 @@ def main():
    # voxels, coors, spatial_shape = waymo_data(num_features=3)
    with open(Path(__file__).parent / "data" / "test_spconv.pkl", "rb") as f:
        (voxels, coors, spatial_shape) = pickle.load(f)
-    voxels, coors, spatial_shape = waymo_data_large()
+    # voxels, coors, spatial_shape = waymo_data_large()
    # breakpoint()
    print(spatial_shape)
@@ -478,11 +480,11 @@ def main():
    # for i in range(10):
    #     out = net(voxels_th, coors_th, 1)
    #     print("------------")
-    #     torch.cuda.synchronize()
+    #     # torch.cuda.synchronize()
-    #     t = time.time()
+    #     # t = time.time()
    #     out.features.backward(dout_t)
-    #     torch.cuda.synchronize()
+    #     # torch.cuda.synchronize()
-    #     times.append(time.time() - t)
+    #     # times.append(time.time() - t)
    # # # print((net.grid == -1).float().sum(), net.grid.numel())
    # # # print("spconv time", time.time() - t)