Commit 899008fa authored by yan.yan's avatar yan.yan
Browse files

working on c++ only

parent f78575ea
<!--
Copyright 2021 Yan Yan
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
# How to develop spconv 2.x
## First step
spconv 2.x is written in a unique c++ framework ```pccm```. read [pccm guide]() to learn how to use ```pccm```.
It's recommend to uninstall spconv and cumm installed by pip, then install spconv and cumm both in editable mode (```pip install -e .```)
## Architecture
\ No newline at end of file
......@@ -159,6 +159,9 @@ if disable_jit is not None and disable_jit == "1":
from spconv.csrc.utils import BoxOps
from spconv.csrc.hash.core import HashTable
from cumm.common import CompileInfo
from spconv.csrc.sparse.alloc import ExternalAllocator
from spconv.csrc.sparse.convops import GemmTunerSimple, ExternalSpconvMatmul
from spconv.csrc.sparse.convops import ConvTunerSimple, ConvGemmOps
cu = GemmMainUnitTest(SHUFFLE_SIMT_PARAMS + SHUFFLE_VOLTA_PARAMS + SHUFFLE_TURING_PARAMS)
convcu = ConvMainUnitTest(IMPLGEMM_SIMT_PARAMS + IMPLGEMM_VOLTA_PARAMS + IMPLGEMM_TURING_PARAMS)
......@@ -172,14 +175,30 @@ if disable_jit is not None and disable_jit == "1":
std = "c++14"
else:
std = "c++17"
cus = [cu, convcu, SpconvOps(), BoxOps(), HashTable(), CompileInfo()]
if CUMM_CPU_ONLY_BUILD:
cus = [SpconvOps(), BoxOps(), HashTable(), CompileInfo()]
gemmtuner = GemmTunerSimple(cu)
gemmtuner.namespace = "csrc.sparse.convops.gemmops"
convtuner = ConvTunerSimple(convcu)
convtuner.namespace = "csrc.sparse.convops.convops"
convops = ConvGemmOps(gemmtuner, convtuner)
convops.namespace = "csrc.sparse.convops.spops"
else:
gemmtuner = GemmTunerSimple(None)
gemmtuner.namespace = "csrc.sparse.convops.gemmops"
convtuner = ConvTunerSimple(None)
convtuner.namespace = "csrc.sparse.convops.convops"
convops = ConvGemmOps(gemmtuner, convtuner)
convops.namespace = "csrc.sparse.convops.spops"
cus = [gemmtuner, convtuner,
convops, SpconvOps(), BoxOps(), HashTable(), CompileInfo(),
ExternalAllocator(),
ExternalSpconvMatmul()]
if not CUMM_CPU_ONLY_BUILD:
cus.extend([cu, convcu])
ext_modules: List[Extension] = [
PCCMExtension(cus,
"spconv/core_cc",
Path(__file__).resolve().parent / "spconv",
objects_folder="objects",
std=std,
disable_pch=True,
verbose=True)
......
......@@ -37,7 +37,7 @@ from cumm import dtypes
from spconv.constants import (NDIM_DONT_CARE, SPCONV_BWD_SPLITK,
SPCONV_NVRTC_MODE, SPCONV_DEBUG_NVRTC_KERNELS)
from spconv.core import ALL_IMPGEMM_PARAMS, AlgoHint, ConvAlgo
from spconv.core import ALL_IMPGEMM_PARAMS, AlgoHint, ConvAlgo, ALL_NATIVE_PARAMS
from spconv.core_cc.cumm.conv.main import ConvMainUnitTest
from spconv.core_cc.cumm.gemm.main import GemmMainUnitTest
from spconv.cppconstants import COMPILED_CUDA_ARCHS
......@@ -49,14 +49,17 @@ from spconv import algocore
from cumm.conv.main import gen_gemm_kernels as gen_conv_kernels
from cumm.gemm.main import gen_gemm_kernels
from spconv.core_cc.csrc.sparse.convops import GemmTuneResult, ConvTuneResult
from spconv.core_cc.csrc.sparse.convops.gemmops import GemmTunerSimple as GemmTunerSimpleBase
from spconv.core_cc.csrc.sparse.convops.convops import ConvTunerSimple as ConvTunerSimpleBase
ALL_ALGO_DESPS = GemmMainUnitTest.get_all_algo_desp()
ALL_CONV_ALGO_DESPS = ConvMainUnitTest.get_all_conv_algo_desp()
_GEMM_STATIC_KEY = Tuple[bool, bool, bool, int, int, int, str, str]
_GEMM_STATIC_KEY = Tuple[bool, bool, bool, int, int, int, int, str]
class SimpleGemmAlgoMeta:
def __init__(self, tile_ms: List[int], tile_ns: List[int],
tile_ks: List[int],
tile_shape_to_algos: Dict[int, List[int]]) -> None:
......@@ -67,19 +70,29 @@ class SimpleGemmAlgoMeta:
class BestAlgoByProfile:
def __init__(self, algo_desp: GemmAlgoDesp, arch: Tuple[int, int], splitk: int = 1) -> None:
def __init__(self,
algo_desp: GemmAlgoDesp,
arch: Tuple[int, int],
splitk: int = 1) -> None:
self.algo_desp = algo_desp
self.splitk = splitk
self.arch = arch
class BestConvAlgoByProfile:
def __init__(self, algo_desp: ConvAlgoDesp, arch: Tuple[int, int], splitk: int = 1) -> None:
def __init__(self,
algo_desp: ConvAlgoDesp,
arch: Tuple[int, int],
splitk: int = 1) -> None:
self.algo_desp = algo_desp
self.splitk = splitk
self.arch = arch
def _get_nvrtc_params(mod: CummNVRTCModule, ker: Union[GemmKernel, ConvKernel], kernel_name: str):
def _get_nvrtc_params(mod: CummNVRTCModule, ker: Union[GemmKernel, ConvKernel],
kernel_name: str):
nvrtc_mode = SPCONV_NVRTC_MODE
nvrtc_params = tv.gemm.NVRTCParams()
nvrtc_params.cumodule = mod.get_cpp_object()
......@@ -89,8 +102,7 @@ def _get_nvrtc_params(mod: CummNVRTCModule, ker: Union[GemmKernel, ConvKernel],
ns = ker.namespace
if nvrtc_mode == NVRTCMode.DynamicParallism:
nvrtc_params.kernel_name = mod.get_lowered_name(
f"{ns}::nvrtc_kernel")
nvrtc_params.kernel_name = mod.get_lowered_name(f"{ns}::nvrtc_kernel")
elif nvrtc_mode == NVRTCMode.KernelAndCPU:
nvrtc_params.kernel_name = mod.get_lowered_name(f"{ns}::{kernel_name}")
......@@ -101,8 +113,10 @@ def _get_nvrtc_params(mod: CummNVRTCModule, ker: Union[GemmKernel, ConvKernel],
nvrtc_params.param_storage = tv.empty([nvrtc_params.param_size],
tv.uint8, 0)
nvrtc_params.param_storage_cpu = tv.empty(
[nvrtc_params.param_size], tv.uint8, -1, pinned=True)
nvrtc_params.param_storage_cpu = tv.empty([nvrtc_params.param_size],
tv.uint8,
-1,
pinned=True)
elif nvrtc_mode == NVRTCMode.Direct:
nvrtc_params.kernel_name = mod.get_lowered_name(f"{ns}::{kernel_name}")
......@@ -120,9 +134,84 @@ def _get_nvrtc_params(mod: CummNVRTCModule, ker: Union[GemmKernel, ConvKernel],
raise NotImplementedError
return nvrtc_params
class GemmTunerSimple(GemmTunerSimpleBase):
def __init__(self, desps: List[GemmAlgoDesp]) -> None:
super().__init__(desps)
self._nvrtc_caches: Dict[Tuple[str, Tuple[int, int], int], NVRTCParams] = {}
def _compile_nvrtc_module(self, desp: GemmAlgoDesp):
params = algocore.get_gemm_param_from_desp(desp)
kernel = gen_gemm_kernels(params, SPCONV_NVRTC_MODE)
kernel.namespace = "spconv"
custom_names = []
if SPCONV_NVRTC_MODE == NVRTCMode.ConstantMemory:
custom_names = [
f"&{kernel.namespace}::{NVRTCConstants.CONSTANT_PARAM_KEY}"
]
cudadevrt = ""
if SPCONV_NVRTC_MODE == NVRTCMode.DynamicParallism:
cudadevrt_p = get_cudadevrt_path()
assert cudadevrt_p is not None, "DynamicParallism must have cudadevrt"
cudadevrt = str(cudadevrt_p)
mod = CummNVRTCModule([kernel],
cudadevrt_path=cudadevrt,
custom_names=custom_names)
mod.load()
return mod, kernel
def cached_get_nvrtc_params(self, desp: GemmAlgoDesp, arch: Tuple[int, int], stream_int: int) -> NVRTCParams:
key = (str(desp), arch, stream_int)
if key in self._nvrtc_caches:
return self._nvrtc_caches[key]
mod, ker = self._compile_nvrtc_module(desp)
nvrtc_params = _get_nvrtc_params(mod, ker, "gemm_kernel")
self._nvrtc_caches[key] = nvrtc_params
return nvrtc_params
class ConvTunerSimple(ConvTunerSimpleBase):
def __init__(self, desps: List[ConvAlgoDesp]) -> None:
super().__init__(desps)
self._nvrtc_caches: Dict[Tuple[str, Tuple[int, int], int], NVRTCParams] = {}
def _compile_nvrtc_module(self, desp: ConvAlgoDesp):
params = algocore.get_conv_param_from_desp(desp)
kernel = gen_conv_kernels(params, SPCONV_NVRTC_MODE)
kernel.namespace = "spconv"
custom_names = []
if SPCONV_NVRTC_MODE == NVRTCMode.ConstantMemory:
custom_names = [
f"&{kernel.namespace}::{NVRTCConstants.CONSTANT_PARAM_KEY}"
]
cudadevrt = ""
if SPCONV_NVRTC_MODE == NVRTCMode.DynamicParallism:
cudadevrt_p = get_cudadevrt_path()
assert cudadevrt_p is not None, "DynamicParallism must have cudadevrt"
cudadevrt = str(cudadevrt_p)
mod = CummNVRTCModule([kernel],
cudadevrt_path=cudadevrt,
verbose=False,
custom_names=custom_names)
mod.load()
return mod, kernel
def cached_get_nvrtc_params(self, desp: ConvAlgoDesp, arch: Tuple[int, int], stream_int: int) -> NVRTCParams:
key = (str(desp), arch, stream_int)
if key in self._nvrtc_caches:
return self._nvrtc_caches[key]
mod, ker = self._compile_nvrtc_module(desp)
print(f"Can't find algo {desp} in prebuilt. compile with nvrtc...")
nvrtc_params = _get_nvrtc_params(mod, ker, "conv_kernel")
self._nvrtc_caches[key] = nvrtc_params
return nvrtc_params
class SimpleGemm:
def __init__(self, prebuilt_desps: List[GemmAlgoDesp]) -> None:
all_desps = [algocore.get_conv_algo_desp_from_param(p) for p in ALL_IMPGEMM_PARAMS]
all_desps = [
algocore.get_gemm_algo_desp_from_param(p)
for p in ALL_NATIVE_PARAMS
]
self.prebuilt_desps = prebuilt_desps
self.prebuilt_desp_names = {str(d) for d in prebuilt_desps}
if SPCONV_DEBUG_NVRTC_KERNELS:
......@@ -178,7 +267,9 @@ class SimpleGemm:
kernel.namespace = "spconv"
custom_names = []
if SPCONV_NVRTC_MODE == NVRTCMode.ConstantMemory:
custom_names = [f"&{kernel.namespace}::{NVRTCConstants.CONSTANT_PARAM_KEY}"]
custom_names = [
f"&{kernel.namespace}::{NVRTCConstants.CONSTANT_PARAM_KEY}"
]
cudadevrt = ""
if SPCONV_NVRTC_MODE == NVRTCMode.DynamicParallism:
cudadevrt_p = get_cudadevrt_path()
......@@ -186,12 +277,12 @@ class SimpleGemm:
cudadevrt = str(cudadevrt_p)
mod = CummNVRTCModule([kernel],
cudadevrt_path=cudadevrt,
verbose=False,
custom_names=custom_names)
mod.load()
return mod, kernel
def _cached_get_nvrtc_params(self, desp: GemmAlgoDesp, arch: Tuple[int, int]):
def _cached_get_nvrtc_params(self, desp: GemmAlgoDesp, arch: Tuple[int,
int]):
key = (str(desp), arch)
if key in self._nvrtc_caches:
return self._nvrtc_caches[key]
......@@ -218,12 +309,15 @@ class SimpleGemm:
trans_c = False
avail_algos = get_available_algo_str_from_arch(arch)
finally_algos: List[GemmAlgoDesp] = []
# print(self.static_key_to_desps)
for algo in avail_algos:
static_key = (trans_a, trans_b, trans_c, a.dtype, b.dtype, c.dtype,
shuffle_type.value, algo)
# print(static_key)
desps = self.static_key_to_desps.get(static_key, None)
if desps is None or len(desps) == 0:
continue
# print(desps)
for desp in desps:
# skip volta tensor op since it is very slow in architectures except volta.
if arch >= (7, 5) and desp.algo == GemmAlgo.Volta.value:
......@@ -430,6 +524,7 @@ class SimpleGemm:
best_scatter_params = (-1, -1, -1, -1)
all_profile_res: List[BestAlgoByProfile] = []
# print(avail)
for desp in avail:
c_.zero_whole_storage_()
split_k_slices = 1
......@@ -466,7 +561,8 @@ class SimpleGemm:
times.append(np.mean(this_times[1:]))
spk_speeds.append(times[-1])
all_profile_res.append(BestAlgoByProfile(desp, arch, splitk=spk))
all_profile_res.append(
BestAlgoByProfile(desp, arch, splitk=spk))
min_time = 1000
min_idx = -1
......@@ -490,8 +586,7 @@ class SimpleGemm:
return res, min_time
def run_with_tuned_result(
self,
def run_with_tuned_result(self,
profile_res: BestAlgoByProfile,
a: tv.Tensor,
b: tv.Tensor,
......@@ -501,7 +596,7 @@ class SimpleGemm:
trans_c: bool,
arch: Tuple[int, int],
stream: int,
shuffle_type: ShuffleStrideType = ShuffleStrideType.NoShuffle,
shuffle_type: ShuffleStrideType,
a_inds: tv.Tensor = tv.Tensor(),
b_inds: tv.Tensor = tv.Tensor(),
c_inds: tv.Tensor = tv.Tensor(),
......@@ -510,7 +605,8 @@ class SimpleGemm:
beta: float = 0.0,
gather_data: tv.Tensor = tv.Tensor(),
workspace: tv.Tensor = tv.Tensor(),
timer: CUDAKernelTimer = CUDAKernelTimer(False)):
timer: CUDAKernelTimer = CUDAKernelTimer(False),
force_nvrtc: bool = False):
m, n, k = GemmMainUnitTest.extract_mnk(a.shape, b.shape, trans_a,
trans_b, trans_c,
shuffle_type.value,
......@@ -526,8 +622,10 @@ class SimpleGemm:
if profile_res.splitk > 1:
split_k_slices = profile_res.splitk
params = GemmParams()
if algo_desp.is_nvrtc and str(algo_desp) not in self.prebuilt_desp_names:
params.nvrtc_params = self._cached_get_nvrtc_params(algo_desp, profile_res.arch)
is_not_static = str(algo_desp) not in self.prebuilt_desp_names
if algo_desp.is_nvrtc and (is_not_static or force_nvrtc):
params.nvrtc_params = self._cached_get_nvrtc_params(
algo_desp, profile_res.arch)
params.a = a
params.b = b
......@@ -569,8 +667,12 @@ _CONV_STATIC_KEY = Tuple[int, int, int, int, int, int, int, int, int, str, int]
class SimpleConv:
def __init__(self, prebuilt_desps: List[ConvAlgoDesp]) -> None:
all_desps = [algocore.get_conv_algo_desp_from_param(p) for p in ALL_IMPGEMM_PARAMS]
all_desps = [
algocore.get_conv_algo_desp_from_param(p)
for p in ALL_IMPGEMM_PARAMS
]
self.prebuilt_desps = prebuilt_desps
self.prebuilt_desp_names = {str(d) for d in prebuilt_desps}
self.prebuilt_desp_names.clear()
......@@ -650,6 +752,7 @@ class SimpleConv:
use_f32_as_accum = weight.dim(0) * kv > 128 * 27
else:
use_f32_as_accum = fp32_accum
use_f32_as_accum = False
for algo in avail_algos:
static_key = (layout_i.layout_type.value,
layout_w.layout_type.value,
......@@ -664,7 +767,6 @@ class SimpleConv:
if arch >= (7, 5) and desp.algo == GemmAlgo.Volta.value:
continue
if arch >= (7, 0) and is_fp16:
# skip simt fp16 kernels if we have tensor core
if desp.algo == GemmAlgo.Simt:
continue
if use_f32_as_accum:
......@@ -675,6 +777,7 @@ class SimpleConv:
ldw = weight.dim(-1)
ldo = out.dim(-1)
mask_width_valid = True
if desp.op_type == ConvOpType.kBackwardWeight.value:
assert mask_width > 0
mask_width_valid = mask_width % desp.tile_shape[2] == 0
......@@ -722,7 +825,9 @@ class SimpleConv:
kernel.namespace = "spconv"
custom_names = []
if SPCONV_NVRTC_MODE == NVRTCMode.ConstantMemory:
custom_names = [f"&{kernel.namespace}::{NVRTCConstants.CONSTANT_PARAM_KEY}"]
custom_names = [
f"&{kernel.namespace}::{NVRTCConstants.CONSTANT_PARAM_KEY}"
]
cudadevrt = ""
if SPCONV_NVRTC_MODE == NVRTCMode.DynamicParallism:
cudadevrt_p = get_cudadevrt_path()
......@@ -735,10 +840,12 @@ class SimpleConv:
mod.load()
return mod, kernel
def _cached_get_nvrtc_params(self, desp: ConvAlgoDesp, arch: Tuple[int, int]):
def _cached_get_nvrtc_params(self, desp: ConvAlgoDesp, arch: Tuple[int,
int]):
key = (str(desp), arch)
if key in self._nvrtc_caches:
return self._nvrtc_caches[key]
print(f"Can't find algo {desp} in prebuilt. compile with nvrtc...")
mod, ker = self._compile_nvrtc_module(desp)
nvrtc_params = _get_nvrtc_params(mod, ker, "conv_kernel")
self._nvrtc_caches[key] = nvrtc_params
......@@ -795,8 +902,8 @@ class SimpleConv:
params.indices = indices
params.mask = mask
params.mask_output = mask_output
if op_type == ConvOpType.kBackwardWeight:
assert not mask_output.empty()
# if op_type == ConvOpType.kBackwardWeight:
# assert not mask_output.empty()
if op_type == ConvOpType.kBackwardInput:
params.reverse_mask = reverse_mask
params.mask_filter = mask_filter
......@@ -808,20 +915,20 @@ class SimpleConv:
spk_speeds = []
for spk in splitk_tests:
this_times = []
for j in range(3):
GemmMainUnitTest.stream_synchronize(stream)
t = time.time()
for j in range(4):
params.split_k_slices = spk
if desp.is_nvrtc and str(desp) not in self.prebuilt_desp_names:
with tv.measure_duration(stream=stream) as measure:
if desp.is_nvrtc and str(
desp) not in self.prebuilt_desp_names:
tv.gemm.run_nvrtc_conv_kernel(params)
else:
ConvMainUnitTest.implicit_gemm2(params)
GemmMainUnitTest.stream_synchronize(stream)
this_times.append(time.time() - t)
this_times.append(measure.duration)
times.append(np.mean(this_times[1:]))
spk_speeds.append(times[-1])
all_profile_res.append(BestConvAlgoByProfile(desp, arch, splitk=spk))
all_profile_res.append(
BestConvAlgoByProfile(desp, arch, splitk=spk))
if not all_profile_res:
raise ValueError("can't find suitable algorithm for", op_type)
min_time = 1000
......@@ -865,7 +972,8 @@ class SimpleConv:
stream: int = 0,
workspace: tv.Tensor = tv.Tensor(),
verbose: bool = False,
timer: CUDAKernelTimer = CUDAKernelTimer(False)):
timer: CUDAKernelTimer = CUDAKernelTimer(False),
force_nvrtc: bool = False):
channel_k = output.dim(1)
channel_c = inp.dim(1)
# GemmMainUnitTest.stream_synchronize(stream)
......@@ -879,13 +987,17 @@ class SimpleConv:
else:
op_type_value = op_type.value
params = ConvParams(NDIM_DONT_CARE, ConvOpTypeCpp(op_type_value))
if algo_desp.is_nvrtc and str(algo_desp) not in self.prebuilt_desp_names:
params.nvrtc_params = self._cached_get_nvrtc_params(algo_desp, profile_res.arch)
is_not_static = str(
algo_desp) not in self.prebuilt_desp_names
if algo_desp.is_nvrtc and (is_not_static or force_nvrtc):
params.nvrtc_params = self._cached_get_nvrtc_params(
algo_desp, profile_res.arch)
params.conv_algo_desp = profile_res.algo_desp
params.input = inp
params.verbose = verbose
params.weight = weight.view([channel_k, -1, channel_c])
params.output = output
params.split_k_slices = split_k_slices
params.alpha = alpha
params.beta = beta
......@@ -893,6 +1005,7 @@ class SimpleConv:
params.mask_argsort = mask_argsort
params.indices = indices
params.mask = mask
params.mask_filter = mask_filter
params.mask_width = mask_width
params.mask_filter = mask_filter
......@@ -919,6 +1032,13 @@ class SimpleConv:
GEMM = SimpleGemm(ALL_ALGO_DESPS)
CONV = SimpleConv(ALL_CONV_ALGO_DESPS)
GEMM_CPP = GemmTunerSimple([
algocore.get_gemm_algo_desp_from_param(p)
for p in ALL_NATIVE_PARAMS])
CONV_CPP = ConvTunerSimple([
algocore.get_conv_algo_desp_from_param(p)
for p in ALL_IMPGEMM_PARAMS])
if __name__ == "__main__":
print(len(ALL_CONV_ALGO_DESPS))
print(ALL_CONV_ALGO_DESPS[0])
......@@ -24,8 +24,8 @@ from cumm.tensorview.gemm import ConvLayoutType as ConvLayoutTypeCpp
from cumm.tensorview.gemm import ShuffleStrideType as ShuffleStrideTypeCpp
from cumm.tensorview.gemm import ConvParams, GemmAlgoDesp, GemmParams
from cumm.gemm.main import GemmAlgoParams
from cumm.conv.main import ConvAlgoParams, ConvIterAlgo
from cumm.gemm.main import GemmAlgoParams, gen_gemm_kernels
from cumm.conv.main import ConvAlgoParams, ConvIterAlgo, gen_gemm_kernels as gen_conv_kernels
from cumm import dtypes
from cumm.conv.bases import (NCHW, NHWC, ConvIterAlgo, ConvLayout,
ConvLayoutType, ConvMode, ConvOpType)
......@@ -56,10 +56,15 @@ def _assign_gemm_desp_props(desp: Union[ConvAlgoDesp, GemmAlgoDesp],
desp.access_per_vector = p.access_per_vector
desp.is_nvrtc = p.is_nvrtc
def get_gemm_algo_desp_from_param(p: GemmAlgoParams):
desp = GemmAlgoDesp()
_assign_gemm_desp_props(desp, p)
# here we must generate kernel for element-per-access data
ker = gen_gemm_kernels(p)
desp.element_per_access_a = ker.input_spec.input_iter_a.element_per_acc
desp.element_per_access_b = ker.input_spec.input_iter_b.element_per_acc
desp.element_per_access_c = ker.output_spec.out_iter.element_per_acc
return desp
......@@ -78,6 +83,10 @@ def get_conv_algo_desp_from_param(p: ConvAlgoParams):
desp.interleave_o = p.layout_desp_output.interleave
desp.mask_sparse = p.mask_sparse
desp.increment_k_first = p.increment_k_first
ker = gen_conv_kernels(p)
desp.element_per_access_a = ker.input_spec.input_iter_a.element_per_acc
desp.element_per_access_b = ker.input_spec.input_iter_b.element_per_acc
desp.element_per_access_c = ker.output_spec.out_iter.element_per_acc
return desp
......@@ -106,6 +115,7 @@ def _assign_gemm_params(desp: Union[ConvAlgoDesp, GemmAlgoDesp],
p.is_nvrtc = desp.is_nvrtc
def get_gemm_param_from_desp(desp: GemmAlgoDesp):
p = GemmAlgoParams((0, 0, 0), (0, 0, 0), 0, "s8,s8,s8,s8,s8", False, False,
False, GemmAlgo.Simt)
......
"""Benchmark MinkowskiEngine
"""
from spconv.benchmark.core import get_voxel_data
import time
from pathlib import Path
import numpy as np
import torch
from torch import nn
from spconv.core import ConvAlgo
from cumm import dtypes
from spconv.test_utils import params_grid
_DTYPE_TO_TORCH_DTYPE = {
dtypes.float32: torch.float32,
dtypes.float16: torch.float16,
}
def bench_me_basic(dtype_str: str):
dtype = dtypes.get_dtype_by_shortcut(dtype_str)
if dtype not in _DTYPE_TO_TORCH_DTYPE:
raise NotImplementedError("only support bench f32 and f16 for now")
torch_dtype = _DTYPE_TO_TORCH_DTYPE[dtype]
"""Benchmark torchsparse
"""
from spconv.benchmark.core import get_voxel_data
import time
from pathlib import Path
import numpy as np
import torch
from torch import nn
from spconv.core import ConvAlgo
from cumm import dtypes
from spconv.test_utils import params_grid
_DTYPE_TO_TORCH_DTYPE = {
dtypes.float32: torch.float32,
dtypes.float16: torch.float16,
}
def bench_torchsparse_basic(dtype_str: str):
dtype = dtypes.get_dtype_by_shortcut(dtype_str)
if dtype not in _DTYPE_TO_TORCH_DTYPE:
raise NotImplementedError("only support bench f32 and f16 for now")
torch_dtype = _DTYPE_TO_TORCH_DTYPE[dtype]
......@@ -13,6 +13,7 @@
# limitations under the License.
from pathlib import Path
from typing import List
import pccm
from pccm.utils import project_is_editable, project_is_installed
......@@ -32,6 +33,10 @@ if project_is_installed(PACKAGE_NAME) and project_is_editable(
from spconv.csrc.sparse.alloc import ExternalAllocator
from spconv.csrc.utils import BoxOps
from spconv.csrc.hash.core import HashTable
from spconv.csrc.sparse.convops import GemmTunerSimple, ExternalSpconvMatmul
from spconv.csrc.sparse.convops import ConvTunerSimple, ConvGemmOps
from spconv.csrc.sparse.convops import SimpleExternalSpconvMatmul
all_shuffle = SHUFFLE_SIMT_PARAMS + SHUFFLE_VOLTA_PARAMS + SHUFFLE_TURING_PARAMS
all_shuffle = list(filter(lambda x: not x.is_nvrtc, all_shuffle))
cu = GemmMainUnitTest(all_shuffle)
......@@ -41,8 +46,35 @@ if project_is_installed(PACKAGE_NAME) and project_is_editable(
all_imp = list(filter(lambda x: not x.is_nvrtc, all_imp))
convcu = ConvMainUnitTest(all_imp)
convcu.namespace = "cumm.conv.main"
pccm.builder.build_pybind([cu, convcu, SpconvOps(), BoxOps(), HashTable(), CompileInfo(), ExternalAllocator()],
gemmtuner = GemmTunerSimple(cu)
gemmtuner.namespace = "csrc.sparse.convops.gemmops"
convtuner = ConvTunerSimple(convcu)
convtuner.namespace = "csrc.sparse.convops.convops"
convops = ConvGemmOps(gemmtuner, convtuner)
convops.namespace = "csrc.sparse.convops.spops"
cus = [
cu, convcu, gemmtuner, convtuner,
convops,
SpconvOps(),
BoxOps(),
HashTable(),
CompileInfo(),
ExternalAllocator(),
ExternalSpconvMatmul(),
SimpleExternalSpconvMatmul(),
]
pccm.builder.build_pybind(cus,
PACKAGE_ROOT / "core_cc",
namespace_root=PACKAGE_ROOT,
load_library=False)
load_library=False,
verbose=True)
# cus_dev: List[pccm.Class] = [
# ]
# pccm.builder.build_pybind(cus_dev,
# PACKAGE_ROOT / "core_cc_dev",
# namespace_root=PACKAGE_ROOT,
# load_library=False,
# verbose=True)
......@@ -30,6 +30,7 @@ if _filter_hwio_env is not None:
raise NotImplementedError("SPCONV_FILTER_HWIO is deprecated. use SPCONV_SAVED_WEIGHT_LAYOUT instead.")
DISABLE_JIT = os.getenv("SPCONV_DISABLE_JIT", "0") == "1"
NDIM_DONT_CARE = 3
FILTER_HWIO = False
......@@ -59,8 +60,10 @@ SPCONV_BWD_SPLITK = list(map(int, os.getenv("SPCONV_BWD_SPLITK", "1,2,4,8,16,32,
SPCONV_NVRTC_MODE = NVRTCMode.ConstantMemory
SPCONV_DEBUG_NVRTC_KERNELS = False
SPCONV_DEBUG_CPP_ONLY = project_is_editable(PACKAGE_NAME)
class SpconvAllocatorKeys:
class AllocKeys:
Pair = "Pair"
IndiceNumPerLoc = "IndiceNumPerLoc"
PairMask = "PairMask"
......@@ -72,5 +75,31 @@ class SpconvAllocatorKeys:
# MaskArgSortFwd = "MaskArgSortFwd"
MaskArgSortBwd = "MaskArgSortBwd"
MaskOutputFwd = "MaskOutputFwd"
OutFeatures = "OutFeatures"
Features = "Features"
Filters = "Filters"
OutBp = "OutBp"
DIn = "DIn"
DFilters = "DFilters"
InpBuffer = "InpBuffer"
OutBuffer = "OutBuffer"
IndicePairsUniq = "IndicePairsUniq"
IndicePairsUniqBackup = "IndicePairsUniqBackup"
HashKOrKV = "HashKOrKV"
HashV = "HashV"
ThrustTemp = "ThrustTemp"
SPCONV_DEBUG_WEIGHT = False
SPCONV_CPP_INDICE_PAIRS = True
SPCONV_CPP_INDICE_PAIRS_IGEMM = True
SPCONV_CPP_GEMM = True
\ No newline at end of file
......@@ -16,9 +16,10 @@ from cumm.gemm.main import gen_shuffle_params_v2 as gen_shuffle_params, GemmAlgo
from cumm.gemm import kernel
from typing import List
from cumm.gemm.algospec.core import TensorOp
from cumm.conv.main import gen_gemm_params as gen_conv_params, ConvFwdAndBwdInput, ConvBwdWeight, ConvIterAlgo, GemmAlgo
from cumm.conv.main import gen_gemm_params as gen_conv_params, ConvFwdAndBwdInput, ConvBwdWeight, ConvFwd, ConvIterAlgo, GemmAlgo
from cumm.conv.bases import (NCHW, NHWC, ConvIterAlgo, ConvLayout,
ConvLayoutType, ConvMode, ConvOpType)
from spconv.algocore import get_gemm_algo_desp_from_param
from spconv.constants import NDIM_DONT_CARE
......@@ -402,32 +403,6 @@ IMPLGEMM_SIMT_PARAMS = [
increment_k_first=True,
access_per_vector=1),
]
IMPLGEMM_SIMT_PARAMS = [
*gen_conv_params(ConvFwdAndBwdInput, (64, 32, 16), (32, 32, 8),
NDIM_DONT_CARE,
ConvIterAlgo.Optimized,
2, ["f32,f32,f32,f32,f32", "f16,f16,f16,f32,f32"],
NHWC,
NHWC,
NHWC,
GemmAlgo.Simt,
None,
mask_sparse=True,
increment_k_first=True,
access_per_vector=1),
*gen_conv_params(ConvBwdWeight, (64, 32, 16), (32, 32, 8),
NDIM_DONT_CARE,
ConvIterAlgo.Optimized,
2, ["f32,f32,f32,f32,f32", "f16,f16,f16,f32,f32"],
NHWC,
NHWC,
NHWC,
GemmAlgo.Simt,
None,
mask_sparse=True,
increment_k_first=True,
access_per_vector=1),
]
IMPLGEMM_VOLTA_PARAMS = [
......@@ -693,6 +668,181 @@ IMPLGEMM_TURING_PARAMS = [
# NHWC, NHWC, NHWC, GemmAlgo.Turing, TensorOp((16, 8, 8)), mask_sparse=True, increment_k_first=True, access_per_vector=1),
# gen_conv_params(ConvFwdAndBwdInput, )
# all int8 kernels use nvrtc.
*gen_conv_params(ConvFwd, (32, 32, 32), (32, 32, 32),
NDIM_DONT_CARE,
ConvIterAlgo.Optimized,
2, ["s8,s8,s8,s32,s32"],
NHWC,
NHWC,
NHWC,
GemmAlgo.Turing,
TensorOp((8, 8, 16)),
mask_sparse=True,
increment_k_first=True,
access_per_vector=1,
is_nvrtc=False),
*gen_conv_params(ConvFwd, (32, 64, 32), (32, 32, 32),
NDIM_DONT_CARE,
ConvIterAlgo.Optimized,
2, ["s8,s8,s8,s32,s32"],
NHWC,
NHWC,
NHWC,
GemmAlgo.Turing,
TensorOp((8, 8, 16)),
mask_sparse=True,
increment_k_first=True,
access_per_vector=1,
is_nvrtc=False),
*gen_conv_params(ConvFwd, (32, 32, 64), (32, 32, 32),
NDIM_DONT_CARE,
ConvIterAlgo.Optimized,
2, ["s8,s8,s8,s32,s32"],
NHWC,
NHWC,
NHWC,
GemmAlgo.Turing,
TensorOp((8, 8, 16)),
mask_sparse=True,
increment_k_first=True,
access_per_vector=1,
is_nvrtc=False),
*gen_conv_params(ConvFwd, (32, 64, 64), (32, 32, 32),
NDIM_DONT_CARE,
ConvIterAlgo.Optimized,
2, ["s8,s8,s8,s32,s32"],
NHWC,
NHWC,
NHWC,
GemmAlgo.Turing,
TensorOp((8, 8, 16)),
mask_sparse=True,
increment_k_first=True,
access_per_vector=1,
is_nvrtc=False),
*gen_conv_params(ConvFwd, (64, 128, 32), (32, 64, 32),
NDIM_DONT_CARE,
ConvIterAlgo.Optimized,
2, ["s8,s8,s8,s32,s32"],
NHWC,
NHWC,
NHWC,
GemmAlgo.Turing,
TensorOp((8, 8, 16)),
mask_sparse=True,
increment_k_first=True,
access_per_vector=1,
is_nvrtc=False),
*gen_conv_params(ConvFwd, (64, 64, 32), (32, 64, 32),
NDIM_DONT_CARE,
ConvIterAlgo.Optimized,
2, ["s8,s8,s8,s32,s32"],
NHWC,
NHWC,
NHWC,
GemmAlgo.Turing,
TensorOp((8, 8, 16)),
mask_sparse=True,
increment_k_first=True,
access_per_vector=1,
is_nvrtc=False),
*gen_conv_params(ConvFwd, (64, 64, 32), (32, 32, 32),
NDIM_DONT_CARE,
ConvIterAlgo.Optimized,
2, ["s8,s8,s8,s32,s32"],
NHWC,
NHWC,
NHWC,
GemmAlgo.Turing,
TensorOp((8, 8, 16)),
mask_sparse=True,
increment_k_first=True,
access_per_vector=1,
is_nvrtc=False),
*gen_conv_params(ConvFwd, (64, 32, 32), (32, 32, 32),
NDIM_DONT_CARE,
ConvIterAlgo.Optimized,
2, ["s8,s8,s8,s32,s32"],
NHWC,
NHWC,
NHWC,
GemmAlgo.Turing,
TensorOp((8, 8, 16)),
mask_sparse=True,
increment_k_first=True,
access_per_vector=1,
is_nvrtc=False),
*gen_conv_params(ConvFwd, (128, 128, 64), (64, 64, 64),
NDIM_DONT_CARE,
ConvIterAlgo.Optimized,
2, ["s8,s8,s8,s32,s32"],
NHWC,
NHWC,
NHWC,
GemmAlgo.Turing,
TensorOp((8, 8, 16)),
mask_sparse=True,
increment_k_first=True,
access_per_vector=1,
is_nvrtc=False),
*gen_conv_params(ConvFwd, (64, 128, 64), (32, 64, 64),
NDIM_DONT_CARE,
ConvIterAlgo.Optimized,
2, ["s8,s8,s8,s32,s32"],
NHWC,
NHWC,
NHWC,
GemmAlgo.Turing,
TensorOp((8, 8, 16)),
mask_sparse=True,
increment_k_first=True,
access_per_vector=1,
is_nvrtc=False),
# *gen_conv_params(ConvFwd, (32, 32, 32), (32, 32, 32),
# NDIM_DONT_CARE,
# ConvIterAlgo.Optimized,
# 2, ["s8,s8,s8,s32,s32"],
# NHWC,
# NHWC,
# NHWC,
# GemmAlgo.Turing,
# TensorOp((8, 8, 16)),
# mask_sparse=True,
# increment_k_first=True,
# access_per_vector=0,
# is_nvrtc=True),
# *gen_conv_params(ConvFwd, (32, 64, 32), (32, 32, 32),
# NDIM_DONT_CARE,
# ConvIterAlgo.Optimized,
# 2, ["s8,s8,s8,s32,s32"],
# NHWC,
# NHWC,
# NHWC,
# GemmAlgo.Turing,
# TensorOp((8, 8, 16)),
# mask_sparse=True,
# increment_k_first=True,
# access_per_vector=0,
# is_nvrtc=True),
# *gen_conv_params(ConvFwd, (32, 32, 64), (32, 32, 32),
# NDIM_DONT_CARE,
# ConvIterAlgo.Optimized,
# 2, ["s8,s8,s8,s32,s32"],
# NHWC,
# NHWC,
# NHWC,
# GemmAlgo.Turing,
# TensorOp((8, 8, 16)),
# mask_sparse=True,
# increment_k_first=True,
# access_per_vector=0,
# is_nvrtc=True),
]
ALL_NATIVE_PARAMS = SHUFFLE_SIMT_PARAMS + SHUFFLE_TURING_PARAMS + SHUFFLE_VOLTA_PARAMS
......
......@@ -48,7 +48,7 @@ class SpconvOps:
"""
...
@staticmethod
def generate_conv_inds_stage2(indices: Tensor, hashdata_k: Tensor, hashdata_v: Tensor, indice_pairs: Tensor, indice_pairs_uniq: Tensor, indice_pairs_uniq_before_sort: Tensor, out_inds: Tensor, num_out_act: int, batch_size: int, output_dims: List[int], input_dims: List[int], ksize: List[int], stride: List[int], padding: List[int], dilation: List[int], transposed: bool = False, stream_int: int = 0) -> int:
def generate_conv_inds_stage2(indices: Tensor, hashdata_k: Tensor, hashdata_v: Tensor, indice_pairs: Tensor, indice_pairs_uniq: Tensor, indice_pairs_uniq_before_sort: Tensor, out_inds: Tensor, indice_num_per_loc: Tensor, num_out_act: int, batch_size: int, output_dims: List[int], input_dims: List[int], ksize: List[int], stride: List[int], padding: List[int], dilation: List[int], transposed: bool = False, stream_int: int = 0, use_bound_algo: bool = False) -> int:
"""
Args:
indices:
......@@ -58,6 +58,7 @@ class SpconvOps:
indice_pairs_uniq:
indice_pairs_uniq_before_sort:
out_inds:
indice_num_per_loc:
num_out_act:
batch_size:
output_dims:
......@@ -68,6 +69,7 @@ class SpconvOps:
dilation:
transposed:
stream_int:
use_bound_algo:
"""
...
@staticmethod
......@@ -191,6 +193,31 @@ class SpconvOps:
"""
...
@staticmethod
def indice_maxpool(out_features: Tensor, features: Tensor, indice_pairs: Tensor, indice_pair_num: Tensor, num_activate_out: int, stream: int = 0) -> None:
"""
Args:
out_features:
features:
indice_pairs:
indice_pair_num:
num_activate_out:
stream:
"""
...
@staticmethod
def indice_maxpool_backward(din: Tensor, features: Tensor, out_features: Tensor, out_bp: Tensor, indice_pairs: Tensor, indice_pair_num: Tensor, stream: int = 0) -> None:
"""
Args:
din:
features:
out_features:
out_bp:
indice_pairs:
indice_pair_num:
stream:
"""
...
@staticmethod
def maxpool_implicit_gemm_forward(out: Tensor, inp: Tensor, inds: Tensor, stream: int = 0) -> None:
"""
Args:
......@@ -369,7 +396,18 @@ class SpconvOps:
@staticmethod
def get_int32_max() -> int: ...
@staticmethod
def get_indice_pairs_implicit_gemm(allocator, indices: Tensor, batch_size: int, input_dims: List[int], algo: int, ksize: List[int], stride: List[int], padding: List[int], dilation: List[int], out_padding: List[int], subm: bool, transposed: bool, is_train: bool, stream_int: int = 0) -> Tensor:
def get_indice_gen_workspace_size(kv: int, num_act_in: int, num_act_out_bound: int, subm: bool, use_int64_hash_k: bool) -> int:
"""
Args:
kv:
num_act_in:
num_act_out_bound:
subm:
use_int64_hash_k:
"""
...
@staticmethod
def get_indice_pairs_implicit_gemm(allocator, indices: Tensor, batch_size: int, input_dims: List[int], algo: int, ksize: List[int], stride: List[int], padding: List[int], dilation: List[int], out_padding: List[int], subm: bool, transposed: bool, is_train: bool, stream_int: int = 0, num_out_act_bound: int = -1) -> Tuple[Tensor, int]:
"""
Args:
allocator:
......@@ -386,10 +424,11 @@ class SpconvOps:
transposed:
is_train:
stream_int:
num_out_act_bound:
"""
...
@staticmethod
def get_indice_pairs(allocator, indices: Tensor, batch_size: int, input_dims: List[int], algo: int, ksize: List[int], stride: List[int], padding: List[int], dilation: List[int], out_padding: List[int], subm: bool, transposed: bool, stream_int: int = 0) -> None:
def get_indice_pairs(allocator, indices: Tensor, batch_size: int, input_dims: List[int], algo: int, ksize: List[int], stride: List[int], padding: List[int], dilation: List[int], out_padding: List[int], subm: bool, transposed: bool, stream_int: int = 0, num_out_act_bound: int = -1) -> int:
"""
Args:
allocator:
......@@ -405,12 +444,6 @@ class SpconvOps:
subm:
transposed:
stream_int:
"""
...
@staticmethod
def test_allocator(allocator) -> None:
"""
Args:
allocator:
num_out_act_bound:
"""
...
......@@ -2,25 +2,29 @@ from typing import overload, Any, Callable, Dict, List, Optional, Set, Tuple, Ty
from pccm.stubs import EnumValue, EnumClassValue
from cumm.tensorview import Tensor
class ExternalAllocator:
def zeros(self, name: str, shape: List[int], dtype: int, device: int) -> Tensor:
def zeros(self, name: str, shape: List[int], dtype: int, device: int, is_temp_memory: bool = False, stream: int = 0) -> Tensor:
"""
Args:
name:
shape:
dtype:
device:
is_temp_memory:
stream:
"""
...
def empty(self, name: str, shape: List[int], dtype: int, device: int) -> Tensor:
def empty(self, name: str, shape: List[int], dtype: int, device: int, is_temp_memory: bool = False, stream: int = 0) -> Tensor:
"""
Args:
name:
shape:
dtype:
device:
is_temp_memory:
stream:
"""
...
def full_int(self, name: str, shape: List[int], value: int, dtype: int, device: int) -> Tensor:
def full_int(self, name: str, shape: List[int], value: int, dtype: int, device: int, is_temp_memory: bool = False, stream: int = 0) -> Tensor:
"""
Args:
name:
......@@ -28,9 +32,11 @@ class ExternalAllocator:
value:
dtype:
device:
is_temp_memory:
stream:
"""
...
def full_float(self, name: str, shape: List[int], value: float, dtype: int, device: int) -> Tensor:
def full_float(self, name: str, shape: List[int], value: float, dtype: int, device: int, is_temp_memory: bool = False, stream: int = 0) -> Tensor:
"""
Args:
name:
......@@ -38,6 +44,14 @@ class ExternalAllocator:
value:
dtype:
device:
is_temp_memory:
stream:
"""
...
def get_tensor_by_name(self, name: str) -> Tensor:
"""
Args:
name:
"""
...
def free(self, ten: Tensor) -> None:
......
from typing import overload, Any, Callable, Dict, List, Optional, Set, Tuple, Type, Union
from pccm.stubs import EnumValue, EnumClassValue
from cumm.tensorview.gemm import GemmAlgoDesp
from cumm.tensorview.gemm import ConvAlgoDesp
from cumm.tensorview import Tensor
from ...csrc.sparse.convops import ExternalSpconvMatmul
class GemmTuneResult:
algo_desp: GemmAlgoDesp
arch: Tuple[int, int]
splitk: int
def is_valid(self) -> bool: ...
@overload
def __init__(self) -> None: ...
@overload
def __init__(self, algo_desp: GemmAlgoDesp, arch: Tuple[int, int], splitk: int) -> None:
"""
Args:
algo_desp:
arch:
splitk:
"""
...
class ConvTuneResult:
algo_desp: ConvAlgoDesp
arch: Tuple[int, int]
splitk: int
@overload
def __init__(self) -> None: ...
@overload
def __init__(self, algo_desp: ConvAlgoDesp, arch: Tuple[int, int], splitk: int) -> None:
"""
Args:
algo_desp:
arch:
splitk:
"""
...
def is_valid(self) -> bool: ...
class ExternalSpconvMatmul:
def indice_conv_init_gemm(self, features_n: str, filters_n: str, all_weight_is_krsc: bool, is_kc_not_ck: bool, kv_center: int, out_channel: int, stream_int: int = 0) -> Tensor:
"""
Args:
features_n:
filters_n:
all_weight_is_krsc:
is_kc_not_ck:
kv_center:
out_channel:
stream_int:
"""
...
def indice_conv_cpu_gemm(self, inp_buffer_n: str, out_buffer_n: str, filters_n: str, all_weight_is_krsc: bool, is_kc_not_ck: bool, nhot: int, index: int) -> None:
"""
Args:
inp_buffer_n:
out_buffer_n:
filters_n:
all_weight_is_krsc:
is_kc_not_ck:
nhot:
index:
"""
...
def indice_conv_bwd_init_gemm(self, features_n: str, filters_n: str, out_bp_n: str, dfilters_n: str, all_weight_is_krsc: bool, is_kc_not_ck: bool, kv_center: int, stream_int: int = 0) -> Tensor:
"""
Args:
features_n:
filters_n:
out_bp_n:
dfilters_n:
all_weight_is_krsc:
is_kc_not_ck:
kv_center:
stream_int:
"""
...
def indice_conv_bwd_cpu_gemm(self, inp_buffer_n: str, out_buffer_n: str, filters_n: str, dfilters_n: str, all_weight_is_krsc: bool, is_kc_not_ck: bool, nhot: int, index: int) -> None:
"""
Args:
inp_buffer_n:
out_buffer_n:
filters_n:
dfilters_n:
all_weight_is_krsc:
is_kc_not_ck:
nhot:
index:
"""
...
class SimpleExternalSpconvMatmul(ExternalSpconvMatmul):
def __init__(self, alloc) -> None:
"""
Args:
alloc:
"""
...
from typing import overload, Any, Callable, Dict, List, Optional, Set, Tuple, Type, Union
from pccm.stubs import EnumValue, EnumClassValue
from cumm.tensorview.gemm import ConvAlgoDesp
from cumm.tensorview import Tensor
from cumm.tensorview.gemm import NVRTCParams
from spconv.core_cc.csrc.sparse.convops import ConvTuneResult
from cumm.tensorview import CUDAKernelTimer
class ConvTunerSimple:
def __init__(self, desps: List[ConvAlgoDesp]) -> None:
"""
Args:
desps:
"""
...
@staticmethod
def get_available_algo_str_from_arch(arch: Tuple[int, int]) -> List[str]:
"""
Args:
arch:
"""
...
def get_all_available(self, inp: Tensor, weight: Tensor, out: Tensor, layout_i: int, layout_w: int, layout_o: int, interleave_i: int, interleave_w: int, interleave_o: int, arch: Tuple[int, int], op_type: int, mask_width: int, auto_fp32_accum: bool, fp32_accum: bool) -> List[ConvAlgoDesp]:
"""
Args:
inp:
weight:
out:
layout_i:
layout_w:
layout_o:
interleave_i:
interleave_w:
interleave_o:
arch:
op_type:
mask_width:
auto_fp32_accum:
fp32_accum:
"""
...
def cached_get_nvrtc_params(self, desp: ConvAlgoDesp, arch: Tuple[int, int], stream_int: int) -> NVRTCParams:
"""
Args:
desp:
arch:
stream_int:
"""
...
def tune_and_cache(self, op_type: int, inp: Tensor, weight: Tensor, output: Tensor, layout_i: int, layout_w: int, layout_o: int, interleave_i: int, interleave_w: int, interleave_o: int, arch: Tuple[int, int], mask: Tensor, mask_argsort: Tensor, indices: Tensor, reverse_mask: bool, mask_filter: int = 0xffffffff, mask_width: int = -1, mask_output: Tensor = Tensor(), alpha: float = 1.0, beta: float = 0.0, stream_int: int = 0, auto_fp32_accum: bool = True, fp32_accum: bool = False, num_run: int = 5) -> Tuple[ConvTuneResult, float]:
"""
Args:
op_type:
inp:
weight:
output:
layout_i:
layout_w:
layout_o:
interleave_i:
interleave_w:
interleave_o:
arch:
mask:
mask_argsort:
indices:
reverse_mask:
mask_filter:
mask_width:
mask_output:
alpha:
beta:
stream_int:
auto_fp32_accum:
fp32_accum:
num_run:
"""
...
def get_tuned_algo(self, op_type: int, i_dtype: int, w_dtype: int, o_dtype: int, k: int, c: int, arch: Tuple[int, int], mask_width: int = -1) -> Tuple[Any, bool]:
"""
Args:
op_type:
i_dtype:
w_dtype:
o_dtype:
k:
c:
arch:
mask_width:
"""
...
def run_with_tuned_result(self, profile_res, op_type: int, inp: Tensor, weight: Tensor, output: Tensor, mask: Tensor, mask_argsort: Tensor, mask_output: Tensor, indices: Tensor, reverse_mask: bool, mask_filter: int = 0xffffffff, mask_width: int = -1, alpha: float = 1.0, beta: float = 0.0, stream_int: int = 0, workspace: Tensor = Tensor(), verbose: bool = False, timer: CUDAKernelTimer = CUDAKernelTimer(false), force_nvrtc: bool = False) -> None:
"""
Args:
profile_res:
op_type:
inp:
weight:
output:
mask:
mask_argsort:
mask_output:
indices:
reverse_mask:
mask_filter:
mask_width:
alpha:
beta:
stream_int:
workspace:
verbose:
timer:
force_nvrtc:
"""
...
def query_workspace_size(self, desp: ConvAlgoDesp, splitk: int, op_type: int, N: int, C: int, K: int, kv: int) -> int:
"""
Args:
desp:
splitk:
op_type:
N:
C:
K:
kv:
"""
...
from typing import overload, Any, Callable, Dict, List, Optional, Set, Tuple, Type, Union
from pccm.stubs import EnumValue, EnumClassValue
from cumm.tensorview.gemm import GemmAlgoDesp
from cumm.tensorview import Tensor
from cumm.tensorview.gemm import NVRTCParams
from spconv.core_cc.csrc.sparse.convops import GemmTuneResult
from cumm.tensorview import CUDAKernelTimer
class GemmTunerSimple:
def __init__(self, desps: List[GemmAlgoDesp]) -> None:
"""
Args:
desps:
"""
...
@staticmethod
def get_available_algo_str_from_arch(arch: Tuple[int, int]) -> List[str]:
"""
Args:
arch:
"""
...
def get_all_available(self, a: Tensor, b: Tensor, c: Tensor, trans_a: bool, trans_b: bool, trans_c: bool, arch: Tuple[int, int], shuffle_type: int) -> List[GemmAlgoDesp]:
"""
Args:
a:
b:
c:
trans_a:
trans_b:
trans_c:
arch:
shuffle_type:
"""
...
def cached_get_nvrtc_params(self, desp: GemmAlgoDesp, arch: Tuple[int, int], stream_int: int) -> NVRTCParams:
"""
Args:
desp:
arch:
stream_int:
"""
...
def tune_and_cache(self, a: Tensor, b: Tensor, c: Tensor, trans_a: bool, trans_b: bool, trans_c: bool, arch: Tuple[int, int], shuffle_type: int, a_inds: Tensor, b_inds: Tensor, c_inds: Tensor, hint: int = 0, alpha: float = 1.0, beta: float = 0.0, stream_int: int = 0, num_run: int = 5) -> Tuple[GemmTuneResult, float]:
"""
Args:
a:
b:
c:
trans_a:
trans_b:
trans_c:
arch:
shuffle_type:
a_inds:
b_inds:
c_inds:
hint:
alpha:
beta:
stream_int:
num_run:
"""
...
def get_tuned_algo(self, a_dtype: int, b_dtype: int, c_dtype: int, a_shape: List[int], b_shape: List[int], c_shape: List[int], trans_a: bool, trans_b: bool, trans_c: bool, arch: Tuple[int, int], shuffle_type: int, a_inds_shape: List[int], b_inds_shape: List[int], c_inds_shape: List[int], hint: int = 0) -> Tuple[Any, bool]:
"""
Args:
a_dtype:
b_dtype:
c_dtype:
a_shape:
b_shape:
c_shape:
trans_a:
trans_b:
trans_c:
arch:
shuffle_type:
a_inds_shape:
b_inds_shape:
c_inds_shape:
hint:
"""
...
def run_with_tuned_result(self, profile_res, a: Tensor, b: Tensor, c: Tensor, trans_a: bool, trans_b: bool, trans_c: bool, arch: Tuple[int, int], stream_int: int, shuffle_type: int, a_inds: Tensor, b_inds: Tensor, c_inds: Tensor, hint: int = 0, alpha: float = 1.0, beta: float = 0.0, workspace: Tensor = Tensor(), timer: CUDAKernelTimer = CUDAKernelTimer(False), force_nvrtc: bool = False) -> None:
"""
Args:
profile_res:
a:
b:
c:
trans_a:
trans_b:
trans_c:
arch:
stream_int:
shuffle_type:
a_inds:
b_inds:
c_inds:
hint:
alpha:
beta:
workspace:
timer:
force_nvrtc:
"""
...
from typing import overload, Any, Callable, Dict, List, Optional, Set, Tuple, Type, Union
from pccm.stubs import EnumValue, EnumClassValue
from cumm.tensorview import Tensor
from cumm.tensorview import CUDAKernelTimer
class ConvGemmOps:
@staticmethod
def get_compute_capability(index: int = -1) -> Tuple[int, int]:
"""
Args:
index:
"""
...
@staticmethod
def indice_conv(allocator, ext_mm, gemm_tuner, all_w_is_krsc: bool, filter_hwio: bool, features: Tensor, filters: Tensor, indice_pairs: Tensor, indice_pair_num: Tensor, num_activate_out: int, inverse: bool = False, subm: bool = False, algo: int = 0, stream_int: int = 0) -> None:
"""
1. this function need to take a out features
that from subm first mm.
2. this function don't support CPU.
Args:
allocator:
ext_mm:
gemm_tuner:
all_w_is_krsc:
filter_hwio:
features:
filters:
indice_pairs:
indice_pair_num:
num_activate_out:
inverse:
subm:
algo:
stream_int:
"""
...
@staticmethod
def indice_conv_backward(allocator, ext_mm, gemm_tuner, all_w_is_krsc: bool, filter_hwio: bool, features: Tensor, filters: Tensor, out_bp: Tensor, indice_pairs: Tensor, indice_pair_num: Tensor, inverse: bool = False, subm: bool = False, algo: int = 0, stream_int: int = 0) -> None:
"""
Args:
allocator:
ext_mm:
gemm_tuner:
all_w_is_krsc:
filter_hwio:
features:
filters:
out_bp:
indice_pairs:
indice_pair_num:
inverse:
subm:
algo:
stream_int:
"""
...
@staticmethod
def implicit_gemm(allocator, conv_tuner, features: Tensor, filters: Tensor, pair_fwd: Tensor, pair_mask_fwd_splits: List[Tensor], mask_argsort_fwd_splits: List[Tensor], num_activate_out: int, masks: Tensor, is_train: bool = False, is_subm: bool = False, stream_int: int = 0, timer: CUDAKernelTimer = CUDAKernelTimer(False), auto_fp32_accum: bool = True, fp32_accum: bool = False) -> int:
"""
Args:
allocator:
conv_tuner:
features:
filters:
pair_fwd:
pair_mask_fwd_splits:
mask_argsort_fwd_splits:
num_activate_out:
masks:
is_train:
is_subm:
stream_int:
timer:
auto_fp32_accum:
fp32_accum:
"""
...
@staticmethod
def implicit_gemm_backward(allocator, conv_tuner, features: Tensor, filters: Tensor, out_bp: Tensor, pair_fwd: Tensor, pair_bwd: Tensor, pair_mask_fwd_splits: List[Tensor], pair_mask_bwd_splits: List[Tensor], mask_argsort_fwd_splits: List[Tensor], mask_argsort_bwd_splits: List[Tensor], mask_output_fwd: Tensor, masks: Tensor, mask_width: int, is_subm: bool, stream_int: int = 0, timer: CUDAKernelTimer = CUDAKernelTimer(False), auto_fp32_accum: bool = True, fp32_accum: bool = False) -> None:
"""
Args:
allocator:
conv_tuner:
features:
filters:
out_bp:
pair_fwd:
pair_bwd:
pair_mask_fwd_splits:
pair_mask_bwd_splits:
mask_argsort_fwd_splits:
mask_argsort_bwd_splits:
mask_output_fwd:
masks:
mask_width:
is_subm:
stream_int:
timer:
auto_fp32_accum:
fp32_accum:
"""
...
......@@ -3,3 +3,10 @@ from pccm.stubs import EnumValue, EnumClassValue
class CompileInfo:
@staticmethod
def get_compiled_cuda_arch() -> List[Tuple[int, int]]: ...
@staticmethod
def arch_is_compiled(arch: Tuple[int, int]) -> bool:
"""
Args:
arch:
"""
...
from typing import overload, Any, Callable, Dict, List, Optional, Set, Tuple, Type, Union
from pccm.stubs import EnumValue, EnumClassValue
from cumm.tensorview.gemm import GemmAlgoDesp
from cumm.tensorview.gemm import GemmParams
class GemmMainUnitTest:
@staticmethod
def get_all_algo_desp() -> List[Any]: ...
def get_all_algo_desp() -> List[GemmAlgoDesp]: ...
@staticmethod
def extract_mnk(a_shape: List[int], b_shape: List[int], trans_a: bool, trans_b: bool, trans_c: bool, shuffle_type: str = "0", a_inds_shape: List[int] = [], b_inds_shape: List[int] = [], c_inds_shape: List[int] = []) -> Tuple[int, int, int]:
def extract_mnk(a_shape: List[int], b_shape: List[int], trans_a: bool, trans_b: bool, trans_c: bool, shuffle_type: int = 0, a_inds_shape: List[int] = [], b_inds_shape: List[int] = [], c_inds_shape: List[int] = []) -> Tuple[int, int, int]:
"""
Args:
a_shape:
......
This diff is collapsed.
import pccm
from cumm.common import TensorView, TensorViewCPU, TensorViewKernel, ThrustLib
from spconv.constants import AllocKeys
class ExternalAllocatorGuard(pccm.Class):
def __init__(self):
super().__init__()
......@@ -51,6 +53,9 @@ class ExternalAllocator(pccm.Class):
code.arg("shape", "std::vector<int64_t>")
code.arg("dtype", "int")
code.arg("device", "int")
code.arg("is_temp_memory", "bool", "false")
code.arg("stream", "std::uintptr_t", "0")
return code.ret("tv::Tensor")
@pccm.pybind.mark(virtual=True)
......@@ -61,6 +66,9 @@ class ExternalAllocator(pccm.Class):
code.arg("shape", "std::vector<int64_t>")
code.arg("dtype", "int")
code.arg("device", "int")
code.arg("is_temp_memory", "bool", "false")
code.arg("stream", "std::uintptr_t", "0")
return code.ret("tv::Tensor")
@pccm.pybind.mark(virtual=True)
......@@ -72,6 +80,9 @@ class ExternalAllocator(pccm.Class):
code.arg("value", "int")
code.arg("dtype", "int")
code.arg("device", "int")
code.arg("is_temp_memory", "bool", "false")
code.arg("stream", "std::uintptr_t", "0")
return code.ret("tv::Tensor")
@pccm.pybind.mark(virtual=True)
......@@ -83,6 +94,15 @@ class ExternalAllocator(pccm.Class):
code.arg("value", "float")
code.arg("dtype", "int")
code.arg("device", "int")
code.arg("is_temp_memory", "bool", "false")
code.arg("stream", "std::uintptr_t", "0")
return code.ret("tv::Tensor")
@pccm.pybind.mark(virtual=True)
@pccm.member_function(virtual=True, pure_virtual=True)
def get_tensor_by_name(self):
code = pccm.code()
code.arg("name", "std::string")
return code.ret("tv::Tensor")
@pccm.pybind.mark(virtual=True)
......@@ -105,9 +125,11 @@ class ExternalAllocator(pccm.Class):
code.arg("shape", "std::vector<int64_t>")
code.arg("dtype", "int")
code.arg("device", "int")
code.arg("name", "std::string", "\"\"")
code.arg("stream", "std::uintptr_t", "0")
code.raw(f"""
// "" means temp memory
auto ten = zeros("", shape, dtype, device);
auto ten = zeros(name, shape, dtype, device, true, stream);
return std::make_{self.ptr_type}<ExternalAllocatorGuard>(ten, [this](tv::Tensor ten){{
this->free(ten);
}});
......@@ -120,8 +142,10 @@ class ExternalAllocator(pccm.Class):
code.arg("shape", "std::vector<int64_t>")
code.arg("dtype", "int")
code.arg("device", "int")
code.arg("name", "std::string", "\"\"")
code.arg("stream", "std::uintptr_t", "0")
code.raw(f"""
auto ten = empty("", shape, dtype, device);
auto ten = empty(name, shape, dtype, device, true, stream);
return std::make_{self.ptr_type}<ExternalAllocatorGuard>(ten, [this](tv::Tensor ten){{
this->free(ten);
}});
......@@ -135,8 +159,10 @@ class ExternalAllocator(pccm.Class):
code.arg("value", "int")
code.arg("dtype", "int")
code.arg("device", "int")
code.arg("name", "std::string", "\"\"")
code.arg("stream", "std::uintptr_t", "0")
code.raw(f"""
auto ten = full_int("", shape, value, dtype, device);
auto ten = full_int(name, shape, value, dtype, device, true, stream);
return std::make_{self.ptr_type}<ExternalAllocatorGuard>(ten, [this](tv::Tensor ten){{
this->free(ten);
}});
......@@ -150,8 +176,10 @@ class ExternalAllocator(pccm.Class):
code.arg("value", "int")
code.arg("dtype", "int")
code.arg("device", "int")
code.arg("name", "std::string", "\"\"")
code.arg("stream", "std::uintptr_t", "0")
code.raw(f"""
auto ten = full_float("", shape, value, dtype, device);
auto ten = full_float(name, shape, value, dtype, device, true, stream);
return std::make_{self.ptr_type}<ExternalAllocatorGuard>(ten, [this](tv::Tensor t){{
this->free(t);
}});
......@@ -179,7 +207,7 @@ class ThrustAllocator(pccm.Class):
code.arg("num_bytes", "std::ptrdiff_t")
code.ret("char*")
code.raw(f"""
auto ten = allocator_.empty("", {{num_bytes}}, tv::uint8, 0);
auto ten = allocator_.empty({pccm.literal(AllocKeys.ThrustTemp)}, {{num_bytes}}, tv::uint8, 0);
return reinterpret_cast<char*>(ten.raw_data());
""")
return code
......@@ -193,3 +221,158 @@ class ThrustAllocator(pccm.Class):
return allocator_.free_noexcept(tv::from_blob(ptr, {{num_bytes}}, tv::uint8, 0));
""")
return code
class StaticAllocator(ExternalAllocator):
"""a simple allocator for tensorrt plugin.
"""
def __init__(self):
super().__init__()
self.add_dependency(TensorView)
self.add_member("tensor_dict_", "std::unordered_map<std::string, tv::Tensor>")
self.add_member("repr_", "std::string")
self.add_member("thrust_tmp_tensor_", "tv::Tensor")
self.grow = 1.5
@pccm.pybind.mark
@pccm.constructor
def ctor(self):
code = pccm.code()
code.arg("tensor_dict", "std::unordered_map<std::string, tv::Tensor>")
code.ctor_init("tensor_dict_", "tensor_dict")
code.raw(f"""
std::stringstream ss;
for (auto& p : tensor_dict){{
tv::ssprint(ss, p.first, p.second.shape(), tv::dtype_str(p.second.dtype()), "\\n");
}}
repr_ = ss.str();
""")
return code
@pccm.member_function(virtual=True)
def _get_raw_and_check(self):
code = pccm.code()
code.arg("name", "std::string")
code.arg("shape", "std::vector<int64_t>")
code.arg("dtype", "int")
code.arg("device", "int")
code.raw(f"""
auto res = get_tensor_by_name(name);
size_t total = std::accumulate(shape.begin(), shape.end(), 1, std::multiplies<int64_t>());
TV_ASSERT_RT_ERR(res.nbytes() >= total * tv::bit_size(tv::DType(dtype))
&& res.device() == device, "alloc failed", shape, res.shape());
return tv::from_blob(res.raw_data(), shape, dtype, device);
""")
return code.ret("tv::Tensor")
@pccm.pybind.mark
@pccm.member_function(virtual=True)
def zeros(self):
code = pccm.code()
code.arg("name", "std::string")
code.arg("shape", "std::vector<int64_t>")
code.arg("dtype", "int")
code.arg("device", "int")
code.arg("is_temp_memory", "bool", "false")
code.arg("stream", "std::uintptr_t", "0")
code.raw(f"""
auto tvctx = tv::Context();
tvctx.set_cuda_stream(reinterpret_cast<cudaStream_t>(stream));
auto blob = _get_raw_and_check(name, shape, dtype, device);
return blob.zero_(tvctx);
""")
return code.ret("tv::Tensor")
@pccm.pybind.mark
@pccm.member_function(virtual=True)
def empty(self):
code = pccm.code()
code.arg("name", "std::string")
code.arg("shape", "std::vector<int64_t>")
code.arg("dtype", "int")
code.arg("device", "int")
code.arg("is_temp_memory", "bool", "false")
code.arg("stream", "std::uintptr_t", "0")
code.raw(f"""
if (name == {pccm.literal(AllocKeys.ThrustTemp)}){{
// thrust tmp shouldn't inside tensor_dict. use a simple method to allocate
// we assume each allocator always handle one stream
// so we can just use one tensor
tv::Tensor res = thrust_tmp_tensor_;
if (res.empty()){{
res = tv::empty(shape, dtype, device);
thrust_tmp_tensor_ = res;
}}
if (shape[0] > thrust_tmp_tensor_.dim(0)){{
res = tv::empty({{int64_t(shape[0] * {self.grow})}}, dtype, device);
thrust_tmp_tensor_ = res;
}}
return res;
}}else{{
auto blob = _get_raw_and_check(name, shape, dtype, device);
return blob;
}}
""")
return code.ret("tv::Tensor")
@pccm.pybind.mark
@pccm.member_function(virtual=True)
def full_int(self):
code = pccm.code()
code.arg("name", "std::string")
code.arg("shape", "std::vector<int64_t>")
code.arg("value", "int")
code.arg("dtype", "int")
code.arg("device", "int")
code.arg("is_temp_memory", "bool", "false")
code.arg("stream", "std::uintptr_t", "0")
code.raw(f"""
auto tvctx = tv::Context();
auto blob = _get_raw_and_check(name, shape, dtype, device);
return blob.fill_(tvctx, value);
""")
return code.ret("tv::Tensor")
@pccm.pybind.mark
@pccm.member_function(virtual=True)
def full_float(self):
code = pccm.code()
code.arg("name", "std::string")
code.arg("shape", "std::vector<int64_t>")
code.arg("value", "float")
code.arg("dtype", "int")
code.arg("device", "int")
code.arg("is_temp_memory", "bool", "false")
code.arg("stream", "std::uintptr_t", "0")
code.raw(f"""
auto blob = _get_raw_and_check(name, shape, dtype, device);
return blob.fill_(tvctx, value);
""")
return code.ret("tv::Tensor")
@pccm.pybind.mark
@pccm.member_function(virtual=True)
def get_tensor_by_name(self):
code = pccm.code()
code.arg("name", "std::string")
code.raw(f"""
TV_ASSERT_RT_ERR(tensor_dict_.find(name) != tensor_dict_.end(), "can't find", name, "exists:\\n", repr_);
return tensor_dict_.at(name);
""")
return code.ret("tv::Tensor")
@pccm.pybind.mark
@pccm.member_function(virtual=True)
def free(self):
code = pccm.code()
code.arg("ten", "tv::Tensor")
return code
@pccm.pybind.mark
@pccm.member_function(virtual=True)
def free_noexcept(self):
code = pccm.code()
code.arg("ten", "tv::Tensor")
return code
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment