Commit 899008fa authored by yan.yan's avatar yan.yan
Browse files

working on c++ only

parent f78575ea
<!--
Copyright 2021 Yan Yan
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
# How to develop spconv 2.x
## First step
spconv 2.x is written in a unique c++ framework ```pccm```. read [pccm guide]() to learn how to use ```pccm```.
It's recommend to uninstall spconv and cumm installed by pip, then install spconv and cumm both in editable mode (```pip install -e .```)
## Architecture
\ No newline at end of file
...@@ -159,6 +159,9 @@ if disable_jit is not None and disable_jit == "1": ...@@ -159,6 +159,9 @@ if disable_jit is not None and disable_jit == "1":
from spconv.csrc.utils import BoxOps from spconv.csrc.utils import BoxOps
from spconv.csrc.hash.core import HashTable from spconv.csrc.hash.core import HashTable
from cumm.common import CompileInfo from cumm.common import CompileInfo
from spconv.csrc.sparse.alloc import ExternalAllocator
from spconv.csrc.sparse.convops import GemmTunerSimple, ExternalSpconvMatmul
from spconv.csrc.sparse.convops import ConvTunerSimple, ConvGemmOps
cu = GemmMainUnitTest(SHUFFLE_SIMT_PARAMS + SHUFFLE_VOLTA_PARAMS + SHUFFLE_TURING_PARAMS) cu = GemmMainUnitTest(SHUFFLE_SIMT_PARAMS + SHUFFLE_VOLTA_PARAMS + SHUFFLE_TURING_PARAMS)
convcu = ConvMainUnitTest(IMPLGEMM_SIMT_PARAMS + IMPLGEMM_VOLTA_PARAMS + IMPLGEMM_TURING_PARAMS) convcu = ConvMainUnitTest(IMPLGEMM_SIMT_PARAMS + IMPLGEMM_VOLTA_PARAMS + IMPLGEMM_TURING_PARAMS)
...@@ -172,14 +175,30 @@ if disable_jit is not None and disable_jit == "1": ...@@ -172,14 +175,30 @@ if disable_jit is not None and disable_jit == "1":
std = "c++14" std = "c++14"
else: else:
std = "c++17" std = "c++17"
cus = [cu, convcu, SpconvOps(), BoxOps(), HashTable(), CompileInfo()]
if CUMM_CPU_ONLY_BUILD: if CUMM_CPU_ONLY_BUILD:
cus = [SpconvOps(), BoxOps(), HashTable(), CompileInfo()] gemmtuner = GemmTunerSimple(cu)
gemmtuner.namespace = "csrc.sparse.convops.gemmops"
convtuner = ConvTunerSimple(convcu)
convtuner.namespace = "csrc.sparse.convops.convops"
convops = ConvGemmOps(gemmtuner, convtuner)
convops.namespace = "csrc.sparse.convops.spops"
else:
gemmtuner = GemmTunerSimple(None)
gemmtuner.namespace = "csrc.sparse.convops.gemmops"
convtuner = ConvTunerSimple(None)
convtuner.namespace = "csrc.sparse.convops.convops"
convops = ConvGemmOps(gemmtuner, convtuner)
convops.namespace = "csrc.sparse.convops.spops"
cus = [gemmtuner, convtuner,
convops, SpconvOps(), BoxOps(), HashTable(), CompileInfo(),
ExternalAllocator(),
ExternalSpconvMatmul()]
if not CUMM_CPU_ONLY_BUILD:
cus.extend([cu, convcu])
ext_modules: List[Extension] = [ ext_modules: List[Extension] = [
PCCMExtension(cus, PCCMExtension(cus,
"spconv/core_cc", "spconv/core_cc",
Path(__file__).resolve().parent / "spconv", Path(__file__).resolve().parent / "spconv",
objects_folder="objects",
std=std, std=std,
disable_pch=True, disable_pch=True,
verbose=True) verbose=True)
......
This diff is collapsed.
...@@ -24,8 +24,8 @@ from cumm.tensorview.gemm import ConvLayoutType as ConvLayoutTypeCpp ...@@ -24,8 +24,8 @@ from cumm.tensorview.gemm import ConvLayoutType as ConvLayoutTypeCpp
from cumm.tensorview.gemm import ShuffleStrideType as ShuffleStrideTypeCpp from cumm.tensorview.gemm import ShuffleStrideType as ShuffleStrideTypeCpp
from cumm.tensorview.gemm import ConvParams, GemmAlgoDesp, GemmParams from cumm.tensorview.gemm import ConvParams, GemmAlgoDesp, GemmParams
from cumm.gemm.main import GemmAlgoParams from cumm.gemm.main import GemmAlgoParams, gen_gemm_kernels
from cumm.conv.main import ConvAlgoParams, ConvIterAlgo from cumm.conv.main import ConvAlgoParams, ConvIterAlgo, gen_gemm_kernels as gen_conv_kernels
from cumm import dtypes from cumm import dtypes
from cumm.conv.bases import (NCHW, NHWC, ConvIterAlgo, ConvLayout, from cumm.conv.bases import (NCHW, NHWC, ConvIterAlgo, ConvLayout,
ConvLayoutType, ConvMode, ConvOpType) ConvLayoutType, ConvMode, ConvOpType)
...@@ -56,10 +56,15 @@ def _assign_gemm_desp_props(desp: Union[ConvAlgoDesp, GemmAlgoDesp], ...@@ -56,10 +56,15 @@ def _assign_gemm_desp_props(desp: Union[ConvAlgoDesp, GemmAlgoDesp],
desp.access_per_vector = p.access_per_vector desp.access_per_vector = p.access_per_vector
desp.is_nvrtc = p.is_nvrtc desp.is_nvrtc = p.is_nvrtc
def get_gemm_algo_desp_from_param(p: GemmAlgoParams): def get_gemm_algo_desp_from_param(p: GemmAlgoParams):
desp = GemmAlgoDesp() desp = GemmAlgoDesp()
_assign_gemm_desp_props(desp, p) _assign_gemm_desp_props(desp, p)
# here we must generate kernel for element-per-access data
ker = gen_gemm_kernels(p)
desp.element_per_access_a = ker.input_spec.input_iter_a.element_per_acc
desp.element_per_access_b = ker.input_spec.input_iter_b.element_per_acc
desp.element_per_access_c = ker.output_spec.out_iter.element_per_acc
return desp return desp
...@@ -78,6 +83,10 @@ def get_conv_algo_desp_from_param(p: ConvAlgoParams): ...@@ -78,6 +83,10 @@ def get_conv_algo_desp_from_param(p: ConvAlgoParams):
desp.interleave_o = p.layout_desp_output.interleave desp.interleave_o = p.layout_desp_output.interleave
desp.mask_sparse = p.mask_sparse desp.mask_sparse = p.mask_sparse
desp.increment_k_first = p.increment_k_first desp.increment_k_first = p.increment_k_first
ker = gen_conv_kernels(p)
desp.element_per_access_a = ker.input_spec.input_iter_a.element_per_acc
desp.element_per_access_b = ker.input_spec.input_iter_b.element_per_acc
desp.element_per_access_c = ker.output_spec.out_iter.element_per_acc
return desp return desp
...@@ -106,6 +115,7 @@ def _assign_gemm_params(desp: Union[ConvAlgoDesp, GemmAlgoDesp], ...@@ -106,6 +115,7 @@ def _assign_gemm_params(desp: Union[ConvAlgoDesp, GemmAlgoDesp],
p.is_nvrtc = desp.is_nvrtc p.is_nvrtc = desp.is_nvrtc
def get_gemm_param_from_desp(desp: GemmAlgoDesp): def get_gemm_param_from_desp(desp: GemmAlgoDesp):
p = GemmAlgoParams((0, 0, 0), (0, 0, 0), 0, "s8,s8,s8,s8,s8", False, False, p = GemmAlgoParams((0, 0, 0), (0, 0, 0), 0, "s8,s8,s8,s8,s8", False, False,
False, GemmAlgo.Simt) False, GemmAlgo.Simt)
......
"""Benchmark MinkowskiEngine
"""
from spconv.benchmark.core import get_voxel_data
import time
from pathlib import Path
import numpy as np
import torch
from torch import nn
from spconv.core import ConvAlgo
from cumm import dtypes
from spconv.test_utils import params_grid
_DTYPE_TO_TORCH_DTYPE = {
dtypes.float32: torch.float32,
dtypes.float16: torch.float16,
}
def bench_me_basic(dtype_str: str):
dtype = dtypes.get_dtype_by_shortcut(dtype_str)
if dtype not in _DTYPE_TO_TORCH_DTYPE:
raise NotImplementedError("only support bench f32 and f16 for now")
torch_dtype = _DTYPE_TO_TORCH_DTYPE[dtype]
"""Benchmark torchsparse
"""
from spconv.benchmark.core import get_voxel_data
import time
from pathlib import Path
import numpy as np
import torch
from torch import nn
from spconv.core import ConvAlgo
from cumm import dtypes
from spconv.test_utils import params_grid
_DTYPE_TO_TORCH_DTYPE = {
dtypes.float32: torch.float32,
dtypes.float16: torch.float16,
}
def bench_torchsparse_basic(dtype_str: str):
dtype = dtypes.get_dtype_by_shortcut(dtype_str)
if dtype not in _DTYPE_TO_TORCH_DTYPE:
raise NotImplementedError("only support bench f32 and f16 for now")
torch_dtype = _DTYPE_TO_TORCH_DTYPE[dtype]
...@@ -13,6 +13,7 @@ ...@@ -13,6 +13,7 @@
# limitations under the License. # limitations under the License.
from pathlib import Path from pathlib import Path
from typing import List
import pccm import pccm
from pccm.utils import project_is_editable, project_is_installed from pccm.utils import project_is_editable, project_is_installed
...@@ -32,17 +33,48 @@ if project_is_installed(PACKAGE_NAME) and project_is_editable( ...@@ -32,17 +33,48 @@ if project_is_installed(PACKAGE_NAME) and project_is_editable(
from spconv.csrc.sparse.alloc import ExternalAllocator from spconv.csrc.sparse.alloc import ExternalAllocator
from spconv.csrc.utils import BoxOps from spconv.csrc.utils import BoxOps
from spconv.csrc.hash.core import HashTable from spconv.csrc.hash.core import HashTable
from spconv.csrc.sparse.convops import GemmTunerSimple, ExternalSpconvMatmul
from spconv.csrc.sparse.convops import ConvTunerSimple, ConvGemmOps
from spconv.csrc.sparse.convops import SimpleExternalSpconvMatmul
all_shuffle = SHUFFLE_SIMT_PARAMS + SHUFFLE_VOLTA_PARAMS + SHUFFLE_TURING_PARAMS all_shuffle = SHUFFLE_SIMT_PARAMS + SHUFFLE_VOLTA_PARAMS + SHUFFLE_TURING_PARAMS
all_shuffle = list(filter(lambda x: not x.is_nvrtc, all_shuffle)) all_shuffle = list(filter(lambda x: not x.is_nvrtc, all_shuffle))
cu = GemmMainUnitTest(all_shuffle) cu = GemmMainUnitTest(all_shuffle)
cu.namespace = "cumm.gemm.main" cu.namespace = "cumm.gemm.main"
all_imp = (IMPLGEMM_SIMT_PARAMS + IMPLGEMM_VOLTA_PARAMS + all_imp = (IMPLGEMM_SIMT_PARAMS + IMPLGEMM_VOLTA_PARAMS +
IMPLGEMM_TURING_PARAMS) IMPLGEMM_TURING_PARAMS)
all_imp = list(filter(lambda x: not x.is_nvrtc, all_imp)) all_imp = list(filter(lambda x: not x.is_nvrtc, all_imp))
convcu = ConvMainUnitTest(all_imp) convcu = ConvMainUnitTest(all_imp)
convcu.namespace = "cumm.conv.main" convcu.namespace = "cumm.conv.main"
pccm.builder.build_pybind([cu, convcu, SpconvOps(), BoxOps(), HashTable(), CompileInfo(), ExternalAllocator()], gemmtuner = GemmTunerSimple(cu)
gemmtuner.namespace = "csrc.sparse.convops.gemmops"
convtuner = ConvTunerSimple(convcu)
convtuner.namespace = "csrc.sparse.convops.convops"
convops = ConvGemmOps(gemmtuner, convtuner)
convops.namespace = "csrc.sparse.convops.spops"
cus = [
cu, convcu, gemmtuner, convtuner,
convops,
SpconvOps(),
BoxOps(),
HashTable(),
CompileInfo(),
ExternalAllocator(),
ExternalSpconvMatmul(),
SimpleExternalSpconvMatmul(),
]
pccm.builder.build_pybind(cus,
PACKAGE_ROOT / "core_cc", PACKAGE_ROOT / "core_cc",
namespace_root=PACKAGE_ROOT, namespace_root=PACKAGE_ROOT,
load_library=False) load_library=False,
verbose=True)
# cus_dev: List[pccm.Class] = [
# ]
# pccm.builder.build_pybind(cus_dev,
# PACKAGE_ROOT / "core_cc_dev",
# namespace_root=PACKAGE_ROOT,
# load_library=False,
# verbose=True)
...@@ -30,6 +30,7 @@ if _filter_hwio_env is not None: ...@@ -30,6 +30,7 @@ if _filter_hwio_env is not None:
raise NotImplementedError("SPCONV_FILTER_HWIO is deprecated. use SPCONV_SAVED_WEIGHT_LAYOUT instead.") raise NotImplementedError("SPCONV_FILTER_HWIO is deprecated. use SPCONV_SAVED_WEIGHT_LAYOUT instead.")
DISABLE_JIT = os.getenv("SPCONV_DISABLE_JIT", "0") == "1" DISABLE_JIT = os.getenv("SPCONV_DISABLE_JIT", "0") == "1"
NDIM_DONT_CARE = 3 NDIM_DONT_CARE = 3
FILTER_HWIO = False FILTER_HWIO = False
...@@ -59,8 +60,10 @@ SPCONV_BWD_SPLITK = list(map(int, os.getenv("SPCONV_BWD_SPLITK", "1,2,4,8,16,32, ...@@ -59,8 +60,10 @@ SPCONV_BWD_SPLITK = list(map(int, os.getenv("SPCONV_BWD_SPLITK", "1,2,4,8,16,32,
SPCONV_NVRTC_MODE = NVRTCMode.ConstantMemory SPCONV_NVRTC_MODE = NVRTCMode.ConstantMemory
SPCONV_DEBUG_NVRTC_KERNELS = False SPCONV_DEBUG_NVRTC_KERNELS = False
SPCONV_DEBUG_CPP_ONLY = project_is_editable(PACKAGE_NAME)
class SpconvAllocatorKeys: class AllocKeys:
Pair = "Pair" Pair = "Pair"
IndiceNumPerLoc = "IndiceNumPerLoc" IndiceNumPerLoc = "IndiceNumPerLoc"
PairMask = "PairMask" PairMask = "PairMask"
...@@ -72,5 +75,31 @@ class SpconvAllocatorKeys: ...@@ -72,5 +75,31 @@ class SpconvAllocatorKeys:
# MaskArgSortFwd = "MaskArgSortFwd" # MaskArgSortFwd = "MaskArgSortFwd"
MaskArgSortBwd = "MaskArgSortBwd" MaskArgSortBwd = "MaskArgSortBwd"
MaskOutputFwd = "MaskOutputFwd"
OutFeatures = "OutFeatures" OutFeatures = "OutFeatures"
Features = "Features"
Filters = "Filters"
OutBp = "OutBp"
DIn = "DIn"
DFilters = "DFilters"
InpBuffer = "InpBuffer"
OutBuffer = "OutBuffer"
IndicePairsUniq = "IndicePairsUniq"
IndicePairsUniqBackup = "IndicePairsUniqBackup"
HashKOrKV = "HashKOrKV"
HashV = "HashV"
ThrustTemp = "ThrustTemp"
SPCONV_DEBUG_WEIGHT = False SPCONV_DEBUG_WEIGHT = False
SPCONV_CPP_INDICE_PAIRS = True
SPCONV_CPP_INDICE_PAIRS_IGEMM = True
SPCONV_CPP_GEMM = True
\ No newline at end of file
...@@ -16,9 +16,10 @@ from cumm.gemm.main import gen_shuffle_params_v2 as gen_shuffle_params, GemmAlgo ...@@ -16,9 +16,10 @@ from cumm.gemm.main import gen_shuffle_params_v2 as gen_shuffle_params, GemmAlgo
from cumm.gemm import kernel from cumm.gemm import kernel
from typing import List from typing import List
from cumm.gemm.algospec.core import TensorOp from cumm.gemm.algospec.core import TensorOp
from cumm.conv.main import gen_gemm_params as gen_conv_params, ConvFwdAndBwdInput, ConvBwdWeight, ConvIterAlgo, GemmAlgo from cumm.conv.main import gen_gemm_params as gen_conv_params, ConvFwdAndBwdInput, ConvBwdWeight, ConvFwd, ConvIterAlgo, GemmAlgo
from cumm.conv.bases import (NCHW, NHWC, ConvIterAlgo, ConvLayout, from cumm.conv.bases import (NCHW, NHWC, ConvIterAlgo, ConvLayout,
ConvLayoutType, ConvMode, ConvOpType) ConvLayoutType, ConvMode, ConvOpType)
from spconv.algocore import get_gemm_algo_desp_from_param
from spconv.constants import NDIM_DONT_CARE from spconv.constants import NDIM_DONT_CARE
...@@ -402,32 +403,6 @@ IMPLGEMM_SIMT_PARAMS = [ ...@@ -402,32 +403,6 @@ IMPLGEMM_SIMT_PARAMS = [
increment_k_first=True, increment_k_first=True,
access_per_vector=1), access_per_vector=1),
] ]
IMPLGEMM_SIMT_PARAMS = [
*gen_conv_params(ConvFwdAndBwdInput, (64, 32, 16), (32, 32, 8),
NDIM_DONT_CARE,
ConvIterAlgo.Optimized,
2, ["f32,f32,f32,f32,f32", "f16,f16,f16,f32,f32"],
NHWC,
NHWC,
NHWC,
GemmAlgo.Simt,
None,
mask_sparse=True,
increment_k_first=True,
access_per_vector=1),
*gen_conv_params(ConvBwdWeight, (64, 32, 16), (32, 32, 8),
NDIM_DONT_CARE,
ConvIterAlgo.Optimized,
2, ["f32,f32,f32,f32,f32", "f16,f16,f16,f32,f32"],
NHWC,
NHWC,
NHWC,
GemmAlgo.Simt,
None,
mask_sparse=True,
increment_k_first=True,
access_per_vector=1),
]
IMPLGEMM_VOLTA_PARAMS = [ IMPLGEMM_VOLTA_PARAMS = [
...@@ -693,6 +668,181 @@ IMPLGEMM_TURING_PARAMS = [ ...@@ -693,6 +668,181 @@ IMPLGEMM_TURING_PARAMS = [
# NHWC, NHWC, NHWC, GemmAlgo.Turing, TensorOp((16, 8, 8)), mask_sparse=True, increment_k_first=True, access_per_vector=1), # NHWC, NHWC, NHWC, GemmAlgo.Turing, TensorOp((16, 8, 8)), mask_sparse=True, increment_k_first=True, access_per_vector=1),
# gen_conv_params(ConvFwdAndBwdInput, ) # gen_conv_params(ConvFwdAndBwdInput, )
# all int8 kernels use nvrtc.
*gen_conv_params(ConvFwd, (32, 32, 32), (32, 32, 32),
NDIM_DONT_CARE,
ConvIterAlgo.Optimized,
2, ["s8,s8,s8,s32,s32"],
NHWC,
NHWC,
NHWC,
GemmAlgo.Turing,
TensorOp((8, 8, 16)),
mask_sparse=True,
increment_k_first=True,
access_per_vector=1,
is_nvrtc=False),
*gen_conv_params(ConvFwd, (32, 64, 32), (32, 32, 32),
NDIM_DONT_CARE,
ConvIterAlgo.Optimized,
2, ["s8,s8,s8,s32,s32"],
NHWC,
NHWC,
NHWC,
GemmAlgo.Turing,
TensorOp((8, 8, 16)),
mask_sparse=True,
increment_k_first=True,
access_per_vector=1,
is_nvrtc=False),
*gen_conv_params(ConvFwd, (32, 32, 64), (32, 32, 32),
NDIM_DONT_CARE,
ConvIterAlgo.Optimized,
2, ["s8,s8,s8,s32,s32"],
NHWC,
NHWC,
NHWC,
GemmAlgo.Turing,
TensorOp((8, 8, 16)),
mask_sparse=True,
increment_k_first=True,
access_per_vector=1,
is_nvrtc=False),
*gen_conv_params(ConvFwd, (32, 64, 64), (32, 32, 32),
NDIM_DONT_CARE,
ConvIterAlgo.Optimized,
2, ["s8,s8,s8,s32,s32"],
NHWC,
NHWC,
NHWC,
GemmAlgo.Turing,
TensorOp((8, 8, 16)),
mask_sparse=True,
increment_k_first=True,
access_per_vector=1,
is_nvrtc=False),
*gen_conv_params(ConvFwd, (64, 128, 32), (32, 64, 32),
NDIM_DONT_CARE,
ConvIterAlgo.Optimized,
2, ["s8,s8,s8,s32,s32"],
NHWC,
NHWC,
NHWC,
GemmAlgo.Turing,
TensorOp((8, 8, 16)),
mask_sparse=True,
increment_k_first=True,
access_per_vector=1,
is_nvrtc=False),
*gen_conv_params(ConvFwd, (64, 64, 32), (32, 64, 32),
NDIM_DONT_CARE,
ConvIterAlgo.Optimized,
2, ["s8,s8,s8,s32,s32"],
NHWC,
NHWC,
NHWC,
GemmAlgo.Turing,
TensorOp((8, 8, 16)),
mask_sparse=True,
increment_k_first=True,
access_per_vector=1,
is_nvrtc=False),
*gen_conv_params(ConvFwd, (64, 64, 32), (32, 32, 32),
NDIM_DONT_CARE,
ConvIterAlgo.Optimized,
2, ["s8,s8,s8,s32,s32"],
NHWC,
NHWC,
NHWC,
GemmAlgo.Turing,
TensorOp((8, 8, 16)),
mask_sparse=True,
increment_k_first=True,
access_per_vector=1,
is_nvrtc=False),
*gen_conv_params(ConvFwd, (64, 32, 32), (32, 32, 32),
NDIM_DONT_CARE,
ConvIterAlgo.Optimized,
2, ["s8,s8,s8,s32,s32"],
NHWC,
NHWC,
NHWC,
GemmAlgo.Turing,
TensorOp((8, 8, 16)),
mask_sparse=True,
increment_k_first=True,
access_per_vector=1,
is_nvrtc=False),
*gen_conv_params(ConvFwd, (128, 128, 64), (64, 64, 64),
NDIM_DONT_CARE,
ConvIterAlgo.Optimized,
2, ["s8,s8,s8,s32,s32"],
NHWC,
NHWC,
NHWC,
GemmAlgo.Turing,
TensorOp((8, 8, 16)),
mask_sparse=True,
increment_k_first=True,
access_per_vector=1,
is_nvrtc=False),
*gen_conv_params(ConvFwd, (64, 128, 64), (32, 64, 64),
NDIM_DONT_CARE,
ConvIterAlgo.Optimized,
2, ["s8,s8,s8,s32,s32"],
NHWC,
NHWC,
NHWC,
GemmAlgo.Turing,
TensorOp((8, 8, 16)),
mask_sparse=True,
increment_k_first=True,
access_per_vector=1,
is_nvrtc=False),
# *gen_conv_params(ConvFwd, (32, 32, 32), (32, 32, 32),
# NDIM_DONT_CARE,
# ConvIterAlgo.Optimized,
# 2, ["s8,s8,s8,s32,s32"],
# NHWC,
# NHWC,
# NHWC,
# GemmAlgo.Turing,
# TensorOp((8, 8, 16)),
# mask_sparse=True,
# increment_k_first=True,
# access_per_vector=0,
# is_nvrtc=True),
# *gen_conv_params(ConvFwd, (32, 64, 32), (32, 32, 32),
# NDIM_DONT_CARE,
# ConvIterAlgo.Optimized,
# 2, ["s8,s8,s8,s32,s32"],
# NHWC,
# NHWC,
# NHWC,
# GemmAlgo.Turing,
# TensorOp((8, 8, 16)),
# mask_sparse=True,
# increment_k_first=True,
# access_per_vector=0,
# is_nvrtc=True),
# *gen_conv_params(ConvFwd, (32, 32, 64), (32, 32, 32),
# NDIM_DONT_CARE,
# ConvIterAlgo.Optimized,
# 2, ["s8,s8,s8,s32,s32"],
# NHWC,
# NHWC,
# NHWC,
# GemmAlgo.Turing,
# TensorOp((8, 8, 16)),
# mask_sparse=True,
# increment_k_first=True,
# access_per_vector=0,
# is_nvrtc=True),
] ]
ALL_NATIVE_PARAMS = SHUFFLE_SIMT_PARAMS + SHUFFLE_TURING_PARAMS + SHUFFLE_VOLTA_PARAMS ALL_NATIVE_PARAMS = SHUFFLE_SIMT_PARAMS + SHUFFLE_TURING_PARAMS + SHUFFLE_VOLTA_PARAMS
......
...@@ -48,7 +48,7 @@ class SpconvOps: ...@@ -48,7 +48,7 @@ class SpconvOps:
""" """
... ...
@staticmethod @staticmethod
def generate_conv_inds_stage2(indices: Tensor, hashdata_k: Tensor, hashdata_v: Tensor, indice_pairs: Tensor, indice_pairs_uniq: Tensor, indice_pairs_uniq_before_sort: Tensor, out_inds: Tensor, num_out_act: int, batch_size: int, output_dims: List[int], input_dims: List[int], ksize: List[int], stride: List[int], padding: List[int], dilation: List[int], transposed: bool = False, stream_int: int = 0) -> int: def generate_conv_inds_stage2(indices: Tensor, hashdata_k: Tensor, hashdata_v: Tensor, indice_pairs: Tensor, indice_pairs_uniq: Tensor, indice_pairs_uniq_before_sort: Tensor, out_inds: Tensor, indice_num_per_loc: Tensor, num_out_act: int, batch_size: int, output_dims: List[int], input_dims: List[int], ksize: List[int], stride: List[int], padding: List[int], dilation: List[int], transposed: bool = False, stream_int: int = 0, use_bound_algo: bool = False) -> int:
""" """
Args: Args:
indices: indices:
...@@ -58,6 +58,7 @@ class SpconvOps: ...@@ -58,6 +58,7 @@ class SpconvOps:
indice_pairs_uniq: indice_pairs_uniq:
indice_pairs_uniq_before_sort: indice_pairs_uniq_before_sort:
out_inds: out_inds:
indice_num_per_loc:
num_out_act: num_out_act:
batch_size: batch_size:
output_dims: output_dims:
...@@ -68,6 +69,7 @@ class SpconvOps: ...@@ -68,6 +69,7 @@ class SpconvOps:
dilation: dilation:
transposed: transposed:
stream_int: stream_int:
use_bound_algo:
""" """
... ...
@staticmethod @staticmethod
...@@ -191,6 +193,31 @@ class SpconvOps: ...@@ -191,6 +193,31 @@ class SpconvOps:
""" """
... ...
@staticmethod @staticmethod
def indice_maxpool(out_features: Tensor, features: Tensor, indice_pairs: Tensor, indice_pair_num: Tensor, num_activate_out: int, stream: int = 0) -> None:
"""
Args:
out_features:
features:
indice_pairs:
indice_pair_num:
num_activate_out:
stream:
"""
...
@staticmethod
def indice_maxpool_backward(din: Tensor, features: Tensor, out_features: Tensor, out_bp: Tensor, indice_pairs: Tensor, indice_pair_num: Tensor, stream: int = 0) -> None:
"""
Args:
din:
features:
out_features:
out_bp:
indice_pairs:
indice_pair_num:
stream:
"""
...
@staticmethod
def maxpool_implicit_gemm_forward(out: Tensor, inp: Tensor, inds: Tensor, stream: int = 0) -> None: def maxpool_implicit_gemm_forward(out: Tensor, inp: Tensor, inds: Tensor, stream: int = 0) -> None:
""" """
Args: Args:
...@@ -369,7 +396,18 @@ class SpconvOps: ...@@ -369,7 +396,18 @@ class SpconvOps:
@staticmethod @staticmethod
def get_int32_max() -> int: ... def get_int32_max() -> int: ...
@staticmethod @staticmethod
def get_indice_pairs_implicit_gemm(allocator, indices: Tensor, batch_size: int, input_dims: List[int], algo: int, ksize: List[int], stride: List[int], padding: List[int], dilation: List[int], out_padding: List[int], subm: bool, transposed: bool, is_train: bool, stream_int: int = 0) -> Tensor: def get_indice_gen_workspace_size(kv: int, num_act_in: int, num_act_out_bound: int, subm: bool, use_int64_hash_k: bool) -> int:
"""
Args:
kv:
num_act_in:
num_act_out_bound:
subm:
use_int64_hash_k:
"""
...
@staticmethod
def get_indice_pairs_implicit_gemm(allocator, indices: Tensor, batch_size: int, input_dims: List[int], algo: int, ksize: List[int], stride: List[int], padding: List[int], dilation: List[int], out_padding: List[int], subm: bool, transposed: bool, is_train: bool, stream_int: int = 0, num_out_act_bound: int = -1) -> Tuple[Tensor, int]:
""" """
Args: Args:
allocator: allocator:
...@@ -386,10 +424,11 @@ class SpconvOps: ...@@ -386,10 +424,11 @@ class SpconvOps:
transposed: transposed:
is_train: is_train:
stream_int: stream_int:
num_out_act_bound:
""" """
... ...
@staticmethod @staticmethod
def get_indice_pairs(allocator, indices: Tensor, batch_size: int, input_dims: List[int], algo: int, ksize: List[int], stride: List[int], padding: List[int], dilation: List[int], out_padding: List[int], subm: bool, transposed: bool, stream_int: int = 0) -> None: def get_indice_pairs(allocator, indices: Tensor, batch_size: int, input_dims: List[int], algo: int, ksize: List[int], stride: List[int], padding: List[int], dilation: List[int], out_padding: List[int], subm: bool, transposed: bool, stream_int: int = 0, num_out_act_bound: int = -1) -> int:
""" """
Args: Args:
allocator: allocator:
...@@ -405,12 +444,6 @@ class SpconvOps: ...@@ -405,12 +444,6 @@ class SpconvOps:
subm: subm:
transposed: transposed:
stream_int: stream_int:
""" num_out_act_bound:
...
@staticmethod
def test_allocator(allocator) -> None:
"""
Args:
allocator:
""" """
... ...
...@@ -2,25 +2,29 @@ from typing import overload, Any, Callable, Dict, List, Optional, Set, Tuple, Ty ...@@ -2,25 +2,29 @@ from typing import overload, Any, Callable, Dict, List, Optional, Set, Tuple, Ty
from pccm.stubs import EnumValue, EnumClassValue from pccm.stubs import EnumValue, EnumClassValue
from cumm.tensorview import Tensor from cumm.tensorview import Tensor
class ExternalAllocator: class ExternalAllocator:
def zeros(self, name: str, shape: List[int], dtype: int, device: int) -> Tensor: def zeros(self, name: str, shape: List[int], dtype: int, device: int, is_temp_memory: bool = False, stream: int = 0) -> Tensor:
""" """
Args: Args:
name: name:
shape: shape:
dtype: dtype:
device: device:
is_temp_memory:
stream:
""" """
... ...
def empty(self, name: str, shape: List[int], dtype: int, device: int) -> Tensor: def empty(self, name: str, shape: List[int], dtype: int, device: int, is_temp_memory: bool = False, stream: int = 0) -> Tensor:
""" """
Args: Args:
name: name:
shape: shape:
dtype: dtype:
device: device:
is_temp_memory:
stream:
""" """
... ...
def full_int(self, name: str, shape: List[int], value: int, dtype: int, device: int) -> Tensor: def full_int(self, name: str, shape: List[int], value: int, dtype: int, device: int, is_temp_memory: bool = False, stream: int = 0) -> Tensor:
""" """
Args: Args:
name: name:
...@@ -28,9 +32,11 @@ class ExternalAllocator: ...@@ -28,9 +32,11 @@ class ExternalAllocator:
value: value:
dtype: dtype:
device: device:
is_temp_memory:
stream:
""" """
... ...
def full_float(self, name: str, shape: List[int], value: float, dtype: int, device: int) -> Tensor: def full_float(self, name: str, shape: List[int], value: float, dtype: int, device: int, is_temp_memory: bool = False, stream: int = 0) -> Tensor:
""" """
Args: Args:
name: name:
...@@ -38,6 +44,14 @@ class ExternalAllocator: ...@@ -38,6 +44,14 @@ class ExternalAllocator:
value: value:
dtype: dtype:
device: device:
is_temp_memory:
stream:
"""
...
def get_tensor_by_name(self, name: str) -> Tensor:
"""
Args:
name:
""" """
... ...
def free(self, ten: Tensor) -> None: def free(self, ten: Tensor) -> None:
......
from typing import overload, Any, Callable, Dict, List, Optional, Set, Tuple, Type, Union
from pccm.stubs import EnumValue, EnumClassValue
from cumm.tensorview.gemm import GemmAlgoDesp
from cumm.tensorview.gemm import ConvAlgoDesp
from cumm.tensorview import Tensor
from ...csrc.sparse.convops import ExternalSpconvMatmul
class GemmTuneResult:
algo_desp: GemmAlgoDesp
arch: Tuple[int, int]
splitk: int
def is_valid(self) -> bool: ...
@overload
def __init__(self) -> None: ...
@overload
def __init__(self, algo_desp: GemmAlgoDesp, arch: Tuple[int, int], splitk: int) -> None:
"""
Args:
algo_desp:
arch:
splitk:
"""
...
class ConvTuneResult:
algo_desp: ConvAlgoDesp
arch: Tuple[int, int]
splitk: int
@overload
def __init__(self) -> None: ...
@overload
def __init__(self, algo_desp: ConvAlgoDesp, arch: Tuple[int, int], splitk: int) -> None:
"""
Args:
algo_desp:
arch:
splitk:
"""
...
def is_valid(self) -> bool: ...
class ExternalSpconvMatmul:
def indice_conv_init_gemm(self, features_n: str, filters_n: str, all_weight_is_krsc: bool, is_kc_not_ck: bool, kv_center: int, out_channel: int, stream_int: int = 0) -> Tensor:
"""
Args:
features_n:
filters_n:
all_weight_is_krsc:
is_kc_not_ck:
kv_center:
out_channel:
stream_int:
"""
...
def indice_conv_cpu_gemm(self, inp_buffer_n: str, out_buffer_n: str, filters_n: str, all_weight_is_krsc: bool, is_kc_not_ck: bool, nhot: int, index: int) -> None:
"""
Args:
inp_buffer_n:
out_buffer_n:
filters_n:
all_weight_is_krsc:
is_kc_not_ck:
nhot:
index:
"""
...
def indice_conv_bwd_init_gemm(self, features_n: str, filters_n: str, out_bp_n: str, dfilters_n: str, all_weight_is_krsc: bool, is_kc_not_ck: bool, kv_center: int, stream_int: int = 0) -> Tensor:
"""
Args:
features_n:
filters_n:
out_bp_n:
dfilters_n:
all_weight_is_krsc:
is_kc_not_ck:
kv_center:
stream_int:
"""
...
def indice_conv_bwd_cpu_gemm(self, inp_buffer_n: str, out_buffer_n: str, filters_n: str, dfilters_n: str, all_weight_is_krsc: bool, is_kc_not_ck: bool, nhot: int, index: int) -> None:
"""
Args:
inp_buffer_n:
out_buffer_n:
filters_n:
dfilters_n:
all_weight_is_krsc:
is_kc_not_ck:
nhot:
index:
"""
...
class SimpleExternalSpconvMatmul(ExternalSpconvMatmul):
def __init__(self, alloc) -> None:
"""
Args:
alloc:
"""
...
from typing import overload, Any, Callable, Dict, List, Optional, Set, Tuple, Type, Union
from pccm.stubs import EnumValue, EnumClassValue
from cumm.tensorview.gemm import ConvAlgoDesp
from cumm.tensorview import Tensor
from cumm.tensorview.gemm import NVRTCParams
from spconv.core_cc.csrc.sparse.convops import ConvTuneResult
from cumm.tensorview import CUDAKernelTimer
class ConvTunerSimple:
def __init__(self, desps: List[ConvAlgoDesp]) -> None:
"""
Args:
desps:
"""
...
@staticmethod
def get_available_algo_str_from_arch(arch: Tuple[int, int]) -> List[str]:
"""
Args:
arch:
"""
...
def get_all_available(self, inp: Tensor, weight: Tensor, out: Tensor, layout_i: int, layout_w: int, layout_o: int, interleave_i: int, interleave_w: int, interleave_o: int, arch: Tuple[int, int], op_type: int, mask_width: int, auto_fp32_accum: bool, fp32_accum: bool) -> List[ConvAlgoDesp]:
"""
Args:
inp:
weight:
out:
layout_i:
layout_w:
layout_o:
interleave_i:
interleave_w:
interleave_o:
arch:
op_type:
mask_width:
auto_fp32_accum:
fp32_accum:
"""
...
def cached_get_nvrtc_params(self, desp: ConvAlgoDesp, arch: Tuple[int, int], stream_int: int) -> NVRTCParams:
"""
Args:
desp:
arch:
stream_int:
"""
...
def tune_and_cache(self, op_type: int, inp: Tensor, weight: Tensor, output: Tensor, layout_i: int, layout_w: int, layout_o: int, interleave_i: int, interleave_w: int, interleave_o: int, arch: Tuple[int, int], mask: Tensor, mask_argsort: Tensor, indices: Tensor, reverse_mask: bool, mask_filter: int = 0xffffffff, mask_width: int = -1, mask_output: Tensor = Tensor(), alpha: float = 1.0, beta: float = 0.0, stream_int: int = 0, auto_fp32_accum: bool = True, fp32_accum: bool = False, num_run: int = 5) -> Tuple[ConvTuneResult, float]:
"""
Args:
op_type:
inp:
weight:
output:
layout_i:
layout_w:
layout_o:
interleave_i:
interleave_w:
interleave_o:
arch:
mask:
mask_argsort:
indices:
reverse_mask:
mask_filter:
mask_width:
mask_output:
alpha:
beta:
stream_int:
auto_fp32_accum:
fp32_accum:
num_run:
"""
...
def get_tuned_algo(self, op_type: int, i_dtype: int, w_dtype: int, o_dtype: int, k: int, c: int, arch: Tuple[int, int], mask_width: int = -1) -> Tuple[Any, bool]:
"""
Args:
op_type:
i_dtype:
w_dtype:
o_dtype:
k:
c:
arch:
mask_width:
"""
...
def run_with_tuned_result(self, profile_res, op_type: int, inp: Tensor, weight: Tensor, output: Tensor, mask: Tensor, mask_argsort: Tensor, mask_output: Tensor, indices: Tensor, reverse_mask: bool, mask_filter: int = 0xffffffff, mask_width: int = -1, alpha: float = 1.0, beta: float = 0.0, stream_int: int = 0, workspace: Tensor = Tensor(), verbose: bool = False, timer: CUDAKernelTimer = CUDAKernelTimer(false), force_nvrtc: bool = False) -> None:
"""
Args:
profile_res:
op_type:
inp:
weight:
output:
mask:
mask_argsort:
mask_output:
indices:
reverse_mask:
mask_filter:
mask_width:
alpha:
beta:
stream_int:
workspace:
verbose:
timer:
force_nvrtc:
"""
...
def query_workspace_size(self, desp: ConvAlgoDesp, splitk: int, op_type: int, N: int, C: int, K: int, kv: int) -> int:
"""
Args:
desp:
splitk:
op_type:
N:
C:
K:
kv:
"""
...
from typing import overload, Any, Callable, Dict, List, Optional, Set, Tuple, Type, Union
from pccm.stubs import EnumValue, EnumClassValue
from cumm.tensorview.gemm import GemmAlgoDesp
from cumm.tensorview import Tensor
from cumm.tensorview.gemm import NVRTCParams
from spconv.core_cc.csrc.sparse.convops import GemmTuneResult
from cumm.tensorview import CUDAKernelTimer
class GemmTunerSimple:
def __init__(self, desps: List[GemmAlgoDesp]) -> None:
"""
Args:
desps:
"""
...
@staticmethod
def get_available_algo_str_from_arch(arch: Tuple[int, int]) -> List[str]:
"""
Args:
arch:
"""
...
def get_all_available(self, a: Tensor, b: Tensor, c: Tensor, trans_a: bool, trans_b: bool, trans_c: bool, arch: Tuple[int, int], shuffle_type: int) -> List[GemmAlgoDesp]:
"""
Args:
a:
b:
c:
trans_a:
trans_b:
trans_c:
arch:
shuffle_type:
"""
...
def cached_get_nvrtc_params(self, desp: GemmAlgoDesp, arch: Tuple[int, int], stream_int: int) -> NVRTCParams:
"""
Args:
desp:
arch:
stream_int:
"""
...
def tune_and_cache(self, a: Tensor, b: Tensor, c: Tensor, trans_a: bool, trans_b: bool, trans_c: bool, arch: Tuple[int, int], shuffle_type: int, a_inds: Tensor, b_inds: Tensor, c_inds: Tensor, hint: int = 0, alpha: float = 1.0, beta: float = 0.0, stream_int: int = 0, num_run: int = 5) -> Tuple[GemmTuneResult, float]:
"""
Args:
a:
b:
c:
trans_a:
trans_b:
trans_c:
arch:
shuffle_type:
a_inds:
b_inds:
c_inds:
hint:
alpha:
beta:
stream_int:
num_run:
"""
...
def get_tuned_algo(self, a_dtype: int, b_dtype: int, c_dtype: int, a_shape: List[int], b_shape: List[int], c_shape: List[int], trans_a: bool, trans_b: bool, trans_c: bool, arch: Tuple[int, int], shuffle_type: int, a_inds_shape: List[int], b_inds_shape: List[int], c_inds_shape: List[int], hint: int = 0) -> Tuple[Any, bool]:
"""
Args:
a_dtype:
b_dtype:
c_dtype:
a_shape:
b_shape:
c_shape:
trans_a:
trans_b:
trans_c:
arch:
shuffle_type:
a_inds_shape:
b_inds_shape:
c_inds_shape:
hint:
"""
...
def run_with_tuned_result(self, profile_res, a: Tensor, b: Tensor, c: Tensor, trans_a: bool, trans_b: bool, trans_c: bool, arch: Tuple[int, int], stream_int: int, shuffle_type: int, a_inds: Tensor, b_inds: Tensor, c_inds: Tensor, hint: int = 0, alpha: float = 1.0, beta: float = 0.0, workspace: Tensor = Tensor(), timer: CUDAKernelTimer = CUDAKernelTimer(False), force_nvrtc: bool = False) -> None:
"""
Args:
profile_res:
a:
b:
c:
trans_a:
trans_b:
trans_c:
arch:
stream_int:
shuffle_type:
a_inds:
b_inds:
c_inds:
hint:
alpha:
beta:
workspace:
timer:
force_nvrtc:
"""
...
from typing import overload, Any, Callable, Dict, List, Optional, Set, Tuple, Type, Union
from pccm.stubs import EnumValue, EnumClassValue
from cumm.tensorview import Tensor
from cumm.tensorview import CUDAKernelTimer
class ConvGemmOps:
@staticmethod
def get_compute_capability(index: int = -1) -> Tuple[int, int]:
"""
Args:
index:
"""
...
@staticmethod
def indice_conv(allocator, ext_mm, gemm_tuner, all_w_is_krsc: bool, filter_hwio: bool, features: Tensor, filters: Tensor, indice_pairs: Tensor, indice_pair_num: Tensor, num_activate_out: int, inverse: bool = False, subm: bool = False, algo: int = 0, stream_int: int = 0) -> None:
"""
1. this function need to take a out features
that from subm first mm.
2. this function don't support CPU.
Args:
allocator:
ext_mm:
gemm_tuner:
all_w_is_krsc:
filter_hwio:
features:
filters:
indice_pairs:
indice_pair_num:
num_activate_out:
inverse:
subm:
algo:
stream_int:
"""
...
@staticmethod
def indice_conv_backward(allocator, ext_mm, gemm_tuner, all_w_is_krsc: bool, filter_hwio: bool, features: Tensor, filters: Tensor, out_bp: Tensor, indice_pairs: Tensor, indice_pair_num: Tensor, inverse: bool = False, subm: bool = False, algo: int = 0, stream_int: int = 0) -> None:
"""
Args:
allocator:
ext_mm:
gemm_tuner:
all_w_is_krsc:
filter_hwio:
features:
filters:
out_bp:
indice_pairs:
indice_pair_num:
inverse:
subm:
algo:
stream_int:
"""
...
@staticmethod
def implicit_gemm(allocator, conv_tuner, features: Tensor, filters: Tensor, pair_fwd: Tensor, pair_mask_fwd_splits: List[Tensor], mask_argsort_fwd_splits: List[Tensor], num_activate_out: int, masks: Tensor, is_train: bool = False, is_subm: bool = False, stream_int: int = 0, timer: CUDAKernelTimer = CUDAKernelTimer(False), auto_fp32_accum: bool = True, fp32_accum: bool = False) -> int:
"""
Args:
allocator:
conv_tuner:
features:
filters:
pair_fwd:
pair_mask_fwd_splits:
mask_argsort_fwd_splits:
num_activate_out:
masks:
is_train:
is_subm:
stream_int:
timer:
auto_fp32_accum:
fp32_accum:
"""
...
@staticmethod
def implicit_gemm_backward(allocator, conv_tuner, features: Tensor, filters: Tensor, out_bp: Tensor, pair_fwd: Tensor, pair_bwd: Tensor, pair_mask_fwd_splits: List[Tensor], pair_mask_bwd_splits: List[Tensor], mask_argsort_fwd_splits: List[Tensor], mask_argsort_bwd_splits: List[Tensor], mask_output_fwd: Tensor, masks: Tensor, mask_width: int, is_subm: bool, stream_int: int = 0, timer: CUDAKernelTimer = CUDAKernelTimer(False), auto_fp32_accum: bool = True, fp32_accum: bool = False) -> None:
"""
Args:
allocator:
conv_tuner:
features:
filters:
out_bp:
pair_fwd:
pair_bwd:
pair_mask_fwd_splits:
pair_mask_bwd_splits:
mask_argsort_fwd_splits:
mask_argsort_bwd_splits:
mask_output_fwd:
masks:
mask_width:
is_subm:
stream_int:
timer:
auto_fp32_accum:
fp32_accum:
"""
...
...@@ -3,3 +3,10 @@ from pccm.stubs import EnumValue, EnumClassValue ...@@ -3,3 +3,10 @@ from pccm.stubs import EnumValue, EnumClassValue
class CompileInfo: class CompileInfo:
@staticmethod @staticmethod
def get_compiled_cuda_arch() -> List[Tuple[int, int]]: ... def get_compiled_cuda_arch() -> List[Tuple[int, int]]: ...
@staticmethod
def arch_is_compiled(arch: Tuple[int, int]) -> bool:
"""
Args:
arch:
"""
...
from typing import overload, Any, Callable, Dict, List, Optional, Set, Tuple, Type, Union from typing import overload, Any, Callable, Dict, List, Optional, Set, Tuple, Type, Union
from pccm.stubs import EnumValue, EnumClassValue from pccm.stubs import EnumValue, EnumClassValue
from cumm.tensorview.gemm import GemmAlgoDesp
from cumm.tensorview.gemm import GemmParams from cumm.tensorview.gemm import GemmParams
class GemmMainUnitTest: class GemmMainUnitTest:
@staticmethod @staticmethod
def get_all_algo_desp() -> List[Any]: ... def get_all_algo_desp() -> List[GemmAlgoDesp]: ...
@staticmethod @staticmethod
def extract_mnk(a_shape: List[int], b_shape: List[int], trans_a: bool, trans_b: bool, trans_c: bool, shuffle_type: str = "0", a_inds_shape: List[int] = [], b_inds_shape: List[int] = [], c_inds_shape: List[int] = []) -> Tuple[int, int, int]: def extract_mnk(a_shape: List[int], b_shape: List[int], trans_a: bool, trans_b: bool, trans_c: bool, shuffle_type: int = 0, a_inds_shape: List[int] = [], b_inds_shape: List[int] = [], c_inds_shape: List[int] = []) -> Tuple[int, int, int]:
""" """
Args: Args:
a_shape: a_shape:
......
This diff is collapsed.
import pccm import pccm
from cumm.common import TensorView, TensorViewCPU, TensorViewKernel, ThrustLib from cumm.common import TensorView, TensorViewCPU, TensorViewKernel, ThrustLib
from spconv.constants import AllocKeys
class ExternalAllocatorGuard(pccm.Class): class ExternalAllocatorGuard(pccm.Class):
def __init__(self): def __init__(self):
super().__init__() super().__init__()
...@@ -51,6 +53,9 @@ class ExternalAllocator(pccm.Class): ...@@ -51,6 +53,9 @@ class ExternalAllocator(pccm.Class):
code.arg("shape", "std::vector<int64_t>") code.arg("shape", "std::vector<int64_t>")
code.arg("dtype", "int") code.arg("dtype", "int")
code.arg("device", "int") code.arg("device", "int")
code.arg("is_temp_memory", "bool", "false")
code.arg("stream", "std::uintptr_t", "0")
return code.ret("tv::Tensor") return code.ret("tv::Tensor")
@pccm.pybind.mark(virtual=True) @pccm.pybind.mark(virtual=True)
...@@ -61,6 +66,9 @@ class ExternalAllocator(pccm.Class): ...@@ -61,6 +66,9 @@ class ExternalAllocator(pccm.Class):
code.arg("shape", "std::vector<int64_t>") code.arg("shape", "std::vector<int64_t>")
code.arg("dtype", "int") code.arg("dtype", "int")
code.arg("device", "int") code.arg("device", "int")
code.arg("is_temp_memory", "bool", "false")
code.arg("stream", "std::uintptr_t", "0")
return code.ret("tv::Tensor") return code.ret("tv::Tensor")
@pccm.pybind.mark(virtual=True) @pccm.pybind.mark(virtual=True)
...@@ -72,6 +80,9 @@ class ExternalAllocator(pccm.Class): ...@@ -72,6 +80,9 @@ class ExternalAllocator(pccm.Class):
code.arg("value", "int") code.arg("value", "int")
code.arg("dtype", "int") code.arg("dtype", "int")
code.arg("device", "int") code.arg("device", "int")
code.arg("is_temp_memory", "bool", "false")
code.arg("stream", "std::uintptr_t", "0")
return code.ret("tv::Tensor") return code.ret("tv::Tensor")
@pccm.pybind.mark(virtual=True) @pccm.pybind.mark(virtual=True)
...@@ -83,6 +94,15 @@ class ExternalAllocator(pccm.Class): ...@@ -83,6 +94,15 @@ class ExternalAllocator(pccm.Class):
code.arg("value", "float") code.arg("value", "float")
code.arg("dtype", "int") code.arg("dtype", "int")
code.arg("device", "int") code.arg("device", "int")
code.arg("is_temp_memory", "bool", "false")
code.arg("stream", "std::uintptr_t", "0")
return code.ret("tv::Tensor")
@pccm.pybind.mark(virtual=True)
@pccm.member_function(virtual=True, pure_virtual=True)
def get_tensor_by_name(self):
code = pccm.code()
code.arg("name", "std::string")
return code.ret("tv::Tensor") return code.ret("tv::Tensor")
@pccm.pybind.mark(virtual=True) @pccm.pybind.mark(virtual=True)
...@@ -105,9 +125,11 @@ class ExternalAllocator(pccm.Class): ...@@ -105,9 +125,11 @@ class ExternalAllocator(pccm.Class):
code.arg("shape", "std::vector<int64_t>") code.arg("shape", "std::vector<int64_t>")
code.arg("dtype", "int") code.arg("dtype", "int")
code.arg("device", "int") code.arg("device", "int")
code.arg("name", "std::string", "\"\"")
code.arg("stream", "std::uintptr_t", "0")
code.raw(f""" code.raw(f"""
// "" means temp memory // "" means temp memory
auto ten = zeros("", shape, dtype, device); auto ten = zeros(name, shape, dtype, device, true, stream);
return std::make_{self.ptr_type}<ExternalAllocatorGuard>(ten, [this](tv::Tensor ten){{ return std::make_{self.ptr_type}<ExternalAllocatorGuard>(ten, [this](tv::Tensor ten){{
this->free(ten); this->free(ten);
}}); }});
...@@ -120,8 +142,10 @@ class ExternalAllocator(pccm.Class): ...@@ -120,8 +142,10 @@ class ExternalAllocator(pccm.Class):
code.arg("shape", "std::vector<int64_t>") code.arg("shape", "std::vector<int64_t>")
code.arg("dtype", "int") code.arg("dtype", "int")
code.arg("device", "int") code.arg("device", "int")
code.arg("name", "std::string", "\"\"")
code.arg("stream", "std::uintptr_t", "0")
code.raw(f""" code.raw(f"""
auto ten = empty("", shape, dtype, device); auto ten = empty(name, shape, dtype, device, true, stream);
return std::make_{self.ptr_type}<ExternalAllocatorGuard>(ten, [this](tv::Tensor ten){{ return std::make_{self.ptr_type}<ExternalAllocatorGuard>(ten, [this](tv::Tensor ten){{
this->free(ten); this->free(ten);
}}); }});
...@@ -135,8 +159,10 @@ class ExternalAllocator(pccm.Class): ...@@ -135,8 +159,10 @@ class ExternalAllocator(pccm.Class):
code.arg("value", "int") code.arg("value", "int")
code.arg("dtype", "int") code.arg("dtype", "int")
code.arg("device", "int") code.arg("device", "int")
code.arg("name", "std::string", "\"\"")
code.arg("stream", "std::uintptr_t", "0")
code.raw(f""" code.raw(f"""
auto ten = full_int("", shape, value, dtype, device); auto ten = full_int(name, shape, value, dtype, device, true, stream);
return std::make_{self.ptr_type}<ExternalAllocatorGuard>(ten, [this](tv::Tensor ten){{ return std::make_{self.ptr_type}<ExternalAllocatorGuard>(ten, [this](tv::Tensor ten){{
this->free(ten); this->free(ten);
}}); }});
...@@ -150,14 +176,16 @@ class ExternalAllocator(pccm.Class): ...@@ -150,14 +176,16 @@ class ExternalAllocator(pccm.Class):
code.arg("value", "int") code.arg("value", "int")
code.arg("dtype", "int") code.arg("dtype", "int")
code.arg("device", "int") code.arg("device", "int")
code.arg("name", "std::string", "\"\"")
code.arg("stream", "std::uintptr_t", "0")
code.raw(f""" code.raw(f"""
auto ten = full_float("", shape, value, dtype, device); auto ten = full_float(name, shape, value, dtype, device, true, stream);
return std::make_{self.ptr_type}<ExternalAllocatorGuard>(ten, [this](tv::Tensor t){{ return std::make_{self.ptr_type}<ExternalAllocatorGuard>(ten, [this](tv::Tensor t){{
this->free(t); this->free(t);
}}); }});
""") """)
return code.ret(f"std::{self.ptr_type}_ptr<ExternalAllocatorGuard>") return code.ret(f"std::{self.ptr_type}_ptr<ExternalAllocatorGuard>")
class ThrustAllocator(pccm.Class): class ThrustAllocator(pccm.Class):
def __init__(self): def __init__(self):
super().__init__() super().__init__()
...@@ -179,7 +207,7 @@ class ThrustAllocator(pccm.Class): ...@@ -179,7 +207,7 @@ class ThrustAllocator(pccm.Class):
code.arg("num_bytes", "std::ptrdiff_t") code.arg("num_bytes", "std::ptrdiff_t")
code.ret("char*") code.ret("char*")
code.raw(f""" code.raw(f"""
auto ten = allocator_.empty("", {{num_bytes}}, tv::uint8, 0); auto ten = allocator_.empty({pccm.literal(AllocKeys.ThrustTemp)}, {{num_bytes}}, tv::uint8, 0);
return reinterpret_cast<char*>(ten.raw_data()); return reinterpret_cast<char*>(ten.raw_data());
""") """)
return code return code
...@@ -192,4 +220,159 @@ class ThrustAllocator(pccm.Class): ...@@ -192,4 +220,159 @@ class ThrustAllocator(pccm.Class):
code.raw(f""" code.raw(f"""
return allocator_.free_noexcept(tv::from_blob(ptr, {{num_bytes}}, tv::uint8, 0)); return allocator_.free_noexcept(tv::from_blob(ptr, {{num_bytes}}, tv::uint8, 0));
""") """)
return code return code
class StaticAllocator(ExternalAllocator):
"""a simple allocator for tensorrt plugin.
"""
def __init__(self):
super().__init__()
self.add_dependency(TensorView)
self.add_member("tensor_dict_", "std::unordered_map<std::string, tv::Tensor>")
self.add_member("repr_", "std::string")
self.add_member("thrust_tmp_tensor_", "tv::Tensor")
self.grow = 1.5
@pccm.pybind.mark
@pccm.constructor
def ctor(self):
code = pccm.code()
code.arg("tensor_dict", "std::unordered_map<std::string, tv::Tensor>")
code.ctor_init("tensor_dict_", "tensor_dict")
code.raw(f"""
std::stringstream ss;
for (auto& p : tensor_dict){{
tv::ssprint(ss, p.first, p.second.shape(), tv::dtype_str(p.second.dtype()), "\\n");
}}
repr_ = ss.str();
""")
return code
@pccm.member_function(virtual=True)
def _get_raw_and_check(self):
code = pccm.code()
code.arg("name", "std::string")
code.arg("shape", "std::vector<int64_t>")
code.arg("dtype", "int")
code.arg("device", "int")
code.raw(f"""
auto res = get_tensor_by_name(name);
size_t total = std::accumulate(shape.begin(), shape.end(), 1, std::multiplies<int64_t>());
TV_ASSERT_RT_ERR(res.nbytes() >= total * tv::bit_size(tv::DType(dtype))
&& res.device() == device, "alloc failed", shape, res.shape());
return tv::from_blob(res.raw_data(), shape, dtype, device);
""")
return code.ret("tv::Tensor")
@pccm.pybind.mark
@pccm.member_function(virtual=True)
def zeros(self):
code = pccm.code()
code.arg("name", "std::string")
code.arg("shape", "std::vector<int64_t>")
code.arg("dtype", "int")
code.arg("device", "int")
code.arg("is_temp_memory", "bool", "false")
code.arg("stream", "std::uintptr_t", "0")
code.raw(f"""
auto tvctx = tv::Context();
tvctx.set_cuda_stream(reinterpret_cast<cudaStream_t>(stream));
auto blob = _get_raw_and_check(name, shape, dtype, device);
return blob.zero_(tvctx);
""")
return code.ret("tv::Tensor")
@pccm.pybind.mark
@pccm.member_function(virtual=True)
def empty(self):
code = pccm.code()
code.arg("name", "std::string")
code.arg("shape", "std::vector<int64_t>")
code.arg("dtype", "int")
code.arg("device", "int")
code.arg("is_temp_memory", "bool", "false")
code.arg("stream", "std::uintptr_t", "0")
code.raw(f"""
if (name == {pccm.literal(AllocKeys.ThrustTemp)}){{
// thrust tmp shouldn't inside tensor_dict. use a simple method to allocate
// we assume each allocator always handle one stream
// so we can just use one tensor
tv::Tensor res = thrust_tmp_tensor_;
if (res.empty()){{
res = tv::empty(shape, dtype, device);
thrust_tmp_tensor_ = res;
}}
if (shape[0] > thrust_tmp_tensor_.dim(0)){{
res = tv::empty({{int64_t(shape[0] * {self.grow})}}, dtype, device);
thrust_tmp_tensor_ = res;
}}
return res;
}}else{{
auto blob = _get_raw_and_check(name, shape, dtype, device);
return blob;
}}
""")
return code.ret("tv::Tensor")
@pccm.pybind.mark
@pccm.member_function(virtual=True)
def full_int(self):
code = pccm.code()
code.arg("name", "std::string")
code.arg("shape", "std::vector<int64_t>")
code.arg("value", "int")
code.arg("dtype", "int")
code.arg("device", "int")
code.arg("is_temp_memory", "bool", "false")
code.arg("stream", "std::uintptr_t", "0")
code.raw(f"""
auto tvctx = tv::Context();
auto blob = _get_raw_and_check(name, shape, dtype, device);
return blob.fill_(tvctx, value);
""")
return code.ret("tv::Tensor")
@pccm.pybind.mark
@pccm.member_function(virtual=True)
def full_float(self):
code = pccm.code()
code.arg("name", "std::string")
code.arg("shape", "std::vector<int64_t>")
code.arg("value", "float")
code.arg("dtype", "int")
code.arg("device", "int")
code.arg("is_temp_memory", "bool", "false")
code.arg("stream", "std::uintptr_t", "0")
code.raw(f"""
auto blob = _get_raw_and_check(name, shape, dtype, device);
return blob.fill_(tvctx, value);
""")
return code.ret("tv::Tensor")
@pccm.pybind.mark
@pccm.member_function(virtual=True)
def get_tensor_by_name(self):
code = pccm.code()
code.arg("name", "std::string")
code.raw(f"""
TV_ASSERT_RT_ERR(tensor_dict_.find(name) != tensor_dict_.end(), "can't find", name, "exists:\\n", repr_);
return tensor_dict_.at(name);
""")
return code.ret("tv::Tensor")
@pccm.pybind.mark
@pccm.member_function(virtual=True)
def free(self):
code = pccm.code()
code.arg("ten", "tv::Tensor")
return code
@pccm.pybind.mark
@pccm.member_function(virtual=True)
def free_noexcept(self):
code = pccm.code()
code.arg("ten", "tv::Tensor")
return code
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment