Commit 62c1496f authored by yan.yan's avatar yan.yan
Browse files

bug fix

parent 01ed382c
name: build
on:
push:
branches:
- main
- feature/*
tags:
- '*'
jobs:
build-windows:
runs-on: windows-latest
strategy:
matrix:
python-version: ['3.6', '3.7', '3.8', '3.9', '3.10']
cuda-version: ['10.2', '11.1', '11.4']
steps:
- uses: actions/checkout@master
- name: Install CUDA
env:
CUDA_VERSION: ${{ matrix.cuda-version }}
PYTHON_VERSION: ${{ matrix.python-version }}
cuda: ${{ matrix.cuda-version }}
if: (github.event_name == 'push' && (startsWith(github.ref, 'refs/tags')) && (env.CUDA_VERSION != '') ) || (env.CUDA_VERSION == '11.1' && env.PYTHON_VERSION == '3.10')
shell: powershell
run: .\tools\install_windows_cuda.ps1
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v2
with:
python-version: ${{ matrix.python-version }}
- uses: ilammy/msvc-dev-cmd@v1
- name: Install pep build
run: |
python -m pip install build --user
python -m pip install --upgrade pip twine wheel
python -m pip install pytest setuptools
- name: Build a windows binary wheel
env:
CUDA_VERSION: ${{ matrix.cuda-version }}
PYTHON_VERSION: ${{ matrix.python-version }}
if: (github.event_name == 'push' && (startsWith(github.ref, 'refs/tags')) ) || (env.CUDA_VERSION == '11.1' && env.PYTHON_VERSION == '3.10')
run: |
$Env:CUMM_CUDA_VERSION = "${{ matrix.cuda-version }}"
$Env:CUMM_CUDA_ARCH_LIST = "all"
$Env:SPCONV_DISABLE_JIT = "1"
pip install pccm pybind11
python -m build --wheel --outdir dist/ .
shell: powershell
- name: Publish a Python distribution to PyPI
if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags')
run: |
$Env:TWINE_USERNAME = "__token__"
$Env:TWINE_PASSWORD = "${{ secrets.pypi_password }}"
twine upload dist/*
shell: powershell
build:
needs: build-windows
runs-on: ubuntu-20.04
strategy:
matrix:
python-version: ['3.8'] # this version is only used for upload.
cuda-version: ['102', '111', '114']
steps:
- uses: actions/checkout@master
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v2
with:
python-version: ${{ matrix.python-version }}
- name: Install pep build
run: |
python -m pip install build --user
python -m pip install --upgrade pip twine wheel
python -m pip install pytest setuptools
- name: Build a cuda wheel
env:
CUDA_VERSION: ${{ matrix.cuda-version }}
PYTHON_VERSION: ${{ matrix.python-version }}
DOCKER_IMAGE: scrin/manylinux2014-cuda:cu${{ matrix.cuda-version }}-devel
PLAT: manylinux2014_x86_64
if: (github.event_name == 'push' && (startsWith(github.ref, 'refs/tags')) && (env.CUDA_VERSION != '') ) || env.CUDA_VERSION == '114'
run: |
docker run --rm -e PLAT=$PLAT -e CUMM_CUDA_VERSION=${{ matrix.cuda-version }} -v `pwd`:/io $DOCKER_IMAGE bash -c "/io/tools/build-wheels.sh"
- name: Publish a Python distribution to PyPI
if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags')
uses: pypa/gh-action-pypi-publish@master
with:
user: __token__
password: ${{ secrets.pypi_password }}
\ No newline at end of file
...@@ -34,7 +34,6 @@ ...@@ -34,7 +34,6 @@
* training/inference speed is increased * training/inference speed is increased
* support int8/tensor core * support int8/tensor core
* doesn't depend on pytorch binary. * doesn't depend on pytorch binary.
* If your GPU has tensor core, try mixed precision training in spconv 2.x!
* since spconv 2.x doesn't depend on pytorch binary (never in future), it's impossible to support torch.jit/libtorch inference. * since spconv 2.x doesn't depend on pytorch binary (never in future), it's impossible to support torch.jit/libtorch inference.
## TODO in Spconv 2.x ## TODO in Spconv 2.x
......
[build-system] [build-system]
requires = ["setuptools>=41.0", "wheel", "pccm>=0.2.5", "cumm>=0.1.3"] requires = ["setuptools>=41.0", "wheel", "pccm>=0.2.5", "cumm>=0.1.5"]
build-backend = "setuptools.build_meta" build-backend = "setuptools.build_meta"
...@@ -30,7 +30,7 @@ DESCRIPTION = 'spatial sparse convolution' ...@@ -30,7 +30,7 @@ DESCRIPTION = 'spatial sparse convolution'
URL = 'https://github.com/traveller59/spconv' URL = 'https://github.com/traveller59/spconv'
EMAIL = 'yanyan.sub@outlook.com' EMAIL = 'yanyan.sub@outlook.com'
AUTHOR = 'Yan Yan' AUTHOR = 'Yan Yan'
REQUIRES_PYTHON = '>=3.7' REQUIRES_PYTHON = '>=3.6'
VERSION = None VERSION = None
# What packages are required for this module to be executed? # What packages are required for this module to be executed?
...@@ -134,12 +134,11 @@ if disable_jit is not None and disable_jit == "1": ...@@ -134,12 +134,11 @@ if disable_jit is not None and disable_jit == "1":
} }
from cumm.gemm.main import GemmMainUnitTest, SHUFFLE_SIMT_PARAMS, SHUFFLE_VOLTA_PARAMS, SHUFFLE_TURING_PARAMS from cumm.gemm.main import GemmMainUnitTest, SHUFFLE_SIMT_PARAMS, SHUFFLE_VOLTA_PARAMS, SHUFFLE_TURING_PARAMS
from spconv.csrc.sparse.all import SpconvOps from spconv.csrc.sparse.all import SpconvOps
from cumm.gemm.gather import GatherAll
cu = GemmMainUnitTest(SHUFFLE_SIMT_PARAMS + SHUFFLE_VOLTA_PARAMS + SHUFFLE_TURING_PARAMS) cu = GemmMainUnitTest(SHUFFLE_SIMT_PARAMS + SHUFFLE_VOLTA_PARAMS + SHUFFLE_TURING_PARAMS)
cu.namespace = "cumm.gemm.main" cu.namespace = "cumm.gemm.main"
ext_modules: List[Extension] = [ ext_modules: List[Extension] = [
PCCMExtension([cu, SpconvOps(), GatherAll()], PCCMExtension([cu, SpconvOps()],
"spconv/core_cc", "spconv/core_cc",
Path(__file__).resolve().parent / "spconv") Path(__file__).resolve().parent / "spconv")
] ]
......
...@@ -22,7 +22,6 @@ from .constants import PACKAGE_NAME, PACKAGE_ROOT ...@@ -22,7 +22,6 @@ from .constants import PACKAGE_NAME, PACKAGE_ROOT
if project_is_installed(PACKAGE_NAME) and project_is_editable(PACKAGE_NAME): if project_is_installed(PACKAGE_NAME) and project_is_editable(PACKAGE_NAME):
from cumm.gemm.main import GemmMainUnitTest, SHUFFLE_SIMT_PARAMS, SHUFFLE_VOLTA_PARAMS, SHUFFLE_TURING_PARAMS from cumm.gemm.main import GemmMainUnitTest, SHUFFLE_SIMT_PARAMS, SHUFFLE_VOLTA_PARAMS, SHUFFLE_TURING_PARAMS
from spconv.csrc.sparse.all import SpconvOps from spconv.csrc.sparse.all import SpconvOps
# from cumm.gemm.gather import GatherAll, ScatterAll
cu = GemmMainUnitTest(SHUFFLE_SIMT_PARAMS + SHUFFLE_VOLTA_PARAMS + SHUFFLE_TURING_PARAMS) cu = GemmMainUnitTest(SHUFFLE_SIMT_PARAMS + SHUFFLE_VOLTA_PARAMS + SHUFFLE_TURING_PARAMS)
cu.namespace = "cumm.gemm.main" cu.namespace = "cumm.gemm.main"
pccm.builder.build_pybind([cu, SpconvOps()], pccm.builder.build_pybind([cu, SpconvOps()],
......
...@@ -3,14 +3,12 @@ from pccm.stubs import EnumValue, EnumClassValue ...@@ -3,14 +3,12 @@ from pccm.stubs import EnumValue, EnumClassValue
from cumm.tensorview import Tensor from cumm.tensorview import Tensor
class SpconvOps: class SpconvOps:
@staticmethod @staticmethod
def generate_conv_inds(indices: Tensor, hashdata: Tensor, indice_pairs: Tensor, indice_pairs_uniq: Tensor, out_inds: Tensor, indice_num_per_loc: Tensor, batch_size: int, output_dims: List[int], input_dims: List[int], ksize: List[int], stride: List[int], padding: List[int], dilation: List[int]) -> int: def generate_conv_inds_stage1(indices: Tensor, indice_pairs: Tensor, indice_pairs_uniq: Tensor, indice_num_per_loc: Tensor, batch_size: int, output_dims: List[int], input_dims: List[int], ksize: List[int], stride: List[int], padding: List[int], dilation: List[int], transposed: bool = False, stream_int: int = 0) -> None:
""" """
Args: Args:
indices: indices:
hashdata:
indice_pairs: indice_pairs:
indice_pairs_uniq: indice_pairs_uniq:
out_inds:
indice_num_per_loc: indice_num_per_loc:
batch_size: batch_size:
output_dims: output_dims:
...@@ -19,28 +17,22 @@ class SpconvOps: ...@@ -19,28 +17,22 @@ class SpconvOps:
stride: stride:
padding: padding:
dilation: dilation:
transposed:
stream_int:
""" """
... ...
@staticmethod @staticmethod
def generate_conv_inds_stage1(indices: Tensor, indice_pairs: Tensor, indice_pairs_uniq: Tensor, indice_num_per_loc: Tensor, batch_size: int, output_dims: List[int], input_dims: List[int], ksize: List[int], stride: List[int], padding: List[int], dilation: List[int], stream_int: int = 0) -> int: def generate_conv_inds_stage1_5(indice_pairs_uniq: Tensor, ndim: int, uniq_size: int, stream_int: int = 0) -> int:
""" """
Args: Args:
indices:
indice_pairs:
indice_pairs_uniq: indice_pairs_uniq:
indice_num_per_loc: ndim:
batch_size: uniq_size:
output_dims:
input_dims:
ksize:
stride:
padding:
dilation:
stream_int: stream_int:
""" """
... ...
@staticmethod @staticmethod
def generate_conv_inds_stage2(indices: Tensor, hashdata: Tensor, indice_pairs: Tensor, indice_pairs_uniq: Tensor, out_inds: Tensor, num_out_act: int, batch_size: int, output_dims: List[int], input_dims: List[int], ksize: List[int], stride: List[int], padding: List[int], dilation: List[int], stream_int: int = 0) -> int: def generate_conv_inds_stage2(indices: Tensor, hashdata: Tensor, indice_pairs: Tensor, indice_pairs_uniq: Tensor, out_inds: Tensor, num_out_act: int, batch_size: int, output_dims: List[int], input_dims: List[int], ksize: List[int], stride: List[int], padding: List[int], dilation: List[int], transposed: bool = False, stream_int: int = 0) -> int:
""" """
Args: Args:
indices: indices:
...@@ -56,6 +48,7 @@ class SpconvOps: ...@@ -56,6 +48,7 @@ class SpconvOps:
stride: stride:
padding: padding:
dilation: dilation:
transposed:
stream_int: stream_int:
""" """
... ...
......
...@@ -35,26 +35,30 @@ class SpconvOps(pccm.Class): ...@@ -35,26 +35,30 @@ class SpconvOps(pccm.Class):
problem = ConvProblem(ndim, ConvOpType.kForward, NHWC, NHWC, NHWC) problem = ConvProblem(ndim, ConvOpType.kForward, NHWC, NHWC, NHWC)
indices = SparseConvIndicesKernel(problem, dtypes.int32) indices = SparseConvIndicesKernel(problem, dtypes.int32)
# self.add_param_class("ops", indices, "SpconvIndices") # self.add_param_class("ops", indices, "SpconvIndices")
cuda_funcs = [self.generate_conv_inds, self.generate_subm_conv_inds, cuda_funcs = [self.generate_subm_conv_inds,
self.generate_conv_inds_stage1, self.generate_conv_inds_stage2, self.sort_1d_by_key] self.generate_conv_inds_stage1, self.generate_conv_inds_stage1_5, self.generate_conv_inds_stage2, self.sort_1d_by_key]
self.add_impl_only_param_class(cuda_funcs, f"ops{ndim}d", indices, f"SpconvIndices{ndim}D") self.add_impl_only_param_class(cuda_funcs, f"ops{ndim}d", indices, f"SpconvIndices{ndim}D")
@pccm.pybind.mark @pccm.pybind.mark
@pccm.cuda.static_function @pccm.cuda.static_function
def generate_conv_inds(self): def generate_conv_inds_stage1(self):
code = pccm.FunctionCode() code = pccm.FunctionCode()
code.arg("indices, hashdata", "tv::Tensor") code.arg("indices", "tv::Tensor")
code.arg("indice_pairs, indice_pairs_uniq, out_inds, indice_num_per_loc", "tv::Tensor") code.arg("indice_pairs, indice_pairs_uniq, indice_num_per_loc", "tv::Tensor")
code.arg("batch_size", "int") code.arg("batch_size", "int")
code.arg("output_dims, input_dims", f"std::vector<int>") code.arg("output_dims, input_dims", f"std::vector<int>")
code.arg("ksize, stride, padding, dilation", f"std::vector<int>") code.arg("ksize, stride, padding, dilation", f"std::vector<int>")
code.arg("transposed", f"bool", "false")
code.arg("stream_int", f"std::uintptr_t", "0", pyanno="int")
code.raw(f""" code.raw(f"""
int ndim = indices.dim(1) - 1; int ndim = indices.dim(1) - 1;
TV_ASSERT_RT_ERR(output_dims.size() == ndim && input_dims.size() == ndim && TV_ASSERT_RT_ERR(output_dims.size() == ndim && input_dims.size() == ndim &&
ksize.size() == ndim && stride.size() == ndim && dilation.size() == ndim && ksize.size() == ndim && stride.size() == ndim && dilation.size() == ndim &&
padding.size() == ndim, "your params size not equal to ndim", ndim); padding.size() == ndim, "your params size not equal to ndim", ndim);
""") """)
for ndim in self.ndims: for ndim in self.ndims:
code.raw(f""" code.raw(f"""
if (ndim == {ndim}){{ if (ndim == {ndim}){{
...@@ -68,53 +72,31 @@ class SpconvOps(pccm.Class): ...@@ -68,53 +72,31 @@ class SpconvOps(pccm.Class):
padding_[i] = padding[i]; padding_[i] = padding[i];
dilation_[i] = dilation[i]; dilation_[i] = dilation[i];
}} }}
return SpconvIndices{ndim}D::generate_conv_inds(indices, hashdata, return SpconvIndices{ndim}D::generate_conv_inds_stage1(indices,
indice_pairs, indice_pairs_uniq, out_inds, indice_num_per_loc, indice_pairs, indice_pairs_uniq, indice_num_per_loc,
batch_size, output_dims_, input_dims_, batch_size, output_dims_, input_dims_,
ksize_, stride_, padding_, dilation_); ksize_, stride_, padding_, dilation_, transposed, stream_int);
}} }}
""") """)
code.raw(f"""TV_THROW_RT_ERR("unknown ndim", ndim);""") code.raw(f"""TV_THROW_RT_ERR("unknown ndim", ndim);""")
return code.ret("int")
return code# .ret("int")
@pccm.pybind.mark @pccm.pybind.mark
@pccm.cuda.static_function @pccm.cuda.static_function
def generate_conv_inds_stage1(self): def generate_conv_inds_stage1_5(self):
code = pccm.FunctionCode() code = pccm.FunctionCode()
code.arg("indices", "tv::Tensor") code.arg("indice_pairs_uniq", "tv::Tensor")
code.arg("indice_pairs, indice_pairs_uniq, indice_num_per_loc", "tv::Tensor") code.arg("ndim", "int")
code.arg("batch_size", "int") code.arg("uniq_size", "int64_t")
code.arg("output_dims, input_dims", f"std::vector<int>")
code.arg("ksize, stride, padding, dilation", f"std::vector<int>")
code.arg("stream_int", f"std::uintptr_t", "0", pyanno="int") code.arg("stream_int", f"std::uintptr_t", "0", pyanno="int")
code.raw(f"""
int ndim = indices.dim(1) - 1;
TV_ASSERT_RT_ERR(output_dims.size() == ndim && input_dims.size() == ndim &&
ksize.size() == ndim && stride.size() == ndim && dilation.size() == ndim &&
padding.size() == ndim, "your params size not equal to ndim", ndim);
""")
for ndim in self.ndims: for ndim in self.ndims:
code.raw(f""" code.raw(f"""
if (ndim == {ndim}){{ if (ndim == {ndim}){{
tv::array<int, {ndim}> output_dims_, input_dims_; return SpconvIndices{ndim}D::generate_conv_inds_stage1_5(indice_pairs_uniq, uniq_size, stream_int);
tv::array<int, {ndim}> ksize_, stride_, padding_, dilation_;
for (int i = 0; i < {ndim}; ++i){{
output_dims_[i] = output_dims[i];
input_dims_[i] = input_dims[i];
ksize_[i] = ksize[i];
stride_[i] = stride[i];
padding_[i] = padding[i];
dilation_[i] = dilation[i];
}}
return SpconvIndices{ndim}D::generate_conv_inds_stage1(indices,
indice_pairs, indice_pairs_uniq, indice_num_per_loc,
batch_size, output_dims_, input_dims_,
ksize_, stride_, padding_, dilation_);
}} }}
""") """)
code.raw(f"""TV_THROW_RT_ERR("unknown ndim", ndim);""") code.raw(f"""TV_THROW_RT_ERR("unknown ndim", ndim);""")
return code.ret("int") return code.ret("int")
@pccm.pybind.mark @pccm.pybind.mark
...@@ -127,6 +109,7 @@ class SpconvOps(pccm.Class): ...@@ -127,6 +109,7 @@ class SpconvOps(pccm.Class):
code.arg("batch_size", "int") code.arg("batch_size", "int")
code.arg("output_dims, input_dims", f"std::vector<int>") code.arg("output_dims, input_dims", f"std::vector<int>")
code.arg("ksize, stride, padding, dilation", f"std::vector<int>") code.arg("ksize, stride, padding, dilation", f"std::vector<int>")
code.arg("transposed", f"bool", "false")
code.arg("stream_int", f"std::uintptr_t", "0", pyanno="int") code.arg("stream_int", f"std::uintptr_t", "0", pyanno="int")
code.raw(f""" code.raw(f"""
int ndim = indices.dim(1) - 1; int ndim = indices.dim(1) - 1;
...@@ -151,7 +134,7 @@ class SpconvOps(pccm.Class): ...@@ -151,7 +134,7 @@ class SpconvOps(pccm.Class):
return SpconvIndices{ndim}D::generate_conv_inds_stage2(indices, hashdata, return SpconvIndices{ndim}D::generate_conv_inds_stage2(indices, hashdata,
indice_pairs, indice_pairs_uniq, out_inds, num_out_act, indice_pairs, indice_pairs_uniq, out_inds, num_out_act,
batch_size, output_dims_, input_dims_, batch_size, output_dims_, input_dims_,
ksize_, stride_, padding_, dilation_); ksize_, stride_, padding_, dilation_, transposed, stream_int);
}} }}
""") """)
code.raw(f"""TV_THROW_RT_ERR("unknown ndim", ndim);""") code.raw(f"""TV_THROW_RT_ERR("unknown ndim", ndim);""")
......
...@@ -85,7 +85,7 @@ class ConvOutLocIter(pccm.ParameterizedClass): ...@@ -85,7 +85,7 @@ class ConvOutLocIter(pccm.ParameterizedClass):
code.ctor_init("layout_npq", f"LayoutNPQ::from_shape({{problem.N, {pqs}}})") code.ctor_init("layout_npq", f"LayoutNPQ::from_shape({{problem.N, {pqs}}})")
code.ctor_init("layout_rs", f"LayoutRS::from_shape({{{rss}}})") code.ctor_init("layout_rs", f"LayoutRS::from_shape({{{rss}}})")
return code return code
@pccm.cuda.member_function(host=True, @pccm.cuda.member_function(host=True,
...@@ -225,6 +225,28 @@ class ConvOutLocIter(pccm.ParameterizedClass): ...@@ -225,6 +225,28 @@ class ConvOutLocIter(pccm.ParameterizedClass):
""") """)
return code return code
@pccm.cuda.member_function(host=True,
device=True,
forceinline=True,
const=True)
def query_nhw_out(self):
code = pccm.FunctionCode()
code.arg("npq_offset", "const int*")
code.arg("nhw_offset", f"tv::array<int, {self.ndim + 1}>&")
code.ret("bool")
code.raw(f"""
nhw_offset = npq_to_nhw(npq_offset);
""")
hw_valid = [] # type: List[str]
for i in range(self.ndim):
hw_valid.append((f"nhw_offset[{i + 1}] >= 0 && "
f"nhw_offset[{i + 1}] < problem_.output_dims[{i}]"))
code.raw(f"""
return nhw_offset[0] < problem_.N &&
{' && '.join(hw_valid)};
""")
return code
class SparseConvIndicesKernel(pccm.ParameterizedClass): class SparseConvIndicesKernel(pccm.ParameterizedClass):
def __init__(self, problem: ConvProblem, dtype_indices: dtypes.DType): def __init__(self, problem: ConvProblem, dtype_indices: dtypes.DType):
super().__init__() super().__init__()
...@@ -255,7 +277,7 @@ class SparseConvIndicesKernel(pccm.ParameterizedClass): ...@@ -255,7 +277,7 @@ class SparseConvIndicesKernel(pccm.ParameterizedClass):
code.arg("indices_pair_size", "int") code.arg("indices_pair_size", "int")
code.arg("RS", "int") code.arg("RS", "int")
# code.arg("bool", "transposed") code.arg("transposed", "bool")
code.raw(f""" code.raw(f"""
int filter_offset = blockIdx.y; int filter_offset = blockIdx.y;
...@@ -264,7 +286,13 @@ class SparseConvIndicesKernel(pccm.ParameterizedClass): ...@@ -264,7 +286,13 @@ class SparseConvIndicesKernel(pccm.ParameterizedClass):
int filter_offset_mul_indices_pair_size = filter_offset * indices_pair_size; int filter_offset_mul_indices_pair_size = filter_offset * indices_pair_size;
for (int i : tv::KernelLoopX<int>(num_indices_in)) {{ for (int i : tv::KernelLoopX<int>(num_indices_in)) {{
tv::array<int, {self.ndim + 1}> npq_offset; tv::array<int, {self.ndim + 1}> npq_offset;
if (loc_iter.query_npq(indices_in + i * {self.ndim + 1}, npq_offset)){{ bool valid;
if (transposed){{
valid = loc_iter.query_nhw_out(indices_in + i * {self.ndim + 1}, npq_offset);
}}else{{
valid = loc_iter.query_npq(indices_in + i * {self.ndim + 1}, npq_offset);
}}
if (valid){{
int old_num = tv::cuda::atomicAggInc(indice_num_per_loc + filter_offset); int old_num = tv::cuda::atomicAggInc(indice_num_per_loc + filter_offset);
{self.dtype_indices} offset = loc_iter.layout_npq(npq_offset); {self.dtype_indices} offset = loc_iter.layout_npq(npq_offset);
if (old_num < indices_pair_size){{ if (old_num < indices_pair_size){{
...@@ -514,81 +542,6 @@ class SparseConvIndicesKernel(pccm.ParameterizedClass): ...@@ -514,81 +542,6 @@ class SparseConvIndicesKernel(pccm.ParameterizedClass):
""") """)
return code return code
@pccm.cuda.static_function
def generate_conv_inds(self):
code = pccm.FunctionCode()
code.arg("indices, hashdata", "tv::Tensor")
code.arg("indice_pairs, indice_pairs_uniq, out_inds, indice_num_per_loc", "tv::Tensor")
code.arg("batch_size", "int")
code.arg("output_dims, input_dims", f"tv::array<int, {self.ndim}>")
code.arg("ksize, stride, padding, dilation", f"tv::array<int, {self.ndim}>")
code.raw(f"""
// TODO stream
// TODO handle num input == 0
int kv = tv::arrayops::prod(ksize);
TV_ASSERT_RT_ERR(kv == indice_pairs.dim(1), "error");
// indice_pairs: [2, kv, indices.dim(0)]
// indice_pairs_uniq: [indice_pairs.size() / 2 + 1]
// out_inds: [MaxSize, {self.ndim + 1}]
auto timer = tv::CudaContextTimer<>();
int64_t uniq_size = indice_pairs.size() / 2 + 1;
TV_ASSERT_RT_ERR(indice_pairs_uniq.dim(0) == uniq_size, "error");
TV_ASSERT_RT_ERR(indice_num_per_loc.dim(0) == kv, "error");
int64_t expected_out_size = indices.dim(0) * kv;
TV_ASSERT_RT_ERR(out_inds.dim(0) == expected_out_size && out_inds.dim(1) == {self.ndim + 1}, "error");
tv::cuda::Launch launcher_num_act_in(indices.dim(0));
// tv::cuda::Launch launcher_num_act_in_2(indices.dim(0));
launcher_num_act_in.blocks.y = kv;
ConvProblem problem(batch_size, 1, 1, input_dims, output_dims, ksize, padding, stride, dilation);
ConvLocIter loc_iter(problem);
tv::cuda::Launch launcher_clean_uniq(uniq_size);
launcher_clean_uniq(clean_indices_uniq, indice_pairs_uniq.data_ptr<{self.dtype_indices}>(), uniq_size);
tv::ssprint("clean time", timer.report() / 1000.0);
launcher_num_act_in(calc_conv_indices_stage1, loc_iter, indices.data_ptr<const int>(),
indice_pairs.data_ptr<{self.dtype_indices}>(),
indice_pairs_uniq.data_ptr<{self.dtype_indices}>(), indice_num_per_loc.data_ptr<int>(), indices.dim(0),
indice_pairs.dim(2), kv);
tv::ssprint("calc_conv_indices_stage1 time", timer.report() / 1000.0, uniq_size);
thrust::device_ptr<{self.dtype_indices}> ptr_tr(indice_pairs_uniq.data_ptr<{self.dtype_indices}>());
auto thrust_ctx = thrust::cuda::par.on(0);
thrust::sort(thrust_ctx, ptr_tr, ptr_tr + uniq_size);
auto new_end = thrust::unique(thrust_ctx, ptr_tr, ptr_tr + uniq_size);
auto num_out_act = new_end - ptr_tr - 1;
tv::ssprint("unique time", num_out_act, timer.report() / 1000.0);
// return num_out_act;
// TODO handle invalid num_out_act
indice_pairs_uniq = indice_pairs_uniq.slice_first_axis(0, num_out_act);
tv::cuda::Launch lanucher_build_hash(num_out_act);
using V = {self.dtype_indices};
using KeyType = {self.dtype_indices};
constexpr KeyType kEmptyKey = std::numeric_limits<KeyType>::max();
using table_t =
tv::hash::LinearHashTable<KeyType, V, tv::hash::Murmur3Hash<KeyType>,
kEmptyKey, false>;
using pair_t = typename table_t::value_type;
TV_ASSERT_RT_ERR(hashdata.dim(0) >= num_out_act, "hash size not enough");
table_t hash = table_t(hashdata.data_ptr<pair_t>(), hashdata.dim(0));
hash.clear();
tv::ssprint("clear hash time", hashdata.dim(0), timer.report() / 1000.0);
lanucher_build_hash(build_conv_hash_table<table_t>, hash, out_inds.data_ptr<int>(), indice_pairs_uniq.data_ptr<const {self.dtype_indices}>(),
loc_iter.layout_npq, num_out_act);
tv::ssprint("build_hash time", num_out_act, timer.report() / 1000.0);
launcher_num_act_in(calc_conv_indices_stage2<table_t>, hash, indice_pairs[1].data_ptr<int>(), indices.dim(0),
indice_pairs.dim(2));
tv::ssprint("gem conv inds time", timer.report() / 1000.0);
return num_out_act;
""")
return code.ret("int")
@pccm.cuda.static_function @pccm.cuda.static_function
def generate_conv_inds_stage1(self): def generate_conv_inds_stage1(self):
code = pccm.FunctionCode() code = pccm.FunctionCode()
...@@ -597,6 +550,8 @@ class SparseConvIndicesKernel(pccm.ParameterizedClass): ...@@ -597,6 +550,8 @@ class SparseConvIndicesKernel(pccm.ParameterizedClass):
code.arg("batch_size", "int") code.arg("batch_size", "int")
code.arg("output_dims, input_dims", f"tv::array<int, {self.ndim}>") code.arg("output_dims, input_dims", f"tv::array<int, {self.ndim}>")
code.arg("ksize, stride, padding, dilation", f"tv::array<int, {self.ndim}>") code.arg("ksize, stride, padding, dilation", f"tv::array<int, {self.ndim}>")
code.arg("transposed", f"bool", "false")
code.arg("stream_int", f"std::uintptr_t", "0") code.arg("stream_int", f"std::uintptr_t", "0")
code.raw(f""" code.raw(f"""
...@@ -620,7 +575,23 @@ class SparseConvIndicesKernel(pccm.ParameterizedClass): ...@@ -620,7 +575,23 @@ class SparseConvIndicesKernel(pccm.ParameterizedClass):
launcher_num_act_in(calc_conv_indices_stage1, loc_iter, indices.data_ptr<const int>(), launcher_num_act_in(calc_conv_indices_stage1, loc_iter, indices.data_ptr<const int>(),
indice_pairs.data_ptr<{self.dtype_indices}>(), indice_pairs.data_ptr<{self.dtype_indices}>(),
indice_pairs_uniq.data_ptr<{self.dtype_indices}>(), indice_num_per_loc.data_ptr<int>(), indices.dim(0), indice_pairs_uniq.data_ptr<{self.dtype_indices}>(), indice_num_per_loc.data_ptr<int>(), indices.dim(0),
indice_pairs.dim(2), kv); indice_pairs.dim(2), kv, transposed);
// thrust::device_ptr<{self.dtype_indices}> ptr_tr(indice_pairs_uniq.data_ptr<{self.dtype_indices}>());
// auto thrust_ctx = thrust::cuda::par.on(reinterpret_cast<cudaStream_t>(stream_int));
// thrust::sort(thrust_ctx, ptr_tr, ptr_tr + uniq_size);
// auto new_end = thrust::unique(thrust_ctx, ptr_tr, ptr_tr + uniq_size);
// auto num_out_act = new_end - ptr_tr - 1;
// return num_out_act;
""")
return code# .ret("int")
@pccm.cuda.static_function
def generate_conv_inds_stage1_5(self):
code = pccm.FunctionCode()
code.arg("indice_pairs_uniq", "tv::Tensor")
code.arg("uniq_size", "int64_t")
code.arg("stream_int", f"std::uintptr_t", "0")
code.raw(f"""
thrust::device_ptr<{self.dtype_indices}> ptr_tr(indice_pairs_uniq.data_ptr<{self.dtype_indices}>()); thrust::device_ptr<{self.dtype_indices}> ptr_tr(indice_pairs_uniq.data_ptr<{self.dtype_indices}>());
auto thrust_ctx = thrust::cuda::par.on(reinterpret_cast<cudaStream_t>(stream_int)); auto thrust_ctx = thrust::cuda::par.on(reinterpret_cast<cudaStream_t>(stream_int));
thrust::sort(thrust_ctx, ptr_tr, ptr_tr + uniq_size); thrust::sort(thrust_ctx, ptr_tr, ptr_tr + uniq_size);
...@@ -630,6 +601,7 @@ class SparseConvIndicesKernel(pccm.ParameterizedClass): ...@@ -630,6 +601,7 @@ class SparseConvIndicesKernel(pccm.ParameterizedClass):
""") """)
return code.ret("int") return code.ret("int")
@pccm.cuda.static_function @pccm.cuda.static_function
def generate_conv_inds_stage2(self): def generate_conv_inds_stage2(self):
code = pccm.FunctionCode() code = pccm.FunctionCode()
...@@ -639,6 +611,7 @@ class SparseConvIndicesKernel(pccm.ParameterizedClass): ...@@ -639,6 +611,7 @@ class SparseConvIndicesKernel(pccm.ParameterizedClass):
code.arg("batch_size", "int") code.arg("batch_size", "int")
code.arg("output_dims, input_dims", f"tv::array<int, {self.ndim}>") code.arg("output_dims, input_dims", f"tv::array<int, {self.ndim}>")
code.arg("ksize, stride, padding, dilation", f"tv::array<int, {self.ndim}>") code.arg("ksize, stride, padding, dilation", f"tv::array<int, {self.ndim}>")
code.arg("transposed", f"bool", "false")
code.arg("stream_int", f"std::uintptr_t", "0") code.arg("stream_int", f"std::uintptr_t", "0")
code.raw(f""" code.raw(f"""
auto custream = reinterpret_cast<cudaStream_t>(stream_int); auto custream = reinterpret_cast<cudaStream_t>(stream_int);
...@@ -651,7 +624,7 @@ class SparseConvIndicesKernel(pccm.ParameterizedClass): ...@@ -651,7 +624,7 @@ class SparseConvIndicesKernel(pccm.ParameterizedClass):
// out_inds: [MaxSize, {self.ndim + 1}] // out_inds: [MaxSize, {self.ndim + 1}]
auto timer = tv::CudaContextTimer<>(); auto timer = tv::CudaContextTimer<>();
int64_t uniq_size = indice_pairs.size() / 2 + 1; int64_t uniq_size = indice_pairs.size() / 2 + 1;
TV_ASSERT_RT_ERR(indice_pairs_uniq.dim(0) == uniq_size, "error"); TV_ASSERT_RT_ERR(indice_pairs_uniq.dim(0) >= num_out_act, "error");
TV_ASSERT_RT_ERR(out_inds.dim(0) >= num_out_act && out_inds.dim(1) == {self.ndim + 1}, "error"); TV_ASSERT_RT_ERR(out_inds.dim(0) >= num_out_act && out_inds.dim(1) == {self.ndim + 1}, "error");
tv::cuda::Launch launcher_num_act_in(indices.dim(0), custream); tv::cuda::Launch launcher_num_act_in(indices.dim(0), custream);
launcher_num_act_in.blocks.y = kv; launcher_num_act_in.blocks.y = kv;
......
...@@ -22,7 +22,9 @@ from cumm.common import TensorView, TensorViewHashKernel, TensorViewKernel, Thru ...@@ -22,7 +22,9 @@ from cumm.common import TensorView, TensorViewHashKernel, TensorViewKernel, Thru
from cumm.gemm import codeops from cumm.gemm import codeops
from typing import List from typing import List
from cumm.conv.params import ConvProblem from cumm.conv.params import ConvProblem
from cumm.gemm.mask_iters import MaskTileIterator, MaskTileIteratorParams
import numpy as np import numpy as np
from cumm.gemm import (thread_map)
class IndiceMaxPool(pccm.Class): class IndiceMaxPool(pccm.Class):
# TODO optimize this function # TODO optimize this function
...@@ -171,4 +173,3 @@ class IndiceMaxPool(pccm.Class): ...@@ -171,4 +173,3 @@ class IndiceMaxPool(pccm.Class):
}}); }});
""") """)
return code return code
...@@ -14,6 +14,7 @@ ...@@ -14,6 +14,7 @@
import math import math
import time import time
from typing import List, Optional, Tuple, Union
import numpy as np import numpy as np
import torch import torch
...@@ -40,8 +41,12 @@ def _calculate_fan_in_and_fan_out_hwio(tensor): ...@@ -40,8 +41,12 @@ def _calculate_fan_in_and_fan_out_hwio(tensor):
fan_in = tensor.size(-2) fan_in = tensor.size(-2)
fan_out = tensor.size(-1) fan_out = tensor.size(-1)
else: else:
num_input_fmaps = tensor.size(-1) if FILTER_HWIO:
num_output_fmaps = tensor.size(-2) num_input_fmaps = tensor.size(-2)
num_output_fmaps = tensor.size(-1)
else:
num_input_fmaps = tensor.size(-1)
num_output_fmaps = tensor.size(-2)
receptive_field_size = 1 receptive_field_size = 1
if tensor.dim() > 2: if tensor.dim() > 2:
receptive_field_size = tensor[..., 0, 0].numel() receptive_field_size = tensor[..., 0, 0].numel()
...@@ -58,22 +63,22 @@ class SparseConvolution(SparseModule): ...@@ -58,22 +63,22 @@ class SparseConvolution(SparseModule):
] ]
def __init__(self, def __init__(self,
ndim, ndim: int,
in_channels, in_channels: int,
out_channels, out_channels: int,
kernel_size=3, kernel_size: Union[int, List[int], Tuple[int, ...]]=3,
stride=1, stride: Union[int, List[int], Tuple[int, ...]]=1,
padding=0, padding: Union[int, List[int], Tuple[int, ...]]=0,
dilation=1, dilation: Union[int, List[int], Tuple[int, ...]]=1,
groups=1, groups: Union[int, List[int], Tuple[int, ...]]=1,
bias=True, bias: bool=True,
subm=False, subm: bool=False,
output_padding=0, output_padding: Union[int, List[int], Tuple[int, ...]]=0,
transposed=False, transposed: bool=False,
inverse=False, inverse: bool=False,
indice_key=None, indice_key: Optional[str]=None,
fused_bn=False, fused_bn: bool=False,
algo=ops.ConvAlgo.Native, algo: ops.ConvAlgo=ops.ConvAlgo.Native,
name=None): name=None):
super(SparseConvolution, self).__init__(name=name) super(SparseConvolution, self).__init__(name=name)
assert groups == 1 assert groups == 1
...@@ -117,8 +122,6 @@ class SparseConvolution(SparseModule): ...@@ -117,8 +122,6 @@ class SparseConvolution(SparseModule):
self.bias = Parameter(torch.Tensor(out_channels)) self.bias = Parameter(torch.Tensor(out_channels))
else: else:
self.register_parameter('bias', None) self.register_parameter('bias', None)
# self.workspace_for_splitk = torch.zeros((GLOBAL_MAXIMUM_SPLITK,), dtype=torch.int8)
# self.register_buffer("workspace_for_splitk", self.workspace_for_splitk)
self.reset_parameters() self.reset_parameters()
def reset_parameters(self): def reset_parameters(self):
...@@ -234,6 +237,7 @@ class SparseConvolution(SparseModule): ...@@ -234,6 +237,7 @@ class SparseConvolution(SparseModule):
t = time.time() t = time.time()
if self.fused_bn: if self.fused_bn:
raise NotImplementedError
assert self.bias is not None assert self.bias is not None
out_features = ops.fused_indice_conv(features, self.weight, out_features = ops.fused_indice_conv(features, self.weight,
self.bias, self.bias,
...@@ -382,6 +386,34 @@ class SparseConv4d(SparseConvolution): ...@@ -382,6 +386,34 @@ class SparseConv4d(SparseConvolution):
name=name) name=name)
class SparseConvTranspose1d(SparseConvolution):
def __init__(self,
in_channels,
out_channels,
kernel_size,
stride=1,
padding=0,
dilation=1,
groups=1,
bias=True,
indice_key=None,
algo=ops.ConvAlgo.Native,
name=None):
super(SparseConvTranspose1d, self).__init__(1,
in_channels,
out_channels,
kernel_size,
stride,
padding,
dilation,
groups,
bias,
transposed=True,
indice_key=indice_key,
algo=algo,
name=name)
class SparseConvTranspose2d(SparseConvolution): class SparseConvTranspose2d(SparseConvolution):
def __init__(self, def __init__(self,
in_channels, in_channels,
...@@ -437,6 +469,34 @@ class SparseConvTranspose3d(SparseConvolution): ...@@ -437,6 +469,34 @@ class SparseConvTranspose3d(SparseConvolution):
algo=algo, algo=algo,
name=name) name=name)
class SparseConvTranspose4d(SparseConvolution):
def __init__(self,
in_channels,
out_channels,
kernel_size,
stride=1,
padding=0,
dilation=1,
groups=1,
bias=True,
indice_key=None,
algo=ops.ConvAlgo.Native,
name=None):
super(SparseConvTranspose4d, self).__init__(4,
in_channels,
out_channels,
kernel_size,
stride,
padding,
dilation,
groups,
bias,
transposed=True,
indice_key=indice_key,
algo=algo,
name=name)
class SparseInverseConv1d(SparseConvolution): class SparseInverseConv1d(SparseConvolution):
def __init__(self, def __init__(self,
in_channels, in_channels,
......
...@@ -143,50 +143,3 @@ class SparseSequential(SparseModule): ...@@ -143,50 +143,3 @@ class SparseSequential(SparseModule):
input = module(input) input = module(input)
return input return input
def fused(self):
"""don't use this. no effect.
"""
from spconv.pytorch.conv import SparseConvolution
mods = [v for k, v in self._modules.items()]
fused_mods = []
idx = 0
while idx < len(mods):
if is_sparse_conv(mods[idx]):
if idx < len(mods) - 1 and isinstance(mods[idx + 1],
nn.BatchNorm1d):
new_module = SparseConvolution(
ndim=mods[idx].ndim,
in_channels=mods[idx].in_channels,
out_channels=mods[idx].out_channels,
kernel_size=mods[idx].kernel_size,
stride=mods[idx].stride,
padding=mods[idx].padding,
dilation=mods[idx].dilation,
groups=mods[idx].groups,
bias=True,
subm=mods[idx].subm,
output_padding=mods[idx].output_padding,
transposed=mods[idx].transposed,
inverse=mods[idx].inverse,
indice_key=mods[idx].indice_key,
fused_bn=True,
)
new_module.load_state_dict(mods[idx].state_dict(), False)
new_module.to(mods[idx].weight.device)
conv = new_module
bn = mods[idx + 1]
conv.bias.data.zero_()
conv.weight.data[:] = conv.weight.data * bn.weight.data / (
torch.sqrt(bn.running_var) + bn.eps)
conv.bias.data[:] = (
conv.bias.data - bn.running_mean) * bn.weight.data / (
torch.sqrt(bn.running_var) + bn.eps) + bn.bias.data
fused_mods.append(conv)
idx += 2
else:
fused_mods.append(mods[idx])
idx += 1
else:
fused_mods.append(mods[idx])
idx += 1
return SparseSequential(*fused_mods)
...@@ -11,7 +11,7 @@ ...@@ -11,7 +11,7 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
import functools
from enum import Enum from enum import Enum
from cumm import tensorview as tv from cumm import tensorview as tv
from cumm.gemm.algospec.core import ShuffleStrideType from cumm.gemm.algospec.core import ShuffleStrideType
...@@ -23,7 +23,7 @@ from spconv.algo import AlgoHint, ConvAlgo ...@@ -23,7 +23,7 @@ from spconv.algo import AlgoHint, ConvAlgo
from typing import List, Union from typing import List, Union
from spconv.pytorch.cppcore import torch_tensor_to_tv, get_current_stream from spconv.pytorch.cppcore import torch_tensor_to_tv, get_current_stream
from spconv.core_cc.csrc.sparse.all import SpconvOps from spconv.core_cc.csrc.sparse.all import SpconvOps
from spconv.algo import GEMM# , GATHER, SCATTER from spconv.algo import GEMM # , GATHER, SCATTER
import time import time
from spconv.constants import FILTER_HWIO from spconv.constants import FILTER_HWIO
...@@ -58,26 +58,17 @@ def get_indice_pairs(indices: torch.Tensor, ...@@ -58,26 +58,17 @@ def get_indice_pairs(indices: torch.Tensor,
batch_size: int, batch_size: int,
spatial_shape: List[int], spatial_shape: List[int],
algo: ConvAlgo, algo: ConvAlgo,
ksize: Union[int, List[int]], ksize: List[int],
stride: Union[int, List[int]], stride: List[int],
padding: Union[int, List[int]], padding: List[int],
dilation: Union[int, List[int]], dilation: List[int],
out_padding: Union[int, List[int]], out_padding: List[int],
subm: bool = False, subm: bool = False,
transpose: bool = False): transpose: bool = False):
# torch.cuda.synchronize()
# t = time.time()
ndim = indices.shape[1] - 1 ndim = indices.shape[1] - 1
if not isinstance(ksize, (list, tuple)): kv: int = functools.reduce(lambda x, y: x * y, ksize, 1)
ksize = [ksize] * ndim
if not isinstance(stride, (list, tuple)):
stride = [stride] * ndim
if not isinstance(padding, (list, tuple)):
padding = [padding] * ndim
if not isinstance(dilation, (list, tuple)):
dilation = [dilation] * ndim
if not isinstance(out_padding, (list, tuple)):
out_padding = [out_padding] * ndim
kv: int = int(np.prod(ksize))
if not subm: if not subm:
if transpose: if transpose:
out_shape = get_deconv_output_size(spatial_shape, ksize, stride, out_shape = get_deconv_output_size(spatial_shape, ksize, stride,
...@@ -87,8 +78,9 @@ def get_indice_pairs(indices: torch.Tensor, ...@@ -87,8 +78,9 @@ def get_indice_pairs(indices: torch.Tensor,
padding, dilation) padding, dilation)
else: else:
out_shape = spatial_shape out_shape = spatial_shape
assert algo == ConvAlgo.Native and not transpose, "TODO" assert algo == ConvAlgo.Native, "TODO"
stream = get_current_stream() stream = get_current_stream()
pair = torch.full((2, kv, indices.shape[0]), pair = torch.full((2, kv, indices.shape[0]),
-1, -1,
dtype=indices.dtype, dtype=indices.dtype,
...@@ -96,19 +88,20 @@ def get_indice_pairs(indices: torch.Tensor, ...@@ -96,19 +88,20 @@ def get_indice_pairs(indices: torch.Tensor,
indice_num_per_loc = torch.zeros((kv, ), indice_num_per_loc = torch.zeros((kv, ),
dtype=indices.dtype, dtype=indices.dtype,
device=indices.device) device=indices.device)
inds_tv = torch_tensor_to_tv(indices) inds_tv = torch_tensor_to_tv(indices)
pair_tv = torch_tensor_to_tv(pair) pair_tv = torch_tensor_to_tv(pair)
indice_num_per_loc_tv = torch_tensor_to_tv(indice_num_per_loc) indice_num_per_loc_tv = torch_tensor_to_tv(indice_num_per_loc)
# torch.cuda.synchronize()
# t = time.time()
if subm: if subm:
out_inds = indices out_inds = indices
hashdata = torch.empty((out_inds.shape[0] * 2, ), hashdata = torch.empty((out_inds.shape[0] * 2, ),
dtype=torch.int64, dtype=torch.int64,
device=indices.device) device=indices.device)
out_inds_tv = torch_tensor_to_tv(out_inds) out_inds_tv = torch_tensor_to_tv(out_inds)
hashdata_tv = torch_tensor_to_tv(hashdata, dtype=tv.custom64) hashdata_tv = torch_tensor_to_tv(hashdata, dtype=tv.custom64)
SpconvOps.generate_subm_conv_inds(inds_tv, SpconvOps.generate_subm_conv_inds(inds_tv,
hashdata_tv, hashdata_tv,
pair_tv, pair_tv,
...@@ -120,27 +113,36 @@ def get_indice_pairs(indices: torch.Tensor, ...@@ -120,27 +113,36 @@ def get_indice_pairs(indices: torch.Tensor,
dilation=dilation, dilation=dilation,
stream_int=stream) stream_int=stream)
# torch.cuda.synchronize() # torch.cuda.synchronize()
# print("SUBM", time.time() - t)
# print("SUBM INDICE GEN", time.time() - t)
else: else:
indice_pairs_uniq = torch.empty((pair.numel() // 2 + 1, ), indice_pairs_uniq = torch.empty((pair.numel() // 2 + 1, ),
dtype=indices.dtype, dtype=indices.dtype,
device=indices.device) device=indices.device)
indice_pairs_uniq_tv = torch_tensor_to_tv(indice_pairs_uniq) indice_pairs_uniq_tv = torch_tensor_to_tv(indice_pairs_uniq)
num_act_out = SpconvOps.generate_conv_inds_stage1(
inds_tv, SpconvOps.generate_conv_inds_stage1(inds_tv,
pair_tv, pair_tv,
indice_pairs_uniq_tv, indice_pairs_uniq_tv,
indice_num_per_loc_tv, indice_num_per_loc_tv,
batch_size=batch_size, batch_size=batch_size,
output_dims=out_shape, output_dims=out_shape,
input_dims=spatial_shape, input_dims=spatial_shape,
ksize=ksize, ksize=ksize,
stride=stride, stride=stride,
padding=padding, padding=padding,
dilation=dilation, dilation=dilation,
stream_int=stream) transposed=transpose,
stream_int=stream)
uniq_res = indice_pairs_uniq.unique()
num_act_out = uniq_res.shape[0] - 1
uniq_res_tv = torch_tensor_to_tv(uniq_res)
# num_act_out = SpconvOps.generate_conv_inds_stage1_5(
# indice_pairs_uniq_tv,
# ndim,
# uniq_size=indice_pairs_uniq_tv.size,
# stream_int=stream)
# uniq_res_tv = indice_pairs_uniq_tv.slice_first_axis(0, num_act_out)
out_inds = torch.empty((num_act_out, indices.shape[1]), out_inds = torch.empty((num_act_out, indices.shape[1]),
dtype=indices.dtype, dtype=indices.dtype,
device=indices.device) device=indices.device)
...@@ -152,7 +154,7 @@ def get_indice_pairs(indices: torch.Tensor, ...@@ -152,7 +154,7 @@ def get_indice_pairs(indices: torch.Tensor,
SpconvOps.generate_conv_inds_stage2(inds_tv, SpconvOps.generate_conv_inds_stage2(inds_tv,
hashdata_tv, hashdata_tv,
pair_tv, pair_tv,
indice_pairs_uniq_tv, uniq_res_tv,
out_inds_tv, out_inds_tv,
num_out_act=num_act_out, num_out_act=num_act_out,
batch_size=batch_size, batch_size=batch_size,
...@@ -162,11 +164,10 @@ def get_indice_pairs(indices: torch.Tensor, ...@@ -162,11 +164,10 @@ def get_indice_pairs(indices: torch.Tensor,
stride=stride, stride=stride,
padding=padding, padding=padding,
dilation=dilation, dilation=dilation,
transposed=transpose,
stream_int=stream) stream_int=stream)
# torch.cuda.synchronize() # torch.cuda.synchronize()
# print("REGU", time.time() - t)
# print("INDICE GEN", time.time() - t)
return out_inds, pair, indice_num_per_loc return out_inds, pair, indice_num_per_loc
...@@ -228,8 +229,6 @@ def indice_conv(features: torch.Tensor, ...@@ -228,8 +229,6 @@ def indice_conv(features: torch.Tensor,
c_inds_shape=[nhot_profile], c_inds_shape=[nhot_profile],
hint=AlgoHint.Fowrard.value) hint=AlgoHint.Fowrard.value)
gather_data_tv = tv.Tensor()
scatter_data_tv = tv.Tensor()
maxnhot = max(indice_pair_num_cpu) maxnhot = max(indice_pair_num_cpu)
if profile_res is None: if profile_res is None:
...@@ -270,36 +269,33 @@ def indice_conv(features: torch.Tensor, ...@@ -270,36 +269,33 @@ def indice_conv(features: torch.Tensor,
continue continue
inp_indices = pair_in[i].slice_first_axis(0, nhot) inp_indices = pair_in[i].slice_first_axis(0, nhot)
out_indices = pair_out[i].slice_first_axis(0, nhot) out_indices = pair_out[i].slice_first_axis(0, nhot)
# inp_indices = torch_tensor_to_tv(inp_indices_th)
# out_indices = torch_tensor_to_tv(out_indices_th)
b = filters_tv[i] b = filters_tv[i]
# inp @ filter.T, NC @ KC # inp @ filter.T, NC @ KC
beta = 1.0 if inited else 0.0 beta = 1.0 if inited else 0.0
algo_desp = GEMM.run_profile( algo_desp = GEMM.run_profile(profile_res,
profile_res, a,
a, b,
b, c,
c, False,
False, False if FILTER_HWIO else True,
False if FILTER_HWIO else True, False,
False, arch=arch,
arch=arch, stream=stream,
stream=stream, shuffle_type=ShuffleStrideType.ShuffleAC,
shuffle_type=ShuffleStrideType.ShuffleAC, a_inds=inp_indices,
a_inds=inp_indices, c_inds=out_indices,
c_inds=out_indices, hint=AlgoHint.Fowrard.value,
hint=AlgoHint.Fowrard.value, alpha=1.0,
alpha=1.0, beta=beta)
beta=beta)
# gather_times += gather_time # gather_times += gather_time
inited = True inited = True
# torch.cuda.synchronize() # torch.cuda.synchronize()
# print(stream, valid_count, maxnhot, features.shape[0], features.shape[1], out_channel, time.time() - t, total_times, txt) # # print(stream, valid_count, maxnhot, features.shape[0], features.shape[1], out_channel, time.time() - t, total_times, txt)
# print(algo_desp, profile_res.external_gather, profile_res.splitk, features.shape[0], features.shape[1], out_channel, time.time() - t, total_times) # # print(algo_desp, profile_res.external_gather, profile_res.splitk, features.shape[0], features.shape[1], out_channel, time.time() - t)
# print(indice_pair_num_cpu) # # print(indice_pair_num_cpu)
# print(time.time() - t) # print("G", time.time() - t)
return out_features return out_features
...@@ -316,8 +312,6 @@ def indice_conv_backward(features: torch.Tensor, ...@@ -316,8 +312,6 @@ def indice_conv_backward(features: torch.Tensor,
inverse: bool = False, inverse: bool = False,
subm: bool = False, subm: bool = False,
algo: ConvAlgo = ConvAlgo.Native): algo: ConvAlgo = ConvAlgo.Native):
# workspace = torch.empty((10000), dtype=torch.uint8, device=features.device)
# workspace_tv = torch_tensor_to_tv(workspace)
# torch.cuda.synchronize() # torch.cuda.synchronize()
# t = time.time() # t = time.time()
...@@ -400,7 +394,6 @@ def indice_conv_backward(features: torch.Tensor, ...@@ -400,7 +394,6 @@ def indice_conv_backward(features: torch.Tensor,
c_inds=out_indices, c_inds=out_indices,
alpha=1.0, alpha=1.0,
beta=0.0, beta=0.0,
# scatter_data=scatter_data_tv.slice_first_axis(0, nhot_profile),
hint=AlgoHint.BackwardInput.value, hint=AlgoHint.BackwardInput.value,
stream=stream) stream=stream)
if not FILTER_HWIO: if not FILTER_HWIO:
...@@ -445,7 +438,6 @@ def indice_conv_backward(features: torch.Tensor, ...@@ -445,7 +438,6 @@ def indice_conv_backward(features: torch.Tensor,
b_inds=b_inds_wgrad, b_inds=b_inds_wgrad,
alpha=1.0, alpha=1.0,
beta=0.0, beta=0.0,
# scatter_data=scatter_data_tv.slice_first_axis(0, nhot_profile),
hint=AlgoHint.BackwardWeight.value, hint=AlgoHint.BackwardWeight.value,
stream=stream) stream=stream)
# print(profile_res_wgrad.algo_desp, profile_res_wgrad.splitk, min_time) # print(profile_res_wgrad.algo_desp, profile_res_wgrad.splitk, min_time)
...@@ -457,21 +449,25 @@ def indice_conv_backward(features: torch.Tensor, ...@@ -457,21 +449,25 @@ def indice_conv_backward(features: torch.Tensor,
else: else:
b_shape = [maxnhot, out_bp_tv.dim(1)] b_shape = [maxnhot, out_bp_tv.dim(1)]
a_shape = [maxnhot, features_tv.dim(1)] a_shape = [maxnhot, features_tv.dim(1)]
m, n, k = GEMM.extract_mnk( m, n, k = GEMM.extract_mnk(a_shape,
a_shape, b_shape, profile_res_wgrad.algo_desp.trans_a, b_shape,
profile_res_wgrad.algo_desp.trans_b, profile_res_wgrad.algo_desp.trans_a,
profile_res_wgrad.algo_desp.trans_c, profile_res_wgrad.algo_desp.trans_b,
arch=arch, profile_res_wgrad.algo_desp.trans_c,
shuffle_type=ShuffleStrideType.ShuffleAB, arch=arch,
a_inds_shape=[maxnhot], shuffle_type=ShuffleStrideType.ShuffleAB,
b_inds_shape=[maxnhot], a_inds_shape=[maxnhot],
hint=AlgoHint.BackwardWeight.value) b_inds_shape=[maxnhot],
workspace_size = profile_res_wgrad.algo_desp.query_workspace_size(m, n, k, profile_res_wgrad.splitk) hint=AlgoHint.BackwardWeight.value)
workspace_size = profile_res_wgrad.algo_desp.query_workspace_size(
m, n, k, profile_res_wgrad.splitk)
workspace = torch.Tensor() workspace = torch.Tensor()
workspace_tv = tv.Tensor() workspace_tv = tv.Tensor()
if workspace_size > 0: if workspace_size > 0:
workspace = torch.empty((workspace_size,), dtype=torch.int8, device=features.device) workspace = torch.empty((workspace_size, ),
dtype=torch.int8,
device=features.device)
workspace_tv = torch_tensor_to_tv(workspace) workspace_tv = torch_tensor_to_tv(workspace)
# print(workspace_size, m, n, k, profile_res_wgrad.splitk) # print(workspace_size, m, n, k, profile_res_wgrad.splitk)
# torch.cuda.synchronize() # torch.cuda.synchronize()
...@@ -538,11 +534,13 @@ def indice_conv_backward(features: torch.Tensor, ...@@ -538,11 +534,13 @@ def indice_conv_backward(features: torch.Tensor,
# dw_time = time.time() - t # dw_time = time.time() - t
# # print(dw_time + di_time, di_time, dw_time, profile_res_wgrad.splitk, profile_res_wgrad.algo_desp, dfilters.shape) # # print(dw_time + di_time, di_time, dw_time, profile_res_wgrad.splitk, profile_res_wgrad.algo_desp, dfilters.shape)
# # print(dw_time + di_time) # # print(dw_time + di_time)
# print(time.time() - t) # print("BWG", time.time() - t)
return (din, dfilters.reshape(filters_shape)) return (din, dfilters.reshape(filters_shape))
def indice_maxpool(features, indice_pairs, indice_pair_num, num_activate_out): def indice_maxpool(features, indice_pairs, indice_pair_num, num_activate_out):
# torch.cuda.synchronize()
# t = time.time()
out_channel = features.shape[-1] out_channel = features.shape[-1]
out_features = torch.zeros((num_activate_out, out_channel), out_features = torch.zeros((num_activate_out, out_channel),
dtype=features.dtype, dtype=features.dtype,
...@@ -558,6 +556,9 @@ def indice_maxpool(features, indice_pairs, indice_pair_num, num_activate_out): ...@@ -558,6 +556,9 @@ def indice_maxpool(features, indice_pairs, indice_pair_num, num_activate_out):
out_indices = torch_tensor_to_tv(indice_pairs[1][i, :nhot]) out_indices = torch_tensor_to_tv(indice_pairs[1][i, :nhot])
SpconvOps.maxpool_forward(out_features_tv, features_tv, out_indices, SpconvOps.maxpool_forward(out_features_tv, features_tv, out_indices,
inp_indices, stream) inp_indices, stream)
# torch.cuda.synchronize()
# print("M", time.time() - t)
return out_features return out_features
......
...@@ -142,6 +142,20 @@ class SparseMaxPool(SparseModule): ...@@ -142,6 +142,20 @@ class SparseMaxPool(SparseModule):
return out_tensor return out_tensor
class SparseMaxPool1d(SparseMaxPool):
def __init__(self,
kernel_size,
stride=None,
padding=0,
dilation=1,
name=None):
super(SparseMaxPool1d, self).__init__(1,
kernel_size,
stride,
padding,
dilation,
name=name)
class SparseMaxPool2d(SparseMaxPool): class SparseMaxPool2d(SparseMaxPool):
def __init__(self, def __init__(self,
kernel_size, kernel_size,
...@@ -170,3 +184,17 @@ class SparseMaxPool3d(SparseMaxPool): ...@@ -170,3 +184,17 @@ class SparseMaxPool3d(SparseMaxPool):
padding, padding,
dilation, dilation,
name=name) name=name)
class SparseMaxPool4d(SparseMaxPool):
def __init__(self,
kernel_size,
stride=None,
padding=0,
dilation=1,
name=None):
super(SparseMaxPool4d, self).__init__(4,
kernel_size,
stride,
padding,
dilation,
name=name)
...@@ -13,20 +13,20 @@ ...@@ -13,20 +13,20 @@
# limitations under the License. # limitations under the License.
STR = """ STR = """
0.0016176700592041016 BWG 0.0008761882781982422
0.002481698989868164 BWG 0.0008311271667480469
0.0027854442596435547 BWG 0.002079486846923828
0.0031723976135253906 BWG 0.002329587936401367
0.0017437934875488281 BWG 0.0025458335876464844
0.0020503997802734375 BWG 0.0026700496673583984
0.001399993896484375 BWG 0.002583742141723633
0.0016183853149414062 BWG 0.0025262832641601562
0.0007357597351074219 BWG 0.003481149673461914
0.0008492469787597656 BWG 0.003238201141357422
0.0006558895111083984 BWG 0.005095958709716797
0.0007994174957275391 BWG 0.0037899017333984375
0.000335693359375 BWG 0.003931283950805664
0.000347137451171875 BWG 0.003300189971923828
""" """
""" """
0.003921985626220703 0.003921985626220703
...@@ -46,37 +46,86 @@ STR = """ ...@@ -46,37 +46,86 @@ STR = """
0.00030994415283203125 0.00030994415283203125
""" """
STR = """ STR1 = """
0.0006084442138671875 SUBM 0.00036716461181640625
0.0005354881286621094 G 0.0010955333709716797
0.0012688636779785156 G 0.0010745525360107422
0.0012619495391845703 REGU 0.0006923675537109375
0.002301931381225586 M 0.0005242824554443359
0.0019693374633789062 SUBM 0.0003108978271484375
0.0038712024688720703 G 0.0010905265808105469
0.002872467041015625 G 0.0011067390441894531
0.005068302154541016 REGU 0.00058746337890625
0.0047588348388671875 M 0.0005304813385009766
0.007832765579223633 SUBM 0.0002682209014892578
0.005643367767333984 G 0.0010945796966552734
0.005807161331176758 G 0.0011165142059326172
0.004715442657470703""" REGU 0.0005419254302978516
M 0.0005164146423339844
SUBM 0.00021505355834960938
G 0.0010805130004882812
G 0.0010516643524169922
REGU 0.00052642822265625
M 0.0004677772521972656
SUBM 0.0002262592315673828
G 0.0010986328125
G 0.0010256767272949219
REGU 0.0005693435668945312
M 0.00048661231994628906
SUBM 0.0002319812774658203
G 0.0011110305786132812
G 0.0011196136474609375
REGU 0.0005295276641845703
M 0.0005729198455810547
SUBM 0.00023889541625976562
G 0.0005326271057128906
G 0.0005140304565429688
""" """
0.0004992485046386719 STR2 = """
0.0003979206085205078 SUBM 0.0003352165222167969
0.0013720989227294922 G 0.001149892807006836
0.0015933513641357422 G 0.0017066001892089844
0.0027768611907958984 REGU 0.0006349086761474609
0.0024590492248535156 M 0.00048804283142089844
0.004837512969970703 SUBM 0.00029850006103515625
0.004601001739501953 G 0.001767873764038086
0.009881019592285156 G 0.0020656585693359375
0.008889913558959961 REGU 0.0005462169647216797
0.017162084579467773 M 0.0005753040313720703
0.009079217910766602 SUBM 0.0002789497375488281
0.009355545043945312 G 0.0012230873107910156
0.0068836212158203125 G 0.0014438629150390625
REGU 0.0005102157592773438
M 0.0005676746368408203
SUBM 0.00020241737365722656
G 0.00102996826171875
G 0.0011174678802490234
REGU 0.0005424022674560547
M 0.0005102157592773438
SUBM 0.0001976490020751953
G 0.0010385513305664062
G 0.0010204315185546875
REGU 0.0005321502685546875
M 0.00047278404235839844
SUBM 0.00021529197692871094
G 0.0010280609130859375
G 0.0010151863098144531
REGU 0.0004942417144775391
M 0.0004811286926269531
SUBM 0.00020694732666015625
G 0.0005142688751220703
G 0.0005171298980712891
""" """
def _handle_lines(s: str):
arr = s.split(" ")
return (arr[0], float(arr[-1]))
from cumm.gemm.codeops import group_by
def print_str(s: str):
nums = list(map(_handle_lines, s.strip().split("\n")))
num_dict = group_by(lambda x: x[0], nums)
num_dict_ = {k: sum([vv[1] for vv in v]) for k, v in num_dict.items()}
print(num_dict_)
nums = list(map(float, STR.strip().split("\n"))) print_str(STR1)
print(sum(nums)) print_str(STR2)
\ No newline at end of file \ No newline at end of file
...@@ -224,25 +224,26 @@ def main(): ...@@ -224,25 +224,26 @@ def main():
# voxels, coors, spatial_shape = waymo_data() # voxels, coors, spatial_shape = waymo_data()
# with open("/home/yy/test_spconv.pkl", "wb") as f: # with open("/home/yy/test_spconv.pkl", "wb") as f:
# pickle.dump((voxels, coors, spatial_shape), f) # pickle.dump((voxels, coors, spatial_shape), f)
with open("/home/yy/test_spconv.pkl", "rb") as f: with open(Path(__file__).parent / "data" / "test_spconv.pkl", "rb") as f:
(voxels, coors, spatial_shape) = pickle.load(f) (voxels, coors, spatial_shape) = pickle.load(f)
print(spatial_shape) print(spatial_shape)
print(voxels.shape) print(voxels.shape)
# voxels = voxels[:100] # voxels = voxels[:100]
# coors = coors[:100] # coors = coors[:100]
voxels_th = torch.from_numpy(voxels).cuda().float() dtype = torch.float32
voxels_th = torch.from_numpy(voxels).cuda().to(dtype)
coors_th = torch.from_numpy(coors).cuda().int() coors_th = torch.from_numpy(coors).cuda().int()
voxels_th.requires_grad = True voxels_th.requires_grad = True
algo = spconv.ConvAlgo.Native algo = spconv.ConvAlgo.Native
net = Net(spatial_shape, algo).cuda().eval().float() net = Net(spatial_shape, algo).cuda().eval().to(dtype)
print(coors_th.shape) print(coors_th.shape)
out = net(voxels_th, coors_th, 1) out = net(voxels_th, coors_th, 1)
print(out.spatial_shape) print(out.spatial_shape)
print(voxels.mean(), voxels.max(), voxels.min()) print(voxels.mean(), voxels.max(), voxels.min())
dout = np.random.uniform(-0.2, 0.2, dout = np.random.uniform(-0.2, 0.2,
out.features.shape).astype(np.float32) out.features.shape).astype(np.float32)
dout_t = torch.from_numpy(dout).cuda() dout_t = torch.from_numpy(dout).cuda().to(dtype)
print(out.spatial_shape, out.features.mean(), out.features.max(), out.features.min()) print(out.spatial_shape, out.features.mean(), out.features.max(), out.features.min())
times = [] times = []
......
...@@ -381,17 +381,17 @@ class TestSpConv(TestCase): ...@@ -381,17 +381,17 @@ class TestSpConv(TestCase):
else: else:
filters = np.random.uniform(0, 1, size=[k, k, k, OC, filters = np.random.uniform(0, 1, size=[k, k, k, OC,
IC]).astype(np.float32) IC]).astype(np.float32)
dtype = torch.float16
indices_t = torch.from_numpy(indices).int().to(device) indices_t = torch.from_numpy(indices).int().to(device)
features_t = torch.from_numpy(features).to(device) features_t = torch.from_numpy(features).to(device).to(dtype)
features_t.requires_grad = True features_t.requires_grad = True
features_dense_t = torch.from_numpy(features_dense).to(device) features_dense_t = torch.from_numpy(features_dense).to(device).to(dtype)
features_dense_t.requires_grad = True features_dense_t.requires_grad = True
net = SparseConv3dTestTorch(1, 3, shape, IC, OC, k, s, p, net = SparseConv3dTestTorch(1, 3, shape, IC, OC, k, s, p,
d).to(device) d).to(device).to(dtype)
net_ref = Conv3dTestTorch(1, 3, shape, IC, OC, k, s, p, net_ref = Conv3dTestTorch(1, 3, shape, IC, OC, k, s, p,
d).to(device) d).to(device).to(dtype)
filters_t = torch.from_numpy(filters).to(device) filters_t = torch.from_numpy(filters).to(device).to(dtype)
if FILTER_HWIO: if FILTER_HWIO:
net_ref.net[0].weight.data[:] = filters_t.permute(4, 3, 0, 1, net_ref.net[0].weight.data[:] = filters_t.permute(4, 3, 0, 1,
2).contiguous() 2).contiguous()
...@@ -442,6 +442,11 @@ class TestSpConv(TestCase): ...@@ -442,6 +442,11 @@ class TestSpConv(TestCase):
strides = [2, 3] strides = [2, 3]
paddings = [0, 1, 2] paddings = [0, 1, 2]
dilations = [1, 2, 3] dilations = [1, 2, 3]
ksizes = [3]
strides = [1]
paddings = [0]
dilations = [1]
for dev, shape, bs, IC, OC, k, s, p, d in params_grid( for dev, shape, bs, IC, OC, k, s, p, d in params_grid(
devices, shapes, batchsizes, in_channels, out_channels, ksizes, devices, shapes, batchsizes, in_channels, out_channels, ksizes,
...@@ -458,8 +463,13 @@ class TestSpConv(TestCase): ...@@ -458,8 +463,13 @@ class TestSpConv(TestCase):
indices = np.ascontiguousarray( indices = np.ascontiguousarray(
sparse_dict["indices"][:, [3, 0, 1, 2]]).astype(np.int32) sparse_dict["indices"][:, [3, 0, 1, 2]]).astype(np.int32)
features_dense = sparse_dict["features_dense"].astype(np.float32) features_dense = sparse_dict["features_dense"].astype(np.float32)
filters = np.random.uniform(0, 1, size=[k, k, k, IC, if FILTER_HWIO:
OC]).astype(np.float32) filters = np.random.uniform(0, 1, size=[k, k, k, IC,
OC]).astype(np.float32)
else:
filters = np.random.uniform(0, 1, size=[k, k, k, OC,
IC]).astype(np.float32)
indices_t = torch.from_numpy(indices).int().to(device) indices_t = torch.from_numpy(indices).int().to(device)
features_t = torch.from_numpy(features).to(device) features_t = torch.from_numpy(features).to(device)
features_t.requires_grad = True features_t.requires_grad = True
...@@ -470,11 +480,20 @@ class TestSpConv(TestCase): ...@@ -470,11 +480,20 @@ class TestSpConv(TestCase):
net_ref = DeConv3dTestTorch(1, 3, shape, IC, OC, k, s, p, net_ref = DeConv3dTestTorch(1, 3, shape, IC, OC, k, s, p,
d).to(device) d).to(device)
filters_t = torch.from_numpy(filters).to(device) filters_t = torch.from_numpy(filters).to(device)
net_ref.net[0].weight.data[:] = filters_t.permute(4, 3, 0, 1, print(net_ref.net[0].weight.shape)
2).contiguous() if FILTER_HWIO:
net_ref.net[0].weight.data[:] = filters_t.permute(3, 4, 0, 1,
2).contiguous()
else:
net_ref.net[0].weight.data[:] = filters_t.permute(4, 3, 0, 1,
2).contiguous()
net.net[0].weight.data[:] = filters_t net.net[0].weight.data[:] = filters_t
out_ref = net_ref(features_dense_t) out_ref = net_ref(features_dense_t)
out = net(features_t, indices_t, bs).dense() out = net(features_t, indices_t, bs).dense()
out_np = out.detach().cpu().numpy()
out_ref_np = out_ref.detach().cpu().numpy()
self.assertAllClose(out_np, out_ref_np, atol=1e-4)
dout = np.random.uniform(-0.2, 0.2, dout = np.random.uniform(-0.2, 0.2,
out_ref.shape).astype(features.dtype) out_ref.shape).astype(features.dtype)
dout_t = torch.from_numpy(dout).to(device) dout_t = torch.from_numpy(dout).to(device)
...@@ -490,12 +509,12 @@ class TestSpConv(TestCase): ...@@ -490,12 +509,12 @@ class TestSpConv(TestCase):
for layer, layer_ref in zip(net.net, net_ref.net): for layer, layer_ref in zip(net.net, net_ref.net):
dw = layer.weight.grad.detach().cpu().numpy() dw = layer.weight.grad.detach().cpu().numpy()
dw_ref = layer_ref.weight.grad.detach().cpu().numpy() dw_ref = layer_ref.weight.grad.detach().cpu().numpy()
dw = dw.transpose(4, 3, 0, 1, 2) if FILTER_HWIO:
dw = dw.transpose(3, 4, 0, 1, 2)
else:
dw = dw.transpose(4, 3, 0, 1, 2)
self.assertAllClose(dw, dw_ref, atol=1e-4) self.assertAllClose(dw, dw_ref, atol=1e-4)
out_np = out.detach().cpu().numpy()
out_ref_np = out_ref.detach().cpu().numpy()
self.assertAllClose(out_np, out_ref_np, atol=1e-4)
def testSpCpConv3d(self): def testSpCpConv3d(self):
np.random.seed(484) np.random.seed(484)
......
...@@ -27,8 +27,8 @@ function repair_wheel { ...@@ -27,8 +27,8 @@ function repair_wheel {
export SPCONV_DISABLE_JIT="1" export SPCONV_DISABLE_JIT="1"
export CUMM_CUDA_ARCH_LIST="all" export CUMM_CUDA_ARCH_LIST="all"
# Compile wheels, we only support 3.7-3.10. # Compile wheels, we only support 3.6-3.10.
# "/opt/python/cp36-cp36m/bin/pip" wheel /io/ --no-deps -w /io/wheelhouse_tmp "/opt/python/cp36-cp36m/bin/pip" wheel /io/ --no-deps -w /io/wheelhouse_tmp
"/opt/python/cp37-cp37m/bin/pip" wheel /io/ --no-deps -w /io/wheelhouse_tmp "/opt/python/cp37-cp37m/bin/pip" wheel /io/ --no-deps -w /io/wheelhouse_tmp
"/opt/python/cp38-cp38/bin/pip" wheel /io/ --no-deps -w /io/wheelhouse_tmp "/opt/python/cp38-cp38/bin/pip" wheel /io/ --no-deps -w /io/wheelhouse_tmp
"/opt/python/cp39-cp39/bin/pip" wheel /io/ --no-deps -w /io/wheelhouse_tmp "/opt/python/cp39-cp39/bin/pip" wheel /io/ --no-deps -w /io/wheelhouse_tmp
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment