Commit 62c1496f authored by yan.yan's avatar yan.yan
Browse files

bug fix

parent 01ed382c
name: build
on:
push:
branches:
- main
- feature/*
tags:
- '*'
jobs:
build-windows:
runs-on: windows-latest
strategy:
matrix:
python-version: ['3.6', '3.7', '3.8', '3.9', '3.10']
cuda-version: ['10.2', '11.1', '11.4']
steps:
- uses: actions/checkout@master
- name: Install CUDA
env:
CUDA_VERSION: ${{ matrix.cuda-version }}
PYTHON_VERSION: ${{ matrix.python-version }}
cuda: ${{ matrix.cuda-version }}
if: (github.event_name == 'push' && (startsWith(github.ref, 'refs/tags')) && (env.CUDA_VERSION != '') ) || (env.CUDA_VERSION == '11.1' && env.PYTHON_VERSION == '3.10')
shell: powershell
run: .\tools\install_windows_cuda.ps1
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v2
with:
python-version: ${{ matrix.python-version }}
- uses: ilammy/msvc-dev-cmd@v1
- name: Install pep build
run: |
python -m pip install build --user
python -m pip install --upgrade pip twine wheel
python -m pip install pytest setuptools
- name: Build a windows binary wheel
env:
CUDA_VERSION: ${{ matrix.cuda-version }}
PYTHON_VERSION: ${{ matrix.python-version }}
if: (github.event_name == 'push' && (startsWith(github.ref, 'refs/tags')) ) || (env.CUDA_VERSION == '11.1' && env.PYTHON_VERSION == '3.10')
run: |
$Env:CUMM_CUDA_VERSION = "${{ matrix.cuda-version }}"
$Env:CUMM_CUDA_ARCH_LIST = "all"
$Env:SPCONV_DISABLE_JIT = "1"
pip install pccm pybind11
python -m build --wheel --outdir dist/ .
shell: powershell
- name: Publish a Python distribution to PyPI
if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags')
run: |
$Env:TWINE_USERNAME = "__token__"
$Env:TWINE_PASSWORD = "${{ secrets.pypi_password }}"
twine upload dist/*
shell: powershell
build:
needs: build-windows
runs-on: ubuntu-20.04
strategy:
matrix:
python-version: ['3.8'] # this version is only used for upload.
cuda-version: ['102', '111', '114']
steps:
- uses: actions/checkout@master
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v2
with:
python-version: ${{ matrix.python-version }}
- name: Install pep build
run: |
python -m pip install build --user
python -m pip install --upgrade pip twine wheel
python -m pip install pytest setuptools
- name: Build a cuda wheel
env:
CUDA_VERSION: ${{ matrix.cuda-version }}
PYTHON_VERSION: ${{ matrix.python-version }}
DOCKER_IMAGE: scrin/manylinux2014-cuda:cu${{ matrix.cuda-version }}-devel
PLAT: manylinux2014_x86_64
if: (github.event_name == 'push' && (startsWith(github.ref, 'refs/tags')) && (env.CUDA_VERSION != '') ) || env.CUDA_VERSION == '114'
run: |
docker run --rm -e PLAT=$PLAT -e CUMM_CUDA_VERSION=${{ matrix.cuda-version }} -v `pwd`:/io $DOCKER_IMAGE bash -c "/io/tools/build-wheels.sh"
- name: Publish a Python distribution to PyPI
if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags')
uses: pypa/gh-action-pypi-publish@master
with:
user: __token__
password: ${{ secrets.pypi_password }}
\ No newline at end of file
......@@ -34,7 +34,6 @@
* training/inference speed is increased
* support int8/tensor core
* doesn't depend on pytorch binary.
* If your GPU has tensor core, try mixed precision training in spconv 2.x!
* since spconv 2.x doesn't depend on pytorch binary (never in future), it's impossible to support torch.jit/libtorch inference.
## TODO in Spconv 2.x
......
[build-system]
requires = ["setuptools>=41.0", "wheel", "pccm>=0.2.5", "cumm>=0.1.3"]
requires = ["setuptools>=41.0", "wheel", "pccm>=0.2.5", "cumm>=0.1.5"]
build-backend = "setuptools.build_meta"
......@@ -30,7 +30,7 @@ DESCRIPTION = 'spatial sparse convolution'
URL = 'https://github.com/traveller59/spconv'
EMAIL = 'yanyan.sub@outlook.com'
AUTHOR = 'Yan Yan'
REQUIRES_PYTHON = '>=3.7'
REQUIRES_PYTHON = '>=3.6'
VERSION = None
# What packages are required for this module to be executed?
......@@ -134,12 +134,11 @@ if disable_jit is not None and disable_jit == "1":
}
from cumm.gemm.main import GemmMainUnitTest, SHUFFLE_SIMT_PARAMS, SHUFFLE_VOLTA_PARAMS, SHUFFLE_TURING_PARAMS
from spconv.csrc.sparse.all import SpconvOps
from cumm.gemm.gather import GatherAll
cu = GemmMainUnitTest(SHUFFLE_SIMT_PARAMS + SHUFFLE_VOLTA_PARAMS + SHUFFLE_TURING_PARAMS)
cu.namespace = "cumm.gemm.main"
ext_modules: List[Extension] = [
PCCMExtension([cu, SpconvOps(), GatherAll()],
PCCMExtension([cu, SpconvOps()],
"spconv/core_cc",
Path(__file__).resolve().parent / "spconv")
]
......
......@@ -22,7 +22,6 @@ from .constants import PACKAGE_NAME, PACKAGE_ROOT
if project_is_installed(PACKAGE_NAME) and project_is_editable(PACKAGE_NAME):
from cumm.gemm.main import GemmMainUnitTest, SHUFFLE_SIMT_PARAMS, SHUFFLE_VOLTA_PARAMS, SHUFFLE_TURING_PARAMS
from spconv.csrc.sparse.all import SpconvOps
# from cumm.gemm.gather import GatherAll, ScatterAll
cu = GemmMainUnitTest(SHUFFLE_SIMT_PARAMS + SHUFFLE_VOLTA_PARAMS + SHUFFLE_TURING_PARAMS)
cu.namespace = "cumm.gemm.main"
pccm.builder.build_pybind([cu, SpconvOps()],
......
......@@ -3,14 +3,12 @@ from pccm.stubs import EnumValue, EnumClassValue
from cumm.tensorview import Tensor
class SpconvOps:
@staticmethod
def generate_conv_inds(indices: Tensor, hashdata: Tensor, indice_pairs: Tensor, indice_pairs_uniq: Tensor, out_inds: Tensor, indice_num_per_loc: Tensor, batch_size: int, output_dims: List[int], input_dims: List[int], ksize: List[int], stride: List[int], padding: List[int], dilation: List[int]) -> int:
def generate_conv_inds_stage1(indices: Tensor, indice_pairs: Tensor, indice_pairs_uniq: Tensor, indice_num_per_loc: Tensor, batch_size: int, output_dims: List[int], input_dims: List[int], ksize: List[int], stride: List[int], padding: List[int], dilation: List[int], transposed: bool = False, stream_int: int = 0) -> None:
"""
Args:
indices:
hashdata:
indice_pairs:
indice_pairs_uniq:
out_inds:
indice_num_per_loc:
batch_size:
output_dims:
......@@ -19,28 +17,22 @@ class SpconvOps:
stride:
padding:
dilation:
transposed:
stream_int:
"""
...
@staticmethod
def generate_conv_inds_stage1(indices: Tensor, indice_pairs: Tensor, indice_pairs_uniq: Tensor, indice_num_per_loc: Tensor, batch_size: int, output_dims: List[int], input_dims: List[int], ksize: List[int], stride: List[int], padding: List[int], dilation: List[int], stream_int: int = 0) -> int:
def generate_conv_inds_stage1_5(indice_pairs_uniq: Tensor, ndim: int, uniq_size: int, stream_int: int = 0) -> int:
"""
Args:
indices:
indice_pairs:
indice_pairs_uniq:
indice_num_per_loc:
batch_size:
output_dims:
input_dims:
ksize:
stride:
padding:
dilation:
ndim:
uniq_size:
stream_int:
"""
...
@staticmethod
def generate_conv_inds_stage2(indices: Tensor, hashdata: Tensor, indice_pairs: Tensor, indice_pairs_uniq: Tensor, out_inds: Tensor, num_out_act: int, batch_size: int, output_dims: List[int], input_dims: List[int], ksize: List[int], stride: List[int], padding: List[int], dilation: List[int], stream_int: int = 0) -> int:
def generate_conv_inds_stage2(indices: Tensor, hashdata: Tensor, indice_pairs: Tensor, indice_pairs_uniq: Tensor, out_inds: Tensor, num_out_act: int, batch_size: int, output_dims: List[int], input_dims: List[int], ksize: List[int], stride: List[int], padding: List[int], dilation: List[int], transposed: bool = False, stream_int: int = 0) -> int:
"""
Args:
indices:
......@@ -56,6 +48,7 @@ class SpconvOps:
stride:
padding:
dilation:
transposed:
stream_int:
"""
...
......
......@@ -35,26 +35,30 @@ class SpconvOps(pccm.Class):
problem = ConvProblem(ndim, ConvOpType.kForward, NHWC, NHWC, NHWC)
indices = SparseConvIndicesKernel(problem, dtypes.int32)
# self.add_param_class("ops", indices, "SpconvIndices")
cuda_funcs = [self.generate_conv_inds, self.generate_subm_conv_inds,
self.generate_conv_inds_stage1, self.generate_conv_inds_stage2, self.sort_1d_by_key]
cuda_funcs = [self.generate_subm_conv_inds,
self.generate_conv_inds_stage1, self.generate_conv_inds_stage1_5, self.generate_conv_inds_stage2, self.sort_1d_by_key]
self.add_impl_only_param_class(cuda_funcs, f"ops{ndim}d", indices, f"SpconvIndices{ndim}D")
@pccm.pybind.mark
@pccm.cuda.static_function
def generate_conv_inds(self):
def generate_conv_inds_stage1(self):
code = pccm.FunctionCode()
code.arg("indices, hashdata", "tv::Tensor")
code.arg("indice_pairs, indice_pairs_uniq, out_inds, indice_num_per_loc", "tv::Tensor")
code.arg("indices", "tv::Tensor")
code.arg("indice_pairs, indice_pairs_uniq, indice_num_per_loc", "tv::Tensor")
code.arg("batch_size", "int")
code.arg("output_dims, input_dims", f"std::vector<int>")
code.arg("ksize, stride, padding, dilation", f"std::vector<int>")
code.arg("transposed", f"bool", "false")
code.arg("stream_int", f"std::uintptr_t", "0", pyanno="int")
code.raw(f"""
int ndim = indices.dim(1) - 1;
TV_ASSERT_RT_ERR(output_dims.size() == ndim && input_dims.size() == ndim &&
ksize.size() == ndim && stride.size() == ndim && dilation.size() == ndim &&
padding.size() == ndim, "your params size not equal to ndim", ndim);
""")
for ndim in self.ndims:
code.raw(f"""
if (ndim == {ndim}){{
......@@ -68,53 +72,31 @@ class SpconvOps(pccm.Class):
padding_[i] = padding[i];
dilation_[i] = dilation[i];
}}
return SpconvIndices{ndim}D::generate_conv_inds(indices, hashdata,
indice_pairs, indice_pairs_uniq, out_inds, indice_num_per_loc,
return SpconvIndices{ndim}D::generate_conv_inds_stage1(indices,
indice_pairs, indice_pairs_uniq, indice_num_per_loc,
batch_size, output_dims_, input_dims_,
ksize_, stride_, padding_, dilation_);
ksize_, stride_, padding_, dilation_, transposed, stream_int);
}}
""")
code.raw(f"""TV_THROW_RT_ERR("unknown ndim", ndim);""")
return code.ret("int")
return code# .ret("int")
@pccm.pybind.mark
@pccm.cuda.static_function
def generate_conv_inds_stage1(self):
def generate_conv_inds_stage1_5(self):
code = pccm.FunctionCode()
code.arg("indices", "tv::Tensor")
code.arg("indice_pairs, indice_pairs_uniq, indice_num_per_loc", "tv::Tensor")
code.arg("batch_size", "int")
code.arg("output_dims, input_dims", f"std::vector<int>")
code.arg("ksize, stride, padding, dilation", f"std::vector<int>")
code.arg("indice_pairs_uniq", "tv::Tensor")
code.arg("ndim", "int")
code.arg("uniq_size", "int64_t")
code.arg("stream_int", f"std::uintptr_t", "0", pyanno="int")
code.raw(f"""
int ndim = indices.dim(1) - 1;
TV_ASSERT_RT_ERR(output_dims.size() == ndim && input_dims.size() == ndim &&
ksize.size() == ndim && stride.size() == ndim && dilation.size() == ndim &&
padding.size() == ndim, "your params size not equal to ndim", ndim);
""")
for ndim in self.ndims:
code.raw(f"""
if (ndim == {ndim}){{
tv::array<int, {ndim}> output_dims_, input_dims_;
tv::array<int, {ndim}> ksize_, stride_, padding_, dilation_;
for (int i = 0; i < {ndim}; ++i){{
output_dims_[i] = output_dims[i];
input_dims_[i] = input_dims[i];
ksize_[i] = ksize[i];
stride_[i] = stride[i];
padding_[i] = padding[i];
dilation_[i] = dilation[i];
}}
return SpconvIndices{ndim}D::generate_conv_inds_stage1(indices,
indice_pairs, indice_pairs_uniq, indice_num_per_loc,
batch_size, output_dims_, input_dims_,
ksize_, stride_, padding_, dilation_);
return SpconvIndices{ndim}D::generate_conv_inds_stage1_5(indice_pairs_uniq, uniq_size, stream_int);
}}
""")
code.raw(f"""TV_THROW_RT_ERR("unknown ndim", ndim);""")
return code.ret("int")
@pccm.pybind.mark
......@@ -127,6 +109,7 @@ class SpconvOps(pccm.Class):
code.arg("batch_size", "int")
code.arg("output_dims, input_dims", f"std::vector<int>")
code.arg("ksize, stride, padding, dilation", f"std::vector<int>")
code.arg("transposed", f"bool", "false")
code.arg("stream_int", f"std::uintptr_t", "0", pyanno="int")
code.raw(f"""
int ndim = indices.dim(1) - 1;
......@@ -151,7 +134,7 @@ class SpconvOps(pccm.Class):
return SpconvIndices{ndim}D::generate_conv_inds_stage2(indices, hashdata,
indice_pairs, indice_pairs_uniq, out_inds, num_out_act,
batch_size, output_dims_, input_dims_,
ksize_, stride_, padding_, dilation_);
ksize_, stride_, padding_, dilation_, transposed, stream_int);
}}
""")
code.raw(f"""TV_THROW_RT_ERR("unknown ndim", ndim);""")
......
......@@ -225,6 +225,28 @@ class ConvOutLocIter(pccm.ParameterizedClass):
""")
return code
@pccm.cuda.member_function(host=True,
device=True,
forceinline=True,
const=True)
def query_nhw_out(self):
code = pccm.FunctionCode()
code.arg("npq_offset", "const int*")
code.arg("nhw_offset", f"tv::array<int, {self.ndim + 1}>&")
code.ret("bool")
code.raw(f"""
nhw_offset = npq_to_nhw(npq_offset);
""")
hw_valid = [] # type: List[str]
for i in range(self.ndim):
hw_valid.append((f"nhw_offset[{i + 1}] >= 0 && "
f"nhw_offset[{i + 1}] < problem_.output_dims[{i}]"))
code.raw(f"""
return nhw_offset[0] < problem_.N &&
{' && '.join(hw_valid)};
""")
return code
class SparseConvIndicesKernel(pccm.ParameterizedClass):
def __init__(self, problem: ConvProblem, dtype_indices: dtypes.DType):
super().__init__()
......@@ -255,7 +277,7 @@ class SparseConvIndicesKernel(pccm.ParameterizedClass):
code.arg("indices_pair_size", "int")
code.arg("RS", "int")
# code.arg("bool", "transposed")
code.arg("transposed", "bool")
code.raw(f"""
int filter_offset = blockIdx.y;
......@@ -264,7 +286,13 @@ class SparseConvIndicesKernel(pccm.ParameterizedClass):
int filter_offset_mul_indices_pair_size = filter_offset * indices_pair_size;
for (int i : tv::KernelLoopX<int>(num_indices_in)) {{
tv::array<int, {self.ndim + 1}> npq_offset;
if (loc_iter.query_npq(indices_in + i * {self.ndim + 1}, npq_offset)){{
bool valid;
if (transposed){{
valid = loc_iter.query_nhw_out(indices_in + i * {self.ndim + 1}, npq_offset);
}}else{{
valid = loc_iter.query_npq(indices_in + i * {self.ndim + 1}, npq_offset);
}}
if (valid){{
int old_num = tv::cuda::atomicAggInc(indice_num_per_loc + filter_offset);
{self.dtype_indices} offset = loc_iter.layout_npq(npq_offset);
if (old_num < indices_pair_size){{
......@@ -514,81 +542,6 @@ class SparseConvIndicesKernel(pccm.ParameterizedClass):
""")
return code
@pccm.cuda.static_function
def generate_conv_inds(self):
code = pccm.FunctionCode()
code.arg("indices, hashdata", "tv::Tensor")
code.arg("indice_pairs, indice_pairs_uniq, out_inds, indice_num_per_loc", "tv::Tensor")
code.arg("batch_size", "int")
code.arg("output_dims, input_dims", f"tv::array<int, {self.ndim}>")
code.arg("ksize, stride, padding, dilation", f"tv::array<int, {self.ndim}>")
code.raw(f"""
// TODO stream
// TODO handle num input == 0
int kv = tv::arrayops::prod(ksize);
TV_ASSERT_RT_ERR(kv == indice_pairs.dim(1), "error");
// indice_pairs: [2, kv, indices.dim(0)]
// indice_pairs_uniq: [indice_pairs.size() / 2 + 1]
// out_inds: [MaxSize, {self.ndim + 1}]
auto timer = tv::CudaContextTimer<>();
int64_t uniq_size = indice_pairs.size() / 2 + 1;
TV_ASSERT_RT_ERR(indice_pairs_uniq.dim(0) == uniq_size, "error");
TV_ASSERT_RT_ERR(indice_num_per_loc.dim(0) == kv, "error");
int64_t expected_out_size = indices.dim(0) * kv;
TV_ASSERT_RT_ERR(out_inds.dim(0) == expected_out_size && out_inds.dim(1) == {self.ndim + 1}, "error");
tv::cuda::Launch launcher_num_act_in(indices.dim(0));
// tv::cuda::Launch launcher_num_act_in_2(indices.dim(0));
launcher_num_act_in.blocks.y = kv;
ConvProblem problem(batch_size, 1, 1, input_dims, output_dims, ksize, padding, stride, dilation);
ConvLocIter loc_iter(problem);
tv::cuda::Launch launcher_clean_uniq(uniq_size);
launcher_clean_uniq(clean_indices_uniq, indice_pairs_uniq.data_ptr<{self.dtype_indices}>(), uniq_size);
tv::ssprint("clean time", timer.report() / 1000.0);
launcher_num_act_in(calc_conv_indices_stage1, loc_iter, indices.data_ptr<const int>(),
indice_pairs.data_ptr<{self.dtype_indices}>(),
indice_pairs_uniq.data_ptr<{self.dtype_indices}>(), indice_num_per_loc.data_ptr<int>(), indices.dim(0),
indice_pairs.dim(2), kv);
tv::ssprint("calc_conv_indices_stage1 time", timer.report() / 1000.0, uniq_size);
thrust::device_ptr<{self.dtype_indices}> ptr_tr(indice_pairs_uniq.data_ptr<{self.dtype_indices}>());
auto thrust_ctx = thrust::cuda::par.on(0);
thrust::sort(thrust_ctx, ptr_tr, ptr_tr + uniq_size);
auto new_end = thrust::unique(thrust_ctx, ptr_tr, ptr_tr + uniq_size);
auto num_out_act = new_end - ptr_tr - 1;
tv::ssprint("unique time", num_out_act, timer.report() / 1000.0);
// return num_out_act;
// TODO handle invalid num_out_act
indice_pairs_uniq = indice_pairs_uniq.slice_first_axis(0, num_out_act);
tv::cuda::Launch lanucher_build_hash(num_out_act);
using V = {self.dtype_indices};
using KeyType = {self.dtype_indices};
constexpr KeyType kEmptyKey = std::numeric_limits<KeyType>::max();
using table_t =
tv::hash::LinearHashTable<KeyType, V, tv::hash::Murmur3Hash<KeyType>,
kEmptyKey, false>;
using pair_t = typename table_t::value_type;
TV_ASSERT_RT_ERR(hashdata.dim(0) >= num_out_act, "hash size not enough");
table_t hash = table_t(hashdata.data_ptr<pair_t>(), hashdata.dim(0));
hash.clear();
tv::ssprint("clear hash time", hashdata.dim(0), timer.report() / 1000.0);
lanucher_build_hash(build_conv_hash_table<table_t>, hash, out_inds.data_ptr<int>(), indice_pairs_uniq.data_ptr<const {self.dtype_indices}>(),
loc_iter.layout_npq, num_out_act);
tv::ssprint("build_hash time", num_out_act, timer.report() / 1000.0);
launcher_num_act_in(calc_conv_indices_stage2<table_t>, hash, indice_pairs[1].data_ptr<int>(), indices.dim(0),
indice_pairs.dim(2));
tv::ssprint("gem conv inds time", timer.report() / 1000.0);
return num_out_act;
""")
return code.ret("int")
@pccm.cuda.static_function
def generate_conv_inds_stage1(self):
code = pccm.FunctionCode()
......@@ -597,6 +550,8 @@ class SparseConvIndicesKernel(pccm.ParameterizedClass):
code.arg("batch_size", "int")
code.arg("output_dims, input_dims", f"tv::array<int, {self.ndim}>")
code.arg("ksize, stride, padding, dilation", f"tv::array<int, {self.ndim}>")
code.arg("transposed", f"bool", "false")
code.arg("stream_int", f"std::uintptr_t", "0")
code.raw(f"""
......@@ -620,7 +575,23 @@ class SparseConvIndicesKernel(pccm.ParameterizedClass):
launcher_num_act_in(calc_conv_indices_stage1, loc_iter, indices.data_ptr<const int>(),
indice_pairs.data_ptr<{self.dtype_indices}>(),
indice_pairs_uniq.data_ptr<{self.dtype_indices}>(), indice_num_per_loc.data_ptr<int>(), indices.dim(0),
indice_pairs.dim(2), kv);
indice_pairs.dim(2), kv, transposed);
// thrust::device_ptr<{self.dtype_indices}> ptr_tr(indice_pairs_uniq.data_ptr<{self.dtype_indices}>());
// auto thrust_ctx = thrust::cuda::par.on(reinterpret_cast<cudaStream_t>(stream_int));
// thrust::sort(thrust_ctx, ptr_tr, ptr_tr + uniq_size);
// auto new_end = thrust::unique(thrust_ctx, ptr_tr, ptr_tr + uniq_size);
// auto num_out_act = new_end - ptr_tr - 1;
// return num_out_act;
""")
return code# .ret("int")
@pccm.cuda.static_function
def generate_conv_inds_stage1_5(self):
code = pccm.FunctionCode()
code.arg("indice_pairs_uniq", "tv::Tensor")
code.arg("uniq_size", "int64_t")
code.arg("stream_int", f"std::uintptr_t", "0")
code.raw(f"""
thrust::device_ptr<{self.dtype_indices}> ptr_tr(indice_pairs_uniq.data_ptr<{self.dtype_indices}>());
auto thrust_ctx = thrust::cuda::par.on(reinterpret_cast<cudaStream_t>(stream_int));
thrust::sort(thrust_ctx, ptr_tr, ptr_tr + uniq_size);
......@@ -630,6 +601,7 @@ class SparseConvIndicesKernel(pccm.ParameterizedClass):
""")
return code.ret("int")
@pccm.cuda.static_function
def generate_conv_inds_stage2(self):
code = pccm.FunctionCode()
......@@ -639,6 +611,7 @@ class SparseConvIndicesKernel(pccm.ParameterizedClass):
code.arg("batch_size", "int")
code.arg("output_dims, input_dims", f"tv::array<int, {self.ndim}>")
code.arg("ksize, stride, padding, dilation", f"tv::array<int, {self.ndim}>")
code.arg("transposed", f"bool", "false")
code.arg("stream_int", f"std::uintptr_t", "0")
code.raw(f"""
auto custream = reinterpret_cast<cudaStream_t>(stream_int);
......@@ -651,7 +624,7 @@ class SparseConvIndicesKernel(pccm.ParameterizedClass):
// out_inds: [MaxSize, {self.ndim + 1}]
auto timer = tv::CudaContextTimer<>();
int64_t uniq_size = indice_pairs.size() / 2 + 1;
TV_ASSERT_RT_ERR(indice_pairs_uniq.dim(0) == uniq_size, "error");
TV_ASSERT_RT_ERR(indice_pairs_uniq.dim(0) >= num_out_act, "error");
TV_ASSERT_RT_ERR(out_inds.dim(0) >= num_out_act && out_inds.dim(1) == {self.ndim + 1}, "error");
tv::cuda::Launch launcher_num_act_in(indices.dim(0), custream);
launcher_num_act_in.blocks.y = kv;
......
......@@ -22,7 +22,9 @@ from cumm.common import TensorView, TensorViewHashKernel, TensorViewKernel, Thru
from cumm.gemm import codeops
from typing import List
from cumm.conv.params import ConvProblem
from cumm.gemm.mask_iters import MaskTileIterator, MaskTileIteratorParams
import numpy as np
from cumm.gemm import (thread_map)
class IndiceMaxPool(pccm.Class):
# TODO optimize this function
......@@ -171,4 +173,3 @@ class IndiceMaxPool(pccm.Class):
}});
""")
return code
......@@ -14,6 +14,7 @@
import math
import time
from typing import List, Optional, Tuple, Union
import numpy as np
import torch
......@@ -39,6 +40,10 @@ def _calculate_fan_in_and_fan_out_hwio(tensor):
if dimensions == 2: # Linear
fan_in = tensor.size(-2)
fan_out = tensor.size(-1)
else:
if FILTER_HWIO:
num_input_fmaps = tensor.size(-2)
num_output_fmaps = tensor.size(-1)
else:
num_input_fmaps = tensor.size(-1)
num_output_fmaps = tensor.size(-2)
......@@ -58,22 +63,22 @@ class SparseConvolution(SparseModule):
]
def __init__(self,
ndim,
in_channels,
out_channels,
kernel_size=3,
stride=1,
padding=0,
dilation=1,
groups=1,
bias=True,
subm=False,
output_padding=0,
transposed=False,
inverse=False,
indice_key=None,
fused_bn=False,
algo=ops.ConvAlgo.Native,
ndim: int,
in_channels: int,
out_channels: int,
kernel_size: Union[int, List[int], Tuple[int, ...]]=3,
stride: Union[int, List[int], Tuple[int, ...]]=1,
padding: Union[int, List[int], Tuple[int, ...]]=0,
dilation: Union[int, List[int], Tuple[int, ...]]=1,
groups: Union[int, List[int], Tuple[int, ...]]=1,
bias: bool=True,
subm: bool=False,
output_padding: Union[int, List[int], Tuple[int, ...]]=0,
transposed: bool=False,
inverse: bool=False,
indice_key: Optional[str]=None,
fused_bn: bool=False,
algo: ops.ConvAlgo=ops.ConvAlgo.Native,
name=None):
super(SparseConvolution, self).__init__(name=name)
assert groups == 1
......@@ -117,8 +122,6 @@ class SparseConvolution(SparseModule):
self.bias = Parameter(torch.Tensor(out_channels))
else:
self.register_parameter('bias', None)
# self.workspace_for_splitk = torch.zeros((GLOBAL_MAXIMUM_SPLITK,), dtype=torch.int8)
# self.register_buffer("workspace_for_splitk", self.workspace_for_splitk)
self.reset_parameters()
def reset_parameters(self):
......@@ -234,6 +237,7 @@ class SparseConvolution(SparseModule):
t = time.time()
if self.fused_bn:
raise NotImplementedError
assert self.bias is not None
out_features = ops.fused_indice_conv(features, self.weight,
self.bias,
......@@ -382,6 +386,34 @@ class SparseConv4d(SparseConvolution):
name=name)
class SparseConvTranspose1d(SparseConvolution):
def __init__(self,
in_channels,
out_channels,
kernel_size,
stride=1,
padding=0,
dilation=1,
groups=1,
bias=True,
indice_key=None,
algo=ops.ConvAlgo.Native,
name=None):
super(SparseConvTranspose1d, self).__init__(1,
in_channels,
out_channels,
kernel_size,
stride,
padding,
dilation,
groups,
bias,
transposed=True,
indice_key=indice_key,
algo=algo,
name=name)
class SparseConvTranspose2d(SparseConvolution):
def __init__(self,
in_channels,
......@@ -437,6 +469,34 @@ class SparseConvTranspose3d(SparseConvolution):
algo=algo,
name=name)
class SparseConvTranspose4d(SparseConvolution):
def __init__(self,
in_channels,
out_channels,
kernel_size,
stride=1,
padding=0,
dilation=1,
groups=1,
bias=True,
indice_key=None,
algo=ops.ConvAlgo.Native,
name=None):
super(SparseConvTranspose4d, self).__init__(4,
in_channels,
out_channels,
kernel_size,
stride,
padding,
dilation,
groups,
bias,
transposed=True,
indice_key=indice_key,
algo=algo,
name=name)
class SparseInverseConv1d(SparseConvolution):
def __init__(self,
in_channels,
......
......@@ -143,50 +143,3 @@ class SparseSequential(SparseModule):
input = module(input)
return input
def fused(self):
"""don't use this. no effect.
"""
from spconv.pytorch.conv import SparseConvolution
mods = [v for k, v in self._modules.items()]
fused_mods = []
idx = 0
while idx < len(mods):
if is_sparse_conv(mods[idx]):
if idx < len(mods) - 1 and isinstance(mods[idx + 1],
nn.BatchNorm1d):
new_module = SparseConvolution(
ndim=mods[idx].ndim,
in_channels=mods[idx].in_channels,
out_channels=mods[idx].out_channels,
kernel_size=mods[idx].kernel_size,
stride=mods[idx].stride,
padding=mods[idx].padding,
dilation=mods[idx].dilation,
groups=mods[idx].groups,
bias=True,
subm=mods[idx].subm,
output_padding=mods[idx].output_padding,
transposed=mods[idx].transposed,
inverse=mods[idx].inverse,
indice_key=mods[idx].indice_key,
fused_bn=True,
)
new_module.load_state_dict(mods[idx].state_dict(), False)
new_module.to(mods[idx].weight.device)
conv = new_module
bn = mods[idx + 1]
conv.bias.data.zero_()
conv.weight.data[:] = conv.weight.data * bn.weight.data / (
torch.sqrt(bn.running_var) + bn.eps)
conv.bias.data[:] = (
conv.bias.data - bn.running_mean) * bn.weight.data / (
torch.sqrt(bn.running_var) + bn.eps) + bn.bias.data
fused_mods.append(conv)
idx += 2
else:
fused_mods.append(mods[idx])
idx += 1
else:
fused_mods.append(mods[idx])
idx += 1
return SparseSequential(*fused_mods)
......@@ -11,7 +11,7 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import functools
from enum import Enum
from cumm import tensorview as tv
from cumm.gemm.algospec.core import ShuffleStrideType
......@@ -23,7 +23,7 @@ from spconv.algo import AlgoHint, ConvAlgo
from typing import List, Union
from spconv.pytorch.cppcore import torch_tensor_to_tv, get_current_stream
from spconv.core_cc.csrc.sparse.all import SpconvOps
from spconv.algo import GEMM# , GATHER, SCATTER
from spconv.algo import GEMM # , GATHER, SCATTER
import time
from spconv.constants import FILTER_HWIO
......@@ -58,26 +58,17 @@ def get_indice_pairs(indices: torch.Tensor,
batch_size: int,
spatial_shape: List[int],
algo: ConvAlgo,
ksize: Union[int, List[int]],
stride: Union[int, List[int]],
padding: Union[int, List[int]],
dilation: Union[int, List[int]],
out_padding: Union[int, List[int]],
ksize: List[int],
stride: List[int],
padding: List[int],
dilation: List[int],
out_padding: List[int],
subm: bool = False,
transpose: bool = False):
# torch.cuda.synchronize()
# t = time.time()
ndim = indices.shape[1] - 1
if not isinstance(ksize, (list, tuple)):
ksize = [ksize] * ndim
if not isinstance(stride, (list, tuple)):
stride = [stride] * ndim
if not isinstance(padding, (list, tuple)):
padding = [padding] * ndim
if not isinstance(dilation, (list, tuple)):
dilation = [dilation] * ndim
if not isinstance(out_padding, (list, tuple)):
out_padding = [out_padding] * ndim
kv: int = int(np.prod(ksize))
kv: int = functools.reduce(lambda x, y: x * y, ksize, 1)
if not subm:
if transpose:
out_shape = get_deconv_output_size(spatial_shape, ksize, stride,
......@@ -87,8 +78,9 @@ def get_indice_pairs(indices: torch.Tensor,
padding, dilation)
else:
out_shape = spatial_shape
assert algo == ConvAlgo.Native and not transpose, "TODO"
assert algo == ConvAlgo.Native, "TODO"
stream = get_current_stream()
pair = torch.full((2, kv, indices.shape[0]),
-1,
dtype=indices.dtype,
......@@ -96,19 +88,20 @@ def get_indice_pairs(indices: torch.Tensor,
indice_num_per_loc = torch.zeros((kv, ),
dtype=indices.dtype,
device=indices.device)
inds_tv = torch_tensor_to_tv(indices)
pair_tv = torch_tensor_to_tv(pair)
indice_num_per_loc_tv = torch_tensor_to_tv(indice_num_per_loc)
# torch.cuda.synchronize()
# t = time.time()
if subm:
out_inds = indices
hashdata = torch.empty((out_inds.shape[0] * 2, ),
dtype=torch.int64,
device=indices.device)
out_inds_tv = torch_tensor_to_tv(out_inds)
hashdata_tv = torch_tensor_to_tv(hashdata, dtype=tv.custom64)
SpconvOps.generate_subm_conv_inds(inds_tv,
hashdata_tv,
pair_tv,
......@@ -120,16 +113,15 @@ def get_indice_pairs(indices: torch.Tensor,
dilation=dilation,
stream_int=stream)
# torch.cuda.synchronize()
# print("SUBM INDICE GEN", time.time() - t)
# print("SUBM", time.time() - t)
else:
indice_pairs_uniq = torch.empty((pair.numel() // 2 + 1, ),
dtype=indices.dtype,
device=indices.device)
indice_pairs_uniq_tv = torch_tensor_to_tv(indice_pairs_uniq)
num_act_out = SpconvOps.generate_conv_inds_stage1(
inds_tv,
SpconvOps.generate_conv_inds_stage1(inds_tv,
pair_tv,
indice_pairs_uniq_tv,
indice_num_per_loc_tv,
......@@ -140,7 +132,17 @@ def get_indice_pairs(indices: torch.Tensor,
stride=stride,
padding=padding,
dilation=dilation,
transposed=transpose,
stream_int=stream)
uniq_res = indice_pairs_uniq.unique()
num_act_out = uniq_res.shape[0] - 1
uniq_res_tv = torch_tensor_to_tv(uniq_res)
# num_act_out = SpconvOps.generate_conv_inds_stage1_5(
# indice_pairs_uniq_tv,
# ndim,
# uniq_size=indice_pairs_uniq_tv.size,
# stream_int=stream)
# uniq_res_tv = indice_pairs_uniq_tv.slice_first_axis(0, num_act_out)
out_inds = torch.empty((num_act_out, indices.shape[1]),
dtype=indices.dtype,
device=indices.device)
......@@ -152,7 +154,7 @@ def get_indice_pairs(indices: torch.Tensor,
SpconvOps.generate_conv_inds_stage2(inds_tv,
hashdata_tv,
pair_tv,
indice_pairs_uniq_tv,
uniq_res_tv,
out_inds_tv,
num_out_act=num_act_out,
batch_size=batch_size,
......@@ -162,11 +164,10 @@ def get_indice_pairs(indices: torch.Tensor,
stride=stride,
padding=padding,
dilation=dilation,
transposed=transpose,
stream_int=stream)
# torch.cuda.synchronize()
# print("INDICE GEN", time.time() - t)
# print("REGU", time.time() - t)
return out_inds, pair, indice_num_per_loc
......@@ -228,8 +229,6 @@ def indice_conv(features: torch.Tensor,
c_inds_shape=[nhot_profile],
hint=AlgoHint.Fowrard.value)
gather_data_tv = tv.Tensor()
scatter_data_tv = tv.Tensor()
maxnhot = max(indice_pair_num_cpu)
if profile_res is None:
......@@ -270,13 +269,10 @@ def indice_conv(features: torch.Tensor,
continue
inp_indices = pair_in[i].slice_first_axis(0, nhot)
out_indices = pair_out[i].slice_first_axis(0, nhot)
# inp_indices = torch_tensor_to_tv(inp_indices_th)
# out_indices = torch_tensor_to_tv(out_indices_th)
b = filters_tv[i]
# inp @ filter.T, NC @ KC
beta = 1.0 if inited else 0.0
algo_desp = GEMM.run_profile(
profile_res,
algo_desp = GEMM.run_profile(profile_res,
a,
b,
c,
......@@ -295,11 +291,11 @@ def indice_conv(features: torch.Tensor,
# gather_times += gather_time
inited = True
# torch.cuda.synchronize()
# print(stream, valid_count, maxnhot, features.shape[0], features.shape[1], out_channel, time.time() - t, total_times, txt)
# print(algo_desp, profile_res.external_gather, profile_res.splitk, features.shape[0], features.shape[1], out_channel, time.time() - t, total_times)
# # print(stream, valid_count, maxnhot, features.shape[0], features.shape[1], out_channel, time.time() - t, total_times, txt)
# # print(algo_desp, profile_res.external_gather, profile_res.splitk, features.shape[0], features.shape[1], out_channel, time.time() - t)
# print(indice_pair_num_cpu)
# print(time.time() - t)
# # print(indice_pair_num_cpu)
# print("G", time.time() - t)
return out_features
......@@ -316,8 +312,6 @@ def indice_conv_backward(features: torch.Tensor,
inverse: bool = False,
subm: bool = False,
algo: ConvAlgo = ConvAlgo.Native):
# workspace = torch.empty((10000), dtype=torch.uint8, device=features.device)
# workspace_tv = torch_tensor_to_tv(workspace)
# torch.cuda.synchronize()
# t = time.time()
......@@ -400,7 +394,6 @@ def indice_conv_backward(features: torch.Tensor,
c_inds=out_indices,
alpha=1.0,
beta=0.0,
# scatter_data=scatter_data_tv.slice_first_axis(0, nhot_profile),
hint=AlgoHint.BackwardInput.value,
stream=stream)
if not FILTER_HWIO:
......@@ -445,7 +438,6 @@ def indice_conv_backward(features: torch.Tensor,
b_inds=b_inds_wgrad,
alpha=1.0,
beta=0.0,
# scatter_data=scatter_data_tv.slice_first_axis(0, nhot_profile),
hint=AlgoHint.BackwardWeight.value,
stream=stream)
# print(profile_res_wgrad.algo_desp, profile_res_wgrad.splitk, min_time)
......@@ -457,8 +449,9 @@ def indice_conv_backward(features: torch.Tensor,
else:
b_shape = [maxnhot, out_bp_tv.dim(1)]
a_shape = [maxnhot, features_tv.dim(1)]
m, n, k = GEMM.extract_mnk(
a_shape, b_shape, profile_res_wgrad.algo_desp.trans_a,
m, n, k = GEMM.extract_mnk(a_shape,
b_shape,
profile_res_wgrad.algo_desp.trans_a,
profile_res_wgrad.algo_desp.trans_b,
profile_res_wgrad.algo_desp.trans_c,
arch=arch,
......@@ -466,12 +459,15 @@ def indice_conv_backward(features: torch.Tensor,
a_inds_shape=[maxnhot],
b_inds_shape=[maxnhot],
hint=AlgoHint.BackwardWeight.value)
workspace_size = profile_res_wgrad.algo_desp.query_workspace_size(m, n, k, profile_res_wgrad.splitk)
workspace_size = profile_res_wgrad.algo_desp.query_workspace_size(
m, n, k, profile_res_wgrad.splitk)
workspace = torch.Tensor()
workspace_tv = tv.Tensor()
if workspace_size > 0:
workspace = torch.empty((workspace_size,), dtype=torch.int8, device=features.device)
workspace = torch.empty((workspace_size, ),
dtype=torch.int8,
device=features.device)
workspace_tv = torch_tensor_to_tv(workspace)
# print(workspace_size, m, n, k, profile_res_wgrad.splitk)
# torch.cuda.synchronize()
......@@ -538,11 +534,13 @@ def indice_conv_backward(features: torch.Tensor,
# dw_time = time.time() - t
# # print(dw_time + di_time, di_time, dw_time, profile_res_wgrad.splitk, profile_res_wgrad.algo_desp, dfilters.shape)
# # print(dw_time + di_time)
# print(time.time() - t)
# print("BWG", time.time() - t)
return (din, dfilters.reshape(filters_shape))
def indice_maxpool(features, indice_pairs, indice_pair_num, num_activate_out):
# torch.cuda.synchronize()
# t = time.time()
out_channel = features.shape[-1]
out_features = torch.zeros((num_activate_out, out_channel),
dtype=features.dtype,
......@@ -558,6 +556,9 @@ def indice_maxpool(features, indice_pairs, indice_pair_num, num_activate_out):
out_indices = torch_tensor_to_tv(indice_pairs[1][i, :nhot])
SpconvOps.maxpool_forward(out_features_tv, features_tv, out_indices,
inp_indices, stream)
# torch.cuda.synchronize()
# print("M", time.time() - t)
return out_features
......
......@@ -142,6 +142,20 @@ class SparseMaxPool(SparseModule):
return out_tensor
class SparseMaxPool1d(SparseMaxPool):
def __init__(self,
kernel_size,
stride=None,
padding=0,
dilation=1,
name=None):
super(SparseMaxPool1d, self).__init__(1,
kernel_size,
stride,
padding,
dilation,
name=name)
class SparseMaxPool2d(SparseMaxPool):
def __init__(self,
kernel_size,
......@@ -170,3 +184,17 @@ class SparseMaxPool3d(SparseMaxPool):
padding,
dilation,
name=name)
class SparseMaxPool4d(SparseMaxPool):
def __init__(self,
kernel_size,
stride=None,
padding=0,
dilation=1,
name=None):
super(SparseMaxPool4d, self).__init__(4,
kernel_size,
stride,
padding,
dilation,
name=name)
......@@ -13,20 +13,20 @@
# limitations under the License.
STR = """
0.0016176700592041016
0.002481698989868164
0.0027854442596435547
0.0031723976135253906
0.0017437934875488281
0.0020503997802734375
0.001399993896484375
0.0016183853149414062
0.0007357597351074219
0.0008492469787597656
0.0006558895111083984
0.0007994174957275391
0.000335693359375
0.000347137451171875
BWG 0.0008761882781982422
BWG 0.0008311271667480469
BWG 0.002079486846923828
BWG 0.002329587936401367
BWG 0.0025458335876464844
BWG 0.0026700496673583984
BWG 0.002583742141723633
BWG 0.0025262832641601562
BWG 0.003481149673461914
BWG 0.003238201141357422
BWG 0.005095958709716797
BWG 0.0037899017333984375
BWG 0.003931283950805664
BWG 0.003300189971923828
"""
"""
0.003921985626220703
......@@ -46,37 +46,86 @@ STR = """
0.00030994415283203125
"""
STR = """
0.0006084442138671875
0.0005354881286621094
0.0012688636779785156
0.0012619495391845703
0.002301931381225586
0.0019693374633789062
0.0038712024688720703
0.002872467041015625
0.005068302154541016
0.0047588348388671875
0.007832765579223633
0.005643367767333984
0.005807161331176758
0.004715442657470703"""
STR1 = """
SUBM 0.00036716461181640625
G 0.0010955333709716797
G 0.0010745525360107422
REGU 0.0006923675537109375
M 0.0005242824554443359
SUBM 0.0003108978271484375
G 0.0010905265808105469
G 0.0011067390441894531
REGU 0.00058746337890625
M 0.0005304813385009766
SUBM 0.0002682209014892578
G 0.0010945796966552734
G 0.0011165142059326172
REGU 0.0005419254302978516
M 0.0005164146423339844
SUBM 0.00021505355834960938
G 0.0010805130004882812
G 0.0010516643524169922
REGU 0.00052642822265625
M 0.0004677772521972656
SUBM 0.0002262592315673828
G 0.0010986328125
G 0.0010256767272949219
REGU 0.0005693435668945312
M 0.00048661231994628906
SUBM 0.0002319812774658203
G 0.0011110305786132812
G 0.0011196136474609375
REGU 0.0005295276641845703
M 0.0005729198455810547
SUBM 0.00023889541625976562
G 0.0005326271057128906
G 0.0005140304565429688
"""
0.0004992485046386719
0.0003979206085205078
0.0013720989227294922
0.0015933513641357422
0.0027768611907958984
0.0024590492248535156
0.004837512969970703
0.004601001739501953
0.009881019592285156
0.008889913558959961
0.017162084579467773
0.009079217910766602
0.009355545043945312
0.0068836212158203125
STR2 = """
SUBM 0.0003352165222167969
G 0.001149892807006836
G 0.0017066001892089844
REGU 0.0006349086761474609
M 0.00048804283142089844
SUBM 0.00029850006103515625
G 0.001767873764038086
G 0.0020656585693359375
REGU 0.0005462169647216797
M 0.0005753040313720703
SUBM 0.0002789497375488281
G 0.0012230873107910156
G 0.0014438629150390625
REGU 0.0005102157592773438
M 0.0005676746368408203
SUBM 0.00020241737365722656
G 0.00102996826171875
G 0.0011174678802490234
REGU 0.0005424022674560547
M 0.0005102157592773438
SUBM 0.0001976490020751953
G 0.0010385513305664062
G 0.0010204315185546875
REGU 0.0005321502685546875
M 0.00047278404235839844
SUBM 0.00021529197692871094
G 0.0010280609130859375
G 0.0010151863098144531
REGU 0.0004942417144775391
M 0.0004811286926269531
SUBM 0.00020694732666015625
G 0.0005142688751220703
G 0.0005171298980712891
"""
def _handle_lines(s: str):
arr = s.split(" ")
return (arr[0], float(arr[-1]))
from cumm.gemm.codeops import group_by
def print_str(s: str):
nums = list(map(_handle_lines, s.strip().split("\n")))
num_dict = group_by(lambda x: x[0], nums)
num_dict_ = {k: sum([vv[1] for vv in v]) for k, v in num_dict.items()}
print(num_dict_)
nums = list(map(float, STR.strip().split("\n")))
print(sum(nums))
\ No newline at end of file
print_str(STR1)
print_str(STR2)
\ No newline at end of file
......@@ -224,25 +224,26 @@ def main():
# voxels, coors, spatial_shape = waymo_data()
# with open("/home/yy/test_spconv.pkl", "wb") as f:
# pickle.dump((voxels, coors, spatial_shape), f)
with open("/home/yy/test_spconv.pkl", "rb") as f:
with open(Path(__file__).parent / "data" / "test_spconv.pkl", "rb") as f:
(voxels, coors, spatial_shape) = pickle.load(f)
print(spatial_shape)
print(voxels.shape)
# voxels = voxels[:100]
# coors = coors[:100]
voxels_th = torch.from_numpy(voxels).cuda().float()
dtype = torch.float32
voxels_th = torch.from_numpy(voxels).cuda().to(dtype)
coors_th = torch.from_numpy(coors).cuda().int()
voxels_th.requires_grad = True
algo = spconv.ConvAlgo.Native
net = Net(spatial_shape, algo).cuda().eval().float()
net = Net(spatial_shape, algo).cuda().eval().to(dtype)
print(coors_th.shape)
out = net(voxels_th, coors_th, 1)
print(out.spatial_shape)
print(voxels.mean(), voxels.max(), voxels.min())
dout = np.random.uniform(-0.2, 0.2,
out.features.shape).astype(np.float32)
dout_t = torch.from_numpy(dout).cuda()
dout_t = torch.from_numpy(dout).cuda().to(dtype)
print(out.spatial_shape, out.features.mean(), out.features.max(), out.features.min())
times = []
......
......@@ -381,17 +381,17 @@ class TestSpConv(TestCase):
else:
filters = np.random.uniform(0, 1, size=[k, k, k, OC,
IC]).astype(np.float32)
dtype = torch.float16
indices_t = torch.from_numpy(indices).int().to(device)
features_t = torch.from_numpy(features).to(device)
features_t = torch.from_numpy(features).to(device).to(dtype)
features_t.requires_grad = True
features_dense_t = torch.from_numpy(features_dense).to(device)
features_dense_t = torch.from_numpy(features_dense).to(device).to(dtype)
features_dense_t.requires_grad = True
net = SparseConv3dTestTorch(1, 3, shape, IC, OC, k, s, p,
d).to(device)
d).to(device).to(dtype)
net_ref = Conv3dTestTorch(1, 3, shape, IC, OC, k, s, p,
d).to(device)
filters_t = torch.from_numpy(filters).to(device)
d).to(device).to(dtype)
filters_t = torch.from_numpy(filters).to(device).to(dtype)
if FILTER_HWIO:
net_ref.net[0].weight.data[:] = filters_t.permute(4, 3, 0, 1,
2).contiguous()
......@@ -442,6 +442,11 @@ class TestSpConv(TestCase):
strides = [2, 3]
paddings = [0, 1, 2]
dilations = [1, 2, 3]
ksizes = [3]
strides = [1]
paddings = [0]
dilations = [1]
for dev, shape, bs, IC, OC, k, s, p, d in params_grid(
devices, shapes, batchsizes, in_channels, out_channels, ksizes,
......@@ -458,8 +463,13 @@ class TestSpConv(TestCase):
indices = np.ascontiguousarray(
sparse_dict["indices"][:, [3, 0, 1, 2]]).astype(np.int32)
features_dense = sparse_dict["features_dense"].astype(np.float32)
if FILTER_HWIO:
filters = np.random.uniform(0, 1, size=[k, k, k, IC,
OC]).astype(np.float32)
else:
filters = np.random.uniform(0, 1, size=[k, k, k, OC,
IC]).astype(np.float32)
indices_t = torch.from_numpy(indices).int().to(device)
features_t = torch.from_numpy(features).to(device)
features_t.requires_grad = True
......@@ -470,11 +480,20 @@ class TestSpConv(TestCase):
net_ref = DeConv3dTestTorch(1, 3, shape, IC, OC, k, s, p,
d).to(device)
filters_t = torch.from_numpy(filters).to(device)
print(net_ref.net[0].weight.shape)
if FILTER_HWIO:
net_ref.net[0].weight.data[:] = filters_t.permute(3, 4, 0, 1,
2).contiguous()
else:
net_ref.net[0].weight.data[:] = filters_t.permute(4, 3, 0, 1,
2).contiguous()
net.net[0].weight.data[:] = filters_t
out_ref = net_ref(features_dense_t)
out = net(features_t, indices_t, bs).dense()
out_np = out.detach().cpu().numpy()
out_ref_np = out_ref.detach().cpu().numpy()
self.assertAllClose(out_np, out_ref_np, atol=1e-4)
dout = np.random.uniform(-0.2, 0.2,
out_ref.shape).astype(features.dtype)
dout_t = torch.from_numpy(dout).to(device)
......@@ -490,12 +509,12 @@ class TestSpConv(TestCase):
for layer, layer_ref in zip(net.net, net_ref.net):
dw = layer.weight.grad.detach().cpu().numpy()
dw_ref = layer_ref.weight.grad.detach().cpu().numpy()
if FILTER_HWIO:
dw = dw.transpose(3, 4, 0, 1, 2)
else:
dw = dw.transpose(4, 3, 0, 1, 2)
self.assertAllClose(dw, dw_ref, atol=1e-4)
out_np = out.detach().cpu().numpy()
out_ref_np = out_ref.detach().cpu().numpy()
self.assertAllClose(out_np, out_ref_np, atol=1e-4)
def testSpCpConv3d(self):
np.random.seed(484)
......
......@@ -27,8 +27,8 @@ function repair_wheel {
export SPCONV_DISABLE_JIT="1"
export CUMM_CUDA_ARCH_LIST="all"
# Compile wheels, we only support 3.7-3.10.
# "/opt/python/cp36-cp36m/bin/pip" wheel /io/ --no-deps -w /io/wheelhouse_tmp
# Compile wheels, we only support 3.6-3.10.
"/opt/python/cp36-cp36m/bin/pip" wheel /io/ --no-deps -w /io/wheelhouse_tmp
"/opt/python/cp37-cp37m/bin/pip" wheel /io/ --no-deps -w /io/wheelhouse_tmp
"/opt/python/cp38-cp38/bin/pip" wheel /io/ --no-deps -w /io/wheelhouse_tmp
"/opt/python/cp39-cp39/bin/pip" wheel /io/ --no-deps -w /io/wheelhouse_tmp
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment