Commit 8cbb7d3c authored by yanyan's avatar yanyan
Browse files

1.2 release

parent 105b3892
# Changelog
## [1.2.0] - 2020-05-28
### Added
- add batch gemm support. small performance increasement but more gpu memory usage. you can use algo=spconv.ConvAlgo.Batch to use it.
### Changed
- replace most of 'functor' with c++14 dispatch in c++ code.
### Fixed
- change gather/scatterAdd kernel parameter to support large points.
......@@ -11,6 +11,7 @@ endif()
if(WIN32) # true if windows (32 and 64 bit)
add_compile_definitions(TV_WINDOWS)
endif()
add_compile_definitions(PYTORCH_VERSION=${PYTORCH_VERSION})
set(CMAKE_CXX_EXTENSIONS OFF) # avoid gnu++11 be added to CXX flags
if(CMAKE_BUILD_TYPE STREQUAL "Debug")
......
......@@ -15,7 +15,15 @@
#ifndef REORDERING_CU_H_
#define REORDERING_CU_H_
#include <THC/THCAtomics.cuh>
#include <THC/THCNumerics.cuh>
#include <tensorview/kernel_utils.h>
#if PYTORCH_VERSION < 10500
#define TH_ATOMIC_ADD atomicAdd
#else
#define TH_ATOMIC_ADD gpuAtomicAdd
#endif
// see http://www.nvidia.com/content/GTC-2010/pdfs/2238_GTC2010.pdf.
namespace spconv {
......@@ -78,21 +86,21 @@ template <typename T, typename Index, int NumTLP, int NumILP,
__global__ void gatherVecBlockKernel(T *buffer, const T *features,
const Index *indices, int size,
int numPlanes) {
int ILPStrideY[NumILP];
int ILPStrideX[NumILP];
#pragma unroll
for (int ilp = 0; ilp < NumILP; ilp++)
ILPStrideY[ilp] = ilp * gridDim.y * blockDim.y;
features += blockIdx.x * NumTLP;
buffer += blockIdx.x * NumTLP;
ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x;
features += blockIdx.y * NumTLP;
buffer += blockIdx.y * NumTLP;
for (int iy : tv::KernelLoopY<int, NumILP>(size)) {
for (int ix : tv::KernelLoopX<int, NumILP>(size)) {
#pragma unroll
for (int ilp = 0; ilp < NumILP; ++ilp) {
reinterpret_cast<VecType *>(
buffer)[(iy + ILPStrideY[ilp]) * numPlanes + threadIdx.x] =
buffer)[(ix + ILPStrideX[ilp]) * numPlanes + threadIdx.y] =
reinterpret_cast<const VecType *>(
features)[indices[iy + ILPStrideY[ilp]] * numPlanes +
threadIdx.x];
features)[indices[ix + ILPStrideX[ilp]] * numPlanes +
threadIdx.y];
}
}
}
......@@ -124,22 +132,33 @@ __global__ void batchGatherGenericKernel(T *buffer, const T *features,
for (int iy : tv::KernelLoopY<int>(numPlanes)) {
#pragma unroll
for (int ilp = 0; ilp < NumILP; ++ilp) {
if (ix + ILPStrideX[ilp] < size && inds[ilp] != -1)
if (ix + ILPStrideX[ilp] < size) {
if (inds[ilp] != -1) {
buffer[(ix + ILPStrideX[ilp]) * numPlanes + iy] =
features[inds[ilp] * numPlanes + iy];
} else {
buffer[(ix + ILPStrideX[ilp]) * numPlanes + iy] = T(0);
}
}
}
}
}
}
template <typename T, typename Index, int NumTLP, int NumILP, typename VecType>
__global__ void batchGatherVecKernel(T *buffer, const T *features,
const Index *indices, int size,
int feature_offset,
int numPlanes, int indice_batch_stride,
int feature_batch_stride) {
__global__ void
batchGatherVecKernel(T *buffer, const T *features, const Index *indices,
int size, int feature_offset, int numPlanes,
int indice_batch_stride, int feature_batch_stride) {
int ILPStrideX[NumILP];
Index inds[NumILP];
Index zero[sizeof(VecType) / sizeof(T)];
#pragma unroll
for (int i = 0; i < sizeof(VecType) / sizeof(T); ++i) {
zero[i] = T(0);
}
Index inds_elem;
#pragma unroll
for (int ilp = 0; ilp < NumILP; ilp++)
......@@ -158,11 +177,19 @@ __global__ void batchGatherVecKernel(T *buffer, const T *features,
for (int iy : tv::KernelLoopY<int>(numPlanes)) {
#pragma unroll
for (int ilp = 0; ilp < NumILP; ++ilp) {
if (ix + ILPStrideX[ilp] < size && inds[ilp] != -1)
if (ix + ILPStrideX[ilp] < size) {
if (inds[ilp] != -1) {
reinterpret_cast<VecType *>(
buffer)[(ix + ILPStrideX[ilp]) * numPlanes + iy] =
reinterpret_cast<const VecType *>(
features)[inds[ilp] * numPlanes + iy];
} else {
reinterpret_cast<VecType *>(
buffer)[(ix + ILPStrideX[ilp]) * numPlanes + iy] =
reinterpret_cast<const VecType *>(&zero)[0];
}
}
}
}
}
......@@ -174,29 +201,38 @@ __global__ void
batchGatherVecBlockKernel(T *buffer, const T *features, const Index *indices,
int size, int numPlanes, int indice_batch_stride,
int feature_batch_stride) {
int ILPStrideY[NumILP];
int ILPStrideX[NumILP];
Index inds;
#pragma unroll
for (int ilp = 0; ilp < NumILP; ilp++)
ILPStrideY[ilp] = ilp * gridDim.y * blockDim.y;
features += blockIdx.x * NumTLP;
buffer += blockIdx.x * NumTLP;
ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x;
features += blockIdx.y * NumTLP;
buffer += blockIdx.y * NumTLP;
Index inds_elem;
Index zero[sizeof(VecType) / sizeof(T)];
#pragma unroll
for (int i = 0; i < sizeof(VecType) / sizeof(T); ++i) {
zero[i] = T(0);
}
for (int iy : tv::KernelLoopY<int, NumILP>(size)) {
for (int ix : tv::KernelLoopX<int, NumILP>(size)) {
#pragma unroll
for (int ilp = 0; ilp < NumILP; ++ilp) {
inds_elem = iy + ILPStrideY[ilp];
inds_elem = ix + ILPStrideX[ilp];
inds = indices[(inds_elem / feature_batch_stride) * indice_batch_stride +
inds_elem % feature_batch_stride];
if (inds != -1) {
reinterpret_cast<VecType *>(
buffer)[(iy + ILPStrideY[ilp]) * numPlanes + threadIdx.x] =
buffer)[(ix + ILPStrideX[ilp]) * numPlanes + threadIdx.y] =
reinterpret_cast<const VecType *>(
features)[inds * numPlanes + threadIdx.x];
features)[inds * numPlanes + threadIdx.y];
} else {
reinterpret_cast<VecType *>(
buffer)[(ix + ILPStrideX[ilp]) * numPlanes + threadIdx.y] =
reinterpret_cast<const VecType *>(&zero)[0];
}
}
}
......@@ -234,24 +270,24 @@ template <typename T, typename Index, int NumTLP, int NumILP,
__global__ void scatterAddVecBlockKernel(T *outFeatures, const T *buffer,
const Index *indices, int size,
int numPlanes) {
int ILPStrideY[NumILP];
int ILPStrideX[NumILP];
constexpr int vecloadFactor = sizeof(VecType) / sizeof(T);
#pragma unroll
for (int ilp = 0; ilp < NumILP; ilp++)
ILPStrideY[ilp] = ilp * gridDim.y * blockDim.y;
outFeatures += blockIdx.x * NumTLP;
buffer += blockIdx.x * NumTLP;
ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x;
outFeatures += blockIdx.y * NumTLP;
buffer += blockIdx.y * NumTLP;
T buf[vecloadFactor];
T buf2[vecloadFactor];
Index idx;
for (int iy : tv::KernelLoopY<int, NumILP>(size)) {
for (int ix : tv::KernelLoopX<int, NumILP>(size)) {
#pragma unroll
for (int ilp = 0; ilp < NumILP; ++ilp) {
idx = indices[iy + ILPStrideY[ilp]] * numPlanes + threadIdx.x;
idx = indices[ix + ILPStrideX[ilp]] * numPlanes + threadIdx.y;
reinterpret_cast<VecType *>(buf)[0] =
reinterpret_cast<VecType *>(outFeatures)[idx];
reinterpret_cast<VecType *>(buf2)[0] = reinterpret_cast<const VecType *>(
buffer)[(iy + ILPStrideY[ilp]) * numPlanes + threadIdx.x];
buffer)[(ix + ILPStrideX[ilp]) * numPlanes + threadIdx.y];
#pragma unroll
for (int i = 0; i < vecloadFactor; i++) {
buf[i] += buf2[i];
......@@ -268,6 +304,10 @@ __global__ void batchScatterAddGenericKernel(T *outFeatures, const T *buffer,
int feature_offset, int numPlanes,
int indice_batch_stride,
int feature_batch_stride) {
// batch scatter add is greatly slower than native scatter when the number of
// points is large. this may due to atomicAdd?
// batch scatter add is greatly faster than native when the number of points
// is small.
int ILPStrideX[NumILP];
Index inds[NumILP];
Index inds_elem;
......@@ -288,7 +328,7 @@ __global__ void batchScatterAddGenericKernel(T *outFeatures, const T *buffer,
#pragma unroll
for (int ilp = 0; ilp < NumILP; ++ilp) {
if (ix + ILPStrideX[ilp] < size && inds[ilp] != -1) {
gpuAtomicAdd(outFeatures + inds[ilp] * numPlanes + iy,
TH_ATOMIC_ADD(outFeatures + inds[ilp] * numPlanes + iy,
buffer[(ix + ILPStrideX[ilp]) * numPlanes + iy]);
}
}
......@@ -301,22 +341,22 @@ __global__ void
batchScatterAddBlockKernel(T *outFeatures, const T *buffer,
const Index *indices, int size, int numPlanes,
int indice_batch_stride, int feature_batch_stride) {
int ILPStrideY[NumILP];
int ILPStrideX[NumILP];
#pragma unroll
for (int ilp = 0; ilp < NumILP; ilp++)
ILPStrideY[ilp] = ilp * gridDim.y * blockDim.y;
outFeatures += blockIdx.x * NumTLP;
buffer += blockIdx.x * NumTLP;
ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x;
outFeatures += blockIdx.y * NumTLP;
buffer += blockIdx.y * NumTLP;
Index inds, inds_elem;
for (int iy : tv::KernelLoopY<int, NumILP>(size)) {
for (int ix : tv::KernelLoopX<int, NumILP>(size)) {
#pragma unroll
for (int ilp = 0; ilp < NumILP; ++ilp) {
inds_elem = iy + ILPStrideY[ilp];
inds_elem = ix + ILPStrideX[ilp];
inds = indices[(inds_elem / feature_batch_stride) * indice_batch_stride +
inds_elem % feature_batch_stride];
if (inds != -1) {
gpuAtomicAdd(outFeatures + inds * numPlanes + threadIdx.x,
buffer[(iy + ILPStrideY[ilp]) * numPlanes + threadIdx.x]);
TH_ATOMIC_ADD(outFeatures + inds * numPlanes + threadIdx.y,
buffer[(ix + ILPStrideX[ilp]) * numPlanes + threadIdx.y]);
}
}
}
......@@ -324,4 +364,6 @@ batchScatterAddBlockKernel(T *outFeatures, const T *buffer,
} // namespace spconv
#undef TH_ATOMIC_ADD
#endif
\ No newline at end of file
......@@ -21,10 +21,10 @@ namespace spconv {
void batch_sparse_gather_cuda(torch::Tensor buffer, torch::Tensor features,
torch::Tensor indices, int size);
void batch_sparse_scatter_add_cuda(torch::Tensor buffer, torch::Tensor outFeatures,
void batch_sparse_scatter_add_cuda(torch::Tensor buffer,
torch::Tensor outFeatures,
torch::Tensor indices, int size);
void sparse_gather_cuda(torch::Tensor buffer, torch::Tensor features,
torch::Tensor indices, int size);
void sparse_scatter_add_cuda(torch::Tensor buffer, torch::Tensor outFeatures,
......
......@@ -23,10 +23,7 @@
namespace spconv {
enum ConvAlgo {
kNative = 0,
kBatchGemm = 1
};
enum ConvAlgo { kNative = 0, kBatch = 1, kBatchGemmGather = 2 };
// torch.jit's doc says only support int64, so we need to convert to int32.
template <unsigned NDim>
......@@ -345,8 +342,10 @@ std::vector<torch::Tensor> getIndicePairPreGrid(
}
}
torch::Tensor indiceConvBatch(torch::Tensor features, torch::Tensor filters,
torch::Tensor indicePairs, torch::Tensor indiceNum,
int64_t numActOut, int64_t _inverse, int64_t _subM);
torch::Tensor indicePairs,
torch::Tensor indiceNum, int64_t numActOut,
int64_t _inverse, int64_t _subM,
bool batchScatter);
torch::Tensor indiceConv(torch::Tensor features, torch::Tensor filters,
torch::Tensor indicePairs, torch::Tensor indiceNum,
......@@ -355,13 +354,14 @@ torch::Tensor indiceConv(torch::Tensor features, torch::Tensor filters,
std::vector<torch::Tensor>
indiceConvBackward(torch::Tensor features, torch::Tensor filters,
torch::Tensor outGrad, torch::Tensor indicePairs,
torch::Tensor indiceNum, int64_t _inverse, int64_t _subM, int64_t algo);
torch::Tensor indiceNum, int64_t _inverse, int64_t _subM,
int64_t algo);
std::vector<torch::Tensor>
indiceConvBackwardBatch(torch::Tensor features, torch::Tensor filters,
torch::Tensor outGrad, torch::Tensor indicePairs,
torch::Tensor indiceNum, int64_t _inverse,
int64_t _subM);
int64_t _subM, bool batchScatter);
} // namespace spconv
#endif
\ No newline at end of file
......@@ -80,6 +80,8 @@ public:
}
} else {
#ifdef TV_CUDA
// we should select device in external
/*
int deviceCount;
cudaGetDeviceCount(&deviceCount);
if (device >= deviceCount) {
......@@ -87,6 +89,7 @@ public:
" but you only have ", deviceCount, " device.");
}
cudaSetDevice(device);
*/
if (managed) {
checkCudaErrors(cudaMallocManaged(&this->mPtr, size * sizeof(T)));
} else {
......
......@@ -125,6 +125,21 @@ TensorView<T, Rank, PtrTraits, Tindex> torch2tv(const torch::Tensor &tensor) {
return tv::TensorView<T, Rank, PtrTraits, Tindex>(
tensor.data_ptr<std::remove_const_t<T>>(), shape);
}
template <typename T>
torch::Tensor torch_slice_first_axis(torch::Tensor tensor, T start, T end) {
// only torch >= 1.5 have tensor slice.
torch::Tensor res;
auto tensor_shape = tensor.sizes();
std::vector<int64_t> shape(tensor_shape.begin(), tensor_shape.end());
shape[0] = end - start;
auto dtype = tensor.scalar_type();
uint8_t *ptr = reinterpret_cast<uint8_t *>(tensor.data_ptr());
res = torch::from_blob(ptr + start * tensor.stride(0) * tensor.itemsize(),
torch::IntArrayRef(shape), tensor.options());
return res;
}
namespace detail {
template <> struct TypeToString<at::Half> {
static constexpr const char *value = "half";
......
......@@ -18,6 +18,8 @@ LIBTORCH_ROOT = str(Path(torch.__file__).parent)
SPCONV_FORCE_BUILD_CUDA = os.getenv("SPCONV_FORCE_BUILD_CUDA")
PYTHON_VERSION = "{}.{}".format(sys.version_info.major, sys.version_info.minor)
PYTORCH_VERSION = list(map(int, torch.__version__.split(".")))
PYTORCH_VERSION_NUMBER = PYTORCH_VERSION[0] * 10000 + PYTORCH_VERSION[1] * 100 + PYTORCH_VERSION[2]
class CMakeExtension(Extension):
def __init__(self, name, sourcedir='', library_dirs=[]):
......@@ -47,6 +49,7 @@ class CMakeBuild(build_ext):
'-DCMAKE_PREFIX_PATH={}'.format(LIBTORCH_ROOT),
'-DPYBIND11_PYTHON_VERSION={}'.format(PYTHON_VERSION),
'-DSPCONV_BuildTests=OFF',
'-DPYTORCH_VERSION={}'.format(PYTORCH_VERSION_NUMBER)
] # -arch=sm_61
if not torch.cuda.is_available() and SPCONV_FORCE_BUILD_CUDA is None:
cmake_args += ['-DSPCONV_BuildCUDA=OFF']
......
......@@ -19,12 +19,12 @@ import numpy as np
import torch
from spconv import ops, utils
from spconv.ops import ConvAlgo
from spconv.conv import (SparseConv2d, SparseConv3d, SparseConvTranspose2d,
SparseConvTranspose3d, SparseInverseConv2d,
SparseInverseConv3d, SubMConv2d, SubMConv3d)
from spconv.identity import Identity
from spconv.modules import SparseModule, SparseSequential
from spconv.ops import ConvAlgo
from spconv.pool import SparseMaxPool2d, SparseMaxPool3d
from spconv.tables import AddTable, ConcatTable, JoinTable
......@@ -62,7 +62,7 @@ class SparseConvTensor(object):
self.features = features
self.indices = indices
if self.indices.dtype != torch.int32:
self.indices.int()
self.indices = self.indices.int()
self.spatial_shape = spatial_shape
self.batch_size = batch_size
self.indice_dict = {}
......@@ -82,7 +82,8 @@ class SparseConvTensor(object):
def dense(self, channels_first=True):
output_shape = [self.batch_size] + list(
self.spatial_shape) + [self.features.shape[1]]
res = scatter_nd(self.indices.long().to(self.features.device), self.features, output_shape)
res = scatter_nd(self.indices.long().to(self.features.device),
self.features, output_shape)
if not channels_first:
return res
ndim = len(self.spatial_shape)
......
......@@ -25,16 +25,25 @@ class SparseConvFunction(Function):
num_activate_out, algo):
ctx.save_for_backward(indice_pairs, indice_pair_num, features, filters)
ctx.algo = algo
return ops.indice_conv(features, filters, indice_pairs,
indice_pair_num, num_activate_out, False, algo=algo)
return ops.indice_conv(features,
filters,
indice_pairs,
indice_pair_num,
num_activate_out,
False,
algo=algo)
@staticmethod
def backward(ctx, grad_output):
indice_pairs, indice_pair_num, features, filters = ctx.saved_tensors
input_bp, filters_bp = ops.indice_conv_backward(
features, filters, grad_output, indice_pairs, indice_pair_num,
False, algo=ctx.algo)
input_bp, filters_bp = ops.indice_conv_backward(features,
filters,
grad_output,
indice_pairs,
indice_pair_num,
False,
algo=ctx.algo)
return input_bp, filters_bp, None, None, None, None
......@@ -45,15 +54,26 @@ class SparseInverseConvFunction(Function):
num_activate_out, algo):
ctx.save_for_backward(indice_pairs, indice_pair_num, features, filters)
ctx.algo = algo
return ops.indice_conv(features, filters, indice_pairs,
indice_pair_num, num_activate_out, True, False, algo=algo)
return ops.indice_conv(features,
filters,
indice_pairs,
indice_pair_num,
num_activate_out,
True,
False,
algo=algo)
@staticmethod
def backward(ctx, grad_output):
indice_pairs, indice_pair_num, features, filters = ctx.saved_tensors
input_bp, filters_bp = ops.indice_conv_backward(
features, filters, grad_output, indice_pairs, indice_pair_num,
True, False, algo=ctx.algo)
input_bp, filters_bp = ops.indice_conv_backward(features,
filters,
grad_output,
indice_pairs,
indice_pair_num,
True,
False,
algo=ctx.algo)
return input_bp, filters_bp, None, None, None, None
......@@ -64,15 +84,26 @@ class SubMConvFunction(Function):
num_activate_out, algo):
ctx.save_for_backward(indice_pairs, indice_pair_num, features, filters)
ctx.algo = algo
return ops.indice_conv(features, filters, indice_pairs,
indice_pair_num, num_activate_out, False, True, algo=algo)
return ops.indice_conv(features,
filters,
indice_pairs,
indice_pair_num,
num_activate_out,
False,
True,
algo=algo)
@staticmethod
def backward(ctx, grad_output):
indice_pairs, indice_pair_num, features, filters = ctx.saved_tensors
input_bp, filters_bp = ops.indice_conv_backward(
features, filters, grad_output, indice_pairs, indice_pair_num,
False, True, algo=ctx.algo)
input_bp, filters_bp = ops.indice_conv_backward(features,
filters,
grad_output,
indice_pairs,
indice_pair_num,
False,
True,
algo=ctx.algo)
return input_bp, filters_bp, None, None, None, None
......
......@@ -12,6 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import sys
import time
from collections import OrderedDict
......
......@@ -12,15 +12,18 @@
# See the License for the specific language governing permissions and
# limitations under the License.
from enum import Enum
import torch
import spconv
from enum import Enum
class ConvAlgo(Enum):
Native = 0
BatchGemm = 1
Native = 0 # small memory cost, faster when number of points is large.
Batch = 1 # high memory cost, faster when number of points is small (< 50000)
BatchGemmGather = 2 # high memory cost, faster when number of points medium
def get_conv_output_size(input_size, kernel_size, stride, padding, dilation):
ndim = len(input_size)
......@@ -59,7 +62,7 @@ def get_indice_pairs(indices,
subm=False,
transpose=False,
grid=None,
use_hash=True):
use_hash=False):
ndim = indices.shape[1] - 1
if not isinstance(ksize, (list, tuple)):
ksize = [ksize] * ndim
......
......@@ -25,7 +25,6 @@
#include <tensorview/torch_utils.h>
#include <type_traits>
#include <utility/timer.h>
namespace spconv {
using float_types_t = tv::mp_list<float, double, at::Half>;
......@@ -59,8 +58,8 @@ void sparse_gather_cuda(torch::Tensor buffer, torch::Tensor features,
if (nHotBlock >= NumTLP) {
gatherVecBlockKernel<T, Index, int(NumTLP), NumILP,
vecload_type_t>
<<<dim3(numPlanes / NumTLP, size / NumTLP),
dim3(NumTLP / vecloadFactor, NumTLP / NumILP), 0,
<<<dim3(size / NumTLP, numPlanes / NumTLP),
dim3(NumTLP / NumILP, NumTLP / vecloadFactor), 0,
stream>>>(buffer.data_ptr<T>(), features.data_ptr<T>(),
indices.data_ptr<Index>(), nHotBlock,
numPlanes / vecloadFactor);
......@@ -127,8 +126,8 @@ void sparse_scatter_add_cuda(torch::Tensor buffer, torch::Tensor outFeatures,
if (nHotBlock >= NumTLP) {
scatterAddVecBlockKernel<T, Index, int(NumTLP), NumILP,
vecload_type_t>
<<<dim3(numPlanes / NumTLP, size / NumTLP),
dim3(NumTLP / vecloadFactor, NumTLP / NumILP), 0,
<<<dim3(size / NumTLP, numPlanes / NumTLP),
dim3(NumTLP / NumILP, NumTLP / vecloadFactor), 0,
stream>>>(outFeatures.data_ptr<T>(), buffer.data_ptr<T>(),
indices.data_ptr<Index>(), nHotBlock,
numPlanes / vecloadFactor);
......@@ -194,25 +193,25 @@ void batch_sparse_gather_cuda(torch::Tensor buffer, torch::Tensor features,
if (nHotBlock >= NumTLP) {
batchGatherVecBlockKernel<T, Index, int(NumTLP), NumILP,
vecload_type_t>
<<<dim3(numPlanes / NumTLP, size / NumTLP),
dim3(NumTLP / vecloadFactor, NumTLP / NumILP), 0,
<<<dim3(size / NumTLP, numPlanes / NumTLP),
dim3(NumTLP / NumILP, NumTLP / vecloadFactor), 0,
stream>>>(buffer.data_ptr<T>(), features.data_ptr<T>(),
indices.data_ptr<Index>(), nHotBlock,
numPlanes / vecloadFactor, inds_stride,
feature_stride);
TV_CHECK_CUDA_ERR();
TV_CHECK_CUDA_ERR_V2("batchGatherVecBlockKernel");
}
if (size - nHotBlock > 0) {
batchGatherVecKernel<T, Index, int(NumTLP), NumILP, vecload_type_t>
batchGatherVecKernel<T, Index, int(NumTLP), NumILP,
vecload_type_t>
<<<dim3(1, numPlanes / NumTLP),
dim3(NumTLP / NumILP, NumTLP / vecloadFactor), 0,
stream>>>(buffer.data_ptr<T>() + nHotBlock * numPlanes,
features.data_ptr<T>(),
indices.data_ptr<Index>(),
size - nHotBlock, nHotBlock, numPlanes / vecloadFactor,
indices.data_ptr<Index>(), size - nHotBlock,
nHotBlock, numPlanes / vecloadFactor,
inds_stride, feature_stride);
TV_CHECK_CUDA_ERR();
TV_CHECK_CUDA_ERR_V2("batchGatherVecKernel");
}
notFound = false;
}
......@@ -270,8 +269,8 @@ void batch_sparse_scatter_add_cuda(torch::Tensor buffer,
if (numPlanes % NumTLP == 0) {
if (nHotBlock >= NumTLP) {
batchScatterAddBlockKernel<T, Index, int(NumTLP), NumILP>
<<<dim3(numPlanes / NumTLP, size / NumTLP),
dim3(NumTLP / vecloadFactor, NumTLP / NumILP), 0,
<<<dim3(size / NumTLP, numPlanes / NumTLP),
dim3(NumTLP / NumILP, NumTLP / vecloadFactor), 0,
stream>>>(outFeatures.data_ptr<T>(), buffer.data_ptr<T>(),
indices.data_ptr<Index>(), nHotBlock,
numPlanes / vecloadFactor, inds_stride,
......@@ -283,8 +282,8 @@ void batch_sparse_scatter_add_cuda(torch::Tensor buffer,
<<<dim3(1, numPlanes / NumTLP), dim3(NumTLP / NumILP, NumTLP),
0, stream>>>(outFeatures.data_ptr<T>(),
buffer.data_ptr<T>() + nHotBlock * numPlanes,
indices.data_ptr<Index>(),
size - nHotBlock, nHotBlock, numPlanes, inds_stride,
indices.data_ptr<Index>(), size - nHotBlock,
nHotBlock, numPlanes, inds_stride,
feature_stride);
TV_CHECK_CUDA_ERR();
}
......
......@@ -139,10 +139,12 @@ torch::Tensor indiceConv(torch::Tensor features, torch::Tensor filters,
int64_t algo) {
auto kernelVolume = indiceNum.size(0);
switch (algo) {
case kBatchGemm: {
case kBatchGemmGather:
case kBatch: {
if (kernelVolume != 1) {
return indiceConvBatch(features, filters, indicePairs, indiceNum,
numActOut, _inverse, _subM);
numActOut, _inverse, _subM,
algo != kBatchGemmGather);
} else {
break;
}
......@@ -152,6 +154,8 @@ torch::Tensor indiceConv(torch::Tensor features, torch::Tensor filters,
default:
TV_THROW_RT_ERR("unknown algo");
}
// auto timer = spconv::CudaContextTimer<>();
bool subM = _subM != 0;
bool inverse = _inverse != 0;
auto device = features.device().type();
......@@ -170,10 +174,11 @@ torch::Tensor indiceConv(torch::Tensor features, torch::Tensor filters,
torch::TensorOptions().dtype(features.dtype()).device(features.device());
torch::Tensor output = torch::zeros({numActOut, numOutPlanes}, options);
torch::Tensor inputBuffer =
torch::zeros({indicePairMaxSize, numInPlanes}, options);
torch::empty({indicePairMaxSize, numInPlanes}, options);
torch::Tensor outputBuffer =
torch::empty({indicePairMaxSize, numOutPlanes}, options);
filters = filters.view({-1, numInPlanes, numOutPlanes});
if (subM) { // the center index of subm conv don't need gather and scatter
// add.
torch::mm_out(output, features, filters[indicePairMaxOffset]);
......@@ -181,12 +186,13 @@ torch::Tensor indiceConv(torch::Tensor features, torch::Tensor filters,
double totalGatherTime = 0;
double totalGEMMTime = 0;
double totalSAddTime = 0;
// tv::ssprint("first subm gemm time", timer.report() / 1000.0);
for (int i = 0; i < kernelVolume; ++i) {
auto nHot = indicePairNumCpu.data_ptr<int>()[i];
if (nHot <= 0 || (subM && i == indicePairMaxOffset)) {
continue;
}
// auto timer = spconv::CudaContextTimer<>();
auto outputBufferBlob = torch::from_blob(outputBuffer.data_ptr(),
{nHot, numOutPlanes}, options);
auto inputBufferBlob =
......@@ -208,7 +214,10 @@ torch::Tensor indiceConv(torch::Tensor features, torch::Tensor filters,
else {
TV_THROW_INVALID_ARG("unknown device type");
}
// totalGatherTime += timer.report() / 1000.0;
torch::mm_out(outputBufferBlob, inputBufferBlob, filters[i]);
// totalGEMMTime += timer.report() / 1000.0;
if (device == torch::kCPU) {
sparse_scatter_add_cpu(outputBuffer, output, indicePairs[!inverse][i],
nHot);
......@@ -222,14 +231,17 @@ torch::Tensor indiceConv(torch::Tensor features, torch::Tensor filters,
else {
TV_THROW_INVALID_ARG("unknown device type");
}
// totalSAddTime += timer.report() / 1000.0;
}
// tv::ssprint(totalGatherTime, totalGEMMTime, totalSAddTime);
return output;
}
torch::Tensor indiceConvBatch(torch::Tensor features, torch::Tensor filters,
torch::Tensor indicePairs,
torch::Tensor indiceNum, int64_t numActOut,
int64_t _inverse, int64_t _subM) {
int64_t _inverse, int64_t _subM,
bool batchScatter) {
bool subM = _subM != 0;
bool inverse = _inverse != 0;
auto device = features.device().type();
......@@ -238,6 +250,7 @@ torch::Tensor indiceConvBatch(torch::Tensor features, torch::Tensor filters,
TV_ASSERT_INVALID_ARG(kernelVolume > 1, "error");
auto numInPlanes = features.size(1);
auto numOutPlanes = filters.size(ndim + 1);
// auto timer = spconv::CudaContextTimer<>();
auto indicePairNumCpu = indiceNum.to({torch::kCPU});
auto indicePairNumVec =
std::vector<int>(indicePairNumCpu.data_ptr<int>(),
......@@ -257,85 +270,98 @@ torch::Tensor indiceConvBatch(torch::Tensor features, torch::Tensor filters,
// number of indice in the center of filter is much more than other
// filter location.
// so we first use top2 indice num to do batch conv, then
// do native conv in center.
// do native conv (gemm) in center.
int bufferSize = subM ? indicePairTop2Size : indicePairMaxSize;
int maxKernelVolumePart = kernelVolume;
std::vector<std::pair<int, int>> part_ranges = {{0, kernelVolume}};
filters = filters.view({kernelVolume, numInPlanes, numOutPlanes});
if (subM) {
maxKernelVolumePart = std::max(indicePairMaxOffset,
int(kernelVolume - indicePairMaxOffset - 1));
part_ranges = {{0, indicePairMaxOffset},
{indicePairMaxOffset + 1, kernelVolume}};
torch::mm_out(output, features, filters[indicePairMaxOffset]);
if (indicePairTop2Size == 0) {
return output;
}
}
// tv::ssprint("first subm gemm time", timer.report() / 1000.0);
double totalGatherTime = 0;
double totalGEMMTime = 0;
double totalSAddTime = 0;
torch::Tensor inputBuffer =
torch::zeros({kernelVolume, bufferSize, numInPlanes}, options);
torch::empty({maxKernelVolumePart, bufferSize, numInPlanes}, options);
torch::Tensor outputBuffer =
torch::empty({kernelVolume, bufferSize, numOutPlanes}, options);
filters = filters.view({kernelVolume, numInPlanes, numOutPlanes});
int64_t size = kernelVolume * bufferSize;
torch::empty({maxKernelVolumePart, bufferSize, numOutPlanes}, options);
for (auto &range : part_ranges) {
int start = range.first;
int end = range.second;
int length = end - start;
int64_t size = length * bufferSize;
auto inputBufferPart = tv::torch_slice_first_axis(inputBuffer, 0, length);
auto outputBufferPart = tv::torch_slice_first_axis(outputBuffer, 0, length);
auto indicePairs1Part =
tv::torch_slice_first_axis(indicePairs[inverse], start, end);
auto indicePairs2Part =
tv::torch_slice_first_axis(indicePairs[!inverse], start, end);
auto filtersPart = tv::torch_slice_first_axis(filters, start, end);
if (device == torch::kCPU) {
TV_THROW_INVALID_ARG("unknown device type");
}
#ifdef TV_CUDA
else if (device == torch::kCUDA) {
batch_sparse_gather_cuda(inputBuffer, features, indicePairs[inverse], size);
batch_sparse_gather_cuda(inputBufferPart, features, indicePairs1Part,
size);
}
#endif
else {
TV_THROW_INVALID_ARG("unknown device type");
}
torch::bmm_out(outputBuffer, inputBuffer, filters);
// totalGatherTime += timer.report() / 1000.0;
torch::bmm_out(outputBufferPart, inputBufferPart, filtersPart);
// totalGEMMTime += timer.report() / 1000.0;
if (batchScatter) {
if (device == torch::kCPU) {
TV_THROW_INVALID_ARG("unknown device type");
}
#ifdef TV_CUDA
else if (device == torch::kCUDA) {
batch_sparse_scatter_add_cuda(outputBuffer, output, indicePairs[!inverse],
size);
batch_sparse_scatter_add_cuda(outputBufferPart, output,
indicePairs2Part, size);
}
#endif
else {
TV_THROW_INVALID_ARG("unknown device type");
}
if (subM) {
auto remain_size = indicePairMaxSize - indicePairTop2Size;
if (remain_size <= 0) {
return output;
} else {
for (int i = 0; i < length; ++i) {
auto nHot = indicePairNumCpu.data_ptr<int>()[i + start];
if (nHot <= 0) {
continue;
}
inputBuffer = torch::empty({remain_size, numInPlanes}, options);
outputBuffer = torch::empty({remain_size, numOutPlanes}, options);
if (device == torch::kCPU) {
TV_THROW_INVALID_ARG("unknown device type");
sparse_scatter_add_cpu(outputBufferPart[i], output,
indicePairs2Part[i], nHot);
}
#ifdef TV_CUDA
else if (device == torch::kCUDA) {
tv::dispatch_torch<int32_t, int64_t>(indice_dtype, [&](auto I) {
using Index = decltype(I);
auto indicePairsRemain = torch::from_blob(
indicePairs[inverse][indicePairMaxOffset].data_ptr<Index>() +
indicePairTop2Size,
{remain_size}, indicePairs.options());
sparse_gather_cuda(inputBuffer, features, indicePairsRemain,
remain_size);
});
sparse_scatter_add_cuda(outputBufferPart[i], output,
indicePairs2Part[i], nHot);
}
#endif
else {
TV_THROW_INVALID_ARG("unknown device type");
}
torch::mm_out(outputBuffer, inputBuffer, filters[indicePairMaxOffset]);
if (device == torch::kCPU) {
TV_THROW_INVALID_ARG("unknown device type");
}
#ifdef TV_CUDA
else if (device == torch::kCUDA) {
tv::dispatch_torch<int32_t, int64_t>(indice_dtype, [&](auto I) {
using Index = decltype(I);
auto indicePairsRemain = torch::from_blob(
indicePairs[!inverse][indicePairMaxOffset].data_ptr<Index>() +
indicePairTop2Size,
{remain_size}, indicePairs.options());
sparse_scatter_add_cuda(outputBuffer, output, indicePairsRemain,
remain_size);
});
}
#endif
else {
TV_THROW_INVALID_ARG("unknown device type");
}
// totalSAddTime += timer.report() / 1000.0;
}
// tv::ssprint(totalGatherTime, totalGEMMTime, totalSAddTime);
return output;
}
......@@ -346,10 +372,12 @@ indiceConvBackward(torch::Tensor features, torch::Tensor filters,
int64_t algo) {
auto kernelVolume = indiceNum.size(0);
switch (algo) {
case kBatchGemm: {
case kBatchGemmGather:
case kBatch: {
if (kernelVolume != 1) {
return indiceConvBackwardBatch(features, filters, outGrad, indicePairs,
indiceNum, _inverse, _subM);
indiceNum, _inverse, _subM,
algo != kBatchGemmGather);
} else {
break;
}
......@@ -439,7 +467,7 @@ std::vector<torch::Tensor>
indiceConvBackwardBatch(torch::Tensor features, torch::Tensor filters,
torch::Tensor outGrad, torch::Tensor indicePairs,
torch::Tensor indiceNum, int64_t _inverse,
int64_t _subM) {
int64_t _subM, bool batchScatter) {
bool subM = _subM != 0;
bool inverse = _inverse != 0;
......@@ -467,22 +495,52 @@ indiceConvBackwardBatch(torch::Tensor features, torch::Tensor filters,
torch::Tensor inputGrad = torch::zeros(features.sizes(), options);
torch::Tensor filtersGrad = torch::zeros(filterShape, options);
int bufferSize = subM ? indicePairTop2Size : indicePairMaxSize;
torch::Tensor inputBuffer =
torch::zeros({kernelVolume, bufferSize, numInPlanes}, options);
torch::Tensor outputBuffer =
torch::zeros({kernelVolume, bufferSize, numOutPlanes}, options);
filters = filters.view({-1, numInPlanes, numOutPlanes});
filtersGrad = filtersGrad.view({-1, numInPlanes, numOutPlanes});
int64_t size = kernelVolume * bufferSize;
std::vector<std::pair<int, int>> part_ranges = {{0, kernelVolume}};
int maxKernelVolumePart = kernelVolume;
if (subM) {
maxKernelVolumePart = std::max(indicePairMaxOffset,
int(kernelVolume - indicePairMaxOffset - 1));
part_ranges = {{0, indicePairMaxOffset},
{indicePairMaxOffset + 1, kernelVolume}};
auto filtersGradSub = filtersGrad[indicePairMaxOffset];
auto filtersSub = filters[indicePairMaxOffset];
torch::mm_out(filtersGradSub, features.t(), outGrad);
torch::mm_out(inputGrad, outGrad, filtersSub.t());
if (indicePairTop2Size == 0) {
return {inputGrad, filtersGrad.view(filterShape)};
}
}
torch::Tensor inputBuffer =
torch::zeros({maxKernelVolumePart, bufferSize, numInPlanes}, options);
torch::Tensor outputBuffer =
torch::zeros({maxKernelVolumePart, bufferSize, numOutPlanes}, options);
for (auto &range : part_ranges) {
int start = range.first;
int end = range.second;
int length = end - start;
int64_t size = length * bufferSize;
auto inputBufferPart = tv::torch_slice_first_axis(inputBuffer, 0, length);
auto outputBufferPart = tv::torch_slice_first_axis(outputBuffer, 0, length);
auto indicePairs1Part =
tv::torch_slice_first_axis(indicePairs[inverse], start, end);
auto indicePairs2Part =
tv::torch_slice_first_axis(indicePairs[!inverse], start, end);
auto filtersPart = tv::torch_slice_first_axis(filters, start, end);
auto filtersGradPart = tv::torch_slice_first_axis(filtersGrad, start, end);
if (device == torch::kCPU) {
TV_THROW_INVALID_ARG("unknown device type");
}
#ifdef TV_CUDA
else if (device == torch::kCUDA) {
batch_sparse_gather_cuda(inputBuffer, features, indicePairs[inverse], size);
batch_sparse_gather_cuda(outputBuffer, outGrad, indicePairs[!inverse],
batch_sparse_gather_cuda(inputBufferPart, features, indicePairs1Part,
size);
batch_sparse_gather_cuda(outputBufferPart, outGrad, indicePairs2Part,
size);
}
#endif
......@@ -491,77 +549,45 @@ indiceConvBackwardBatch(torch::Tensor features, torch::Tensor filters,
}
// filters: KV, I, O, inputBuffer: [KV, buffer, I]
// outputBuffer: [KV, buffer, O]
torch::bmm_out(filtersGrad, inputBuffer.permute({0, 2, 1}), outputBuffer);
torch::bmm_out(inputBuffer, outputBuffer, filters.permute({0, 2, 1}));
torch::bmm_out(filtersGradPart, inputBufferPart.permute({0, 2, 1}),
outputBufferPart);
torch::bmm_out(inputBuffer, outputBufferPart,
filtersPart.permute({0, 2, 1}));
if (batchScatter) {
if (device == torch::kCPU) {
TV_THROW_INVALID_ARG("unknown device type");
}
#ifdef TV_CUDA
else if (device == torch::kCUDA) {
batch_sparse_scatter_add_cuda(inputBuffer, inputGrad, indicePairs[inverse],
size);
batch_sparse_scatter_add_cuda(inputBufferPart, inputGrad,
indicePairs1Part, size);
}
#endif
else {
TV_THROW_INVALID_ARG("unknown device type");
}
if (subM) {
auto remain_size = indicePairMaxSize - indicePairTop2Size;
if (remain_size <= 0) {
return {inputGrad, filtersGrad.view(filterShape)};
} else {
for (int i = 0; i < length; ++i) {
auto nHot = indicePairNumCpu.data_ptr<int>()[i + start];
if (nHot <= 0) {
continue;
}
inputBuffer = torch::zeros({remain_size, numInPlanes}, options);
outputBuffer = torch::zeros({remain_size, numOutPlanes}, options);
if (device == torch::kCPU) {
TV_THROW_INVALID_ARG("unknown device type");
sparse_scatter_add_cpu(inputBufferPart[i], inputGrad,
indicePairs1Part[i], nHot);
}
#ifdef TV_CUDA
else if (device == torch::kCUDA) {
tv::dispatch_torch<int32_t, int64_t>(indice_dtype, [&](auto I) {
using Index = decltype(I);
auto indicePairsRemain = torch::from_blob(
indicePairs[inverse][indicePairMaxOffset].data_ptr<Index>() +
indicePairTop2Size,
{remain_size}, indicePairs.options());
auto indicePairsRemain2 = torch::from_blob(
indicePairs[!inverse][indicePairMaxOffset].data_ptr<Index>() +
indicePairTop2Size,
{remain_size}, indicePairs.options());
batch_sparse_gather_cuda(inputBuffer, features, indicePairsRemain,
remain_size);
batch_sparse_gather_cuda(outputBuffer, outGrad, indicePairsRemain2,
remain_size);
});
sparse_scatter_add_cuda(inputBufferPart[i], inputGrad,
indicePairs1Part[i], nHot);
}
#endif
else {
TV_THROW_INVALID_ARG("unknown device type");
}
torch::mm_out(filtersGrad, inputBuffer.t(), outputBuffer);
torch::mm_out(inputBuffer, outputBuffer, filters[indicePairMaxOffset].t());
if (device == torch::kCPU) {
TV_THROW_INVALID_ARG("unknown device type");
}
#ifdef TV_CUDA
else if (device == torch::kCUDA) {
tv::dispatch_torch<int32_t, int64_t>(indice_dtype, [&](auto I) {
using Index = decltype(I);
auto indicePairsRemain2 = torch::from_blob(
indicePairs[!inverse][indicePairMaxOffset].data_ptr<Index>() +
indicePairTop2Size,
{remain_size}, indicePairs.options());
batch_sparse_scatter_add_cuda(inputBuffer, inputGrad,
indicePairsRemain2, remain_size);
});
}
#endif
else {
TV_THROW_INVALID_ARG("unknown device type");
}
}
return {inputGrad, filtersGrad.view(filterShape)};
}
......
......@@ -27,11 +27,18 @@ from spconv.test_utils import TestCase, generate_sparse_data, params_grid
class SparseConv3dTestTorch(nn.Module):
def __init__(self, num_layers, ndim, shape, in_channels, out_channels,
kernel_size, stride, padding, dilation):
def __init__(self,
num_layers,
ndim,
shape,
in_channels,
out_channels,
kernel_size,
stride,
padding,
dilation,
algo=spconv.ConvAlgo.BatchGemmGather):
super().__init__()
algo = spconv.ConvAlgo.BatchGemm
layers = [
spconv.SparseConv3d(in_channels,
out_channels,
......@@ -67,8 +74,17 @@ class SparseConv3dTestTorch(nn.Module):
class SubMConv3dTestTorch(nn.Module):
def __init__(self, num_layers, ndim, shape, in_channels, out_channels,
kernel_size, stride, padding, dilation, algo=spconv.ConvAlgo.Native):
def __init__(self,
num_layers,
ndim,
shape,
in_channels,
out_channels,
kernel_size,
stride,
padding,
dilation,
algo=spconv.ConvAlgo.Native):
super().__init__()
layers = [
spconv.SubMConv3d(in_channels,
......@@ -96,7 +112,7 @@ class SubMConv3dTestTorch(nn.Module):
self.shape = shape
def forward(self, features, coors, batch_size):
coors = coors.int()# .cpu()
coors = coors.int() # .cpu()
x = spconv.SparseConvTensor(features, coors, self.shape, batch_size,
self.grid)
return self.net(x) # .dense()
......@@ -599,13 +615,13 @@ class TestSpConv(TestCase):
self.assertAllClose(din_np, din_sparse_np, atol=1e-4)
def main():
def main(algo=spconv.ConvAlgo.Native):
# function for develop.
np.random.seed(484)
# devices = ["cuda:0"]
devices = ["cuda:0"]
shapes = [[50, 30, 30]]
batchsizes = [2]
shapes = [[400, 400, 15]]
batchsizes = [1]
in_channels = [32]
out_channels = [64]
......@@ -620,7 +636,7 @@ def main():
if all([s > 1, d > 1]):
continue
device = torch.device(dev)
num_points = [500] * bs
num_points = [30000] * bs
sparse_dict = generate_sparse_data(shape, num_points, IC)
......@@ -636,8 +652,8 @@ def main():
features_t = torch.from_numpy(features).to(device).float()
features_dense_t = torch.from_numpy(features_dense).to(device).float()
net = SparseConv3dTestTorch(1, 3, shape, IC, OC, k, s, p,
d).to(device).float()
net = SparseConv3dTestTorch(1, 3, shape, IC, OC, k, s, p, d,
algo=algo).to(device).float()
net_ref = Conv3dTestTorch(1, 3, shape, IC, OC, k, s, p,
d).to(device).float()
filters_t = torch.from_numpy(filters).to(device).float()
......@@ -662,7 +678,8 @@ def main():
print(
np.linalg.norm(out.detach().cpu().numpy() -
out_ref.detach().cpu().numpy()))
print(out_numpy.min(), out_numpy.max(), out_numpy.mean(), out_numpy.sum())
print(out_numpy.min(), out_numpy.max(), out_numpy.mean(),
out_numpy.sum())
def main_subm(algo):
......@@ -671,7 +688,7 @@ def main_subm(algo):
torch.manual_seed(50051)
# devices = ["cuda:0"]
devices = ["cuda:0"]
shapes = [[50, 30, 30]]
shapes = [[400, 400, 15]]
batchsizes = [2]
in_channels = [32]
......@@ -686,7 +703,7 @@ def main_subm(algo):
if all([s > 1, d > 1]):
continue
device = torch.device(dev)
num_points = [1000] * bs
num_points = [240000] * bs
sparse_dict = generate_sparse_data(shape, num_points, IC)
......@@ -702,8 +719,8 @@ def main_subm(algo):
features_t = torch.from_numpy(features).to(device).float()
features_dense_t = torch.from_numpy(features_dense).to(device).float()
net = SubMConv3dTestTorch(1, 3, shape, IC, OC, k, s, p,
d, algo=algo).to(device).float()
net = SubMConv3dTestTorch(1, 3, shape, IC, OC, k, s, p, d,
algo=algo).to(device).float()
net_ref = Conv3dTestTorch(1, 3, shape, IC, OC, k, s, p,
d).to(device).float()
filters_t = torch.from_numpy(filters).to(device).float()
......@@ -712,7 +729,7 @@ def main_subm(algo):
net.net[0].weight[:] = filters_t
out_ref = net_ref(features_dense_t)
times = []
for i in range(100):
for i in range(20):
t = time.time()
out = net(features_t, indices_t, bs)
torch.cuda.synchronize()
......@@ -727,11 +744,13 @@ def main_subm(algo):
print(
np.linalg.norm(out.detach().cpu().numpy() -
out_ref.detach().cpu().numpy()))
print(out_numpy.min(), out_numpy.max(), out_numpy.mean(), out_numpy.sum())
print(out_numpy.min(), out_numpy.max(), out_numpy.mean(),
out_numpy.sum())
return out_numpy
if __name__ == '__main__':
# out_my = main_subm(algo=spconv.ConvAlgo.BatchGemm)
# main_subm(algo=spconv.ConvAlgo.BatchGemmGather)
# out_ref = main_subm(algo=spconv.ConvAlgo.Native)
# TestCase().assertAllClose(out_my, out_ref)
# unittest.main()
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment