Commit 9ce18407 authored by yanyan's avatar yanyan
Browse files

add SparseConvNet algorithm

parent 47bea04c
......@@ -37,8 +37,8 @@ observers:
libpaths: [
/home/yy/anaconda3/lib,
]
libraries: [-lnvinfer, -lpython3.7m, -lcublas, -lcudart, -ljpeg]
std: c++2a
libraries: [-lpython3.7m, -lcublas, -lcudart, -ljpeg]
std: c++14
options: [-Wall, -Wextra]
cudadev:
......@@ -65,11 +65,11 @@ observers:
std: c++14
options: [
-Wno-deprecated-declarations,
"-gencode=arch=compute_52,code=sm_52",
# "-gencode=arch=compute_52,code=sm_52",
"-gencode=arch=compute_61,code=sm_61",
"-gencode=arch=compute_60,code=sm_60",
"-gencode=arch=compute_70,code=sm_70",
"-gencode=arch=compute_75,code=sm_75",
# "-gencode=arch=compute_60,code=sm_60",
# "-gencode=arch=compute_70,code=sm_70",
# "-gencode=arch=compute_75,code=sm_75",
]
torchdev:
......
/*
BSD License
For SparseConvNet software
Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
Redistribution and use in source and binary forms, with or without modification,
are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright notice, this
list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.
* Neither the name Facebook nor the names of its contributors may be used to
endorse or promote products derived from this software without specific
prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#define TACC double
template <typename T, int32_t K, int32_t V>
__global__ void
dConvolution_KMxKN_forwardA(T *inFeatures, T *outFeatures, T *w,
int32_t *rulesIn, int32_t *rulesOut, int32_t nHot,
int32_t input_nPlanes, int32_t input_stride,
int32_t output_nPlanes, int32_t output_stride) {
// nHot must be a multiple of K!!
// Input x Weight -> Output
// blockDim=(K,K/V,1), gridDim=(nBlocks,N,nGroups) Volkov-blocks
// K is a multiple of V,
// nHot x KM -> nHot x KN - parallel over N,nHot - loop over M
int32_t M = input_nPlanes / K;
// N = gridDim.y == output_nPlanes/K
int32_t n = blockIdx.y;
int32_t g = blockIdx.z;
inFeatures += g * input_nPlanes;
outFeatures += n * K + g * output_nPlanes;
w += n * K + g * input_nPlanes * output_nPlanes;
TACC O[V];
__shared__ T W[K][K];
__shared__ T I[K][K];
int32_t R0[V];
int32_t R1[V];
const int32_t tx = threadIdx.x;
int32_t ty[V];
#pragma unroll
for (int32_t v = 0; v < V; v++)
ty[v] = threadIdx.y + v * (K / V);
for (int32_t m = 0; m < M; m++) {
// Read w
#pragma unroll
for (int32_t v = 0; v < V; v++)
W[ty[v]][tx] = w[ty[v] * output_nPlanes + tx];
for (int32_t s = blockIdx.x * K; s < nHot; s += K * gridDim.x) {
#pragma unroll
for (int32_t v = 0; v < V; v++) {
R0[v] = rulesIn[s + ty[v]];
R1[v] = rulesOut[s + ty[v]];
}
__syncthreads();
// Read input, reset O[]
#pragma unroll
for (int32_t v = 0; v < V; v++) {
I[ty[v]][tx] = inFeatures[R0[v] * input_stride + tx];
O[v] = 0;
}
__syncthreads();
#pragma unroll
for (int32_t k = 0; k < K; k++)
#pragma unroll
for (int32_t v = 0; v < V; v++)
O[v] += I[ty[v]][k] * W[k][tx];
#pragma unroll
for (int32_t v = 0; v < V; v++)
O[v] += outFeatures[R1[v] * output_stride + tx];
#pragma unroll
for (int32_t v = 0; v < V; v++)
outFeatures[R1[v] * output_stride + tx] = O[v];
__syncthreads();
}
w += K * output_nPlanes;
inFeatures += K;
}
}
template <typename T, int32_t K, int32_t V>
__global__ void
dConvolution_KMxKN_forwardB(T *inFeatures, T *outFeatures, T *w,
int32_t *rulesIn, int32_t *rulesOut, int32_t nHot,
int32_t input_nPlanes, int32_t input_stride,
int32_t output_nPlanes, int32_t output_stride) {
// Input x Weight -> Output
// blockDim=(K,K/V,1), gridDim=(nBlocks,N,nGroups) Volkov-blocks
// K is a multiple of V,
// nHot x KM -> nHot x KN - parallel over N,nHot - loop over M
int32_t M = input_nPlanes / K;
// N = gridDim.y == output_nPlanes/K
int32_t n = blockIdx.y;
int32_t g = blockIdx.z;
inFeatures += g * input_nPlanes;
outFeatures += n * K + g * output_nPlanes;
w += n * K + g * input_nPlanes * output_nPlanes;
TACC O[V];
__shared__ T W[K][K];
__shared__ T I[K][K];
int32_t R0[V];
int32_t R1[V];
const int32_t tx = threadIdx.x;
int32_t ty[V];
#pragma unroll
for (int32_t v = 0; v < V; v++)
ty[v] = threadIdx.y + v * (K / V);
for (int32_t m = 0; m < M; m++) {
// Read w
#pragma unroll
for (int32_t v = 0; v < V; v++)
W[ty[v]][tx] = w[ty[v] * output_nPlanes + tx];
for (int32_t s = blockIdx.x * K; s < nHot; s += K * gridDim.x) {
#pragma unroll
for (int32_t v = 0; v < V; v++) {
if (s + ty[v] < nHot) {
R0[v] = rulesIn[s + ty[v]];
R1[v] = rulesOut[s + ty[v]];
}
}
__syncthreads();
// Read input, reset O[]
#pragma unroll
for (int32_t v = 0; v < V; v++) {
if (s + ty[v] < nHot)
I[ty[v]][tx] = inFeatures[R0[v] * input_stride + tx];
O[v] = 0;
}
__syncthreads();
#pragma unroll
for (int32_t k = 0; k < K; k++)
#pragma unroll
for (int32_t v = 0; v < V; v++)
O[v] += I[ty[v]][k] * W[k][tx];
#pragma unroll
for (int32_t v = 0; v < V; v++)
if (s + ty[v] < nHot)
O[v] += outFeatures[R1[v] * output_stride + tx];
#pragma unroll
for (int32_t v = 0; v < V; v++)
if (s + ty[v] < nHot)
outFeatures[R1[v] * output_stride + tx] = O[v];
__syncthreads();
}
w += K * output_nPlanes;
inFeatures += K;
}
}
#define FOO(T, K, V) \
{ \
if (input_nPlanes % K == 0 and output_nPlanes % K == 0) { \
int32_t o = (nHot / K) * K; \
if (o >= K) \
dConvolution_KMxKN_forwardA<T, K, V> \
<<<dim3(std::min(o / K, (int32_t)512), output_nPlanes / K, \
nGroups), \
dim3(K, K / V), 0, s>>>( \
inFeatures, outFeatures, w, rulesIn, rulesOut, o, \
input_nPlanes, input_stride, output_nPlanes, output_stride); \
if (nHot > o) \
dConvolution_KMxKN_forwardB<T, K, V> \
<<<dim3(1, output_nPlanes / K, nGroups), dim3(K, K / V), 0, s>>>( \
inFeatures, outFeatures, w, rulesIn + o, rulesOut + o, \
nHot - o, input_nPlanes, input_stride, output_nPlanes, \
output_stride); \
return; \
} \
}
template <typename T>
void dConvolution_forward(cudaStream_t s, T *inFeatures, T *outFeatures, T *w,
int32_t *rulesIn, int32_t *rulesOut, int32_t nHot,
int32_t input_nPlanes, int32_t input_stride,
int32_t output_nPlanes, int32_t output_stride,
int32_t nGroups) {
FOO(T, 64, 16)
FOO(T, 32, 8)
FOO(T, 16, 4)
FOO(T, 8, 2)
assert(false);
}
template <>
void dConvolution_forward<double>(cudaStream_t s, double *inFeatures,
double *outFeatures, double *w,
int32_t *rulesIn, int32_t *rulesOut,
int32_t nHot, int32_t input_nPlanes,
int32_t input_stride, int32_t output_nPlanes,
int32_t output_stride, int32_t nGroups) {
FOO(double, 32, 8)
FOO(double, 16, 4)
FOO(double, 8, 2)
assert(false);
}
#undef FOO
// dOutput x W^T -> dInput and
// Input^T x dOutput -> dW
// blockDim=(K,K/V,1), gridDim=(nBlocks,M,nGroups)
template <typename T, int32_t K, int32_t V>
__global__ void dConvolution_KMxKN_backward_dW_A(
T *inFeatures, T *dInFeatures, T *dOutFeatures, T *w, T *dw,
int32_t *rulesIn, int32_t *rulesOut, int32_t nHot, int32_t input_nPlanes,
int32_t input_stride, int32_t output_nPlanes, int32_t output_stride) {
// M = gridDim.y == input_nPlanes / K
int32_t N = output_nPlanes / K;
int32_t m = blockIdx.y;
int32_t g = blockIdx.z;
inFeatures += m * K + g * input_nPlanes;
dInFeatures += m * K + g * input_nPlanes;
dOutFeatures += g * output_nPlanes;
w += m * K * output_nPlanes + g * input_nPlanes * output_nPlanes;
dw += m * K * output_nPlanes + g * input_nPlanes * output_nPlanes;
TACC dI[V];
TACC dW[V];
__shared__ T I[K][K];
__shared__ T dO[K][K];
__shared__ T W[K][K];
int32_t R0[V];
int32_t R1[V];
const int32_t tx = threadIdx.x;
int32_t ty[V];
#pragma unroll
for (int32_t v = 0; v < V; v++)
ty[v] = threadIdx.y + v * (K / V);
for (int32_t n = 0; n < N; n++) {
// Read w, reset dW
#pragma unroll
for (int32_t v = 0; v < V; v++) {
W[ty[v]][tx] = w[ty[v] * output_nPlanes + tx];
dW[v] = 0;
}
for (int32_t s = blockIdx.x * K; s < nHot; s += K * gridDim.x) {
#pragma unroll
for (int32_t v = 0; v < V; v++) {
R0[v] = rulesIn[s + ty[v]];
R1[v] = rulesOut[s + ty[v]];
dI[v] = 0;
}
__syncthreads();
// Read input and dOutput
#pragma unroll
for (int32_t v = 0; v < V; v++) {
I[ty[v]][tx] = inFeatures[R0[v] * input_stride + tx];
dO[ty[v]][tx] = dOutFeatures[R1[v] * output_stride + tx];
}
__syncthreads();
#pragma unroll
for (int32_t k = 0; k < K; k++)
#pragma unroll
for (int32_t v = 0; v < V; v++) {
dI[v] += dO[ty[v]][k] * W[tx][k];
dW[v] += I[k][ty[v]] * dO[k][tx];
}
#pragma unroll
for (int32_t v = 0; v < V; v++)
dI[v] += dInFeatures[R0[v] * input_stride + tx];
#pragma unroll
for (int32_t v = 0; v < V; v++)
dInFeatures[R0[v] * input_stride + tx] = dI[v];
__syncthreads();
}
#pragma unroll
for (int32_t v = 0; v < V; v++)
atomicAdd(&dw[ty[v] * output_nPlanes + tx], dW[v]);
w += K;
dw += K;
dOutFeatures += K;
}
}
// dOutput x W^T -> dInput and
// Input^T x dOutput -> dW
// blockDim=(K,K/V,1), gridDim=(nBlocks,M,nGroups)
template <typename T, int32_t K, int32_t V>
__global__ void dConvolution_KMxKN_backward_dW_B(
T *inFeatures, T *dInFeatures, T *dOutFeatures, T *w, T *dw,
int32_t *rulesIn, int32_t *rulesOut, int32_t nHot, int32_t input_nPlanes,
int32_t input_stride, int32_t output_nPlanes, int32_t output_stride) {
// M = gridDim.y == input_nPlanes / K
int32_t N = output_nPlanes / K;
int32_t m = blockIdx.y;
int32_t g = blockIdx.z;
inFeatures += m * K + g * input_nPlanes;
dInFeatures += m * K + g * input_nPlanes;
dOutFeatures += g * output_nPlanes;
w += m * K * output_nPlanes + g * input_nPlanes * output_nPlanes;
dw += m * K * output_nPlanes + g * input_nPlanes * output_nPlanes;
TACC dI[V];
TACC dW[V];
__shared__ T I[K][K];
__shared__ T dO[K][K];
__shared__ T W[K][K];
int32_t R0[V];
int32_t R1[V];
const int32_t tx = threadIdx.x;
int32_t ty[V];
#pragma unroll
for (int32_t v = 0; v < V; v++)
ty[v] = threadIdx.y + v * (K / V);
for (int32_t n = 0; n < N; n++) {
// Read w, reset dW
#pragma unroll
for (int32_t v = 0; v < V; v++) {
W[ty[v]][tx] = w[ty[v] * output_nPlanes + tx];
dW[v] = 0;
}
for (int32_t s = blockIdx.x * K; s < nHot; s += K * gridDim.x) {
#pragma unroll
for (int32_t v = 0; v < V; v++) {
if (s + ty[v] < nHot) {
R0[v] = rulesIn[s + ty[v]];
R1[v] = rulesOut[s + ty[v]];
}
dI[v] = 0;
}
__syncthreads();
// Read input and dOutput
#pragma unroll
for (int32_t v = 0; v < V; v++)
if (s + ty[v] < nHot) {
I[ty[v]][tx] = inFeatures[R0[v] * input_stride + tx];
dO[ty[v]][tx] = dOutFeatures[R1[v] * output_stride + tx];
} else {
I[ty[v]][tx] = 0;
dO[ty[v]][tx] = 0;
}
__syncthreads();
#pragma unroll
for (int32_t k = 0; k < K; k++)
#pragma unroll
for (int32_t v = 0; v < V; v++) {
dI[v] += dO[ty[v]][k] * W[tx][k];
dW[v] += I[k][ty[v]] * dO[k][tx];
}
#pragma unroll
for (int32_t v = 0; v < V; v++)
if (s + ty[v] < nHot)
dI[v] += dInFeatures[R0[v] * input_stride + tx];
#pragma unroll
for (int32_t v = 0; v < V; v++)
if (s + ty[v] < nHot)
dInFeatures[R0[v] * input_stride + tx] = dI[v];
__syncthreads();
}
#pragma unroll
for (int32_t v = 0; v < V; v++)
atomicAdd(&dw[ty[v] * output_nPlanes + tx], dW[v]);
w += K;
dw += K;
dOutFeatures += K;
}
}
#define FOO(T, K, V) \
{ \
if (input_nPlanes % K == 0 and output_nPlanes % K == 0) { \
int32_t o = (nHot / K) * K; \
if (o >= K) \
dConvolution_KMxKN_backward_dW_A<T, K, V> \
<<<dim3(std::min(o / K, (int32_t)512), input_nPlanes / K, \
nGroups), \
dim3(K, K / V), 0, s>>>(inFeatures, dInFeatures, dOutFeatures, \
w, dw, rulesIn, rulesOut, o, \
input_nPlanes, input_stride, \
output_nPlanes, output_stride); \
if (nHot > o) \
dConvolution_KMxKN_backward_dW_B<T, K, V> \
<<<dim3(1, input_nPlanes / K, nGroups), dim3(K, K / V), 0, s>>>( \
inFeatures, dInFeatures, dOutFeatures, w, dw, rulesIn + o, \
rulesOut + o, nHot - o, input_nPlanes, input_stride, \
output_nPlanes, output_stride); \
return; \
} \
}
template <typename T>
void dConvolution_backward_dW(cudaStream_t s, T *inFeatures, T *dInFeatures,
T *dOutFeatures, T *w, T *dw, int32_t *rulesIn,
int32_t *rulesOut, int32_t nHot,
int32_t input_nPlanes, int32_t input_stride,
int32_t output_nPlanes, int32_t output_stride,
int32_t nGroups) {
FOO(T, 32, 8)
FOO(T, 16, 4)
FOO(T, 8, 2)
assert(false);
}
#undef FOO
template <typename T, int32_t K, int32_t V>
__global__ void
dConvolution_KMxKN_forward2(T *inFeatures, T *outFeatures, T *w,
int32_t *rulesIn, int32_t *rulesOut, int32_t nHot,
int32_t input_nPlanes, int32_t input_stride,
int32_t output_nPlanes, int32_t output_stride) {
// Input x Weight -> Output
// blockDim=(K,K/V,1), gridDim=(nBlocks,N,nGroups) Volkov-blocks
// K is a multiple of V,
// nHot x input_nplanes<=KM -> nHot x output_nPlanes<=KN
// - parallel over N,nHot - loop over M
int32_t M = (input_nPlanes + K - 1) / K;
// N = gridDim.y ~ output_nPlanes/K
int32_t n = blockIdx.y;
int32_t g = blockIdx.z;
inFeatures += g * input_nPlanes;
outFeatures += n * K + g * output_nPlanes;
w += n * K + g * input_nPlanes * output_nPlanes;
int32_t KO = min(K, output_nPlanes - K * n);
TACC O[V];
__shared__ T W[K][K];
__shared__ T I[K][K];
__shared__ int32_t R[K * 2];
const int32_t tx = threadIdx.x;
int32_t ty[V];
#pragma unroll
for (int32_t v = 0; v < V; v++)
ty[v] = threadIdx.y + v * (K / V);
for (int32_t m = 0; m < M; m++) {
int32_t KI = min(K, input_nPlanes - K * m);
// Read w
#pragma unroll
for (int32_t v = 0; v < V; v++)
if (ty[v] < KI and tx < KO)
W[ty[v]][tx] = w[ty[v] * output_nPlanes + tx];
for (int32_t s = blockIdx.x * K; s < nHot; s += K * gridDim.x) {
// Read rules for K input/output pairs
#pragma unroll
for (int32_t v = 0; v < V; v++) {
if (ty[v] < 1) {
if (s + tx < nHot) {
R[2 * tx] = rulesIn[s + tx];
R[2 * tx + 1] = rulesOut[s + tx];
}
// R[q] = rules[2 * s + q];
}
}
__syncthreads();
// Read input, reset O[]
#pragma unroll
for (int32_t v = 0; v < V; v++) {
if (tx < KI and s + ty[v] < nHot)
I[ty[v]][tx] = inFeatures[R[2 * ty[v]] * input_stride + tx];
O[v] = 0;
}
__syncthreads();
#pragma unroll
for (int32_t k = 0; k < KI; k++)
#pragma unroll
for (int32_t v = 0; v < V; v++)
O[v] += I[ty[v]][k] * W[k][tx];
__syncthreads();
#pragma unroll
for (int32_t v = 0; v < V; v++)
if (tx < KO and s + ty[v] < nHot)
outFeatures[R[2 * ty[v] + 1] * output_stride + tx] += O[v];
__syncthreads();
}
w += K * output_nPlanes;
inFeatures += K;
}
}
// dOutput x W^T -> dInput and
// Input^T x dOutput -> dW
// blockDim=(K,K/V,1), gridDim=(nBlocks,M,nGroups)
template <typename T, int32_t K, int32_t V>
__global__ void dConvolution_KMxKN_backward_dW2(
T *inFeatures, T *dInFeatures, T *dOutFeatures, T *w, T *dw,
int32_t *rulesIn, int32_t *rulesOut, int32_t nHot, int32_t input_nPlanes,
int32_t input_stride, int32_t output_nPlanes, int32_t output_stride) {
// M = gridDim.y == input_nPlanes / K
int32_t N = (output_nPlanes + K - 1) / K;
int32_t m = blockIdx.y;
int32_t g = blockIdx.z;
inFeatures += m * K + g * input_nPlanes;
dInFeatures += m * K + g * input_nPlanes;
dOutFeatures += g * output_nPlanes;
w += m * K * output_nPlanes + g * input_nPlanes * output_nPlanes;
dw += m * K * output_nPlanes + g * input_nPlanes * output_nPlanes;
int32_t KI = min(K, input_nPlanes - K * m);
TACC dI[V];
TACC dW[V];
__shared__ T I[K][K];
__shared__ T dO[K][K];
__shared__ T W[K][K];
__shared__ int32_t R[K * 2];
const int32_t tx = threadIdx.x;
int32_t ty[V];
#pragma unroll
for (int32_t v = 0; v < V; v++)
ty[v] = threadIdx.y + v * (K / V);
for (int32_t n = 0; n < N; n++) {
int32_t KO = min(K, output_nPlanes - K * n);
// Read w, reset dW
#pragma unroll
for (int32_t v = 0; v < V; v++) {
if (ty[v] < KI and tx < KO)
W[ty[v]][tx] = w[ty[v] * output_nPlanes + tx];
dW[v] = 0;
}
for (int32_t s = blockIdx.x * K; s < nHot; s += K * gridDim.x) {
// Read rules for K input/output pairs, reset dI[]
#pragma unroll
for (int32_t v = 0; v < V; v++) {
if (ty[v] < 1) {
if (s + tx < nHot) {
R[2 * tx] = rulesIn[s + tx];
R[2 * tx + 1] = rulesOut[s + tx];
}
// R[q] = rules[2 * s + q];
}
dI[v] = 0;
}
__syncthreads();
// Read input and dOutput
#pragma unroll
for (int32_t v = 0; v < V; v++) {
if (tx < KI and s + ty[v] < nHot)
I[ty[v]][tx] = inFeatures[R[2 * ty[v]] * input_stride + tx];
else
I[ty[v]][tx] = 0;
if (tx < KO and s + ty[v] < nHot)
dO[ty[v]][tx] = dOutFeatures[R[2 * ty[v] + 1] * output_stride + tx];
else
dO[ty[v]][tx] = 0;
}
__syncthreads();
#pragma unroll
for (int32_t k = 0; k < KO; k++)
#pragma unroll
for (int32_t v = 0; v < V; v++)
dI[v] += dO[ty[v]][k] * W[tx][k];
#pragma unroll
for (int32_t k = 0; k < K; k++)
#pragma unroll
for (int32_t v = 0; v < V; v++)
dW[v] += I[k][ty[v]] * dO[k][tx];
__syncthreads();
#pragma unroll
for (int32_t v = 0; v < V; v++)
if (tx < KI and s + ty[v] < nHot)
dInFeatures[R[2 * ty[v]] * input_stride + tx] += dI[v];
__syncthreads();
}
#pragma unroll
for (int32_t v = 0; v < V; v++)
if (ty[v] < KI and tx < KO)
atomicAdd(&dw[ty[v] * output_nPlanes + tx], dW[v]);
w += K;
dw += K;
dOutFeatures += K;
}
}
template <typename T>
void dConvolution_forward2(cudaStream_t s, T *inFeatures, T *outFeatures, T *w,
int32_t *rulesIn, int32_t *rulesOut, int32_t nHot,
int32_t input_nPlanes, int32_t input_stride,
int32_t output_nPlanes, int32_t output_stride,
int32_t nGroups) {
int32_t c = input_nPlanes * output_nPlanes * nGroups;
if (input_nPlanes % 8 != 0 or output_nPlanes % 8 != 0) {
const int32_t K = 16;
const int32_t V = 4;
dConvolution_KMxKN_forward2<T, K, V>
<<<dim3(128, (output_nPlanes + K - 1) / K, nGroups), dim3(K, K / V), 0,
s>>>(inFeatures, outFeatures, w, rulesIn, rulesOut, nHot,
input_nPlanes, input_stride, output_nPlanes, output_stride);
} else {
dConvolution_forward(s, inFeatures, outFeatures, w, rulesIn, rulesOut, nHot,
input_nPlanes, input_stride, output_nPlanes,
output_stride, nGroups);
}
}
template <typename T>
void dConvolution_backward_dW2(cudaStream_t s, T *inFeatures, T *dInFeatures,
T *dOutFeatures, T *w, T *dw, int32_t *rulesIn,
int32_t *rulesOut, int32_t nHot,
int32_t input_nPlanes, int32_t input_stride,
int32_t output_nPlanes, int32_t output_stride,
int32_t nGroups) {
int32_t c = input_nPlanes * output_nPlanes * nGroups;
if (input_nPlanes % 8 != 0 or output_nPlanes % 8 != 0) {
const int32_t K = 16;
const int32_t V = 4;
dConvolution_KMxKN_backward_dW2<T, K, V>
<<<dim3(128, (input_nPlanes + K - 1) / K, nGroups), dim3(K, K / V), 0,
s>>>(inFeatures, dInFeatures, dOutFeatures, w, dw, rulesIn, rulesOut,
nHot, input_nPlanes, input_stride, output_nPlanes,
output_stride);
} else {
dConvolution_backward_dW(s, inFeatures, dInFeatures, dOutFeatures, w, dw,
rulesIn, rulesOut, nHot, input_nPlanes,
input_stride, output_nPlanes, output_stride,
nGroups);
}
}
#undef TACC
\ No newline at end of file
// Copyright 2019 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <cuda_runtime_api.h>
#include <tensorview/tensorview.h>
#include <torch/script.h>
namespace spconv {
void fused_conv_cuda(torch::Tensor output, torch::Tensor features,
torch::Tensor filters, torch::Tensor indicesIn,
torch::Tensor indicesOut, int nHot);
void fused_conv_backward_cuda(torch::Tensor features, torch::Tensor din,
torch::Tensor dout, torch::Tensor filters,
torch::Tensor dfilters, torch::Tensor indicesIn,
torch::Tensor indicesOut, int nHot);
} // namespace spconv
......@@ -20,6 +20,18 @@
#include <tensorview/tensorview.h>
namespace spconv {
template <bool UseDeconv, typename Index, unsigned NDim> struct ConvIndiceDispatch;
template <typename Index, unsigned NDim>
struct ConvIndiceDispatch<true, Index, NDim>{
constexpr static auto* func = getValidOutPosTranspose<Index, NDim>;
};
template <typename Index, unsigned NDim>
struct ConvIndiceDispatch<false, Index, NDim>{
constexpr static auto* func = getValidOutPos<Index, NDim>;
};
template <typename Index, unsigned NDim, bool UseDeconv,
int KernelMaxVolume = 256, typename Index1D = int>
__global__ void prepareIndicePairsKernel(
......@@ -47,19 +59,10 @@ __global__ void prepareIndicePairsKernel(
auto indicePairsDim2 = indicePairs.dim(2);
Index index;
for (int ix : tv::KernelLoopX<int>(numActIn)) {
if (UseDeconv) {
// nvcc will optimize this fake "if constexpr"
// after cuda 11 released, we will start to use real if constexpr.
numValidPoints = getValidOutPosTranspose<Index, NDim>(
indicesIn.data() + ix * (NDim + 1) + 1, kernelSize.data(),
stride.data(), padding.data(), dilation.data(),
outSpatialShape.data(), validPoints);
} else {
numValidPoints = getValidOutPos<Index, NDim>(
indicesIn.data() + ix * (NDim + 1) + 1, kernelSize.data(),
stride.data(), padding.data(), dilation.data(),
outSpatialShape.data(), validPoints);
}
numValidPoints = ConvIndiceDispatch<UseDeconv, Index, NDim>::func(
indicesIn.data() + ix * (NDim + 1) + 1, kernelSize.data(),
stride.data(), padding.data(), dilation.data(),
outSpatialShape.data(), validPoints);
for (Index i = 0; i < numValidPoints; ++i) {
pointPtr = validPoints + i * (NDim + 1);
auto offset = pointPtr[NDim];
......
......@@ -23,7 +23,9 @@
namespace spconv {
enum ConvAlgo { kNative = 0, kBatch = 1, kBatchGemmGather = 2 };
enum ConvAlgo { kNative = 0, kBatch, kBatchGemmGather, kSparseConvNet };
using all_conv_algos_t =
tv::mp_list_c<int, kNative, kBatch, kBatchGemmGather, kSparseConvNet>;
// torch.jit's doc says only support int64, so we need to convert to int32.
std::vector<torch::Tensor>
......
#pragma once
#include <cutlass/gemm/device/gemm.h>
#include <type_traits>
namespace spconv {
......
#pragma once
#include <cuda_runtime_api.h>
#include <tensorview/torch_utils.h>
#include <torch/script.h>
......
/*
From PyTorch:
Copyright (c) 2016- Facebook, Inc (Adam Paszke)
Copyright (c) 2014- Facebook, Inc (Soumith Chintala)
Copyright (c) 2011-2014 Idiap Research Institute (Ronan Collobert)
Copyright (c) 2012-2014 Deepmind Technologies (Koray Kavukcuoglu)
Copyright (c) 2011-2012 NEC Laboratories America (Koray Kavukcuoglu)
Copyright (c) 2011-2013 NYU (Clement Farabet)
Copyright (c) 2006-2010 NEC Laboratories America (Ronan Collobert, Leon Bottou, Iain Melvin, Jason Weston)
Copyright (c) 2006 Idiap Research Institute (Samy Bengio)
Copyright (c) 2001-2004 Idiap Research Institute (Ronan Collobert, Samy Bengio, Johnny Mariethoz)
From Caffe2:
Copyright (c) 2016-present, Facebook Inc. All rights reserved.
All contributions by Facebook:
Copyright (c) 2016 Facebook Inc.
All contributions by Google:
Copyright (c) 2015 Google Inc.
All rights reserved.
All contributions by Yangqing Jia:
Copyright (c) 2015 Yangqing Jia
All rights reserved.
All contributions from Caffe:
Copyright(c) 2013, 2014, 2015, the respective contributors
All rights reserved.
All other contributions:
Copyright(c) 2015, 2016 the respective contributors
All rights reserved.
Caffe2 uses a copyright model similar to Caffe: each contributor holds
copyright over their contributions to Caffe2. The project versioning records
all such contribution and copyright details. If a contributor wants to further
mark their specific copyright on a particular contribution, they should
indicate their copyright solely in the commit message of the change when it is
committed.
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
3. Neither the names of Facebook, Deepmind Technologies, NYU, NEC Laboratories America
and IDIAP Research Institute nor the names of its contributors may be
used to endorse or promote products derived from this software without
specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.
*/
#pragma once
#include <type_traits>
#include <utility>
namespace tv {
#ifdef __cpp_lib_void_t
template <class T> using void_t = std::void_t<T>;
#else
// Implementation taken from http://en.cppreference.com/w/cpp/types/void_t
// (it takes CWG1558 into account and also works for older compilers)
template <typename... Ts> struct make_void { typedef void type; };
template <typename... Ts> using void_t = typename make_void<Ts...>::type;
#endif
namespace detail {
struct _identity final {
template <class T> using type_identity = T;
template <class T> decltype(auto) operator()(T &&arg) {
return std::forward<T>(arg);
}
};
template<class Func, class Enable = void>
struct function_takes_identity_argument : std::false_type {};
#if defined(_MSC_VER)
// For some weird reason, MSVC shows a compiler error when using guts::void_t instead of std::void_t.
// But we're only building on MSVC versions that have std::void_t, so let's just use that one.
template<class Func>
struct function_takes_identity_argument<Func, std::void_t<decltype(std::declval<Func>()(_identity()))>> : std::true_type {};
#else
template<class Func>
struct function_takes_identity_argument<Func, void_t<decltype(std::declval<Func>()(_identity()))>> : std::true_type {};
#endif
template<bool Condition>
struct _if_constexpr;
template<>
struct _if_constexpr<true> final {
template<class ThenCallback, class ElseCallback, std::enable_if_t<function_takes_identity_argument<ThenCallback>::value, void*> = nullptr>
static decltype(auto) call(ThenCallback&& thenCallback, ElseCallback&& /* elseCallback */) {
// The _identity instance passed in can be used to delay evaluation of an expression,
// because the compiler can't know that it's just the identity we're passing in.
return thenCallback(_identity());
}
template<class ThenCallback, class ElseCallback, std::enable_if_t<!function_takes_identity_argument<ThenCallback>::value, void*> = nullptr>
static decltype(auto) call(ThenCallback&& thenCallback, ElseCallback&& /* elseCallback */) {
return thenCallback();
}
};
template<>
struct _if_constexpr<false> final {
template<class ThenCallback, class ElseCallback, std::enable_if_t<function_takes_identity_argument<ElseCallback>::value, void*> = nullptr>
static decltype(auto) call(ThenCallback&& /* thenCallback */, ElseCallback&& elseCallback) {
// The _identity instance passed in can be used to delay evaluation of an expression,
// because the compiler can't know that it's just the identity we're passing in.
return elseCallback(_identity());
}
template<class ThenCallback, class ElseCallback, std::enable_if_t<!function_takes_identity_argument<ElseCallback>::value, void*> = nullptr>
static decltype(auto) call(ThenCallback&& /* thenCallback */, ElseCallback&& elseCallback) {
return elseCallback();
}
};
} // namespace detail
/*
* Get something like C++17 if constexpr in C++14.
*
* Example 1: simple constexpr if/then/else
* template<int arg> int increment_absolute_value() {
* int result = arg;
* if_constexpr<(arg > 0)>(
* [&] { ++result; } // then-case
* [&] { --result; } // else-case
* );
* return result;
* }
*
* Example 2: without else case (i.e. conditionally prune code from assembly)
* template<int arg> int decrement_if_positive() {
* int result = arg;
* if_constexpr<(arg > 0)>(
* // This decrement operation is only present in the assembly for
* // template instances with arg > 0.
* [&] { --result; }
* );
* return result;
* }
*
* Example 3: branch based on type (i.e. replacement for SFINAE)
* struct MyClass1 {int value;};
* struct MyClass2 {int val};
* template <class T>
* int func(T t) {
* return if_constexpr<std::is_same<T, MyClass1>::value>(
* [&](auto _) { return _(t).value; }, // this code is invalid for T == MyClass2, so a regular non-constexpr if statement wouldn't compile
* [&](auto _) { return _(t).val; } // this code is invalid for T == MyClass1
* );
* }
*
* Note: The _ argument passed in Example 3 is the identity function, i.e. it does nothing.
* It is used to force the compiler to delay type checking, because the compiler
* doesn't know what kind of _ is passed in. Without it, the compiler would fail
* when you try to access t.value but the member doesn't exist.
*
* Note: In Example 3, both branches return int, so func() returns int. This is not necessary.
* If func() had a return type of "auto", then both branches could return different
* types, say func<MyClass1>() could return int and func<MyClass2>() could return string.
*/
template<bool Condition, class ThenCallback, class ElseCallback>
decltype(auto) if_constexpr(ThenCallback&& thenCallback, ElseCallback&& elseCallback) {
#if defined(__cpp_if_constexpr)
// If we have C++17, just use it's "if constexpr" feature instead of wrapping it.
// This will give us better error messages.
if constexpr(Condition) {
if constexpr (detail::function_takes_identity_argument<ThenCallback>::value) {
return std::forward<ThenCallback>(thenCallback)(detail::_identity());
} else {
return std::forward<ThenCallback>(thenCallback)();
}
} else {
if constexpr (detail::function_takes_identity_argument<ElseCallback>::value) {
return std::forward<ElseCallback>(elseCallback)(detail::_identity());
} else {
return std::forward<ElseCallback>(elseCallback)();
}
}
#else
// C++14 implementation of if constexpr
return detail::_if_constexpr<Condition>::call(std::forward<ThenCallback>(thenCallback),
std::forward<ElseCallback>(elseCallback));
#endif
}
template<bool Condition, class ThenCallback>
decltype(auto) if_constexpr(ThenCallback&& thenCallback) {
#if defined(__cpp_if_constexpr)
// If we have C++17, just use it's "if constexpr" feature instead of wrapping it.
// This will give us better error messages.
if constexpr(Condition) {
if constexpr (detail::function_takes_identity_argument<ThenCallback>::value) {
return std::forward<ThenCallback>(thenCallback)(detail::_identity());
} else {
return std::forward<ThenCallback>(thenCallback)();
}
}
#else
// C++14 implementation of if constexpr
return if_constexpr<Condition>(std::forward<ThenCallback>(thenCallback), [] (auto) {});
#endif
}
}
......@@ -11,7 +11,7 @@
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <iostream>
#include <sstream>
#ifdef TV_USE_STACKTRACE
......
......@@ -28,6 +28,7 @@ If you can use libtorch, dont use tv::Tensor.
#include <iomanip>
#include <memory>
#include <type_traits>
#include "cc17.h"
#ifdef TV_CUDA
#include <cuda_fp16.h>
#include <cuda_runtime.h>
......@@ -623,66 +624,35 @@ struct Tensor {
return TensorView<T, Rank, PtrTraits, Tindex>(
reinterpret_cast<T *>(data<T>()), shape, stride);
}
template <typename T, int Rank = -1,
template <class> class PtrTraits = DefaultPtrTraits,
typename Tindex = int,
typename std::enable_if<Rank == -1, int>::type = 0>
TensorView<T, Rank, PtrTraits, Tindex> tview() {
writable_check();
static_assert(Rank == -1 || Rank > 0, "error");
TV_ASSERT_RT_ERR(dtype_ == type_v<T>, "error");
ShapeBase<TV_MAX_DIM, Tindex> shape(ndim()), stride(ndim());
for (size_t i = 0; i < ndim(); ++i) {
shape[i] = shape_[i];
stride[i] = stride_[i];
}
return TensorView<T, Rank, PtrTraits, Tindex>(
reinterpret_cast<T *>(data<T>()), shape, stride);
}
template <typename T, int Rank = -1,
template <class> class PtrTraits = DefaultPtrTraits,
typename Tindex = int,
typename std::enable_if<(Rank > 0), int>::type = 0>
typename Tindex = int>
TensorView<const std::remove_const_t<T>, Rank, PtrTraits, Tindex>
tview() const {
static_assert(Rank == -1 || Rank > 0, "error");
if (Rank > 0) {
TV_ASSERT_RT_ERR(Rank == ndim(), "error");
}
TV_ASSERT_RT_ERR(dtype_ == type_v<T>, "error");
ShapeBase<Rank == -1 ? TV_MAX_DIM : Rank, Tindex> shape(Rank), stride(Rank);
for (int i = 0; i < Rank; ++i) {
shape[i] = shape_[i];
stride[i] = stride_[i];
}
return TensorView<const std::remove_const_t<T>, Rank, PtrTraits, Tindex>(
reinterpret_cast<const std::remove_const_t<T> *>(data<T>()), shape,
stride);
}
template <typename T, int Rank = -1,
template <class> class PtrTraits = DefaultPtrTraits,
typename Tindex = int,
typename std::enable_if<Rank == -1, int>::type = 0>
TensorView<const std::remove_const_t<T>, Rank, PtrTraits, Tindex>
tview() const {
static_assert(Rank == -1 || Rank > 0, "error");
if (Rank > 0) {
return if_constexpr<(Rank > 0)>([&](auto _){
TV_ASSERT_RT_ERR(Rank == ndim(), "error");
}
TV_ASSERT_RT_ERR(dtype_ == type_v<T>, "error");
ShapeBase<TV_MAX_DIM, Tindex> shape(ndim()), stride(ndim());
for (int i = 0; i < int(ndim()); ++i) {
shape[i] = shape_[i];
stride[i] = stride_[i];
}
return TensorView<const std::remove_const_t<T>, Rank, PtrTraits, Tindex>(
reinterpret_cast<const std::remove_const_t<T> *>(data<T>()), shape,
stride);
ShapeBase<_(Rank) == -1 ? TV_MAX_DIM : Rank, Tindex> shape(Rank), stride(Rank);
for (int i = 0; i < Rank; ++i) {
shape[i] = shape_[i];
stride[i] = stride_[i];
}
return TensorView<const std::remove_const_t<T>, Rank, PtrTraits, Tindex>(
reinterpret_cast<const std::remove_const_t<T> *>(data<T>()), shape,
stride);
}, [&](auto _){
ShapeBase<TV_MAX_DIM, Tindex> shape(_(ndim())), stride(ndim());
for (int i = 0; i < int(ndim()); ++i) {
shape[i] = shape_[i];
stride[i] = stride_[i];
}
return TensorView<const std::remove_const_t<T>, Rank, PtrTraits, Tindex>(
reinterpret_cast<const std::remove_const_t<T> *>(data<T>()), shape,
stride);
});
}
template <class... Inds> Tensor view(Inds... newShapes) const {
static_assert(sizeof...(newShapes) > 0, "dont support empty for now");
TensorShape shape{int(newShapes)...};
......@@ -780,10 +750,10 @@ struct Tensor {
int dim(int idx) const {
if (idx < 0) {
TV_ASSERT_RT_ERR(shape_.size() + idx < shape_.size(), idx, shape_);
return shape_[shape_.size() + idx];
TV_ASSERT_RT_ERR(shape_.ndim() + idx < shape_.ndim(), idx, shape_);
return shape_[shape_.ndim() + idx];
} else {
TV_ASSERT_RT_ERR(idx < int(shape_.size()), idx, shape_);
TV_ASSERT_RT_ERR(idx < int(shape_.ndim()), idx, shape_);
return shape_[idx];
}
}
......
......@@ -35,6 +35,24 @@ template <typename TimeT = std::chrono::microseconds> struct CudaContextTimer {
mCurTime = std::chrono::steady_clock::now();
return res;
}
template <int Count, typename F>
double benchmark(F&& f, int start=int(Count) * 0.3){
// std::vector<TimeT::rep> times;
auto res = typename TimeT::rep();
int count = 0;
cudaDeviceSynchronize();
for (int i = 0; i < Count; ++i){
std::forward<F>(f)();
auto time = report();
if (i >= start){
// times.push_back(time)
res += time;
count += 1;
}
}
return res / double(count);
}
private:
std::chrono::time_point<std::chrono::steady_clock> mCurTime;
......
......@@ -23,7 +23,7 @@ class ConvAlgo(Enum):
Native = 0 # small memory cost, faster when number of points is large.
Batch = 1 # high memory cost, faster when number of points is small (< 50000)
BatchGemmGather = 2 # high memory cost, faster when number of points medium
SparseConvNet = 3
def get_conv_output_size(input_size, kernel_size, stride, padding, dilation):
ndim = len(input_size)
......
# Copyright 2019 Yan Yan
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import math
import time
import numpy as np
import torch
from torch import nn
from torch.nn import init
from torch.nn.parameter import Parameter
import spconv
from spconv.modules import SparseModule
class RemoveDuplicate(SparseModule):
def forward(self, x: spconv.SparseConvTensor):
inds = x.indices
spatial_shape = [x.batch_size, *x.spatial_shape]
spatial_stride = [0] * len(spatial_shape)
val = 1
for i in range(inds.shape[1] - 1, -1, -1):
spatial_stride[i] = val
val *= spatial_shape[i]
indices_index = inds[:, -1]
for i in range(len(spatial_shape) - 1):
indices_index += spatial_stride[i] * inds[:, i]
_, unique_inds = torch.unique(indices_index)
new_inds = inds[unique_inds]
new_features = x.features[unique_inds]
res = spconv.SparseConvTensor(new_features, new_inds, x.spatial_shape, x.batch_size, x.grid)
return res
\ No newline at end of file
set(ALL_FILES all.cc indice.cc reordering.cc maxpool.cc nms.cc spconv_ops.cc pool_ops.cc)
if (SPCONV_BuildCUDA)
set(ALL_FILES ${ALL_FILES} indice.cu reordering.cu maxpool.cu pillar_scatter.cu cublas_gemm.cc)
set(ALL_FILES ${ALL_FILES} indice.cu reordering.cu maxpool.cu pillar_scatter.cu cublas_gemm.cc fused_conv.cu)
endif()
add_library(spconv SHARED ${ALL_FILES})
......
#include <ATen/ATen.h>
#include <spconv/fused_conv.cu.h>
#include <spconv/fused_conv.h>
#include <tensorview/torch_utils.h>
namespace spconv {
void fused_conv_cuda(torch::Tensor output, torch::Tensor features,
torch::Tensor filters, torch::Tensor indicesIn,
torch::Tensor indicesOut, int nHot) {
auto dtype = output.scalar_type();
auto input_nPlanes = features.size(1);
auto output_nPlanes = output.size(1);
auto stream = at::cuda::getCurrentCUDAStream();
tv::dispatch_torch<float, at::Half>(dtype, [&](auto I) {
using T = decltype(I);
dConvolution_forward2(stream, features.data_ptr<T>(), output.data_ptr<T>(),
filters.data_ptr<T>(), indicesIn.data_ptr<int32_t>(),
indicesOut.data_ptr<int32_t>(), nHot, input_nPlanes,
input_nPlanes, output_nPlanes, output_nPlanes, 1);
});
}
void fused_conv_backward_cuda(torch::Tensor features, torch::Tensor din,
torch::Tensor dout, torch::Tensor filters,
torch::Tensor dfilters, torch::Tensor indicesIn,
torch::Tensor indicesOut, int nHot) {
auto dtype = features.scalar_type();
auto input_nPlanes = features.size(1);
auto output_nPlanes = dout.size(1);
auto stream = at::cuda::getCurrentCUDAStream();
tv::dispatch_torch<float>(dtype, [&](auto I) {
using T = decltype(I);
dConvolution_backward_dW2(
stream, features.data_ptr<T>(), din.data_ptr<T>(), dout.data_ptr<T>(),
filters.data_ptr<T>(), dfilters.data_ptr<T>(),
indicesIn.data_ptr<int32_t>(), indicesOut.data_ptr<int32_t>(), nHot,
input_nPlanes, input_nPlanes, output_nPlanes, output_nPlanes, 1);
});
}
} // namespace spconv
\ No newline at end of file
......@@ -28,7 +28,6 @@
#include <thrust/execution_policy.h>
#include <type_traits>
#include <utility/timer.h>
namespace spconv {
using max_kernel_vol_t = tv::mp_list_c<int, 9, 16, 27, 32, 128, 256, 4096>;
......@@ -330,7 +329,6 @@ int create_submconv_indice_pair_cuda(
auto found = false;
if (dilation_one && (NDim == 2 || NDim == 3)) {
auto indiceNumCpu = indiceNum.cpu();
if (NDim == 2) {
tv::SimpleVector<Index, 2> ou_(outSpatialShape.begin(),
outSpatialShape.end());
......
#include <spconv/fused_conv.h>
#include <spconv/spconv_ops.h>
#include <spgemm/gemm_th.h>
#include <tensorview/tensor.h>
namespace spconv {
......@@ -140,27 +142,11 @@ getIndicePairs(torch::Tensor indices, torch::Tensor gridOut, int64_t batchSize,
}
}
torch::Tensor indiceConv(torch::Tensor features, torch::Tensor filters,
torch::Tensor indicePairs, torch::Tensor indiceNum,
int64_t numActOut, int64_t _inverse, int64_t _subM,
int64_t algo) {
torch::Tensor indiceConvNative(torch::Tensor features, torch::Tensor filters,
torch::Tensor indicePairs,
torch::Tensor indiceNum, int64_t numActOut,
int64_t _inverse, int64_t _subM) {
auto kernelVolume = indiceNum.size(0);
switch (algo) {
case kBatchGemmGather:
case kBatch: {
if (kernelVolume != 1) {
return indiceConvBatch(features, filters, indicePairs, indiceNum,
numActOut, _inverse, _subM,
algo != kBatchGemmGather);
} else {
break;
}
}
case kNative:
break;
default:
TV_THROW_RT_ERR("unknown algo");
}
// auto timer = spconv::CudaContextTimer<>();
bool subM = _subM != 0;
......@@ -184,16 +170,16 @@ torch::Tensor indiceConv(torch::Tensor features, torch::Tensor filters,
torch::mm_out(output, features, filters[indicePairMaxOffset]);
// get indice pair second max size based on subM symmetric property
indicePairMaxSize =
*std::max_element(indicePairNumCpu.data_ptr<int>(),
indicePairNumCpu.data_ptr<int>() + indicePairMaxOffset);
indicePairMaxSize = *std::max_element(indicePairNumCpu.data_ptr<int>(),
indicePairNumCpu.data_ptr<int>() +
indicePairMaxOffset);
if (indicePairMaxSize == 0) {
return output;
}
} else {
indicePairMaxSize =
*std::max_element(indicePairNumCpu.data_ptr<int>(),
indicePairNumCpu.data_ptr<int>() + kernelVolume);
*std::max_element(indicePairNumCpu.data_ptr<int>(),
indicePairNumCpu.data_ptr<int>() + kernelVolume);
}
torch::Tensor inputBuffer =
......@@ -260,12 +246,59 @@ torch::Tensor indiceConv(torch::Tensor features, torch::Tensor filters,
return output;
}
torch::Tensor
indiceConvSparseConvNet(torch::Tensor features, torch::Tensor filters,
torch::Tensor indicePairs, torch::Tensor indiceNum,
int64_t numActOut, int64_t _inverse, int64_t _subM) {
auto kernelVolume = indiceNum.size(0);
// auto timer = spconv::CudaContextTimer<>();
bool subM = _subM != 0;
bool inverse = _inverse != 0;
auto device = features.device().type();
auto ndim = filters.dim() - 2;
auto numInPlanes = features.size(1);
auto numOutPlanes = filters.size(ndim + 1);
auto indicePairNumCpu = indiceNum.to({torch::kCPU});
auto options =
torch::TensorOptions().dtype(features.dtype()).device(features.device());
torch::Tensor output = torch::zeros({numActOut, numOutPlanes}, options);
filters = filters.view({-1, numInPlanes, numOutPlanes});
// init for subM
int indicePairMaxOffset = kernelVolume / 2;
if (subM) { // the center index of subm conv don't need gather and scatter
// add.
torch::mm_out(output, features, filters[indicePairMaxOffset]);
}
for (int i = 0; i < kernelVolume; ++i) {
auto nHot = indicePairNumCpu.data_ptr<int>()[i];
if (nHot <= 0 || (subM && i == indicePairMaxOffset)) {
continue;
}
if (device == torch::kCPU) {
TV_THROW_INVALID_ARG("SparseConvNet only support gpu");
}
#ifdef TV_CUDA
else if (device == torch::kCUDA) {
fused_conv_cuda(output, features, filters[i], indicePairs[inverse][i],
indicePairs[!inverse][i], nHot);
}
#endif
else {
TV_THROW_INVALID_ARG("unknown device type");
}
}
return output;
}
template <bool BatchScatter>
torch::Tensor indiceConvBatch(torch::Tensor features, torch::Tensor filters,
torch::Tensor indicePairs,
torch::Tensor indiceNum, int64_t numActOut,
int64_t _inverse, int64_t _subM,
bool batchScatter) {
int64_t _inverse, int64_t _subM) {
bool subM = _subM != 0;
auto batchScatter = BatchScatter;
bool inverse = _inverse != 0;
auto device = features.device().type();
auto ndim = filters.dim() - 2;
......@@ -388,29 +421,42 @@ torch::Tensor indiceConvBatch(torch::Tensor features, torch::Tensor filters,
return output;
}
template <int Algo> struct ConvDispatch;
template <> struct ConvDispatch<kNative> {
constexpr static auto *func = indiceConvNative;
};
template <> struct ConvDispatch<kBatch> {
constexpr static auto *func = indiceConvBatch<false>;
};
template <> struct ConvDispatch<kBatchGemmGather> {
constexpr static auto *func = indiceConvBatch<true>;
};
template <> struct ConvDispatch<kSparseConvNet> {
constexpr static auto *func = indiceConvSparseConvNet;
};
torch::Tensor indiceConv(torch::Tensor features, torch::Tensor filters,
torch::Tensor indicePairs, torch::Tensor indiceNum,
int64_t numActOut, int64_t _inverse, int64_t _subM,
int64_t algo) {
torch::Tensor res;
tv::DispatchInt<all_conv_algos_t>()(algo, [&](auto I) {
constexpr int AlgoValue = decltype(I)::value;
res = ConvDispatch<AlgoValue>::func(features, filters, indicePairs,
indiceNum, numActOut, _inverse, _subM);
});
return res;
}
std::vector<torch::Tensor>
indiceConvBackward(torch::Tensor features, torch::Tensor filters,
indiceConvBwNative(torch::Tensor features, torch::Tensor filters,
torch::Tensor outGrad, torch::Tensor indicePairs,
torch::Tensor indiceNum, int64_t _inverse, int64_t _subM,
int64_t algo) {
torch::Tensor indiceNum, int64_t _inverse, int64_t _subM) {
auto kernelVolume = indiceNum.size(0);
switch (algo) {
case kBatchGemmGather:
case kBatch: {
if (kernelVolume != 1) {
return indiceConvBackwardBatch(features, filters, outGrad, indicePairs,
indiceNum, _inverse, _subM,
algo != kBatchGemmGather);
} else {
break;
}
}
case kNative:
break;
default:
TV_THROW_RT_ERR("unknown algo");
}
bool subM = _subM != 0;
bool inverse = _inverse != 0;
......@@ -437,16 +483,16 @@ indiceConvBackward(torch::Tensor features, torch::Tensor filters,
torch::mm_out(inputGrad, outGrad, filters[indicePairMaxOffset].t());
// get indice pair second max size based on subM symmetric property
indicePairMaxSize =
*std::max_element(indicePairNumCpu.data_ptr<int>(),
indicePairNumCpu.data_ptr<int>() + indicePairMaxOffset);
indicePairMaxSize = *std::max_element(indicePairNumCpu.data_ptr<int>(),
indicePairNumCpu.data_ptr<int>() +
indicePairMaxOffset);
if (indicePairMaxSize == 0) {
return {inputGrad, filtersGrad.view(filterShape)};
}
} else {
indicePairMaxSize =
*std::max_element(indicePairNumCpu.data_ptr<int>(),
indicePairNumCpu.data_ptr<int>() + kernelVolume);
*std::max_element(indicePairNumCpu.data_ptr<int>(),
indicePairNumCpu.data_ptr<int>() + kernelVolume);
}
torch::Tensor inputBuffer =
......@@ -499,13 +545,66 @@ indiceConvBackward(torch::Tensor features, torch::Tensor filters,
}
std::vector<torch::Tensor>
indiceConvBackwardBatch(torch::Tensor features, torch::Tensor filters,
torch::Tensor outGrad, torch::Tensor indicePairs,
torch::Tensor indiceNum, int64_t _inverse,
int64_t _subM, bool batchScatter) {
indiceConvBwSparseConvNet(torch::Tensor features, torch::Tensor filters,
torch::Tensor outGrad, torch::Tensor indicePairs,
torch::Tensor indiceNum, int64_t _inverse,
int64_t _subM) {
auto kernelVolume = indiceNum.size(0);
bool subM = _subM != 0;
bool inverse = _inverse != 0;
auto device = features.device().type();
auto ndim = filters.dim() - 2;
auto numInPlanes = features.size(1);
auto numOutPlanes = filters.size(ndim + 1);
auto indicePairNumCpu = indiceNum.to({torch::kCPU});
auto options =
torch::TensorOptions().dtype(features.dtype()).device(features.device());
auto filterShape = filters.sizes();
torch::Tensor inputGrad = torch::zeros(features.sizes(), options);
torch::Tensor filtersGrad = torch::zeros(filterShape, options);
filters = filters.view({-1, numInPlanes, numOutPlanes});
filtersGrad = filtersGrad.view({-1, numInPlanes, numOutPlanes});
// init for subM
int indicePairMaxOffset = kernelVolume / 2;
int indicePairMaxSize = indicePairNumCpu.data_ptr<int>()[indicePairMaxOffset];
if (subM) {
auto filterGradSub = filtersGrad[indicePairMaxOffset];
torch::mm_out(filterGradSub, features.t(), outGrad);
torch::mm_out(inputGrad, outGrad, filters[indicePairMaxOffset].t());
}
for (int i = 0; i < kernelVolume; ++i) {
auto nHot = indicePairNumCpu.data_ptr<int>()[i];
if (nHot <= 0 || (subM && i == indicePairMaxOffset)) {
continue;
}
if (device == torch::kCPU) {
TV_THROW_INVALID_ARG("unknown device type");
}
#ifdef TV_CUDA
else if (device == torch::kCUDA) {
fused_conv_backward_cuda(features, inputGrad, outGrad, filters[i],
filtersGrad[i], indicePairs[inverse][i],
indicePairs[!inverse][i], nHot);
}
#endif
else {
TV_THROW_INVALID_ARG("unknown device type");
}
}
return {inputGrad, filtersGrad.view(filterShape)};
}
template <bool BatchScatter>
std::vector<torch::Tensor>
indiceConvBwBatch(torch::Tensor features, torch::Tensor filters,
torch::Tensor outGrad, torch::Tensor indicePairs,
torch::Tensor indiceNum, int64_t _inverse, int64_t _subM) {
bool subM = _subM != 0;
bool inverse = _inverse != 0;
auto batchScatter = BatchScatter;
auto device = features.device().type();
auto ndim = filters.dim() - 2;
auto kernelVolume = indiceNum.size(0);
......@@ -626,4 +725,35 @@ indiceConvBackwardBatch(torch::Tensor features, torch::Tensor filters,
return {inputGrad, filtersGrad.view(filterShape)};
}
template <int Algo> struct ConvBwDispatch;
template <> struct ConvBwDispatch<kNative> {
constexpr static auto *func = indiceConvBwNative;
};
template <> struct ConvBwDispatch<kBatch> {
constexpr static auto *func = indiceConvBwBatch<false>;
};
template <> struct ConvBwDispatch<kBatchGemmGather> {
constexpr static auto *func = indiceConvBwBatch<true>;
};
template <> struct ConvBwDispatch<kSparseConvNet> {
constexpr static auto *func = indiceConvBwSparseConvNet;
};
std::vector<torch::Tensor>
indiceConvBackward(torch::Tensor features, torch::Tensor filters,
torch::Tensor outGrad, torch::Tensor indicePairs,
torch::Tensor indiceNum, int64_t _inverse, int64_t _subM,
int64_t algo) {
std::vector<torch::Tensor> res;
tv::DispatchInt<all_conv_algos_t>()(algo, [&](auto I) {
constexpr int AlgoValue = decltype(I)::value;
res = ConvBwDispatch<AlgoValue>::func(
features, filters, outGrad, indicePairs, indiceNum, _inverse, _subM);
});
return res;
}
} // namespace spconv
......@@ -37,7 +37,7 @@ class SparseConv3dTestTorch(nn.Module):
stride,
padding,
dilation,
algo=spconv.ConvAlgo.Native):
algo=spconv.ConvAlgo.SparseConvNet):
super().__init__()
layers = [
spconv.SparseConv3d(in_channels,
......@@ -353,7 +353,7 @@ class TestSpConv(TestCase):
shapes = [[19, 18, 17]]
batchsizes = [1, 2]
in_channels = [64]
in_channels = [32]
out_channels = [32, 48, 64]
ksizes = [2, 3]
strides = [1, 2, 3]
......@@ -623,8 +623,8 @@ def main(algo=spconv.ConvAlgo.Native, dtype=torch.float32):
shapes = [[400, 400, 15]]
batchsizes = [2]
in_channels = [32]
out_channels = [64]
in_channels = [19]
out_channels = [17]
ksizes = [(3, 3, 3)]
strides = [1]
paddings = [0]
......@@ -752,8 +752,8 @@ def main_subm(algo, dtype=torch.float32):
if __name__ == '__main__':
main_subm(algo=spconv.ConvAlgo.Native, dtype=torch.float32)
# main(algo=spconv.ConvAlgo.Native, dtype=torch.half)
# main_subm(algo=spconv.ConvAlgo.SparseConvNet, dtype=torch.float32)
# main(algo=spconv.ConvAlgo.SparseConvNet, dtype=torch.float32)
# TestCase().assertAllClose(out_my, out_ref)
# unittest.main()
# TestSpConv().testSpConv3d()
TestSpConv().testSpConv3d()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment