Commit cfaa1a3a authored by yanyan's avatar yanyan
Browse files

add Minkowski conv kernel

parent 9ce18407
...@@ -186,7 +186,7 @@ ...@@ -186,7 +186,7 @@
same "printed page" as the copyright notice for easier same "printed page" as the copyright notice for easier
identification within third-party archives. identification within third-party archives.
Copyright 2019 Yan Yan Copyright 2019-2020 Yan Yan
Licensed under the Apache License, Version 2.0 (the "License"); Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License. you may not use this file except in compliance with the License.
......
// Copyright 2019 Yan Yan // Copyright 2019-2020 Yan Yan
// //
// Licensed under the Apache License, Version 2.0 (the "License"); // Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License. // you may not use this file except in compliance with the License.
......
// Copyright 2019 Yan Yan // Copyright 2019-2020 Yan Yan
// //
// Licensed under the Apache License, Version 2.0 (the "License"); // Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License. // you may not use this file except in compliance with the License.
......
// Copyright 2019 Yan Yan // Copyright 2019-2020 Yan Yan
// //
// Licensed under the Apache License, Version 2.0 (the "License"); // Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License. // you may not use this file except in compliance with the License.
...@@ -13,10 +13,14 @@ ...@@ -13,10 +13,14 @@
// limitations under the License. // limitations under the License.
#pragma once #pragma once
#include <cuda_runtime_api.h> #include <cuda_runtime_api.h>
#include <tensorview/tensorview.h> #include <tensorview/tensor.h>
#include <torch/script.h> #include <torch/script.h>
namespace spconv { namespace spconv {
enum FusedConvAlgo { kFSparseConvNet, kFMinkowskiEngine };
using all_fused_conv_algos_t =
tv::mp_list_c<int, kFSparseConvNet, kFMinkowskiEngine>;
void fused_conv_cuda(torch::Tensor output, torch::Tensor features, void fused_conv_cuda(torch::Tensor output, torch::Tensor features,
torch::Tensor filters, torch::Tensor indicesIn, torch::Tensor filters, torch::Tensor indicesIn,
torch::Tensor indicesOut, int nHot); torch::Tensor indicesOut, int nHot);
...@@ -26,4 +30,26 @@ void fused_conv_backward_cuda(torch::Tensor features, torch::Tensor din, ...@@ -26,4 +30,26 @@ void fused_conv_backward_cuda(torch::Tensor features, torch::Tensor din,
torch::Tensor dfilters, torch::Tensor indicesIn, torch::Tensor dfilters, torch::Tensor indicesIn,
torch::Tensor indicesOut, int nHot); torch::Tensor indicesOut, int nHot);
void fused_conv_cuda_minkowski(torch::Tensor output, torch::Tensor features,
torch::Tensor filters, torch::Tensor indicesIn,
torch::Tensor indicesOut, int nHot);
void fused_conv_backward_cuda_minkowski(torch::Tensor features,
torch::Tensor din, torch::Tensor dout,
torch::Tensor filters,
torch::Tensor dfilters,
torch::Tensor indicesIn,
torch::Tensor indicesOut, int nHot);
template <int Algo> struct FusedConvDispatch;
template <> struct FusedConvDispatch<kFSparseConvNet> {
constexpr static auto *fwd = fused_conv_cuda;
constexpr static auto *bwd = fused_conv_backward_cuda;
};
template <> struct FusedConvDispatch<kFMinkowskiEngine> {
constexpr static auto *fwd = fused_conv_cuda_minkowski;
constexpr static auto *bwd = fused_conv_backward_cuda_minkowski;
};
} // namespace spconv } // namespace spconv
// Copyright 2019 Yan Yan // Copyright 2019-2020 Yan Yan
// //
// Licensed under the Apache License, Version 2.0 (the "License"); // Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License. // you may not use this file except in compliance with the License.
......
// Copyright 2019 Yan Yan // Copyright 2019-2020 Yan Yan
// //
// Licensed under the Apache License, Version 2.0 (the "License"); // Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License. // you may not use this file except in compliance with the License.
......
// Copyright 2019 Yan Yan // Copyright 2019-2020 Yan Yan
// //
// Licensed under the Apache License, Version 2.0 (the "License"); // Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License. // you may not use this file except in compliance with the License.
......
// Copyright 2019 Yan Yan // Copyright 2019-2020 Yan Yan
// //
// Licensed under the Apache License, Version 2.0 (the "License"); // Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License. // you may not use this file except in compliance with the License.
......
// Copyright 2019 Yan Yan // Copyright 2019-2020 Yan Yan
// //
// Licensed under the Apache License, Version 2.0 (the "License"); // Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License. // you may not use this file except in compliance with the License.
......
/* Copyright (c) Chris Choy (chrischoy@ai.stanford.edu).
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*
* Please cite "4D Spatio-Temporal ConvNets: Minkowski Convolutional Neural
* Networks", CVPR'19 (https://arxiv.org/abs/1904.08755) if you use any part
* of the code.
*/
template <typename Dtype, typename Itype, int BLOCK_SIZE>
__global__ void matmul(const Dtype *A, const int wA, const int hA,
const Dtype *B, const int wB, const int hB, Dtype *C,
const Itype *in_map, const Itype *out_map) {
// Use in_feat as A and kernel as B
// Block index
const int bx = blockIdx.x;
const int by = blockIdx.y;
// Thread index
const int tx = threadIdx.x;
const int ty = threadIdx.y;
// Coordinate. x is for rows, y is for columns.
const int x = BLOCK_SIZE * bx + tx;
const int y = BLOCK_SIZE * by + ty;
// Csub is used to store the element of the block sub-matrix
// that is computed by the thread
Dtype Csub = 0;
const Itype in_row = y < hA ? in_map[y] : 0;
const Itype out_row = y < hA ? out_map[y] : 0;
// Loop over all the sub-matrices of A and B
// required to compute the block sub-matrix
for (int s = 0; s < wA; s += BLOCK_SIZE) {
// Declaration of the shared memory array As used to
// store the sub-matrix of A
__shared__ Dtype As[BLOCK_SIZE][BLOCK_SIZE];
// Declaration of the shared memory array Bs used to
// store the sub-matrix of B
__shared__ Dtype Bs[BLOCK_SIZE][BLOCK_SIZE];
// Load the matrices from device memory
// to shared memory; each thread loads
// one element of each matrix
As[ty][tx] = ((s + tx) < wA && y < hA) ? A[wA * in_row + s + tx] : 0;
Bs[ty][tx] = ((s + ty) < hB && x < wB) ? B[wB * (s + ty) + x] : 0;
// Synchronize to make sure the matrices are loaded
__syncthreads();
// Multiply the two matrices together;
// each thread computes one element
// of the block sub-matrix
#pragma unroll
for (int k = 0; k < BLOCK_SIZE; ++k) {
Csub += As[ty][k] * Bs[k][tx];
}
// Synchronize to make sure that the preceding
// computation is done before loading two new
// sub-matrices of A and B in the next iteration
__syncthreads();
}
// Write the block sub-matrix to device memory;
// each thread writes one element
if (y < hA && x < wB)
atomicAdd(&C[wB * out_row + x], Csub);
// C[wB * out_row + x] += Csub;
}
template <typename Dtype, typename Itype, int BLOCK_SIZE>
__global__ void matmul2(const Dtype *A, const int wA, const int hA,
const Dtype *B, const int wB, const int hB,
const Dtype *D, const int wD, const int hD, Dtype *C,
Dtype *E, const Itype *in_map, const Itype *out_map) {
// Use grad_out_feat as A, transposed kernel weight as B, and in_feat as D
// Block index
const int bx = blockIdx.x;
const int by = blockIdx.y;
// Thread index
const int tx = threadIdx.x;
const int ty = threadIdx.y;
// Coordinate. y is for rows, x is for columns.
const int x = BLOCK_SIZE * bx + tx;
const int y = BLOCK_SIZE * by + ty;
const Itype in_row = y < hA ? in_map[y] : 0;
const Itype out_row = y < hA ? out_map[y] : 0;
// Csub is used to store the element of the block sub-matrix
// that is computed by the thread
Dtype Csub = 0;
Dtype Esub = 0;
// Declaration of the shared memory array As used to
// store the sub-matrix of A
__shared__ Dtype As[BLOCK_SIZE][BLOCK_SIZE];
// Declaration of the shared memory array Bs used to
// store the sub-matrix of B
__shared__ Dtype BTs[BLOCK_SIZE][BLOCK_SIZE];
// Declaration of the shared memory array Ds used to
// store the sub-matrix of D
__shared__ Dtype DTs[BLOCK_SIZE][BLOCK_SIZE];
// For Ds = D^T[...:..., ...:...], use the transposed grid dimension for A
DTs[ty][tx] = (x < wD && y < hD) ? D[wD * in_row + x] : 0;
// Loop over all the sub-matrices of A and B
// required to compute the block sub-matrix
for (int s = 0; s < wA; s += BLOCK_SIZE) {
// Load the matrices from device memory
// to shared memory; each thread loads
// one element of each matrix
As[ty][tx] = ((s + tx) < wA && y < hA) ? A[wA * out_row + s + tx] : 0;
// Transposed kernel
BTs[ty][tx] = ((s + ty) < wB && x < hB) ? B[wB * x + s + ty] : 0;
// Synchronize to make sure the matrices are loaded
__syncthreads();
// Multiply the two matrices together;
// each thread computes one element
// of the block sub-matrix
#pragma unroll
for (int k = 0; k < BLOCK_SIZE; ++k) {
Csub += As[ty][k] * BTs[k][tx];
}
// For Esub, reset to 0
Esub = 0;
#pragma unroll
for (int k = 0; k < BLOCK_SIZE; ++k) {
Esub += DTs[k][ty] * As[k][tx];
}
// Synchronize to make sure that the preceding
// computation is done before loading two new
// sub-matrices of A and B in the next iteration
__syncthreads();
// For the E matrix which requires accmulation of multiple blocks, use
// atomic addition. This can be replaced with a more sophisticaed reduction
// algorithm.
if ((bx * BLOCK_SIZE + ty) < wD && (s + tx) < wA)
atomicAdd(&E[wA * (bx * BLOCK_SIZE + ty) + (s + tx)], Esub);
}
// Write the block sub-matrix to device memory;
// each thread writes one element
if (y < hA && x < hB)
atomicAdd(&C[hB * in_row + x], Csub);
}
// Copyright 2019 Yan Yan // Copyright 2019-2020 Yan Yan
// //
// Licensed under the Apache License, Version 2.0 (the "License"); // Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License. // you may not use this file except in compliance with the License.
......
// Copyright 2019 Yan Yan // Copyright 2019-2020 Yan Yan
// //
// Licensed under the Apache License, Version 2.0 (the "License"); // Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License. // you may not use this file except in compliance with the License.
......
// Copyright 2019 Yan Yan // Copyright 2019-2020 Yan Yan
// //
// Licensed under the Apache License, Version 2.0 (the "License"); // Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License. // you may not use this file except in compliance with the License.
......
// Copyright 2019 Yan Yan // Copyright 2019-2020 Yan Yan
// //
// Licensed under the Apache License, Version 2.0 (the "License"); // Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License. // you may not use this file except in compliance with the License.
......
// Copyright 2019 Yan Yan // Copyright 2019-2020 Yan Yan
// //
// Licensed under the Apache License, Version 2.0 (the "License"); // Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License. // you may not use this file except in compliance with the License.
......
// Copyright 2019 Yan Yan // Copyright 2019-2020 Yan Yan
// //
// Licensed under the Apache License, Version 2.0 (the "License"); // Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License. // you may not use this file except in compliance with the License.
......
// Copyright 2019 Yan Yan // Copyright 2019-2020 Yan Yan
// //
// Licensed under the Apache License, Version 2.0 (the "License"); // Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License. // you may not use this file except in compliance with the License.
......
// Copyright 2019 Yan Yan // Copyright 2019-2020 Yan Yan
// //
// Licensed under the Apache License, Version 2.0 (the "License"); // Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License. // you may not use this file except in compliance with the License.
......
// Copyright 2019 Yan Yan // Copyright 2019-2020 Yan Yan
// //
// Licensed under the Apache License, Version 2.0 (the "License"); // Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License. // you may not use this file except in compliance with the License.
......
// Copyright 2019 Yan Yan // Copyright 2019-2020 Yan Yan
// //
// Licensed under the Apache License, Version 2.0 (the "License"); // Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License. // you may not use this file except in compliance with the License.
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment