Unverified Commit f0d7a46d authored by Yan Yan's avatar Yan Yan Committed by GitHub
Browse files

Merge branch 'master' into master

parents 999c834c 83344f71
name: build
on:
push:
branches:
- master
schedule:
# * is a special character in YAML so you have to quote this string
- cron: '0 0 * * 6' # base builds run every saturday
jobs:
build:
runs-on: ubuntu-latest
env:
DOCKER_IMAGE_NAME: scrin/dev-spconv
DOCKER_FILE_PATH: ./Dockerfile
# TODO: create a action to reuse code. the problem is how to reuse docker-login.
steps:
- uses: actions/checkout@master
- name: Build Docker
run: |
docker build . --file ${{env.DOCKER_FILE_PATH}} --tag ${{env.DOCKER_IMAGE_NAME}}:latest
docker tag ${{env.DOCKER_IMAGE_NAME}}:latest ${{env.DOCKER_IMAGE_NAME}}:${{ github.sha }}
- name: Login to Registry
uses: azure/docker-login@v1
with:
username: ${{ secrets.DOCKER_USERNAME }}
password: ${{ secrets.DOCKER_PASSWORD }}
- name: Publish to Registry
run: |
docker push ${{env.DOCKER_IMAGE_NAME}}:latest
docker push ${{env.DOCKER_IMAGE_NAME}}:${{ github.sha }}
......@@ -7,6 +7,7 @@ __pycache__/
*.so
*.o
*.out
*.so.*
# Distribution / packaging
.Python
......
FROM scrin/dev:latest
RUN PROBLEM_FILE=/usr/local/lib/python3.8/dist-packages/torch/share/cmake/Caffe2/Caffe2Targets.cmake && \
sed -i 's/-Wall;-Wextra;-Wno-unused-parameter;-Wno-missing-field-initializers;-Wno-write-strings;-Wno-unknown-pragmas;-Wno-missing-braces;-fopenmp//g' $PROBLEM_FILE && \
sed -i 's/-Wall;-Wextra;-Wno-unused-parameter;-Wno-missing-field-initializers;-Wno-write-strings;-Wno-unknown-pragmas;-Wno-missing-braces//g' $PROBLEM_FILE && \
cd /root && \
git clone --depth 1 --recursive https://www.github.com/traveller59/spconv.git && \
cd ./spconv && \
SPCONV_FORCE_BUILD_CUDA=1 python setup.py install
# SpConv: PyTorch Spatially Sparse Convolution Library
[![Build Status](https://github.com/traveller59/spconv/workflows/build/badge.svg)](https://github.com/traveller59/spconv/actions?query=workflow%3Abuild)
This is a spatially sparse convolution library like [SparseConvNet](https://github.com/facebookresearch/SparseConvNet) but faster and easy to read. This library provide sparse convolution/transposed, submanifold convolution, inverse convolution and sparse maxpool.
The GPU Indice Generation algorithm is a unofficial implementation of paper [SECOND](http://www.mdpi.com/1424-8220/18/10/3337). That algorithm (don't include GPU SubM indice generation algorithm) may be protected by patent.
This project only support CUDA 9.0+ or CPU only. If you are using cuda 8.0, please update it to 9.0.
2020-5-2, we add ConcatTable, JoinTable, AddTable, and Identity function to build ResNet and Unet in this version of spconv.
This project only support tensors with spatial volume less than ```std::numeric_limits<int>::max()``` (~2e9). if someone really need very large space, open an issue.
## News:
## Docker:
2019-5-24: spconv v1.1 released, now indice generation will use hash table as default (CPU code only support hash table). you can use ```use_hash=False``` to use dense table when using CUDA. In addition, add CPU only build support.
```docker pull scrin/dev-spconv```, contains python 3.8, cuda 10.1, fish shell, newest pytorch and tensorflow.
## Install on Ubuntu 16.04/18.04
* if you are using pytorch 1.4+ and encounter "nvcc fatal: unknown -Wall", you need to go to torch package dir and remove flags contains "-Wall" in INTERFACE_COMPILE_OPTIONS in Caffe2Targets.cmake. This problem can't be fixed in this project (to avoid this, I need to remove all torch dependency in cuda sources and drop half support).
0. Use ```git clone xxx.git --recursive``` to clone this repo.
1. Install boost headers to your system include path, you can use either ```sudo apt-get install libboostall-dev``` or download compressed files from boost official website and copy headers to include path.
1. Install boost headers to your system include path, you can use either ```sudo apt-get install libboost-all-dev``` or download compressed files from boost official website and copy headers to include path.
2. Download cmake >= 3.13.2, then add cmake executables to PATH.
......@@ -168,4 +170,4 @@ This project is licensed under the Apache license 2.0 License - see the [LICENSE
The [CUDPP](https://github.com/cudpp/cudpp) hash code is licensed under BSD License.
The [robin-map](https://github.com/Tessil/robin-map) code is licensed under MIT license.
\ No newline at end of file
The [robin-map](https://github.com/Tessil/robin-map) code is licensed under MIT license.
......@@ -13,49 +13,103 @@
// limitations under the License.
#pragma once
#include <tensorview/tensorview.h>
#include <tensorview/tensor.h>
#include <algorithm>
#include <array>
#include <iostream>
#include <pybind11/embed.h> // everything needed for embedding
#include <pybind11/functional.h>
#include <pybind11/numpy.h>
#include <pybind11/pybind11.h>
#include <pybind11/stl.h>
#include <tensorview/tensorview.h>
namespace py = pybind11;
namespace tv {
template <typename T> TensorView<T> arrayt2tv(py::array_t<T> arr) {
Shape shape;
for (int i = 0; i < arr.ndim(); ++i) {
shape.push_back(arr.shape(i));
}
return TensorView<T>(arr.mutable_data(), shape);
}
template <typename T, typename TPyObject>
std::vector<T> array2Vector(TPyObject arr){
py::array arr_np = arr;
size_t size = arr.attr("size").template cast<size_t>();
py::array_t<T> arr_cc = arr_np;
std::vector<T> data(arr_cc.data(), arr_cc.data() + size);
return data;
template <typename T> TensorView<const T> carrayt2tv(py::array_t<T> arr) {
Shape shape;
for (int i = 0; i < arr.ndim(); ++i) {
shape.push_back(arr.shape(i));
}
return TensorView<const T>(arr.data(), shape);
}
template <typename T> TensorView<T> vector2tv(std::vector<T> &arr) {
return TensorView<T>(arr.data(), {arr.size()});
}
template <typename T>
std::vector<T> arrayT2Vector(py::array_t<T> arr)
{
std::vector<T> data(arr.data(), arr.data() + arr.size());
return data;
}
template <typename T, typename TPyObject>
tv::TensorView<T> array2TensorView(TPyObject arr){
py::array arr_np = arr;
py::array_t<T> arr_cc = arr_np;
tv::Shape shape;
for (int i = 0; i < arr_cc.ndim(); ++i){
shape.push_back(arr_cc.shape(i));
}
return tv::TensorView<T>(arr_cc.mutable_data(), shape);
TensorView<T> vector2tv(std::vector<T> &arr, Shape shape) {
TV_ASSERT_INVALID_ARG(shape.prod() == arr.size(), "error");
return TensorView<T>(arr.data(), shape);
}
template <typename T> TensorView<const T> vector2tv(const std::vector<T> &arr) {
return TensorView<const T>(arr.data(), {arr.size()});
}
template <typename T>
tv::TensorView<T> arrayT2TensorView(py::array_t<T> arr){
tv::Shape shape;
for (int i = 0; i < arr.ndim(); ++i){
shape.push_back(arr.shape(i));
std::vector<T> shape2stride(const std::vector<T> &shape, T itemsize) {
T p = T(1);
std::vector<T> res;
for (auto iter = shape.rbegin(); iter != shape.rend(); ++iter) {
res.push_back(p * itemsize);
p *= *iter;
}
std::reverse(res.begin(), res.end());
return res;
}
tv::DType get_array_tv_dtype(const py::array& arr){
//
switch (arr.dtype().kind()){
case 'b': return tv::bool_;
case 'i': {
switch (arr.itemsize()){
case 1: return tv::int8;
case 2: return tv::int16;
case 4: return tv::int32;
case 8: return tv::int64;
default: break;
}
}
return tv::TensorView<T>(arr.mutable_data(), shape);
}
\ No newline at end of file
case 'u': {
switch (arr.itemsize()){
case 1: return tv::uint8;
case 2: return tv::uint16;
case 4: return tv::uint32;
case 8: return tv::uint64;
default: break;
}
}
case 'f': {
switch (arr.itemsize()){
case 4: return tv::float32;
case 8: return tv::float64;
default: break;
}
}
}
TV_THROW_RT_ERR("unknown dtype", arr.dtype().kind(), arr.itemsize());
}
Tensor array2tensor(py::array& arr) {
Shape shape;
for (int i = 0; i < arr.ndim(); ++i) {
shape.push_back(arr.shape(i));
}
return tv::from_blob(arr.mutable_data(), shape, get_array_tv_dtype(arr), -1);
}
} // namespace tv
......@@ -37,12 +37,12 @@ torch::Tensor fusedIndiceConvBatchNorm(torch::Tensor features, torch::Tensor fil
auto numOutPlanes = filters.size(ndim + 1);
auto indicePairNumCpu = indiceNum.to({torch::kCPU});
auto indicePairMaxSizeIter = std::max_element(
indicePairNumCpu.data<int>(), indicePairNumCpu.data<int>() + kernelVolume);
int indicePairMaxOffset = indicePairMaxSizeIter - indicePairNumCpu.data<int>();
indicePairNumCpu.data_ptr<int>(), indicePairNumCpu.data_ptr<int>() + kernelVolume);
int indicePairMaxOffset = indicePairMaxSizeIter - indicePairNumCpu.data_ptr<int>();
int indicePairMaxSize = *indicePairMaxSizeIter;
/*if (_subM){
std::vector<int> indicePairNumVec(indicePairNumCpu.data<int>(), indicePairNumCpu.data<int>() + kernelVolume);
std::vector<int> indicePairNumVec(indicePairNumCpu.data_ptr<int>(), indicePairNumCpu.data_ptr<int>() + kernelVolume);
indicePairNumVec.erase(indicePairNumVec.begin() + indicePairMaxOffset);
auto indicePairVecMaxSizeIter = std::max_element(
......@@ -68,15 +68,15 @@ torch::Tensor fusedIndiceConvBatchNorm(torch::Tensor features, torch::Tensor fil
double totalGEMMTime = 0;
double totalSAddTime = 0;
for (int i = 0; i < kernelVolume; ++i) {
auto nHot = indicePairNumCpu.data<int>()[i];
auto nHot = indicePairNumCpu.data_ptr<int>()[i];
if (nHot <= 0 || (subM && i == indicePairMaxOffset)) {
continue;
}
// auto timer = spconv::CudaContextTimer<>();
auto outputBufferBlob =
torch::from_blob(outputBuffer.data<T>(), {nHot, numOutPlanes}, options);
torch::from_blob(outputBuffer.data_ptr<T>(), {nHot, numOutPlanes}, options);
auto inputBufferBlob =
torch::from_blob(inputBuffer.data<T>(), {nHot, numInPlanes}, options);
torch::from_blob(inputBuffer.data_ptr<T>(), {nHot, numInPlanes}, options);
if (device == torch::kCPU) {
functor::SparseGatherFunctor<tv::CPU, T, int> gatherFtor;
......
......@@ -37,7 +37,7 @@ torch::Tensor pointPillarScatter(torch::Tensor features, torch::Tensor coors,
tv::check_torch_dtype<int>(shape);
tv::check_torch_dtype<T>(coors);
auto shapeData = shape.data<int>();
auto shapeData = shape.data_ptr<int>();
torch::Tensor canvas =
torch::zeros({shapeData[0], shapeData[1], shapeData[2], shapeData[3]},
features.options());
......
......@@ -71,7 +71,7 @@ int points_to_voxel_3d_np(py::array_t<DType> points, py::array_t<DType> voxels,
if (voxelidx == -1) {
voxelidx = voxel_num;
if (voxel_num >= max_voxels)
break;
continue;
voxel_num += 1;
coor_to_voxelidx_rw(coor[0], coor[1], coor[2]) = voxelidx;
for (int k = 0; k < NDim; ++k) {
......@@ -139,7 +139,7 @@ int points_to_voxel_3d_np_mean(py::array_t<DType> points,
if (voxelidx == -1) {
voxelidx = voxel_num;
if (voxel_num >= max_voxels)
break;
continue;
voxel_num += 1;
coor_to_voxelidx_rw(coor[0], coor[1], coor[2]) = voxelidx;
for (int k = 0; k < NDim; ++k) {
......@@ -225,7 +225,7 @@ int points_to_voxel_3d_with_filtering(
if (voxelidx == -1) {
voxelidx = voxel_num;
if (voxel_num >= max_voxels)
break;
continue;
voxel_num += 1;
coor_to_voxelidx_rw(coor[0], coor[1], coor[2]) = voxelidx;
for (int k = 0; k < NDim; ++k) {
......@@ -274,4 +274,4 @@ int points_to_voxel_3d_with_filtering(
return voxel_num;
}
} // namespace spconv
\ No newline at end of file
} // namespace spconv
......@@ -33,7 +33,7 @@ torch::Tensor indiceMaxPool(torch::Tensor features, torch::Tensor indicePairs,
torch::Tensor output = torch::zeros({numAct, numInPlanes}, options);
double totalTime = 0;
for (int i = 0; i < kernelVolume; ++i) {
auto nHot = indicePairNumCpu.data<int>()[i];
auto nHot = indicePairNumCpu.data_ptr<int>()[i];
if (nHot <= 0) {
continue;
}
......@@ -75,7 +75,7 @@ torch::Tensor indiceMaxPoolBackward(torch::Tensor features,
torch::Tensor inputGrad = torch::zeros(features.sizes(), options);
auto kernelVolume = indicePairs.size(0);
for (int i = 0; i < kernelVolume; ++i) {
auto nHot = indicePairNumCpu.data<int>()[i];
auto nHot = indicePairNumCpu.data_ptr<int>()[i];
if (nHot <= 0) {
continue;
}
......
......@@ -110,7 +110,7 @@ getIndicePair(torch::Tensor indices, int64_t batchSize,
tv::torch2tv<int>(gridOut), tv::torch2tv<int>(indicePairs),
tv::torch2tv<int>(indiceNum), kernelSize32, stride32, padding32,
dilation32, outSpatialShape32, transpose, false, useHash);
if (numActOut == -1){
if (numActOut == -1) {
// build hash failed. use CPU algorithm
auto device = indices.device();
indicePairs = indicePairs.to({torch::kCPU});
......@@ -123,7 +123,8 @@ getIndicePair(torch::Tensor indices, int64_t batchSize,
tv::torch2tv<int>(gridOut), tv::torch2tv<int>(indicePairs),
tv::torch2tv<int>(indiceNum), kernelSize32, stride32, padding32,
dilation32, outSpatialShape32, transpose, false, useHash);
return {indices.to(device), indicePairs.to(device), indiceNum.to(device)};
return {indices.to(device), indicePairs.to(device),
indiceNum.to(device)};
}
}
#endif
......@@ -169,7 +170,7 @@ getIndicePair(torch::Tensor indices, int64_t batchSize,
tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum),
tv::torch2tv<int>(indicePairUnique), outSpatialShape32, transpose,
false, useHash);
if (numActOut == -1){
if (numActOut == -1) {
// build hash failed. use CPU algorithm
auto getIndicePairFtor =
functor::CreateConvIndicePairFunctor<tv::CPU, int, int, NDim>();
......@@ -184,7 +185,8 @@ getIndicePair(torch::Tensor indices, int64_t batchSize,
tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum),
kernelSize32, stride32, padding32, dilation32, outSpatialShape32,
transpose);
return {outInds.to(device).slice(0, 0, numActOut), indicePairs.to(device), indiceNum.to(device)};
return {outInds.to(device).slice(0, 0, numActOut),
indicePairs.to(device), indiceNum.to(device)};
}
}
}
......@@ -328,372 +330,14 @@ std::vector<torch::Tensor> getIndicePairPreGrid(
}
}
template <typename T>
torch::Tensor indiceConv(torch::Tensor features, torch::Tensor filters,
torch::Tensor indicePairs, torch::Tensor indiceNum,
int64_t numActOut, int64_t _inverse, int64_t _subM) {
bool subM = _subM != 0;
bool inverse = _inverse != 0;
auto device = features.device().type();
auto ndim = filters.dim() - 2;
auto kernelVolume = indicePairs.size(0);
auto numInPlanes = features.size(1);
auto numOutPlanes = filters.size(ndim + 1);
auto indicePairNumCpu = indiceNum.to({torch::kCPU});
auto indicePairMaxSizeIter =
std::max_element(indicePairNumCpu.data<int>(),
indicePairNumCpu.data<int>() + kernelVolume);
int indicePairMaxOffset =
indicePairMaxSizeIter - indicePairNumCpu.data<int>();
int indicePairMaxSize = *indicePairMaxSizeIter;
/*if (_subM){
std::vector<int> indicePairNumVec(indicePairNumCpu.data<int>(),
indicePairNumCpu.data<int>() + kernelVolume);
indicePairNumVec.erase(indicePairNumVec.begin() + indicePairMaxOffset);
auto indicePairVecMaxSizeIter = std::max_element(
indicePairNumVec.begin(), indicePairNumVec.end());
indicePairMaxSize = *indicePairVecMaxSizeIter;
}*/
auto options =
torch::TensorOptions().dtype(features.dtype()).device(features.device());
// auto indicePairOptions =
// torch::TensorOptions().dtype(torch::kInt64).device(indicePairs.device());
torch::Tensor output = torch::zeros({numActOut, numOutPlanes}, options);
torch::Tensor inputBuffer =
torch::zeros({indicePairMaxSize, numInPlanes}, options);
torch::Tensor outputBuffer =
torch::zeros({indicePairMaxSize, numOutPlanes}, options);
filters = filters.view({-1, numInPlanes, numOutPlanes});
if (subM) { // the center index of subm conv don't need gather and scatter
// add.
torch::mm_out(output, features, filters[indicePairMaxOffset]);
}
double totalGatherTime = 0;
double totalGEMMTime = 0;
double totalSAddTime = 0;
for (int i = 0; i < kernelVolume; ++i) {
auto nHot = indicePairNumCpu.data<int>()[i];
if (nHot <= 0 || (subM && i == indicePairMaxOffset)) {
continue;
}
// auto timer = spconv::CudaContextTimer<>();
auto outputBufferBlob =
torch::from_blob(outputBuffer.data<T>(), {nHot, numOutPlanes}, options);
auto inputBufferBlob =
torch::from_blob(inputBuffer.data<T>(), {nHot, numInPlanes}, options);
if (device == torch::kCPU) {
functor::SparseGatherFunctor<tv::CPU, T, int> gatherFtor;
gatherFtor(tv::CPU(), tv::torch2tv<T>(inputBuffer),
tv::torch2tv<const T>(features),
tv::torch2tv<const int>(indicePairs).subview(i, inverse),
nHot);
}
#ifdef SPCONV_CUDA
else if (device == torch::kCUDA) {
functor::SparseGatherFunctor<tv::GPU, T, int> gatherFtor;
gatherFtor(tv::TorchGPU(), tv::torch2tv<T>(inputBuffer),
tv::torch2tv<const T>(features),
tv::torch2tv<const int>(indicePairs).subview(i, inverse),
nHot);
TV_CHECK_CUDA_ERR();
/* slower than SparseGatherFunctor, may due to int->long conversion
auto indicePairLong = indicePairs[i][inverse].to(torch::kInt64);
auto indicePairBlob = torch::from_blob(indicePairLong.data<long>(),
{nHot}, indicePairOptions); torch::index_select_out(inputBufferBlob,
features, 0, indicePairBlob);*/
}
#endif
else {
TV_ASSERT_INVALID_ARG(false, "unknown device type");
}
// totalGatherTime += timer.report() / 1000.0;
torch::mm_out(outputBufferBlob, inputBufferBlob, filters[i]);
// totalGEMMTime += timer.report() / 1000.0;
if (device == torch::kCPU) {
functor::SparseScatterAddFunctor<tv::CPU, T, int> scatterFtor;
scatterFtor(tv::CPU(), tv::torch2tv<T>(output),
tv::torch2tv<const T>(outputBuffer),
tv::torch2tv<const int>(indicePairs).subview(i, !inverse),
nHot, true);
}
#ifdef SPCONV_CUDA
else if (device == torch::kCUDA) {
functor::SparseScatterAddFunctor<tv::GPU, T, int> scatterFtor;
scatterFtor(tv::TorchGPU(), tv::torch2tv<T>(output),
tv::torch2tv<const T>(outputBuffer),
tv::torch2tv<const int>(indicePairs).subview(i, !inverse),
nHot, true);
TV_CHECK_CUDA_ERR();
}
#endif
else {
TV_ASSERT_INVALID_ARG(false, "unknown device type");
}
// totalSAddTime += timer.report() / 1000.0;
}
// std::cout << "gather time " << totalGatherTime << std::endl;
// std::cout << "gemm time " << totalGEMMTime << std::endl;
// std::cout << "scatteradd time " << totalSAddTime << std::endl;
return output;
}
int64_t numActOut, int64_t _inverse, int64_t _subM);
template <typename T>
std::vector<torch::Tensor>
indiceConvBackward(torch::Tensor features, torch::Tensor filters,
torch::Tensor outGrad, torch::Tensor indicePairs,
torch::Tensor indiceNum, int64_t _inverse, int64_t _subM) {
bool subM = _subM != 0;
bool inverse = _inverse != 0;
auto device = features.device().type();
auto ndim = filters.dim() - 2;
auto kernelVolume = indicePairs.size(0);
auto numInPlanes = features.size(1);
auto numOutPlanes = filters.size(ndim + 1);
auto indicePairNumCpu = indiceNum.to({torch::kCPU});
auto indicePairMaxSizeIter =
std::max_element(indicePairNumCpu.data<int>(),
indicePairNumCpu.data<int>() + kernelVolume);
int indicePairMaxOffset =
indicePairMaxSizeIter - indicePairNumCpu.data<int>();
int indicePairMaxSize = *indicePairMaxSizeIter;
auto options =
torch::TensorOptions().dtype(features.dtype()).device(features.device());
auto filterShape = filters.sizes();
torch::Tensor inputGrad = torch::zeros(features.sizes(), options);
torch::Tensor filtersGrad = torch::zeros(filterShape, options);
torch::Tensor inputBuffer =
torch::zeros({indicePairMaxSize, numInPlanes}, options);
torch::Tensor outputBuffer =
torch::zeros({indicePairMaxSize, numOutPlanes}, options);
filters = filters.view({-1, numInPlanes, numOutPlanes});
filtersGrad = filtersGrad.view({-1, numInPlanes, numOutPlanes});
if (subM) {
auto filterGradSub = filtersGrad[indicePairMaxOffset];
torch::mm_out(filterGradSub, features.t(), outGrad);
torch::mm_out(inputGrad, outGrad, filters[indicePairMaxOffset].t());
}
for (int i = 0; i < kernelVolume; ++i) {
auto nHot = indicePairNumCpu.data<int>()[i];
if (nHot <= 0 || (subM && i == indicePairMaxOffset)) {
continue;
}
if (device == torch::kCPU) {
functor::SparseGatherFunctor<tv::CPU, T, int> gatherFtor;
functor::SparseGatherFunctor<tv::CPU, T, int> gatherFtorOut;
gatherFtor(tv::CPU(), tv::torch2tv<T>(inputBuffer),
tv::torch2tv<const T>(features),
tv::torch2tv<const int>(indicePairs).subview(i, inverse),
nHot);
gatherFtorOut(tv::CPU(), tv::torch2tv<T>(outputBuffer),
tv::torch2tv<const T>(outGrad),
tv::torch2tv<const int>(indicePairs).subview(i, !inverse),
nHot);
}
#ifdef SPCONV_CUDA
else if (device == torch::kCUDA) {
functor::SparseGatherFunctor<tv::GPU, T, int> gatherFtor;
functor::SparseGatherFunctor<tv::GPU, T, int> gatherFtorOut;
gatherFtor(tv::TorchGPU(), tv::torch2tv<T>(inputBuffer),
tv::torch2tv<const T>(features),
tv::torch2tv<const int>(indicePairs).subview(i, inverse),
nHot);
TV_CHECK_CUDA_ERR();
gatherFtorOut(tv::TorchGPU(), tv::torch2tv<T>(outputBuffer),
tv::torch2tv<const T>(outGrad),
tv::torch2tv<const int>(indicePairs).subview(i, !inverse),
nHot);
TV_CHECK_CUDA_ERR();
}
#endif
else {
TV_ASSERT_INVALID_ARG(false, "unknown device type");
}
auto filterGradSub = filtersGrad[i];
auto outputBufferBlob =
torch::from_blob(outputBuffer.data<T>(), {nHot, numOutPlanes}, options);
auto inputBufferBlob =
torch::from_blob(inputBuffer.data<T>(), {nHot, numInPlanes}, options);
torch::mm_out(filterGradSub, inputBufferBlob.t(), outputBufferBlob);
torch::mm_out(inputBufferBlob, outputBufferBlob, filters[i].t());
if (device == torch::kCPU) {
functor::SparseScatterAddFunctor<tv::CPU, T, int> scatterFtor;
scatterFtor(tv::CPU(), tv::torch2tv<T>(inputGrad),
tv::torch2tv<const T>(inputBuffer),
tv::torch2tv<const int>(indicePairs).subview(i, inverse),
nHot);
}
#ifdef SPCONV_CUDA
else if (device == torch::kCUDA) {
functor::SparseScatterAddFunctor<tv::GPU, T, int> scatterFtor;
scatterFtor(tv::TorchGPU(), tv::torch2tv<T>(inputGrad),
tv::torch2tv<const T>(inputBuffer),
tv::torch2tv<const int>(indicePairs).subview(i, inverse),
nHot);
TV_CHECK_CUDA_ERR();
}
#endif
else {
TV_ASSERT_INVALID_ARG(false, "unknown device type");
}
}
return {inputGrad, filtersGrad.view(filterShape)};
}
template <typename T>
torch::Tensor
indiceConvDevelopDontUse(torch::Tensor features, torch::Tensor filters,
torch::Tensor indicePairs, torch::Tensor indiceNum,
int64_t numActOut, int64_t _inverse, int64_t _subM) {
bool subM = _subM != 0;
bool inverse = _inverse != 0;
auto device = features.device().type();
auto ndim = filters.dim() - 2;
auto kernelVolume = indicePairs.size(0);
auto numInPlanes = features.size(1);
auto numOutPlanes = filters.size(ndim + 1);
auto indicePairNumCpu = indiceNum.to({torch::kCPU});
auto totalActsTen = indicePairNumCpu.sum();
auto totalActs = indicePairNumCpu.data<int>()[0];
auto indicePairMaxSizeIter =
std::max_element(indicePairNumCpu.data<int>(),
indicePairNumCpu.data<int>() + kernelVolume);
int indicePairMaxOffset =
indicePairMaxSizeIter - indicePairNumCpu.data<int>();
int indicePairMaxSize = *indicePairMaxSizeIter;
std::vector<int> indicePairNumVec(indicePairNumCpu.data<int>(),
indicePairNumCpu.data<int>() +
kernelVolume);
indicePairNumVec.erase(indicePairNumVec.begin() + indicePairMaxOffset);
int subRuleMaxSize =
*std::max_element(indicePairNumVec.begin(), indicePairNumVec.end());
if (subM) {
indicePairMaxSize = subRuleMaxSize;
}
// auto timer = spconv::CudaContextTimer<>();
auto options =
torch::TensorOptions().dtype(features.dtype()).device(features.device());
// auto indicePairOptions =
// torch::TensorOptions().dtype(torch::kInt64).device(indicePairs.device());
torch::Tensor output = torch::zeros({numActOut, numOutPlanes}, options);
torch::Tensor inputBuffer =
torch::zeros({kernelVolume, indicePairMaxSize, numInPlanes}, options);
torch::Tensor outputBuffer =
torch::zeros({kernelVolume, indicePairMaxSize, numOutPlanes}, options);
filters = filters.view({-1, numInPlanes, numOutPlanes});
// std::cout << "create time " << timer.report() / 1000.0 << std::endl;
if (subM) { // the center index of subm conv don't need gather and scatter
// add.
torch::mm_out(output, features, filters[indicePairMaxOffset]);
}
double totalGatherTime = 0;
double totalGEMMTime = 0;
double totalSAddTime = 0;
// auto timer = spconv::CudaContextTimer<>();
for (int i = 0; i < kernelVolume; ++i) {
auto nHot = indicePairNumCpu.data<int>()[i];
if (nHot <= 0 || (subM && i == indicePairMaxOffset)) {
continue;
}
//
auto outputBufferBlob = torch::from_blob(outputBuffer[i].data<T>(),
{nHot, numOutPlanes}, options);
auto inputBufferBlob = torch::from_blob(inputBuffer[i].data<T>(),
{nHot, numInPlanes}, options);
if (device == torch::kCPU) {
functor::SparseGatherFunctor<tv::CPU, T, int> gatherFtor;
gatherFtor(tv::CPU(), tv::torch2tv<T>(inputBufferBlob),
tv::torch2tv<const T>(features),
tv::torch2tv<const int>(indicePairs).subview(i, inverse),
nHot);
}
#ifdef SPCONV_CUDA
else if (device == torch::kCUDA) {
functor::SparseGatherFunctor<tv::GPU, T, int> gatherFtor;
gatherFtor(tv::TorchGPU(), tv::torch2tv<T>(inputBufferBlob),
tv::torch2tv<const T>(features),
tv::torch2tv<const int>(indicePairs).subview(i, inverse),
nHot);
TV_CHECK_CUDA_ERR();
}
#endif
else {
TV_ASSERT_INVALID_ARG(false, "unknown device type");
}
// }
// for (int i = 0; i < kernelVolume; ++i) {
// totalGatherTime += timer.report() / 1000.0;
// auto outputBufferBlob = torch::from_blob(outputBuffer[i].data<T>(),
// {nHot, numOutPlanes}, options);
}
// totalGatherTime += timer.report() / 1000.0;
for (int i = 0; i < kernelVolume; ++i) {
auto nHot = indicePairNumCpu.data<int>()[i];
if (nHot <= 0 || (subM && i == indicePairMaxOffset)) {
continue;
}
auto outputBufferBlob = torch::from_blob(outputBuffer[i].data<T>(),
{nHot, numOutPlanes}, options);
auto inputBufferBlob = torch::from_blob(inputBuffer[i].data<T>(),
{nHot, numInPlanes}, options);
torch::mm_out(outputBufferBlob, inputBufferBlob, filters[i]);
}
// totalGEMMTime += timer.report() / 1000.0;
// totalGEMMTime += timer.report() / 1000.0;
for (int i = 0; i < kernelVolume; ++i) {
auto nHot = indicePairNumCpu.data<int>()[i];
if (nHot <= 0 || (subM && i == indicePairMaxOffset)) {
continue;
}
auto outputBufferBlob = torch::from_blob(outputBuffer[i].data<T>(),
{nHot, numOutPlanes}, options);
auto inputBufferBlob = torch::from_blob(inputBuffer[i].data<T>(),
{nHot, numInPlanes}, options);
if (device == torch::kCPU) {
functor::SparseScatterAddFunctor<tv::CPU, T, int> scatterFtor;
scatterFtor(tv::CPU(), tv::torch2tv<T>(output),
tv::torch2tv<const T>(outputBufferBlob),
tv::torch2tv<const int>(indicePairs).subview(i, !inverse),
nHot, true);
}
#ifdef SPCONV_CUDA
else if (device == torch::kCUDA) {
functor::SparseScatterAddFunctor<tv::GPU, T, int> scatterFtor;
scatterFtor(tv::TorchGPU(), tv::torch2tv<T>(output),
tv::torch2tv<const T>(outputBufferBlob),
tv::torch2tv<const int>(indicePairs).subview(i, !inverse),
nHot, true);
TV_CHECK_CUDA_ERR();
}
#endif
else {
TV_ASSERT_INVALID_ARG(false, "unknown device type");
}
// totalSAddTime += timer.report() / 1000.0;
}
// totalSAddTime += timer.report() / 1000.0;
// std::cout << "gather time " << totalGatherTime << std::endl;
// std::cout << "gemm time " << totalGEMMTime << std::endl;
// std::cout << "scatteradd time " << totalSAddTime << std::endl;
return output;
}
torch::Tensor indiceNum, int64_t _inverse, int64_t _subM);
} // namespace spconv
......
// Copyright 2019 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "tensorview.h"
#include <memory>
#include <spconv/mp_helper.h>
#ifdef SPCONV_CUDA
#include <cuda_runtime.h>
#include <cuda_runtime_api.h>
#endif
namespace tv
{
enum DType
{
float32,
int32,
int16,
int8,
float64,
bool_,
uint8,
float16,
int64,
uint16,
uint32,
uint64
};
namespace detail
{
template <typename T>
class TensorStorage
{
public:
TensorStorage(size_t size, int device = -1, bool managed = false)
: mSize(size), device_(device), managed_(managed)
{
if (size == 0)
{
mPtr = nullptr;
}
else
{
if (device == -1)
{
#ifdef SPCONV_CUDA
checkCudaErrors(cudaMallocHost(&mPtr, size * sizeof(T)));
#else
mPtr = new T[size];
#endif
}
else
{
#ifdef SPCONV_CUDA
int deviceCount;
cudaGetDeviceCount(&deviceCount);
if (device >= deviceCount)
{
TV_ASSERT_INVALID_ARG("you provide device ", device,
" but you only have ", deviceCount, " device.");
}
cudaSetDevice(device);
if (managed)
{
checkCudaErrors(cudaMallocManaged(&this->mPtr, size * sizeof(T)));
}
else
{
checkCudaErrors(cudaMalloc(&mPtr, size * sizeof(T)));
}
#else
TV_ASSERT_INVALID_ARG(false, "don't compiled with cuda");
#endif
}
}
}
TensorStorage(T *ptr, size_t size, int device)
: mSize(size), mPtr(ptr), from_blob_(true), device_(device) {}
virtual ~TensorStorage()
{
if (empty())
{
return;
}
if (from_blob_)
{
return;
}
if (device_ == -1)
{
#ifdef SPCONV_CUDA
cudaFreeHost(mPtr);
#else
delete[] mPtr;
#endif
}
else
{
#ifdef SPCONV_CUDA
cudaFree(mPtr);
#endif
}
};
inline size_t size() const { return mSize; }
T *data() { return mPtr; }
const T *data() const { return mPtr; }
bool empty() const { return mPtr == nullptr || mSize == 0; }
bool managed() const { return managed_; }
int device() const { return device_; }
void zero_()
{
if (device_ == -1)
{
std::memset(data(), 0, mSize);
// std::fill(data(), data() + mSize, 0);
}
else
{
#ifdef SPCONV_CUDA
checkCudaErrors(cudaMemset(data(), 0, mSize / sizeof(T)));
#else
TV_ASSERT_INVALID_ARG(false, "don't compiled with cuda");
#endif
}
}
private:
T *mPtr = nullptr;
size_t mSize = 0;
int device_ = -1;
bool from_blob_ = false;
bool managed_ = false;
};
size_t sizeof_dtype(DType dtype)
{
switch (dtype)
{
case float32:
return sizeof(float);
case int8:
return sizeof(int8_t);
case int16:
return sizeof(int16_t);
case int32:
return sizeof(int32_t);
case float64:
return sizeof(double);
case int64:
return sizeof(int64_t);
case bool_:
return sizeof(bool);
case uint8:
return sizeof(uint8_t);
case uint16:
return sizeof(uint16_t);
case uint32:
return sizeof(uint32_t);
case uint64:
return sizeof(uint64_t);
#ifdef SPCONV_CUDA
case float16:
return sizeof(__half);
#endif
default:
TV_THROW_RT_ERR("unsupported dtype");
}
return 0;
}
std::string typeString(DType t)
{
switch (t)
{
case DType::bool_:
return "bool";
case DType::float32:
return "float32";
case DType::int8:
return "int8";
case DType::int16:
return "int16";
case DType::int32:
return "int32";
case DType::float64:
return "float64";
case DType::int64:
return "int64";
case DType::uint8:
return "uint8";
case DType::uint16:
return "uint16";
case DType::uint32:
return "uint32";
case DType::uint64:
return "uint64";
#ifdef SPCONV_CUDA
case DType::float16:
return "half";
#endif
default:
return "";
}
}
template <typename T>
struct TypeToDtypeTraits;
template <>
struct TypeToDtypeTraits<int32_t>
{
static constexpr DType dtype = int32;
};
#ifdef SPCONV_CUDA
template <>
struct TypeToDtypeTraits<__half>
{
static constexpr DType dtype = float16;
};
#endif
template <>
struct TypeToDtypeTraits<float>
{
static constexpr DType dtype = float32;
};
template <>
struct TypeToDtypeTraits<double>
{
static constexpr DType dtype = float64;
};
template <>
struct TypeToDtypeTraits<int16_t>
{
static constexpr DType dtype = int16;
};
template <>
struct TypeToDtypeTraits<int8_t>
{
static constexpr DType dtype = int8;
};
template <>
struct TypeToDtypeTraits<int64_t>
{
static constexpr DType dtype = int64;
};
template <>
struct TypeToDtypeTraits<uint8_t>
{
static constexpr DType dtype = uint8;
};
template <>
struct TypeToDtypeTraits<uint16_t>
{
static constexpr DType dtype = uint16;
};
template <>
struct TypeToDtypeTraits<uint32_t>
{
static constexpr DType dtype = uint32;
};
template <>
struct TypeToDtypeTraits<uint64_t>
{
static constexpr DType dtype = uint64;
};
} // namespace detail
template <class T>
constexpr DType type_v = detail::TypeToDtypeTraits<T>::dtype;
struct Tensor
{
Tensor() {}
Tensor(Shape shape, DType dtype, int device = -1, bool managed = false)
: dtype_(dtype)
{
storage_ = std::make_shared<detail::TensorStorage<uint8_t>>(
shape.size() * detail::sizeof_dtype(dtype), device, managed);
shape_ = shape;
}
Tensor(void *ptr, Shape shape, DType dtype, int device = -1) : dtype_(dtype)
{
storage_ = std::make_shared<detail::TensorStorage<uint8_t>>(
reinterpret_cast<uint8_t *>(ptr),
shape.size() * detail::sizeof_dtype(dtype), device);
shape_ = shape;
}
template <typename T>
TensorView<T> tview()
{
TV_ASSERT_RT_ERR(dtype_ == type_v<T>, "error");
TV_ASSERT_RT_ERR(shape_.size() == storage_->size() / sizeof(T), "error");
return TensorView<T>(reinterpret_cast<T *>(storage_->data()), shape_);
}
template <typename T>
TensorView<T> tview() const
{
TV_ASSERT_RT_ERR(shape_.size() == storage_->size() / sizeof(T), "error");
TV_ASSERT_RT_ERR(dtype_ == type_v<T>, "error");
return TensorView<const std::remove_const_t<T>>(
reinterpret_cast<const std::remove_const_t<T> *>(storage_->data()),
shape_);
}
bool empty() const { return storage_->empty(); }
DType dtype() const { return dtype_; }
int device() const { return storage_->device(); }
const Shape &shape() const { return shape_; }
int dim(int idx) const
{
TV_ASSERT_RT_ERR(idx < shape_.size(), "error");
return shape_[idx];
}
const uint8_t *raw_data() const { return storage_->data(); }
size_t size() const { return shape_.size(); }
Tensor &zero_()
{
storage_->zero_();
return *this;
}
uint8_t *raw_data() { return storage_->data(); }
template <typename T>
Tensor &fill_(T value)
{
TV_ASSERT_RT_ERR(dtype_ == type_v<T>, "error");
auto ptr = reinterpret_cast<T *>(raw_data());
std::fill(ptr, ptr + size(), value);
return *this;
}
template <typename T>
T *data()
{
TV_ASSERT_RT_ERR(dtype_ == type_v<T>, "error");
return reinterpret_cast<T *>(raw_data());
}
template <typename T>
const T *data() const
{
TV_ASSERT_RT_ERR(dtype_ == type_v<T>, "error");
return reinterpret_cast<const T *>(raw_data());
}
void copy_(const Tensor &tensor)
{
TV_ASSERT_RT_ERR(!empty() && !tensor.empty(), "must not empty");
TV_ASSERT_RT_ERR(size() == tensor.size(), "must have same size");
TV_ASSERT_RT_ERR(dtype() == tensor.dtype(), "must have same dtype");
if (device() == -1 && tensor.device() == -1)
{
#ifdef SPCONV_CUDA
host2host(storage_->data(), tensor.raw_data(),
size() * detail::sizeof_dtype(dtype_));
#else
std::copy(tensor.raw_data(),
tensor.raw_data() + size() * detail::sizeof_dtype(dtype_),
storage_->data());
#endif
}
#ifdef SPCONV_CUDA
else if (device() >= 0 && tensor.device() == -1)
{
// host2dev
host2dev(storage_->data(), tensor.raw_data(),
size() * detail::sizeof_dtype(dtype_));
}
else if (device() == -1 && tensor.device() >= 0)
{
// dev2host
dev2host(storage_->data(), tensor.raw_data(),
size() * detail::sizeof_dtype(dtype_));
}
else if (device() >= 0 && tensor.device() >= 0)
{
// dev2dev
dev2dev(storage_->data(), tensor.raw_data(),
size() * detail::sizeof_dtype(dtype_));
}
#endif
else
{
TV_ASSERT_RT_ERR(false, "only support cpu tensor");
}
}
Tensor cpu() const
{
if (storage_->device() == -1)
{
return *this;
}
Tensor res(shape_, dtype_, -1, storage_->managed());
res.copy_(*this);
return res;
}
template <typename T>
void copy_(const TensorView<T> &tensor, int device)
{
Tensor src = from_blob(tensor, device);
return copy_(src);
}
protected:
DType dtype_;
std::shared_ptr<detail::TensorStorage<uint8_t>> storage_;
Shape shape_;
};
inline Tensor from_blob(void *ptr, Shape shape, DType dtype, int device)
{
return Tensor(ptr, shape, dtype, device);
}
template <typename T>
Tensor from_blob(TensorView<T> tensor, int device)
{
return Tensor(tensor.data(), tensor.shape, type_v<T>, device);
}
template <class... Ts, typename F>
void dispatch(DType t, F &&f)
{
static_assert(sizeof...(Ts) > 0, "you need to provide at least one type");
bool notFound = true;
spconv::mp_for_each<spconv::mp_list<Ts...>>([=, &notFound, &f](auto I) {
if (type_v<decltype(I)> == t)
{
std::forward<F>(f)(decltype(I)());
notFound = false;
}
});
if (notFound)
{
std::stringstream ss;
spconv::mp_for_each<spconv::mp_list<Ts...>>([=, &ss](auto I) {
ss << detail::TypeToString<decltype(I)>::value << " ";
});
TV_THROW_RT_ERR("unknown type", detail::typeString(t),
", available: ", ss.str());
}
}
} // namespace tv
\ No newline at end of file
......@@ -17,9 +17,9 @@
#include <cassert>
#include <cstdlib>
#include "prettyprint.h"
#include <iostream>
#include <memory>
// #include <prettyprint.h>
#include <sstream>
#include <type_traits>
#include <vector>
......@@ -27,7 +27,6 @@
#include <cuda_runtime_api.h>
#endif
namespace tv {
#ifdef __NVCC__
......@@ -72,15 +71,24 @@ void sstream_print(SStream &ss, T val, TArgs... args) {
sstream_print(ss, args...);
}
template <class... TArgs> void ssprint(TArgs... args) {
std::stringstream ss;
sstream_print(ss, args...);
std::cout << ss.str() << std::endl;
}
#define TV_THROW_RT_ERR(...) \
{ \
std::stringstream __macro_s; \
__macro_s << __FILE__ << " " << __LINE__ << "\n"; \
tv::sstream_print(__macro_s, __VA_ARGS__); \
throw std::runtime_error(__macro_s.str()); \
}
#define TV_ASSERT_RT_ERR(expr, ...) \
{ \
if (!(expr)) { \
std::stringstream __macro_s; \
__macro_s << __FILE__ << " " << __LINE__ << "\n"; \
__macro_s << #expr << " assert faild. "; \
tv::sstream_print(__macro_s, __VA_ARGS__); \
throw std::runtime_error(__macro_s.str()); \
} \
if (!(expr)) \
TV_THROW_RT_ERR(__VA_ARGS__); \
}
#define TV_ASSERT_INVALID_ARG(expr, ...) \
......@@ -96,24 +104,24 @@ void sstream_print(SStream &ss, T val, TArgs... args) {
#define TV_CHECK_CUDA_ERR() \
{ \
auto __macro_err = cudaGetLastError(); \
if (__macro_err != cudaSuccess) { \
auto __macro_err = cudaGetLastError(); \
if (__macro_err != cudaSuccess) { \
std::stringstream __macro_s; \
__macro_s << __FILE__ << " " << __LINE__ << "\n"; \
__macro_s << "cuda execution failed with error " << __macro_err; \
__macro_s << "cuda execution failed with error " << __macro_err; \
throw std::runtime_error(__macro_s.str()); \
} \
}
#define TV_CHECK_CUDA_ERR_V2(...) \
#define TV_CHECK_CUDA_ERR_V2(...) \
{ \
auto __macro_err = cudaGetLastError(); \
if (__macro_err != cudaSuccess) { \
auto __macro_err = cudaGetLastError(); \
if (__macro_err != cudaSuccess) { \
std::stringstream __macro_s; \
__macro_s << __FILE__ << " " << __LINE__ << "\n"; \
__macro_s << "cuda execution failed with error " << __macro_err; \
__macro_s << " " << cudaGetErrorString(__macro_err) << "\n";\
tv::sstream_print(__macro_s, __VA_ARGS__); \
__macro_s << "cuda execution failed with error " << __macro_err; \
__macro_s << " " << cudaGetErrorString(__macro_err) << "\n"; \
tv::sstream_print(__macro_s, __VA_ARGS__); \
throw std::runtime_error(__macro_s.str()); \
} \
}
......@@ -362,7 +370,8 @@ struct ShapeBase : public SimpleVector<int, MaxDim> {
TV_HOST_DEVICE_INLINE ShapeBase() : SimpleVector<int, MaxDim>(){};
TV_HOST_DEVICE_INLINE ShapeBase(std::initializer_list<int> shape)
: SimpleVector<int, MaxDim>(shape) {}
TV_HOST_DEVICE_INLINE ShapeBase(SimpleVector<int, MaxDim> vec)
: SimpleVector<int, MaxDim>(vec) {}
template <typename T, template <class...> class Container>
ShapeBase(Container<T> shape) : SimpleVector<int, MaxDim>(shape) {}
TV_HOST_DEVICE_INLINE ShapeBase(const ShapeBase<MaxDim> &shape)
......@@ -372,7 +381,7 @@ struct ShapeBase : public SimpleVector<int, MaxDim> {
ShapeBase<MaxDim> &operator=(const ShapeBase<MaxDim> &shape) = default;
TV_HOST_DEVICE_INLINE ShapeBase<MaxDim> subshape(int start, int end) const {
#ifdef TV_DEBUG
TV_ASSERT(start >= 0 && end < this->mSize && end > start);
TV_ASSERT(start >= 0 && end <= this->mSize && end > start);
#endif
ShapeBase<MaxDim> shape;
for (int i = start; i < end; ++i) {
......@@ -417,6 +426,13 @@ struct ShapeBase : public SimpleVector<int, MaxDim> {
}
return shape;
}
TV_HOST_DEVICE size_t prod() const {
size_t res = 1;
for (size_t i = 0; i < this->mSize; ++i) {
res *= this->mArray[i];
}
return res;
}
};
using Shape = ShapeBase<TV_MAX_DIM>;
......@@ -545,6 +561,9 @@ template <typename T, int Rank = -1> struct TensorView {
: mPtr(ptr) {
mShape = {int(shapes)...};
}
operator TensorView<const T>() {
return TensorView<const T>(mPtr, mShape);
} // conversion function
TV_HOST_DEVICE_INLINE TensorView<T, Rank> &
assign(const TensorView<T, Rank> &tensor) {
......@@ -846,6 +865,19 @@ template <typename T, int Rank = -1> struct TensorView {
#endif
return mPtr[idx];
}
TV_HOST_DEVICE_INLINE const T &operator[](int idx) const {
#ifdef TV_DEBUG
#if defined(__CUDA_ARCH__)
TV_DEVICE_REQUIRE(idx >= 0 && idx < size(),
"index(%d) out-of-range: [0, %ld)\n", int(idx), size());
#else
TV_REQUIRE(idx >= 0 && idx < size(), "index(%d) out-of-range: [0, %ld)\n",
int(idx), size());
#endif
#endif
return mPtr[idx];
}
// TODO: this is conflcit with operator[](SimpleVector<Slice> slice_vec).
/*TV_HOST_DEVICE_INLINE T &operator[](const Shape index) {
int idx = rowArrayIdx(mShape, index);
......@@ -970,6 +1002,26 @@ template <typename T, int Rank = -1> struct TensorView {
mShape.subshape(sizeof...(ints) + 1));
}
TV_HOST_DEVICE_INLINE TensorView<T, Rank>
subview_ints(SimpleVector<int> ids) const {
Shape start = ids;
for (int i = ids.size(); i < ndim(); ++i) {
start.push_back(0);
}
return TensorView<T, Rank>(mPtr + rowArrayIdx(mShape, start),
mShape.subshape(ids.size()));
}
std::string print_vec(TensorView<T> tensor) const {
std::ostringstream ss;
ss << "[";
for (size_t i = 0; i < tensor.dim(0) - 1; ++i) {
ss << tensor(i) << ", ";
}
ss << tensor(tensor.dim(0) - 1) << "]";
return ss.str();
}
std::string repr() const {
std::ostringstream ss;
if (empty())
......@@ -983,37 +1035,29 @@ template <typename T, int Rank = -1> struct TensorView {
}
Shape counter = mShape;
auto tensor_flat = this->view(-1);
for (int i = 0; i < counter.ndim(); ++i) {
for (int i = 0; i < counter.ndim() - 1; ++i) {
counter[i] = 0;
ss << "[";
// ss << "[";
}
for (size_t i = 0; i < this->size(); ++i) {
ss << tensor_flat(rowArrayIdx(mShape, counter));
counter[counter.ndim() - 1] += 1;
int inc_count = 0;
bool print_comma = true;
for (int c = counter.ndim() - 1; c >= 0; --c) {
if (counter[c] == this->dim(c) && c > 0) {
++inc_count;
counter[c - 1] += 1;
counter[c] = 0;
print_comma = false;
for (size_t i = 0; i < this->size() / this->dim(this->ndim() - 1); ++i) {
for (int i = 0; i < counter.ndim() - 1; ++i) {
if (counter[i] == 0) {
ss << "[";
}
}
if (print_comma && i != this->size() - 1)
ss << ", ";
for (int j = 0; j < inc_count; ++j) {
ss << "]";
}
if (i != this->size() - 1) {
if (inc_count != 0)
ss << "\n";
for (int j = 0; j < inc_count; ++j) {
ss << "[";
std::cout << "counter.ndim() " << counter.ndim() << std::endl;
auto counter_ = counter.subshape(0, counter.ndim() - 1);
std::cout << counter.subshape(0, counter.ndim() - 1) << std::endl;
ss << print_vec(this->subview_ints(counter_)) << "\n";
std::cout << "after counter.ndim() " << counter.ndim() << std::endl;
for (int i = 0; i < counter.ndim() - 1; ++i) {
if (counter[i] == this->dim(i) - 1) {
ss << "]";
}
}
}
ss << "]";
// ss << "]";
// ss << fmt::format("\nTensor: shape={}, dtype={}", mShape,
// detail::simpleTypeName<T>());
ss << "Tensor: dtype=" << detail::simpleTypeName<T>();
......@@ -1161,4 +1205,160 @@ TV_HOST_DEVICE void printTensorView(const T *ptr, Shape shape,
return printTensorView(TensorView<const T>(ptr, shape), format);
}
#ifdef SPCONV_CUDA
#ifdef __DRIVER_TYPES_H__
#ifndef DEVICE_RESET
#define DEVICE_RESET cudaDeviceReset();
#endif
#else
#ifndef DEVICE_RESET
#define DEVICE_RESET
#endif
#endif
template <typename T>
void check(T result, char const *const func, const char *const file,
int const line) {
if (result) {
fprintf(stderr, "CUDA error at %s:%d code=%d \"%s\" \n", file, line,
static_cast<unsigned int>(result), func);
DEVICE_RESET
// Make sure we call CUDA Device Reset before exiting
exit(EXIT_FAILURE);
}
}
#define checkCudaErrors(val) check((val), #val, __FILE__, __LINE__)
template <typename T>
void host2dev(T *dst, const T *src, size_t size, cudaStream_t s = 0) {
checkCudaErrors(
cudaMemcpyAsync(dst, src, size * sizeof(T), cudaMemcpyHostToDevice, s));
}
template <typename T>
void host2dev(TensorView<T> dst, const TensorView<const T> src,
cudaStream_t s = 0) {
host2dev(dst.data(), src.data(), std::min(dst.size(), src.size()), s);
}
template <typename T>
void host2dev(TensorView<T> dst, const TensorView<T> src, cudaStream_t s = 0) {
host2dev(dst.data(), src.data(), std::min(dst.size(), src.size()), s);
}
template <typename T> void host2dev_sync(T *dst, const T *src, size_t size) {
checkCudaErrors(
cudaMemcpy(dst, src, size * sizeof(T), cudaMemcpyHostToDevice));
}
template <typename T>
void host2dev_sync(TensorView<T> dst, const TensorView<const T> src) {
host2dev_sync(dst.data(), src.data(), std::min(dst.size(), src.size()));
}
template <typename T>
void host2dev_sync(TensorView<T> dst, const TensorView<T> src) {
host2dev_sync(dst.data(), src.data(), std::min(dst.size(), src.size()));
}
template <typename T>
void dev2host(T *dst, const T *src, size_t size, cudaStream_t s = 0) {
checkCudaErrors(
cudaMemcpyAsync(dst, src, size * sizeof(T), cudaMemcpyDeviceToHost, s));
}
template <typename T>
void dev2host(TensorView<T> dst, const TensorView<const T> src,
cudaStream_t s = 0) {
dev2host(dst.data(), src.data(), std::min(dst.size(), src.size()), s);
}
template <typename T>
void dev2host(TensorView<T> dst, const TensorView<T> src, cudaStream_t s = 0) {
dev2host(dst.data(), src.data(), std::min(dst.size(), src.size()), s);
}
template <typename T>
void dev2dev(T *dst, const T *src, size_t size, cudaStream_t s = 0) {
checkCudaErrors(
cudaMemcpyAsync(dst, src, size * sizeof(T), cudaMemcpyDeviceToDevice, s));
}
template <typename T>
void dev2dev(TensorView<T> dst, const TensorView<const T> src,
cudaStream_t s = 0) {
dev2dev(dst.data(), src.data(), std::min(dst.size(), src.size()), s);
}
template <typename T>
void dev2dev(TensorView<T> dst, const TensorView<T> src, cudaStream_t s = 0) {
dev2dev(dst.data(), src.data(), std::min(dst.size(), src.size()), s);
}
template <typename T>
void host2host(T *dst, const T *src, size_t size, cudaStream_t s = 0) {
checkCudaErrors(
cudaMemcpyAsync(dst, src, size * sizeof(T), cudaMemcpyHostToHost, s));
}
template <typename T>
void host2host(TensorView<T> dst, const TensorView<const T> src,
cudaStream_t s = 0) {
host2host(dst.data(), src.data(), std::min(dst.size(), src.size()), s);
}
template <typename T>
void host2host(TensorView<T> dst, const TensorView<T> src, cudaStream_t s = 0) {
host2host(dst.data(), src.data(), std::min(dst.size(), src.size()), s);
}
template <typename T> void zero_dev(TensorView<T> tensor) {
checkCudaErrors(cudaMemset(tensor.data(), 0, tensor.size() * sizeof(T)));
}
template <typename T> void zero_dev(TensorView<T> tensor, cudaStream_t s) {
checkCudaErrors(
cudaMemsetAsync(tensor.data(), 0, tensor.size() * sizeof(T), s));
}
template <typename T> void zero_host(TensorView<T> tensor) {
std::fill(tensor.data(), tensor.data() + tensor.size(), 0);
}
#endif
namespace detail {
template <typename T> struct TypeToString;
template <> struct TypeToString<int32_t> {
static constexpr const char *value = "int32";
};
template <> struct TypeToString<bool> {
static constexpr const char *value = "bool";
};
template <> struct TypeToString<float> {
static constexpr const char *value = "float";
};
template <> struct TypeToString<double> {
static constexpr const char *value = "double";
};
template <> struct TypeToString<int16_t> {
static constexpr const char *value = "int16";
};
template <> struct TypeToString<int8_t> {
static constexpr const char *value = "int8";
};
template <> struct TypeToString<int64_t> {
static constexpr const char *value = "int64";
};
template <> struct TypeToString<uint8_t> {
static constexpr const char *value = "uint8";
};
template <> struct TypeToString<uint16_t> {
static constexpr const char *value = "uint16";
};
template <> struct TypeToString<uint32_t> {
static constexpr const char *value = "uint32";
};
template <> struct TypeToString<uint64_t> {
static constexpr const char *value = "uint64";
};
} // namespace detail
} // namespace tv
\ No newline at end of file
// Copyright 2019 Yan Yan
//
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//
// http://www.apache.org/licenses/LICENSE-2.0
//
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
......@@ -13,24 +13,26 @@
// limitations under the License.
#pragma once
#include <spconv/mp_helper.h>
#include <tensorview/tensorview.h>
#include <torch/script.h>
#include <ATen/ATen.h>
#include <torch/script.h>
#ifdef SPCONV_CUDA
#include <ATen/cuda/CUDAContext.h>
#endif
namespace tv {
#ifdef SPCONV_CUDA
struct TorchGPU: public tv::GPU {
struct TorchGPU : public tv::GPU {
virtual cudaStream_t getStream() const override {
return at::cuda::getCurrentCUDAStream();
}
};
#endif
template <typename T> void check_torch_dtype(const torch::Tensor &tensor) {
switch (tensor.type().scalarType()) {
switch (tensor.scalar_type()) {
case at::ScalarType::Double: {
auto val = std::is_same<std::remove_const_t<T>, double>::value;
TV_ASSERT_RT_ERR(val, "error");
......@@ -60,39 +62,63 @@ template <typename T> void check_torch_dtype(const torch::Tensor &tensor) {
TV_ASSERT_RT_ERR(false, "error");
}
}
namespace detail {
template <typename T> struct TypeToTorchDtypeTraits;
template <typename T>
constexpr auto type2torch(T val=T()) -> decltype(torch::kInt32){
TV_ASSERT_RT_ERR(false, "unknown type");
}
template <>
constexpr auto type2torch(int val) -> decltype(torch::kInt32){
return torch::kInt32;
}
template <> struct TypeToTorchDtypeTraits<int32_t> {
static constexpr decltype(torch::kInt32) value = torch::kInt32;
};
template <>
constexpr auto type2torch(long val) -> decltype(torch::kInt32){
return torch::kInt64;
}
template <> struct TypeToTorchDtypeTraits<int64_t> {
static constexpr decltype(torch::kInt32) value = torch::kInt64;
};
template <>
constexpr auto type2torch(float val) -> decltype(torch::kInt32){
return torch::kFloat32;
}
template <> struct TypeToTorchDtypeTraits<float> {
static constexpr decltype(torch::kInt32) value = torch::kFloat32;
};
template <> struct TypeToTorchDtypeTraits<double> {
static constexpr decltype(torch::kInt32) value = torch::kFloat64;
};
template <> struct TypeToTorchDtypeTraits<at::Half> {
static constexpr decltype(torch::kInt32) value = torch::kHalf;
};
template <>
constexpr auto type2torch(double val) -> decltype(torch::kInt32){
return torch::kFloat64;
}
} // namespace detail
template <typename T>
tv::TensorView<T> torch2tv(const torch::Tensor &tensor) {
constexpr decltype(torch::kInt32) torch_type_v =
detail::TypeToTorchDtypeTraits<T>::value;
template <typename T> tv::TensorView<T> torch2tv(const torch::Tensor &tensor) {
check_torch_dtype<T>(tensor);
tv::Shape shape;
for (auto i : tensor.sizes()) {
shape.push_back(i);
}
return tv::TensorView<T>(tensor.data<std::remove_const_t<T>>(), shape);
return tv::TensorView<T>(tensor.data_ptr<std::remove_const_t<T>>(), shape);
}
namespace detail {
template <> struct TypeToString<at::Half> {
static constexpr const char *value = "half";
};
} // namespace detail
template <class... Ts, typename F>
void torch_dispatch(at::ScalarType t, F &&f) {
static_assert(sizeof...(Ts) > 0, "you need to provide at least one type");
bool notFound = true;
spconv::mp_for_each<spconv::mp_list<Ts...>>([=, &notFound, &f](auto I) {
if (torch_type_v<decltype(I)> == t) {
std::forward<F>(f)(decltype(I)());
notFound = false;
}
});
if (notFound) {
std::stringstream ss;
spconv::mp_for_each<spconv::mp_list<Ts...>>([=, &ss](auto I) {
ss << tv::detail::TypeToString<decltype(I)>::value << " ";
});
TV_THROW_RT_ERR("unknown type", t, ", available: ", ss.str());
}
}
} // namespace tv
\ No newline at end of file
......@@ -15,6 +15,8 @@ from pathlib import Path
LIBTORCH_ROOT = str(Path(torch.__file__).parent)
SPCONV_FORCE_BUILD_CUDA = os.getenv("SPCONV_FORCE_BUILD_CUDA")
PYTHON_VERSION = "{}.{}".format(sys.version_info.major, sys.version_info.minor)
class CMakeExtension(Extension):
......@@ -46,7 +48,7 @@ class CMakeBuild(build_ext):
'-DPYBIND11_PYTHON_VERSION={}'.format(PYTHON_VERSION),
'-DSPCONV_BuildTests=OFF',
] # -arch=sm_61
if not torch.cuda.is_available():
if not torch.cuda.is_available() and SPCONV_FORCE_BUILD_CUDA is None:
cmake_args += ['-DSPCONV_BuildCUDA=OFF']
else:
cuda_flags = ["\"--expt-relaxed-constexpr\""]
......
# Copyright 2019 Yan Yan
#
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#
# http://www.apache.org/licenses/LICENSE-2.0
#
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
......@@ -23,6 +23,8 @@ from spconv.conv import SparseConvTranspose2d, SparseConvTranspose3d
from spconv.conv import SparseInverseConv2d, SparseInverseConv3d
from spconv.modules import SparseModule, SparseSequential
from spconv.pool import SparseMaxPool2d, SparseMaxPool3d
from spconv.tables import ConcatTable, JoinTable, AddTable
from spconv.identity import Identity
from spconv import ops
......@@ -55,7 +57,7 @@ class SparseConvTensor(object):
is very large.
"""
self.features = features
self.indices = indices
self.indices = indices
if self.indices.dtype != torch.int32:
self.indices.int()
self.spatial_shape = spatial_shape
......@@ -69,7 +71,7 @@ class SparseConvTensor(object):
def find_indice_pair(self, key):
if key is None:
return None
return None
if key in self.indice_dict:
return self.indice_dict[key]
return None
......@@ -100,4 +102,4 @@ class RemoveGrid(SparseModule):
"""
def forward(self, x: SparseConvTensor):
x.grid = None
return x
\ No newline at end of file
return x
......@@ -70,7 +70,7 @@ class SparseConvolution(SparseModule):
inverse=False,
indice_key=None,
fused_bn=False,
use_hash=True):
use_hash=False):
super(SparseConvolution, self).__init__()
assert groups == 1
if not isinstance(kernel_size, (list, tuple)):
......@@ -221,7 +221,7 @@ class SparseConv2d(SparseConvolution):
groups=1,
bias=True,
indice_key=None,
use_hash=True):
use_hash=False):
super(SparseConv2d, self).__init__(
2,
in_channels,
......@@ -247,7 +247,7 @@ class SparseConv3d(SparseConvolution):
groups=1,
bias=True,
indice_key=None,
use_hash=True):
use_hash=False):
super(SparseConv3d, self).__init__(
3,
in_channels,
......@@ -273,7 +273,7 @@ class SparseConv4d(SparseConvolution):
groups=1,
bias=True,
indice_key=None,
use_hash=True):
use_hash=False):
super(SparseConv4d, self).__init__(
4,
in_channels,
......@@ -299,7 +299,7 @@ class SparseConvTranspose2d(SparseConvolution):
groups=1,
bias=True,
indice_key=None,
use_hash=True):
use_hash=False):
super(SparseConvTranspose2d, self).__init__(
2,
in_channels,
......@@ -326,7 +326,7 @@ class SparseConvTranspose3d(SparseConvolution):
groups=1,
bias=True,
indice_key=None,
use_hash=True):
use_hash=False):
super(SparseConvTranspose3d, self).__init__(
3,
in_channels,
......@@ -387,7 +387,7 @@ class SubMConv2d(SparseConvolution):
groups=1,
bias=True,
indice_key=None,
use_hash=True):
use_hash=False):
super(SubMConv2d, self).__init__(
2,
in_channels,
......@@ -414,7 +414,7 @@ class SubMConv3d(SparseConvolution):
groups=1,
bias=True,
indice_key=None,
use_hash=True):
use_hash=False):
super(SubMConv3d, self).__init__(
3,
in_channels,
......@@ -441,7 +441,7 @@ class SubMConv4d(SparseConvolution):
groups=1,
bias=True,
indice_key=None,
use_hash=True):
use_hash=False):
super(SubMConv4d, self).__init__(
4,
in_channels,
......
# Copyright 2016-present, Facebook, Inc.
# All rights reserved.
#
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.
from torch.nn import Module
class Identity(Module):
def forward(self, input):
return input
def input_spatial_size(self, out_size):
return out_size
......@@ -72,7 +72,7 @@ class SparseSequential(SparseModule):
('conv2', SparseConv2d(20,64,5)),
('relu2', nn.ReLU())
]))
# Example of using Sequential with kwargs(python 3.6+)
model = SparseSequential(
conv1=SparseConv2d(1,20,5),
......@@ -125,9 +125,12 @@ class SparseSequential(SparseModule):
def forward(self, input):
for k, module in self._modules.items():
if is_spconv_module(module): # use SpConvTensor as input
assert isinstance(input, spconv.SparseConvTensor)
self._sparity_dict[k] = input.sparity
input = module(input)
if isinstance(input, list):
input = module(input)
else:
assert isinstance(input, spconv.SparseConvTensor)
self._sparity_dict[k] = input.sparity
input = module(input)
else:
if isinstance(input, spconv.SparseConvTensor):
if input.indices.shape[0] != 0:
......
......@@ -111,16 +111,9 @@ def indice_conv(features,
num_activate_out,
inverse=False,
subm=False):
if filters.dtype == torch.float32:
return torch.ops.spconv.indice_conv_fp32(features, filters, indice_pairs,
indice_pair_num, num_activate_out,
int(inverse), int(subm))
elif filters.dtype == torch.half:
return torch.ops.spconv.indice_conv_half(features, filters, indice_pairs,
indice_pair_num, num_activate_out,
int(inverse), int(subm))
else:
raise NotImplementedError
return torch.ops.spconv.indice_conv(features, filters, indice_pairs,
indice_pair_num, num_activate_out,
int(inverse), int(subm))
def fused_indice_conv(features, filters, bias,
indice_pairs,
......@@ -145,14 +138,8 @@ def indice_conv_backward(features,
indice_pair_num,
inverse=False,
subm=False):
if filters.dtype == torch.float32:
return torch.ops.spconv.indice_conv_backward_fp32(
features, filters, out_bp, indice_pairs, indice_pair_num, int(inverse), int(subm))
elif filters.dtype == torch.half:
return torch.ops.spconv.indice_conv_backward_half(
features, filters, out_bp, indice_pairs, indice_pair_num, int(inverse), int(subm))
else:
raise NotImplementedError
return torch.ops.spconv.indice_conv_backward(
features, filters, out_bp, indice_pairs, indice_pair_num, int(inverse), int(subm))
def indice_maxpool(features, indice_pairs, indice_pair_num, num_activate_out):
......
from torch.autograd import Function
#from torch.nn import Module
from spconv.modules import SparseModule
import spconv
import torch
class JoinTable(SparseModule):# Module):
def forward(self, input):
output = spconv.SparseConvTensor(
torch.cat([i.features for i in input],1), input[1].indices,
input[1].spatial_shape, input[0].batch_size )
output.indice_dict = input[1].indice_dict
output.grid = input[1].grid
return output
def input_spatial_size(self, out_size):
return out_size
class AddTable(SparseModule): # Module):
def forward(self, input):
output = spconv.SparseConvTensor(
sum([i.features for i in input]), input[1].indices,
input[1].spatial_shape, input[1].batch_size )
output.indice_dict = input[1].indice_dict
output.grid = input[1].grid
return output
def input_spatial_size(self, out_size):
return out_size
class ConcatTable(SparseModule): # Module):
def forward(self, input):
return [module(input) for module in self._modules.values()]
def add(self, module):
self._modules[str(len(self._modules))] = module
return self
def input_spatial_size(self, out_size):
return self._modules['0'].input_spatial_size(out_size)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment