Commit 5073a413 authored by tusimple's avatar tusimple
Browse files

some c++ change:

1. use dispatch instead of template
2. set use_hash default to False
parent ec40b6f7
...@@ -7,6 +7,7 @@ __pycache__/ ...@@ -7,6 +7,7 @@ __pycache__/
*.so *.so
*.o *.o
*.out *.out
*.so.*
# Distribution / packaging # Distribution / packaging
.Python .Python
......
...@@ -13,49 +13,103 @@ ...@@ -13,49 +13,103 @@
// limitations under the License. // limitations under the License.
#pragma once #pragma once
#include <tensorview/tensorview.h>
#include <tensorview/tensor.h>
#include <algorithm> #include <algorithm>
#include <array>
#include <iostream> #include <iostream>
#include <pybind11/embed.h> // everything needed for embedding
#include <pybind11/functional.h> #include <pybind11/functional.h>
#include <pybind11/numpy.h> #include <pybind11/numpy.h>
#include <pybind11/pybind11.h> #include <pybind11/pybind11.h>
#include <pybind11/stl.h> #include <pybind11/stl.h>
#include <tensorview/tensorview.h>
namespace py = pybind11; namespace py = pybind11;
namespace tv {
template <typename T> TensorView<T> arrayt2tv(py::array_t<T> arr) {
Shape shape;
for (int i = 0; i < arr.ndim(); ++i) {
shape.push_back(arr.shape(i));
}
return TensorView<T>(arr.mutable_data(), shape);
}
template <typename T, typename TPyObject> template <typename T> TensorView<const T> carrayt2tv(py::array_t<T> arr) {
std::vector<T> array2Vector(TPyObject arr){ Shape shape;
py::array arr_np = arr; for (int i = 0; i < arr.ndim(); ++i) {
size_t size = arr.attr("size").template cast<size_t>(); shape.push_back(arr.shape(i));
py::array_t<T> arr_cc = arr_np; }
std::vector<T> data(arr_cc.data(), arr_cc.data() + size); return TensorView<const T>(arr.data(), shape);
return data; }
template <typename T> TensorView<T> vector2tv(std::vector<T> &arr) {
return TensorView<T>(arr.data(), {arr.size()});
} }
template <typename T> template <typename T>
std::vector<T> arrayT2Vector(py::array_t<T> arr) TensorView<T> vector2tv(std::vector<T> &arr, Shape shape) {
{ TV_ASSERT_INVALID_ARG(shape.prod() == arr.size(), "error");
std::vector<T> data(arr.data(), arr.data() + arr.size()); return TensorView<T>(arr.data(), shape);
return data; }
}
template <typename T> TensorView<const T> vector2tv(const std::vector<T> &arr) {
template <typename T, typename TPyObject> return TensorView<const T>(arr.data(), {arr.size()});
tv::TensorView<T> array2TensorView(TPyObject arr){
py::array arr_np = arr;
py::array_t<T> arr_cc = arr_np;
tv::Shape shape;
for (int i = 0; i < arr_cc.ndim(); ++i){
shape.push_back(arr_cc.shape(i));
}
return tv::TensorView<T>(arr_cc.mutable_data(), shape);
} }
template <typename T> template <typename T>
tv::TensorView<T> arrayT2TensorView(py::array_t<T> arr){ std::vector<T> shape2stride(const std::vector<T> &shape, T itemsize) {
tv::Shape shape; T p = T(1);
for (int i = 0; i < arr.ndim(); ++i){ std::vector<T> res;
shape.push_back(arr.shape(i)); for (auto iter = shape.rbegin(); iter != shape.rend(); ++iter) {
res.push_back(p * itemsize);
p *= *iter;
}
std::reverse(res.begin(), res.end());
return res;
}
tv::DType get_array_tv_dtype(const py::array& arr){
//
switch (arr.dtype().kind()){
case 'b': return tv::bool_;
case 'i': {
switch (arr.itemsize()){
case 1: return tv::int8;
case 2: return tv::int16;
case 4: return tv::int32;
case 8: return tv::int64;
default: break;
}
} }
return tv::TensorView<T>(arr.mutable_data(), shape); case 'u': {
} switch (arr.itemsize()){
\ No newline at end of file case 1: return tv::uint8;
case 2: return tv::uint16;
case 4: return tv::uint32;
case 8: return tv::uint64;
default: break;
}
}
case 'f': {
switch (arr.itemsize()){
case 4: return tv::float32;
case 8: return tv::float64;
default: break;
}
}
}
TV_THROW_RT_ERR("unknown dtype", arr.dtype().kind(), arr.itemsize());
}
Tensor array2tensor(py::array& arr) {
Shape shape;
for (int i = 0; i < arr.ndim(); ++i) {
shape.push_back(arr.shape(i));
}
return tv::from_blob(arr.mutable_data(), shape, get_array_tv_dtype(arr), -1);
}
} // namespace tv
...@@ -110,7 +110,7 @@ getIndicePair(torch::Tensor indices, int64_t batchSize, ...@@ -110,7 +110,7 @@ getIndicePair(torch::Tensor indices, int64_t batchSize,
tv::torch2tv<int>(gridOut), tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(gridOut), tv::torch2tv<int>(indicePairs),
tv::torch2tv<int>(indiceNum), kernelSize32, stride32, padding32, tv::torch2tv<int>(indiceNum), kernelSize32, stride32, padding32,
dilation32, outSpatialShape32, transpose, false, useHash); dilation32, outSpatialShape32, transpose, false, useHash);
if (numActOut == -1){ if (numActOut == -1) {
// build hash failed. use CPU algorithm // build hash failed. use CPU algorithm
auto device = indices.device(); auto device = indices.device();
indicePairs = indicePairs.to({torch::kCPU}); indicePairs = indicePairs.to({torch::kCPU});
...@@ -123,7 +123,8 @@ getIndicePair(torch::Tensor indices, int64_t batchSize, ...@@ -123,7 +123,8 @@ getIndicePair(torch::Tensor indices, int64_t batchSize,
tv::torch2tv<int>(gridOut), tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(gridOut), tv::torch2tv<int>(indicePairs),
tv::torch2tv<int>(indiceNum), kernelSize32, stride32, padding32, tv::torch2tv<int>(indiceNum), kernelSize32, stride32, padding32,
dilation32, outSpatialShape32, transpose, false, useHash); dilation32, outSpatialShape32, transpose, false, useHash);
return {indices.to(device), indicePairs.to(device), indiceNum.to(device)}; return {indices.to(device), indicePairs.to(device),
indiceNum.to(device)};
} }
} }
#endif #endif
...@@ -169,7 +170,7 @@ getIndicePair(torch::Tensor indices, int64_t batchSize, ...@@ -169,7 +170,7 @@ getIndicePair(torch::Tensor indices, int64_t batchSize,
tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum), tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum),
tv::torch2tv<int>(indicePairUnique), outSpatialShape32, transpose, tv::torch2tv<int>(indicePairUnique), outSpatialShape32, transpose,
false, useHash); false, useHash);
if (numActOut == -1){ if (numActOut == -1) {
// build hash failed. use CPU algorithm // build hash failed. use CPU algorithm
auto getIndicePairFtor = auto getIndicePairFtor =
functor::CreateConvIndicePairFunctor<tv::CPU, int, int, NDim>(); functor::CreateConvIndicePairFunctor<tv::CPU, int, int, NDim>();
...@@ -184,7 +185,8 @@ getIndicePair(torch::Tensor indices, int64_t batchSize, ...@@ -184,7 +185,8 @@ getIndicePair(torch::Tensor indices, int64_t batchSize,
tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum), tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum),
kernelSize32, stride32, padding32, dilation32, outSpatialShape32, kernelSize32, stride32, padding32, dilation32, outSpatialShape32,
transpose); transpose);
return {outInds.to(device).slice(0, 0, numActOut), indicePairs.to(device), indiceNum.to(device)}; return {outInds.to(device).slice(0, 0, numActOut),
indicePairs.to(device), indiceNum.to(device)};
} }
} }
} }
...@@ -328,372 +330,14 @@ std::vector<torch::Tensor> getIndicePairPreGrid( ...@@ -328,372 +330,14 @@ std::vector<torch::Tensor> getIndicePairPreGrid(
} }
} }
template <typename T>
torch::Tensor indiceConv(torch::Tensor features, torch::Tensor filters, torch::Tensor indiceConv(torch::Tensor features, torch::Tensor filters,
torch::Tensor indicePairs, torch::Tensor indiceNum, torch::Tensor indicePairs, torch::Tensor indiceNum,
int64_t numActOut, int64_t _inverse, int64_t _subM) { int64_t numActOut, int64_t _inverse, int64_t _subM);
bool subM = _subM != 0;
bool inverse = _inverse != 0;
auto device = features.device().type();
auto ndim = filters.dim() - 2;
auto kernelVolume = indicePairs.size(0);
auto numInPlanes = features.size(1);
auto numOutPlanes = filters.size(ndim + 1);
auto indicePairNumCpu = indiceNum.to({torch::kCPU});
auto indicePairMaxSizeIter =
std::max_element(indicePairNumCpu.data_ptr<int>(),
indicePairNumCpu.data_ptr<int>() + kernelVolume);
int indicePairMaxOffset =
indicePairMaxSizeIter - indicePairNumCpu.data_ptr<int>();
int indicePairMaxSize = *indicePairMaxSizeIter;
/*if (_subM){
std::vector<int> indicePairNumVec(indicePairNumCpu.data_ptr<int>(),
indicePairNumCpu.data_ptr<int>() + kernelVolume);
indicePairNumVec.erase(indicePairNumVec.begin() + indicePairMaxOffset);
auto indicePairVecMaxSizeIter = std::max_element(
indicePairNumVec.begin(), indicePairNumVec.end());
indicePairMaxSize = *indicePairVecMaxSizeIter;
}*/
auto options =
torch::TensorOptions().dtype(features.dtype()).device(features.device());
// auto indicePairOptions =
// torch::TensorOptions().dtype(torch::kInt64).device(indicePairs.device());
torch::Tensor output = torch::zeros({numActOut, numOutPlanes}, options);
torch::Tensor inputBuffer =
torch::zeros({indicePairMaxSize, numInPlanes}, options);
torch::Tensor outputBuffer =
torch::zeros({indicePairMaxSize, numOutPlanes}, options);
filters = filters.view({-1, numInPlanes, numOutPlanes});
if (subM) { // the center index of subm conv don't need gather and scatter
// add.
torch::mm_out(output, features, filters[indicePairMaxOffset]);
}
double totalGatherTime = 0;
double totalGEMMTime = 0;
double totalSAddTime = 0;
for (int i = 0; i < kernelVolume; ++i) {
auto nHot = indicePairNumCpu.data_ptr<int>()[i];
if (nHot <= 0 || (subM && i == indicePairMaxOffset)) {
continue;
}
// auto timer = spconv::CudaContextTimer<>();
auto outputBufferBlob =
torch::from_blob(outputBuffer.data_ptr<T>(), {nHot, numOutPlanes}, options);
auto inputBufferBlob =
torch::from_blob(inputBuffer.data_ptr<T>(), {nHot, numInPlanes}, options);
if (device == torch::kCPU) {
functor::SparseGatherFunctor<tv::CPU, T, int> gatherFtor;
gatherFtor(tv::CPU(), tv::torch2tv<T>(inputBuffer),
tv::torch2tv<const T>(features),
tv::torch2tv<const int>(indicePairs).subview(i, inverse),
nHot);
}
#ifdef SPCONV_CUDA
else if (device == torch::kCUDA) {
functor::SparseGatherFunctor<tv::GPU, T, int> gatherFtor;
gatherFtor(tv::TorchGPU(), tv::torch2tv<T>(inputBuffer),
tv::torch2tv<const T>(features),
tv::torch2tv<const int>(indicePairs).subview(i, inverse),
nHot);
TV_CHECK_CUDA_ERR();
/* slower than SparseGatherFunctor, may due to int->long conversion
auto indicePairLong = indicePairs[i][inverse].to(torch::kInt64);
auto indicePairBlob = torch::from_blob(indicePairLong.data<long>(),
{nHot}, indicePairOptions); torch::index_select_out(inputBufferBlob,
features, 0, indicePairBlob);*/
}
#endif
else {
TV_ASSERT_INVALID_ARG(false, "unknown device type");
}
// totalGatherTime += timer.report() / 1000.0;
torch::mm_out(outputBufferBlob, inputBufferBlob, filters[i]);
// totalGEMMTime += timer.report() / 1000.0;
if (device == torch::kCPU) {
functor::SparseScatterAddFunctor<tv::CPU, T, int> scatterFtor;
scatterFtor(tv::CPU(), tv::torch2tv<T>(output),
tv::torch2tv<const T>(outputBuffer),
tv::torch2tv<const int>(indicePairs).subview(i, !inverse),
nHot, true);
}
#ifdef SPCONV_CUDA
else if (device == torch::kCUDA) {
functor::SparseScatterAddFunctor<tv::GPU, T, int> scatterFtor;
scatterFtor(tv::TorchGPU(), tv::torch2tv<T>(output),
tv::torch2tv<const T>(outputBuffer),
tv::torch2tv<const int>(indicePairs).subview(i, !inverse),
nHot, true);
TV_CHECK_CUDA_ERR();
}
#endif
else {
TV_ASSERT_INVALID_ARG(false, "unknown device type");
}
// totalSAddTime += timer.report() / 1000.0;
}
// std::cout << "gather time " << totalGatherTime << std::endl;
// std::cout << "gemm time " << totalGEMMTime << std::endl;
// std::cout << "scatteradd time " << totalSAddTime << std::endl;
return output;
}
template <typename T>
std::vector<torch::Tensor> std::vector<torch::Tensor>
indiceConvBackward(torch::Tensor features, torch::Tensor filters, indiceConvBackward(torch::Tensor features, torch::Tensor filters,
torch::Tensor outGrad, torch::Tensor indicePairs, torch::Tensor outGrad, torch::Tensor indicePairs,
torch::Tensor indiceNum, int64_t _inverse, int64_t _subM) { torch::Tensor indiceNum, int64_t _inverse, int64_t _subM);
bool subM = _subM != 0;
bool inverse = _inverse != 0;
auto device = features.device().type();
auto ndim = filters.dim() - 2;
auto kernelVolume = indicePairs.size(0);
auto numInPlanes = features.size(1);
auto numOutPlanes = filters.size(ndim + 1);
auto indicePairNumCpu = indiceNum.to({torch::kCPU});
auto indicePairMaxSizeIter =
std::max_element(indicePairNumCpu.data_ptr<int>(),
indicePairNumCpu.data_ptr<int>() + kernelVolume);
int indicePairMaxOffset =
indicePairMaxSizeIter - indicePairNumCpu.data_ptr<int>();
int indicePairMaxSize = *indicePairMaxSizeIter;
auto options =
torch::TensorOptions().dtype(features.dtype()).device(features.device());
auto filterShape = filters.sizes();
torch::Tensor inputGrad = torch::zeros(features.sizes(), options);
torch::Tensor filtersGrad = torch::zeros(filterShape, options);
torch::Tensor inputBuffer =
torch::zeros({indicePairMaxSize, numInPlanes}, options);
torch::Tensor outputBuffer =
torch::zeros({indicePairMaxSize, numOutPlanes}, options);
filters = filters.view({-1, numInPlanes, numOutPlanes});
filtersGrad = filtersGrad.view({-1, numInPlanes, numOutPlanes});
if (subM) {
auto filterGradSub = filtersGrad[indicePairMaxOffset];
torch::mm_out(filterGradSub, features.t(), outGrad);
torch::mm_out(inputGrad, outGrad, filters[indicePairMaxOffset].t());
}
for (int i = 0; i < kernelVolume; ++i) {
auto nHot = indicePairNumCpu.data_ptr<int>()[i];
if (nHot <= 0 || (subM && i == indicePairMaxOffset)) {
continue;
}
if (device == torch::kCPU) {
functor::SparseGatherFunctor<tv::CPU, T, int> gatherFtor;
functor::SparseGatherFunctor<tv::CPU, T, int> gatherFtorOut;
gatherFtor(tv::CPU(), tv::torch2tv<T>(inputBuffer),
tv::torch2tv<const T>(features),
tv::torch2tv<const int>(indicePairs).subview(i, inverse),
nHot);
gatherFtorOut(tv::CPU(), tv::torch2tv<T>(outputBuffer),
tv::torch2tv<const T>(outGrad),
tv::torch2tv<const int>(indicePairs).subview(i, !inverse),
nHot);
}
#ifdef SPCONV_CUDA
else if (device == torch::kCUDA) {
functor::SparseGatherFunctor<tv::GPU, T, int> gatherFtor;
functor::SparseGatherFunctor<tv::GPU, T, int> gatherFtorOut;
gatherFtor(tv::TorchGPU(), tv::torch2tv<T>(inputBuffer),
tv::torch2tv<const T>(features),
tv::torch2tv<const int>(indicePairs).subview(i, inverse),
nHot);
TV_CHECK_CUDA_ERR();
gatherFtorOut(tv::TorchGPU(), tv::torch2tv<T>(outputBuffer),
tv::torch2tv<const T>(outGrad),
tv::torch2tv<const int>(indicePairs).subview(i, !inverse),
nHot);
TV_CHECK_CUDA_ERR();
}
#endif
else {
TV_ASSERT_INVALID_ARG(false, "unknown device type");
}
auto filterGradSub = filtersGrad[i];
auto outputBufferBlob =
torch::from_blob(outputBuffer.data_ptr<T>(), {nHot, numOutPlanes}, options);
auto inputBufferBlob =
torch::from_blob(inputBuffer.data_ptr<T>(), {nHot, numInPlanes}, options);
torch::mm_out(filterGradSub, inputBufferBlob.t(), outputBufferBlob);
torch::mm_out(inputBufferBlob, outputBufferBlob, filters[i].t());
if (device == torch::kCPU) {
functor::SparseScatterAddFunctor<tv::CPU, T, int> scatterFtor;
scatterFtor(tv::CPU(), tv::torch2tv<T>(inputGrad),
tv::torch2tv<const T>(inputBuffer),
tv::torch2tv<const int>(indicePairs).subview(i, inverse),
nHot);
}
#ifdef SPCONV_CUDA
else if (device == torch::kCUDA) {
functor::SparseScatterAddFunctor<tv::GPU, T, int> scatterFtor;
scatterFtor(tv::TorchGPU(), tv::torch2tv<T>(inputGrad),
tv::torch2tv<const T>(inputBuffer),
tv::torch2tv<const int>(indicePairs).subview(i, inverse),
nHot);
TV_CHECK_CUDA_ERR();
}
#endif
else {
TV_ASSERT_INVALID_ARG(false, "unknown device type");
}
}
return {inputGrad, filtersGrad.view(filterShape)};
}
template <typename T>
torch::Tensor
indiceConvDevelopDontUse(torch::Tensor features, torch::Tensor filters,
torch::Tensor indicePairs, torch::Tensor indiceNum,
int64_t numActOut, int64_t _inverse, int64_t _subM) {
bool subM = _subM != 0;
bool inverse = _inverse != 0;
auto device = features.device().type();
auto ndim = filters.dim() - 2;
auto kernelVolume = indicePairs.size(0);
auto numInPlanes = features.size(1);
auto numOutPlanes = filters.size(ndim + 1);
auto indicePairNumCpu = indiceNum.to({torch::kCPU});
auto totalActsTen = indicePairNumCpu.sum();
auto totalActs = indicePairNumCpu.data_ptr<int>()[0];
auto indicePairMaxSizeIter =
std::max_element(indicePairNumCpu.data_ptr<int>(),
indicePairNumCpu.data_ptr<int>() + kernelVolume);
int indicePairMaxOffset =
indicePairMaxSizeIter - indicePairNumCpu.data_ptr<int>();
int indicePairMaxSize = *indicePairMaxSizeIter;
std::vector<int> indicePairNumVec(indicePairNumCpu.data_ptr<int>(),
indicePairNumCpu.data_ptr<int>() +
kernelVolume);
indicePairNumVec.erase(indicePairNumVec.begin() + indicePairMaxOffset);
int subRuleMaxSize =
*std::max_element(indicePairNumVec.begin(), indicePairNumVec.end());
if (subM) {
indicePairMaxSize = subRuleMaxSize;
}
// auto timer = spconv::CudaContextTimer<>();
auto options =
torch::TensorOptions().dtype(features.dtype()).device(features.device());
// auto indicePairOptions =
// torch::TensorOptions().dtype(torch::kInt64).device(indicePairs.device());
torch::Tensor output = torch::zeros({numActOut, numOutPlanes}, options);
torch::Tensor inputBuffer =
torch::zeros({kernelVolume, indicePairMaxSize, numInPlanes}, options);
torch::Tensor outputBuffer =
torch::zeros({kernelVolume, indicePairMaxSize, numOutPlanes}, options);
filters = filters.view({-1, numInPlanes, numOutPlanes});
// std::cout << "create time " << timer.report() / 1000.0 << std::endl;
if (subM) { // the center index of subm conv don't need gather and scatter
// add.
torch::mm_out(output, features, filters[indicePairMaxOffset]);
}
double totalGatherTime = 0;
double totalGEMMTime = 0;
double totalSAddTime = 0;
// auto timer = spconv::CudaContextTimer<>();
for (int i = 0; i < kernelVolume; ++i) {
auto nHot = indicePairNumCpu.data_ptr<int>()[i];
if (nHot <= 0 || (subM && i == indicePairMaxOffset)) {
continue;
}
//
auto outputBufferBlob = torch::from_blob(outputBuffer[i].data_ptr<T>(),
{nHot, numOutPlanes}, options);
auto inputBufferBlob = torch::from_blob(inputBuffer[i].data_ptr<T>(),
{nHot, numInPlanes}, options);
if (device == torch::kCPU) {
functor::SparseGatherFunctor<tv::CPU, T, int> gatherFtor;
gatherFtor(tv::CPU(), tv::torch2tv<T>(inputBufferBlob),
tv::torch2tv<const T>(features),
tv::torch2tv<const int>(indicePairs).subview(i, inverse),
nHot);
}
#ifdef SPCONV_CUDA
else if (device == torch::kCUDA) {
functor::SparseGatherFunctor<tv::GPU, T, int> gatherFtor;
gatherFtor(tv::TorchGPU(), tv::torch2tv<T>(inputBufferBlob),
tv::torch2tv<const T>(features),
tv::torch2tv<const int>(indicePairs).subview(i, inverse),
nHot);
TV_CHECK_CUDA_ERR();
}
#endif
else {
TV_ASSERT_INVALID_ARG(false, "unknown device type");
}
// }
// for (int i = 0; i < kernelVolume; ++i) {
// totalGatherTime += timer.report() / 1000.0;
// auto outputBufferBlob = torch::from_blob(outputBuffer[i].data<T>(),
// {nHot, numOutPlanes}, options);
}
// totalGatherTime += timer.report() / 1000.0;
for (int i = 0; i < kernelVolume; ++i) {
auto nHot = indicePairNumCpu.data_ptr<int>()[i];
if (nHot <= 0 || (subM && i == indicePairMaxOffset)) {
continue;
}
auto outputBufferBlob = torch::from_blob(outputBuffer[i].data_ptr<T>(),
{nHot, numOutPlanes}, options);
auto inputBufferBlob = torch::from_blob(inputBuffer[i].data_ptr<T>(),
{nHot, numInPlanes}, options);
torch::mm_out(outputBufferBlob, inputBufferBlob, filters[i]);
}
// totalGEMMTime += timer.report() / 1000.0;
// totalGEMMTime += timer.report() / 1000.0;
for (int i = 0; i < kernelVolume; ++i) {
auto nHot = indicePairNumCpu.data_ptr<int>()[i];
if (nHot <= 0 || (subM && i == indicePairMaxOffset)) {
continue;
}
auto outputBufferBlob = torch::from_blob(outputBuffer[i].data_ptr<T>(),
{nHot, numOutPlanes}, options);
auto inputBufferBlob = torch::from_blob(inputBuffer[i].data_ptr<T>(),
{nHot, numInPlanes}, options);
if (device == torch::kCPU) {
functor::SparseScatterAddFunctor<tv::CPU, T, int> scatterFtor;
scatterFtor(tv::CPU(), tv::torch2tv<T>(output),
tv::torch2tv<const T>(outputBufferBlob),
tv::torch2tv<const int>(indicePairs).subview(i, !inverse),
nHot, true);
}
#ifdef SPCONV_CUDA
else if (device == torch::kCUDA) {
functor::SparseScatterAddFunctor<tv::GPU, T, int> scatterFtor;
scatterFtor(tv::TorchGPU(), tv::torch2tv<T>(output),
tv::torch2tv<const T>(outputBufferBlob),
tv::torch2tv<const int>(indicePairs).subview(i, !inverse),
nHot, true);
TV_CHECK_CUDA_ERR();
}
#endif
else {
TV_ASSERT_INVALID_ARG(false, "unknown device type");
}
// totalSAddTime += timer.report() / 1000.0;
}
// totalSAddTime += timer.report() / 1000.0;
// std::cout << "gather time " << totalGatherTime << std::endl;
// std::cout << "gemm time " << totalGEMMTime << std::endl;
// std::cout << "scatteradd time " << totalSAddTime << std::endl;
return output;
}
} // namespace spconv } // namespace spconv
......
// Copyright 2019 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "tensorview.h"
#include <memory>
#include <spconv/mp_helper.h>
#ifdef SPCONV_CUDA
#include <cuda_runtime.h>
#include <cuda_runtime_api.h>
#endif
namespace tv
{
enum DType
{
float32,
int32,
int16,
int8,
float64,
bool_,
uint8,
float16,
int64,
uint16,
uint32,
uint64
};
namespace detail
{
template <typename T>
class TensorStorage
{
public:
TensorStorage(size_t size, int device = -1, bool managed = false)
: mSize(size), device_(device), managed_(managed)
{
if (size == 0)
{
mPtr = nullptr;
}
else
{
if (device == -1)
{
#ifdef SPCONV_CUDA
checkCudaErrors(cudaMallocHost(&mPtr, size * sizeof(T)));
#else
mPtr = new T[size];
#endif
}
else
{
#ifdef SPCONV_CUDA
int deviceCount;
cudaGetDeviceCount(&deviceCount);
if (device >= deviceCount)
{
TV_ASSERT_INVALID_ARG("you provide device ", device,
" but you only have ", deviceCount, " device.");
}
cudaSetDevice(device);
if (managed)
{
checkCudaErrors(cudaMallocManaged(&this->mPtr, size * sizeof(T)));
}
else
{
checkCudaErrors(cudaMalloc(&mPtr, size * sizeof(T)));
}
#else
TV_ASSERT_INVALID_ARG(false, "don't compiled with cuda");
#endif
}
}
}
TensorStorage(T *ptr, size_t size, int device)
: mSize(size), mPtr(ptr), from_blob_(true), device_(device) {}
virtual ~TensorStorage()
{
if (empty())
{
return;
}
if (from_blob_)
{
return;
}
if (device_ == -1)
{
#ifdef SPCONV_CUDA
cudaFreeHost(mPtr);
#else
delete[] mPtr;
#endif
}
else
{
#ifdef SPCONV_CUDA
cudaFree(mPtr);
#endif
}
};
inline size_t size() const { return mSize; }
T *data() { return mPtr; }
const T *data() const { return mPtr; }
bool empty() const { return mPtr == nullptr || mSize == 0; }
bool managed() const { return managed_; }
int device() const { return device_; }
void zero_()
{
if (device_ == -1)
{
std::memset(data(), 0, mSize);
// std::fill(data(), data() + mSize, 0);
}
else
{
#ifdef SPCONV_CUDA
checkCudaErrors(cudaMemset(data(), 0, mSize / sizeof(T)));
#else
TV_ASSERT_INVALID_ARG(false, "don't compiled with cuda");
#endif
}
}
private:
T *mPtr = nullptr;
size_t mSize = 0;
int device_ = -1;
bool from_blob_ = false;
bool managed_ = false;
};
size_t sizeof_dtype(DType dtype)
{
switch (dtype)
{
case float32:
return sizeof(float);
case int8:
return sizeof(int8_t);
case int16:
return sizeof(int16_t);
case int32:
return sizeof(int32_t);
case float64:
return sizeof(double);
case int64:
return sizeof(int64_t);
case bool_:
return sizeof(bool);
case uint8:
return sizeof(uint8_t);
case uint16:
return sizeof(uint16_t);
case uint32:
return sizeof(uint32_t);
case uint64:
return sizeof(uint64_t);
#ifdef SPCONV_CUDA
case float16:
return sizeof(__half);
#endif
default:
TV_THROW_RT_ERR("unsupported dtype");
}
return 0;
}
std::string typeString(DType t)
{
switch (t)
{
case DType::bool_:
return "bool";
case DType::float32:
return "float32";
case DType::int8:
return "int8";
case DType::int16:
return "int16";
case DType::int32:
return "int32";
case DType::float64:
return "float64";
case DType::int64:
return "int64";
case DType::uint8:
return "uint8";
case DType::uint16:
return "uint16";
case DType::uint32:
return "uint32";
case DType::uint64:
return "uint64";
#ifdef SPCONV_CUDA
case DType::float16:
return "half";
#endif
default:
return "";
}
}
template <typename T>
struct TypeToDtypeTraits;
template <>
struct TypeToDtypeTraits<int32_t>
{
static constexpr DType dtype = int32;
};
#ifdef SPCONV_CUDA
template <>
struct TypeToDtypeTraits<__half>
{
static constexpr DType dtype = float16;
};
#endif
template <>
struct TypeToDtypeTraits<float>
{
static constexpr DType dtype = float32;
};
template <>
struct TypeToDtypeTraits<double>
{
static constexpr DType dtype = float64;
};
template <>
struct TypeToDtypeTraits<int16_t>
{
static constexpr DType dtype = int16;
};
template <>
struct TypeToDtypeTraits<int8_t>
{
static constexpr DType dtype = int8;
};
template <>
struct TypeToDtypeTraits<int64_t>
{
static constexpr DType dtype = int64;
};
template <>
struct TypeToDtypeTraits<uint8_t>
{
static constexpr DType dtype = uint8;
};
template <>
struct TypeToDtypeTraits<uint16_t>
{
static constexpr DType dtype = uint16;
};
template <>
struct TypeToDtypeTraits<uint32_t>
{
static constexpr DType dtype = uint32;
};
template <>
struct TypeToDtypeTraits<uint64_t>
{
static constexpr DType dtype = uint64;
};
} // namespace detail
template <class T>
constexpr DType type_v = detail::TypeToDtypeTraits<T>::dtype;
struct Tensor
{
Tensor() {}
Tensor(Shape shape, DType dtype, int device = -1, bool managed = false)
: dtype_(dtype)
{
storage_ = std::make_shared<detail::TensorStorage<uint8_t>>(
shape.size() * detail::sizeof_dtype(dtype), device, managed);
shape_ = shape;
}
Tensor(void *ptr, Shape shape, DType dtype, int device = -1) : dtype_(dtype)
{
storage_ = std::make_shared<detail::TensorStorage<uint8_t>>(
reinterpret_cast<uint8_t *>(ptr),
shape.size() * detail::sizeof_dtype(dtype), device);
shape_ = shape;
}
template <typename T>
TensorView<T> tview()
{
TV_ASSERT_RT_ERR(dtype_ == type_v<T>, "error");
TV_ASSERT_RT_ERR(shape_.size() == storage_->size() / sizeof(T), "error");
return TensorView<T>(reinterpret_cast<T *>(storage_->data()), shape_);
}
template <typename T>
TensorView<T> tview() const
{
TV_ASSERT_RT_ERR(shape_.size() == storage_->size() / sizeof(T), "error");
TV_ASSERT_RT_ERR(dtype_ == type_v<T>, "error");
return TensorView<const std::remove_const_t<T>>(
reinterpret_cast<const std::remove_const_t<T> *>(storage_->data()),
shape_);
}
bool empty() const { return storage_->empty(); }
DType dtype() const { return dtype_; }
int device() const { return storage_->device(); }
const Shape &shape() const { return shape_; }
int dim(int idx) const
{
TV_ASSERT_RT_ERR(idx < shape_.size(), "error");
return shape_[idx];
}
const uint8_t *raw_data() const { return storage_->data(); }
size_t size() const { return shape_.size(); }
Tensor &zero_()
{
storage_->zero_();
return *this;
}
uint8_t *raw_data() { return storage_->data(); }
template <typename T>
Tensor &fill_(T value)
{
TV_ASSERT_RT_ERR(dtype_ == type_v<T>, "error");
auto ptr = reinterpret_cast<T *>(raw_data());
std::fill(ptr, ptr + size(), value);
return *this;
}
template <typename T>
T *data()
{
TV_ASSERT_RT_ERR(dtype_ == type_v<T>, "error");
return reinterpret_cast<T *>(raw_data());
}
template <typename T>
const T *data() const
{
TV_ASSERT_RT_ERR(dtype_ == type_v<T>, "error");
return reinterpret_cast<const T *>(raw_data());
}
void copy_(const Tensor &tensor)
{
TV_ASSERT_RT_ERR(!empty() && !tensor.empty(), "must not empty");
TV_ASSERT_RT_ERR(size() == tensor.size(), "must have same size");
TV_ASSERT_RT_ERR(dtype() == tensor.dtype(), "must have same dtype");
if (device() == -1 && tensor.device() == -1)
{
#ifdef SPCONV_CUDA
host2host(storage_->data(), tensor.raw_data(),
size() * detail::sizeof_dtype(dtype_));
#else
std::copy(tensor.raw_data(),
tensor.raw_data() + size() * detail::sizeof_dtype(dtype_),
storage_->data());
#endif
}
#ifdef SPCONV_CUDA
else if (device() >= 0 && tensor.device() == -1)
{
// host2dev
host2dev(storage_->data(), tensor.raw_data(),
size() * detail::sizeof_dtype(dtype_));
}
else if (device() == -1 && tensor.device() >= 0)
{
// dev2host
dev2host(storage_->data(), tensor.raw_data(),
size() * detail::sizeof_dtype(dtype_));
}
else if (device() >= 0 && tensor.device() >= 0)
{
// dev2dev
dev2dev(storage_->data(), tensor.raw_data(),
size() * detail::sizeof_dtype(dtype_));
}
#endif
else
{
TV_ASSERT_RT_ERR(false, "only support cpu tensor");
}
}
Tensor cpu() const
{
if (storage_->device() == -1)
{
return *this;
}
Tensor res(shape_, dtype_, -1, storage_->managed());
res.copy_(*this);
return res;
}
template <typename T>
void copy_(const TensorView<T> &tensor, int device)
{
Tensor src = from_blob(tensor, device);
return copy_(src);
}
protected:
DType dtype_;
std::shared_ptr<detail::TensorStorage<uint8_t>> storage_;
Shape shape_;
};
inline Tensor from_blob(void *ptr, Shape shape, DType dtype, int device)
{
return Tensor(ptr, shape, dtype, device);
}
template <typename T>
Tensor from_blob(TensorView<T> tensor, int device)
{
return Tensor(tensor.data(), tensor.shape, type_v<T>, device);
}
template <class... Ts, typename F>
void dispatch(DType t, F &&f)
{
static_assert(sizeof...(Ts) > 0, "you need to provide at least one type");
bool notFound = true;
spconv::mp_for_each<spconv::mp_list<Ts...>>([=, &notFound, &f](auto I) {
if (type_v<decltype(I)> == t)
{
std::forward<F>(f)(decltype(I)());
notFound = false;
}
});
if (notFound)
{
std::stringstream ss;
spconv::mp_for_each<spconv::mp_list<Ts...>>([=, &ss](auto I) {
ss << detail::TypeToString<decltype(I)>::value << " ";
});
TV_THROW_RT_ERR("unknown type", detail::typeString(t),
", available: ", ss.str());
}
}
} // namespace tv
\ No newline at end of file
...@@ -17,9 +17,9 @@ ...@@ -17,9 +17,9 @@
#include <cassert> #include <cassert>
#include <cstdlib> #include <cstdlib>
#include "prettyprint.h"
#include <iostream> #include <iostream>
#include <memory> #include <memory>
// #include <prettyprint.h>
#include <sstream> #include <sstream>
#include <type_traits> #include <type_traits>
#include <vector> #include <vector>
...@@ -27,7 +27,6 @@ ...@@ -27,7 +27,6 @@
#include <cuda_runtime_api.h> #include <cuda_runtime_api.h>
#endif #endif
namespace tv { namespace tv {
#ifdef __NVCC__ #ifdef __NVCC__
...@@ -72,15 +71,24 @@ void sstream_print(SStream &ss, T val, TArgs... args) { ...@@ -72,15 +71,24 @@ void sstream_print(SStream &ss, T val, TArgs... args) {
sstream_print(ss, args...); sstream_print(ss, args...);
} }
template <class... TArgs> void ssprint(TArgs... args) {
std::stringstream ss;
sstream_print(ss, args...);
std::cout << ss.str() << std::endl;
}
#define TV_THROW_RT_ERR(...) \
{ \
std::stringstream __macro_s; \
__macro_s << __FILE__ << " " << __LINE__ << "\n"; \
tv::sstream_print(__macro_s, __VA_ARGS__); \
throw std::runtime_error(__macro_s.str()); \
}
#define TV_ASSERT_RT_ERR(expr, ...) \ #define TV_ASSERT_RT_ERR(expr, ...) \
{ \ { \
if (!(expr)) { \ if (!(expr)) \
std::stringstream __macro_s; \ TV_THROW_RT_ERR(__VA_ARGS__); \
__macro_s << __FILE__ << " " << __LINE__ << "\n"; \
__macro_s << #expr << " assert faild. "; \
tv::sstream_print(__macro_s, __VA_ARGS__); \
throw std::runtime_error(__macro_s.str()); \
} \
} }
#define TV_ASSERT_INVALID_ARG(expr, ...) \ #define TV_ASSERT_INVALID_ARG(expr, ...) \
...@@ -96,24 +104,24 @@ void sstream_print(SStream &ss, T val, TArgs... args) { ...@@ -96,24 +104,24 @@ void sstream_print(SStream &ss, T val, TArgs... args) {
#define TV_CHECK_CUDA_ERR() \ #define TV_CHECK_CUDA_ERR() \
{ \ { \
auto __macro_err = cudaGetLastError(); \ auto __macro_err = cudaGetLastError(); \
if (__macro_err != cudaSuccess) { \ if (__macro_err != cudaSuccess) { \
std::stringstream __macro_s; \ std::stringstream __macro_s; \
__macro_s << __FILE__ << " " << __LINE__ << "\n"; \ __macro_s << __FILE__ << " " << __LINE__ << "\n"; \
__macro_s << "cuda execution failed with error " << __macro_err; \ __macro_s << "cuda execution failed with error " << __macro_err; \
throw std::runtime_error(__macro_s.str()); \ throw std::runtime_error(__macro_s.str()); \
} \ } \
} }
#define TV_CHECK_CUDA_ERR_V2(...) \ #define TV_CHECK_CUDA_ERR_V2(...) \
{ \ { \
auto __macro_err = cudaGetLastError(); \ auto __macro_err = cudaGetLastError(); \
if (__macro_err != cudaSuccess) { \ if (__macro_err != cudaSuccess) { \
std::stringstream __macro_s; \ std::stringstream __macro_s; \
__macro_s << __FILE__ << " " << __LINE__ << "\n"; \ __macro_s << __FILE__ << " " << __LINE__ << "\n"; \
__macro_s << "cuda execution failed with error " << __macro_err; \ __macro_s << "cuda execution failed with error " << __macro_err; \
__macro_s << " " << cudaGetErrorString(__macro_err) << "\n";\ __macro_s << " " << cudaGetErrorString(__macro_err) << "\n"; \
tv::sstream_print(__macro_s, __VA_ARGS__); \ tv::sstream_print(__macro_s, __VA_ARGS__); \
throw std::runtime_error(__macro_s.str()); \ throw std::runtime_error(__macro_s.str()); \
} \ } \
} }
...@@ -362,7 +370,8 @@ struct ShapeBase : public SimpleVector<int, MaxDim> { ...@@ -362,7 +370,8 @@ struct ShapeBase : public SimpleVector<int, MaxDim> {
TV_HOST_DEVICE_INLINE ShapeBase() : SimpleVector<int, MaxDim>(){}; TV_HOST_DEVICE_INLINE ShapeBase() : SimpleVector<int, MaxDim>(){};
TV_HOST_DEVICE_INLINE ShapeBase(std::initializer_list<int> shape) TV_HOST_DEVICE_INLINE ShapeBase(std::initializer_list<int> shape)
: SimpleVector<int, MaxDim>(shape) {} : SimpleVector<int, MaxDim>(shape) {}
TV_HOST_DEVICE_INLINE ShapeBase(SimpleVector<int, MaxDim> vec)
: SimpleVector<int, MaxDim>(vec) {}
template <typename T, template <class...> class Container> template <typename T, template <class...> class Container>
ShapeBase(Container<T> shape) : SimpleVector<int, MaxDim>(shape) {} ShapeBase(Container<T> shape) : SimpleVector<int, MaxDim>(shape) {}
TV_HOST_DEVICE_INLINE ShapeBase(const ShapeBase<MaxDim> &shape) TV_HOST_DEVICE_INLINE ShapeBase(const ShapeBase<MaxDim> &shape)
...@@ -372,7 +381,7 @@ struct ShapeBase : public SimpleVector<int, MaxDim> { ...@@ -372,7 +381,7 @@ struct ShapeBase : public SimpleVector<int, MaxDim> {
ShapeBase<MaxDim> &operator=(const ShapeBase<MaxDim> &shape) = default; ShapeBase<MaxDim> &operator=(const ShapeBase<MaxDim> &shape) = default;
TV_HOST_DEVICE_INLINE ShapeBase<MaxDim> subshape(int start, int end) const { TV_HOST_DEVICE_INLINE ShapeBase<MaxDim> subshape(int start, int end) const {
#ifdef TV_DEBUG #ifdef TV_DEBUG
TV_ASSERT(start >= 0 && end < this->mSize && end > start); TV_ASSERT(start >= 0 && end <= this->mSize && end > start);
#endif #endif
ShapeBase<MaxDim> shape; ShapeBase<MaxDim> shape;
for (int i = start; i < end; ++i) { for (int i = start; i < end; ++i) {
...@@ -417,6 +426,13 @@ struct ShapeBase : public SimpleVector<int, MaxDim> { ...@@ -417,6 +426,13 @@ struct ShapeBase : public SimpleVector<int, MaxDim> {
} }
return shape; return shape;
} }
TV_HOST_DEVICE size_t prod() const {
size_t res = 1;
for (size_t i = 0; i < this->mSize; ++i) {
res *= this->mArray[i];
}
return res;
}
}; };
using Shape = ShapeBase<TV_MAX_DIM>; using Shape = ShapeBase<TV_MAX_DIM>;
...@@ -545,6 +561,9 @@ template <typename T, int Rank = -1> struct TensorView { ...@@ -545,6 +561,9 @@ template <typename T, int Rank = -1> struct TensorView {
: mPtr(ptr) { : mPtr(ptr) {
mShape = {int(shapes)...}; mShape = {int(shapes)...};
} }
operator TensorView<const T>() {
return TensorView<const T>(mPtr, mShape);
} // conversion function
TV_HOST_DEVICE_INLINE TensorView<T, Rank> & TV_HOST_DEVICE_INLINE TensorView<T, Rank> &
assign(const TensorView<T, Rank> &tensor) { assign(const TensorView<T, Rank> &tensor) {
...@@ -846,6 +865,19 @@ template <typename T, int Rank = -1> struct TensorView { ...@@ -846,6 +865,19 @@ template <typename T, int Rank = -1> struct TensorView {
#endif #endif
return mPtr[idx]; return mPtr[idx];
} }
TV_HOST_DEVICE_INLINE const T &operator[](int idx) const {
#ifdef TV_DEBUG
#if defined(__CUDA_ARCH__)
TV_DEVICE_REQUIRE(idx >= 0 && idx < size(),
"index(%d) out-of-range: [0, %ld)\n", int(idx), size());
#else
TV_REQUIRE(idx >= 0 && idx < size(), "index(%d) out-of-range: [0, %ld)\n",
int(idx), size());
#endif
#endif
return mPtr[idx];
}
// TODO: this is conflcit with operator[](SimpleVector<Slice> slice_vec). // TODO: this is conflcit with operator[](SimpleVector<Slice> slice_vec).
/*TV_HOST_DEVICE_INLINE T &operator[](const Shape index) { /*TV_HOST_DEVICE_INLINE T &operator[](const Shape index) {
int idx = rowArrayIdx(mShape, index); int idx = rowArrayIdx(mShape, index);
...@@ -970,6 +1002,26 @@ template <typename T, int Rank = -1> struct TensorView { ...@@ -970,6 +1002,26 @@ template <typename T, int Rank = -1> struct TensorView {
mShape.subshape(sizeof...(ints) + 1)); mShape.subshape(sizeof...(ints) + 1));
} }
TV_HOST_DEVICE_INLINE TensorView<T, Rank>
subview_ints(SimpleVector<int> ids) const {
Shape start = ids;
for (int i = ids.size(); i < ndim(); ++i) {
start.push_back(0);
}
return TensorView<T, Rank>(mPtr + rowArrayIdx(mShape, start),
mShape.subshape(ids.size()));
}
std::string print_vec(TensorView<T> tensor) const {
std::ostringstream ss;
ss << "[";
for (size_t i = 0; i < tensor.dim(0) - 1; ++i) {
ss << tensor(i) << ", ";
}
ss << tensor(tensor.dim(0) - 1) << "]";
return ss.str();
}
std::string repr() const { std::string repr() const {
std::ostringstream ss; std::ostringstream ss;
if (empty()) if (empty())
...@@ -983,37 +1035,29 @@ template <typename T, int Rank = -1> struct TensorView { ...@@ -983,37 +1035,29 @@ template <typename T, int Rank = -1> struct TensorView {
} }
Shape counter = mShape; Shape counter = mShape;
auto tensor_flat = this->view(-1); auto tensor_flat = this->view(-1);
for (int i = 0; i < counter.ndim(); ++i) {
for (int i = 0; i < counter.ndim() - 1; ++i) {
counter[i] = 0; counter[i] = 0;
ss << "["; // ss << "[";
} }
for (size_t i = 0; i < this->size(); ++i) { for (size_t i = 0; i < this->size() / this->dim(this->ndim() - 1); ++i) {
ss << tensor_flat(rowArrayIdx(mShape, counter)); for (int i = 0; i < counter.ndim() - 1; ++i) {
counter[counter.ndim() - 1] += 1; if (counter[i] == 0) {
int inc_count = 0; ss << "[";
bool print_comma = true;
for (int c = counter.ndim() - 1; c >= 0; --c) {
if (counter[c] == this->dim(c) && c > 0) {
++inc_count;
counter[c - 1] += 1;
counter[c] = 0;
print_comma = false;
} }
} }
if (print_comma && i != this->size() - 1) std::cout << "counter.ndim() " << counter.ndim() << std::endl;
ss << ", "; auto counter_ = counter.subshape(0, counter.ndim() - 1);
for (int j = 0; j < inc_count; ++j) { std::cout << counter.subshape(0, counter.ndim() - 1) << std::endl;
ss << "]"; ss << print_vec(this->subview_ints(counter_)) << "\n";
} std::cout << "after counter.ndim() " << counter.ndim() << std::endl;
if (i != this->size() - 1) { for (int i = 0; i < counter.ndim() - 1; ++i) {
if (inc_count != 0) if (counter[i] == this->dim(i) - 1) {
ss << "\n"; ss << "]";
for (int j = 0; j < inc_count; ++j) {
ss << "[";
} }
} }
} }
ss << "]"; // ss << "]";
// ss << fmt::format("\nTensor: shape={}, dtype={}", mShape, // ss << fmt::format("\nTensor: shape={}, dtype={}", mShape,
// detail::simpleTypeName<T>()); // detail::simpleTypeName<T>());
ss << "Tensor: dtype=" << detail::simpleTypeName<T>(); ss << "Tensor: dtype=" << detail::simpleTypeName<T>();
...@@ -1161,4 +1205,160 @@ TV_HOST_DEVICE void printTensorView(const T *ptr, Shape shape, ...@@ -1161,4 +1205,160 @@ TV_HOST_DEVICE void printTensorView(const T *ptr, Shape shape,
return printTensorView(TensorView<const T>(ptr, shape), format); return printTensorView(TensorView<const T>(ptr, shape), format);
} }
#ifdef SPCONV_CUDA
#ifdef __DRIVER_TYPES_H__
#ifndef DEVICE_RESET
#define DEVICE_RESET cudaDeviceReset();
#endif
#else
#ifndef DEVICE_RESET
#define DEVICE_RESET
#endif
#endif
template <typename T>
void check(T result, char const *const func, const char *const file,
int const line) {
if (result) {
fprintf(stderr, "CUDA error at %s:%d code=%d \"%s\" \n", file, line,
static_cast<unsigned int>(result), func);
DEVICE_RESET
// Make sure we call CUDA Device Reset before exiting
exit(EXIT_FAILURE);
}
}
#define checkCudaErrors(val) check((val), #val, __FILE__, __LINE__)
template <typename T>
void host2dev(T *dst, const T *src, size_t size, cudaStream_t s = 0) {
checkCudaErrors(
cudaMemcpyAsync(dst, src, size * sizeof(T), cudaMemcpyHostToDevice, s));
}
template <typename T>
void host2dev(TensorView<T> dst, const TensorView<const T> src,
cudaStream_t s = 0) {
host2dev(dst.data(), src.data(), std::min(dst.size(), src.size()), s);
}
template <typename T>
void host2dev(TensorView<T> dst, const TensorView<T> src, cudaStream_t s = 0) {
host2dev(dst.data(), src.data(), std::min(dst.size(), src.size()), s);
}
template <typename T> void host2dev_sync(T *dst, const T *src, size_t size) {
checkCudaErrors(
cudaMemcpy(dst, src, size * sizeof(T), cudaMemcpyHostToDevice));
}
template <typename T>
void host2dev_sync(TensorView<T> dst, const TensorView<const T> src) {
host2dev_sync(dst.data(), src.data(), std::min(dst.size(), src.size()));
}
template <typename T>
void host2dev_sync(TensorView<T> dst, const TensorView<T> src) {
host2dev_sync(dst.data(), src.data(), std::min(dst.size(), src.size()));
}
template <typename T>
void dev2host(T *dst, const T *src, size_t size, cudaStream_t s = 0) {
checkCudaErrors(
cudaMemcpyAsync(dst, src, size * sizeof(T), cudaMemcpyDeviceToHost, s));
}
template <typename T>
void dev2host(TensorView<T> dst, const TensorView<const T> src,
cudaStream_t s = 0) {
dev2host(dst.data(), src.data(), std::min(dst.size(), src.size()), s);
}
template <typename T>
void dev2host(TensorView<T> dst, const TensorView<T> src, cudaStream_t s = 0) {
dev2host(dst.data(), src.data(), std::min(dst.size(), src.size()), s);
}
template <typename T>
void dev2dev(T *dst, const T *src, size_t size, cudaStream_t s = 0) {
checkCudaErrors(
cudaMemcpyAsync(dst, src, size * sizeof(T), cudaMemcpyDeviceToDevice, s));
}
template <typename T>
void dev2dev(TensorView<T> dst, const TensorView<const T> src,
cudaStream_t s = 0) {
dev2dev(dst.data(), src.data(), std::min(dst.size(), src.size()), s);
}
template <typename T>
void dev2dev(TensorView<T> dst, const TensorView<T> src, cudaStream_t s = 0) {
dev2dev(dst.data(), src.data(), std::min(dst.size(), src.size()), s);
}
template <typename T>
void host2host(T *dst, const T *src, size_t size, cudaStream_t s = 0) {
checkCudaErrors(
cudaMemcpyAsync(dst, src, size * sizeof(T), cudaMemcpyHostToHost, s));
}
template <typename T>
void host2host(TensorView<T> dst, const TensorView<const T> src,
cudaStream_t s = 0) {
host2host(dst.data(), src.data(), std::min(dst.size(), src.size()), s);
}
template <typename T>
void host2host(TensorView<T> dst, const TensorView<T> src, cudaStream_t s = 0) {
host2host(dst.data(), src.data(), std::min(dst.size(), src.size()), s);
}
template <typename T> void zero_dev(TensorView<T> tensor) {
checkCudaErrors(cudaMemset(tensor.data(), 0, tensor.size() * sizeof(T)));
}
template <typename T> void zero_dev(TensorView<T> tensor, cudaStream_t s) {
checkCudaErrors(
cudaMemsetAsync(tensor.data(), 0, tensor.size() * sizeof(T), s));
}
template <typename T> void zero_host(TensorView<T> tensor) {
std::fill(tensor.data(), tensor.data() + tensor.size(), 0);
}
#endif
namespace detail {
template <typename T> struct TypeToString;
template <> struct TypeToString<int32_t> {
static constexpr const char *value = "int32";
};
template <> struct TypeToString<bool> {
static constexpr const char *value = "bool";
};
template <> struct TypeToString<float> {
static constexpr const char *value = "float";
};
template <> struct TypeToString<double> {
static constexpr const char *value = "double";
};
template <> struct TypeToString<int16_t> {
static constexpr const char *value = "int16";
};
template <> struct TypeToString<int8_t> {
static constexpr const char *value = "int8";
};
template <> struct TypeToString<int64_t> {
static constexpr const char *value = "int64";
};
template <> struct TypeToString<uint8_t> {
static constexpr const char *value = "uint8";
};
template <> struct TypeToString<uint16_t> {
static constexpr const char *value = "uint16";
};
template <> struct TypeToString<uint32_t> {
static constexpr const char *value = "uint32";
};
template <> struct TypeToString<uint64_t> {
static constexpr const char *value = "uint64";
};
} // namespace detail
} // namespace tv } // namespace tv
\ No newline at end of file
// Copyright 2019 Yan Yan // Copyright 2019 Yan Yan
// //
// Licensed under the Apache License, Version 2.0 (the "License"); // Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License. // you may not use this file except in compliance with the License.
// You may obtain a copy of the License at // You may obtain a copy of the License at
// //
// http://www.apache.org/licenses/LICENSE-2.0 // http://www.apache.org/licenses/LICENSE-2.0
// //
// Unless required by applicable law or agreed to in writing, software // Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS, // distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
...@@ -13,24 +13,26 @@ ...@@ -13,24 +13,26 @@
// limitations under the License. // limitations under the License.
#pragma once #pragma once
#include <spconv/mp_helper.h>
#include <tensorview/tensorview.h> #include <tensorview/tensorview.h>
#include <torch/script.h>
#include <ATen/ATen.h> #include <ATen/ATen.h>
#include <torch/script.h>
#ifdef SPCONV_CUDA #ifdef SPCONV_CUDA
#include <ATen/cuda/CUDAContext.h> #include <ATen/cuda/CUDAContext.h>
#endif #endif
namespace tv { namespace tv {
#ifdef SPCONV_CUDA #ifdef SPCONV_CUDA
struct TorchGPU: public tv::GPU { struct TorchGPU : public tv::GPU {
virtual cudaStream_t getStream() const override { virtual cudaStream_t getStream() const override {
return at::cuda::getCurrentCUDAStream(); return at::cuda::getCurrentCUDAStream();
} }
}; };
#endif #endif
template <typename T> void check_torch_dtype(const torch::Tensor &tensor) { template <typename T> void check_torch_dtype(const torch::Tensor &tensor) {
switch (tensor.type().scalarType()) { switch (tensor.scalar_type()) {
case at::ScalarType::Double: { case at::ScalarType::Double: {
auto val = std::is_same<std::remove_const_t<T>, double>::value; auto val = std::is_same<std::remove_const_t<T>, double>::value;
TV_ASSERT_RT_ERR(val, "error"); TV_ASSERT_RT_ERR(val, "error");
...@@ -60,34 +62,34 @@ template <typename T> void check_torch_dtype(const torch::Tensor &tensor) { ...@@ -60,34 +62,34 @@ template <typename T> void check_torch_dtype(const torch::Tensor &tensor) {
TV_ASSERT_RT_ERR(false, "error"); TV_ASSERT_RT_ERR(false, "error");
} }
} }
namespace detail {
template <typename T> struct TypeToTorchDtypeTraits;
template <typename T> template <> struct TypeToTorchDtypeTraits<int32_t> {
constexpr auto type2torch(T val=T()) -> decltype(torch::kInt32){ static constexpr decltype(torch::kInt32) value = torch::kInt32;
TV_ASSERT_RT_ERR(false, "unknown type"); };
}
template <>
constexpr auto type2torch(int val) -> decltype(torch::kInt32){
return torch::kInt32;
}
template <> template <> struct TypeToTorchDtypeTraits<int64_t> {
constexpr auto type2torch(long val) -> decltype(torch::kInt32){ static constexpr decltype(torch::kInt32) value = torch::kInt64;
return torch::kInt64; };
}
template <> template <> struct TypeToTorchDtypeTraits<float> {
constexpr auto type2torch(float val) -> decltype(torch::kInt32){ static constexpr decltype(torch::kInt32) value = torch::kFloat32;
return torch::kFloat32; };
} template <> struct TypeToTorchDtypeTraits<double> {
static constexpr decltype(torch::kInt32) value = torch::kFloat64;
};
template <> struct TypeToTorchDtypeTraits<at::Half> {
static constexpr decltype(torch::kInt32) value = torch::kHalf;
};
template <> } // namespace detail
constexpr auto type2torch(double val) -> decltype(torch::kInt32){
return torch::kFloat64;
}
template <typename T> template <typename T>
tv::TensorView<T> torch2tv(const torch::Tensor &tensor) { constexpr decltype(torch::kInt32) torch_type_v =
detail::TypeToTorchDtypeTraits<T>::value;
template <typename T> tv::TensorView<T> torch2tv(const torch::Tensor &tensor) {
check_torch_dtype<T>(tensor); check_torch_dtype<T>(tensor);
tv::Shape shape; tv::Shape shape;
for (auto i : tensor.sizes()) { for (auto i : tensor.sizes()) {
...@@ -95,4 +97,28 @@ tv::TensorView<T> torch2tv(const torch::Tensor &tensor) { ...@@ -95,4 +97,28 @@ tv::TensorView<T> torch2tv(const torch::Tensor &tensor) {
} }
return tv::TensorView<T>(tensor.data_ptr<std::remove_const_t<T>>(), shape); return tv::TensorView<T>(tensor.data_ptr<std::remove_const_t<T>>(), shape);
} }
namespace detail {
template <> struct TypeToString<at::Half> {
static constexpr const char *value = "half";
};
} // namespace detail
template <class... Ts, typename F>
void torch_dispatch(at::ScalarType t, F &&f) {
static_assert(sizeof...(Ts) > 0, "you need to provide at least one type");
bool notFound = true;
spconv::mp_for_each<spconv::mp_list<Ts...>>([=, &notFound, &f](auto I) {
if (torch_type_v<decltype(I)> == t) {
std::forward<F>(f)(decltype(I)());
notFound = false;
}
});
if (notFound) {
std::stringstream ss;
spconv::mp_for_each<spconv::mp_list<Ts...>>([=, &ss](auto I) {
ss << tv::detail::TypeToString<decltype(I)>::value << " ";
});
TV_THROW_RT_ERR("unknown type", t, ", available: ", ss.str());
}
}
} // namespace tv } // namespace tv
\ No newline at end of file
...@@ -70,7 +70,7 @@ class SparseConvolution(SparseModule): ...@@ -70,7 +70,7 @@ class SparseConvolution(SparseModule):
inverse=False, inverse=False,
indice_key=None, indice_key=None,
fused_bn=False, fused_bn=False,
use_hash=True): use_hash=False):
super(SparseConvolution, self).__init__() super(SparseConvolution, self).__init__()
assert groups == 1 assert groups == 1
if not isinstance(kernel_size, (list, tuple)): if not isinstance(kernel_size, (list, tuple)):
...@@ -221,7 +221,7 @@ class SparseConv2d(SparseConvolution): ...@@ -221,7 +221,7 @@ class SparseConv2d(SparseConvolution):
groups=1, groups=1,
bias=True, bias=True,
indice_key=None, indice_key=None,
use_hash=True): use_hash=False):
super(SparseConv2d, self).__init__( super(SparseConv2d, self).__init__(
2, 2,
in_channels, in_channels,
...@@ -247,7 +247,7 @@ class SparseConv3d(SparseConvolution): ...@@ -247,7 +247,7 @@ class SparseConv3d(SparseConvolution):
groups=1, groups=1,
bias=True, bias=True,
indice_key=None, indice_key=None,
use_hash=True): use_hash=False):
super(SparseConv3d, self).__init__( super(SparseConv3d, self).__init__(
3, 3,
in_channels, in_channels,
...@@ -273,7 +273,7 @@ class SparseConv4d(SparseConvolution): ...@@ -273,7 +273,7 @@ class SparseConv4d(SparseConvolution):
groups=1, groups=1,
bias=True, bias=True,
indice_key=None, indice_key=None,
use_hash=True): use_hash=False):
super(SparseConv4d, self).__init__( super(SparseConv4d, self).__init__(
4, 4,
in_channels, in_channels,
...@@ -299,7 +299,7 @@ class SparseConvTranspose2d(SparseConvolution): ...@@ -299,7 +299,7 @@ class SparseConvTranspose2d(SparseConvolution):
groups=1, groups=1,
bias=True, bias=True,
indice_key=None, indice_key=None,
use_hash=True): use_hash=False):
super(SparseConvTranspose2d, self).__init__( super(SparseConvTranspose2d, self).__init__(
2, 2,
in_channels, in_channels,
...@@ -326,7 +326,7 @@ class SparseConvTranspose3d(SparseConvolution): ...@@ -326,7 +326,7 @@ class SparseConvTranspose3d(SparseConvolution):
groups=1, groups=1,
bias=True, bias=True,
indice_key=None, indice_key=None,
use_hash=True): use_hash=False):
super(SparseConvTranspose3d, self).__init__( super(SparseConvTranspose3d, self).__init__(
3, 3,
in_channels, in_channels,
...@@ -387,7 +387,7 @@ class SubMConv2d(SparseConvolution): ...@@ -387,7 +387,7 @@ class SubMConv2d(SparseConvolution):
groups=1, groups=1,
bias=True, bias=True,
indice_key=None, indice_key=None,
use_hash=True): use_hash=False):
super(SubMConv2d, self).__init__( super(SubMConv2d, self).__init__(
2, 2,
in_channels, in_channels,
...@@ -414,7 +414,7 @@ class SubMConv3d(SparseConvolution): ...@@ -414,7 +414,7 @@ class SubMConv3d(SparseConvolution):
groups=1, groups=1,
bias=True, bias=True,
indice_key=None, indice_key=None,
use_hash=True): use_hash=False):
super(SubMConv3d, self).__init__( super(SubMConv3d, self).__init__(
3, 3,
in_channels, in_channels,
...@@ -441,7 +441,7 @@ class SubMConv4d(SparseConvolution): ...@@ -441,7 +441,7 @@ class SubMConv4d(SparseConvolution):
groups=1, groups=1,
bias=True, bias=True,
indice_key=None, indice_key=None,
use_hash=True): use_hash=False):
super(SubMConv4d, self).__init__( super(SubMConv4d, self).__init__(
4, 4,
in_channels, in_channels,
......
...@@ -111,16 +111,9 @@ def indice_conv(features, ...@@ -111,16 +111,9 @@ def indice_conv(features,
num_activate_out, num_activate_out,
inverse=False, inverse=False,
subm=False): subm=False):
if filters.dtype == torch.float32: return torch.ops.spconv.indice_conv(features, filters, indice_pairs,
return torch.ops.spconv.indice_conv_fp32(features, filters, indice_pairs, indice_pair_num, num_activate_out,
indice_pair_num, num_activate_out, int(inverse), int(subm))
int(inverse), int(subm))
elif filters.dtype == torch.half:
return torch.ops.spconv.indice_conv_half(features, filters, indice_pairs,
indice_pair_num, num_activate_out,
int(inverse), int(subm))
else:
raise NotImplementedError
def fused_indice_conv(features, filters, bias, def fused_indice_conv(features, filters, bias,
indice_pairs, indice_pairs,
...@@ -145,14 +138,8 @@ def indice_conv_backward(features, ...@@ -145,14 +138,8 @@ def indice_conv_backward(features,
indice_pair_num, indice_pair_num,
inverse=False, inverse=False,
subm=False): subm=False):
if filters.dtype == torch.float32: return torch.ops.spconv.indice_conv_backward(
return torch.ops.spconv.indice_conv_backward_fp32( features, filters, out_bp, indice_pairs, indice_pair_num, int(inverse), int(subm))
features, filters, out_bp, indice_pairs, indice_pair_num, int(inverse), int(subm))
elif filters.dtype == torch.half:
return torch.ops.spconv.indice_conv_backward_half(
features, filters, out_bp, indice_pairs, indice_pair_num, int(inverse), int(subm))
else:
raise NotImplementedError
def indice_maxpool(features, indice_pairs, indice_pair_num, num_activate_out): def indice_maxpool(features, indice_pairs, indice_pair_num, num_activate_out):
......
set(ALL_FILES all.cc indice.cc reordering.cc maxpool.cc nms.cc) set(ALL_FILES all.cc indice.cc reordering.cc maxpool.cc nms.cc spconv_ops.cc)
if (SPCONV_BuildCUDA) if (SPCONV_BuildCUDA)
set(ALL_FILES ${ALL_FILES} indice.cu reordering.cu maxpool.cu pillar_scatter.cu) set(ALL_FILES ${ALL_FILES} indice.cu reordering.cu maxpool.cu pillar_scatter.cu)
endif() endif()
......
...@@ -26,11 +26,8 @@ static auto registry = ...@@ -26,11 +26,8 @@ static auto registry =
.op("spconv::get_indice_pairs_4d", &spconv::getIndicePair<4>) .op("spconv::get_indice_pairs_4d", &spconv::getIndicePair<4>)
.op("spconv::get_indice_pairs_grid_2d", &spconv::getIndicePairPreGrid<2>) .op("spconv::get_indice_pairs_grid_2d", &spconv::getIndicePairPreGrid<2>)
.op("spconv::get_indice_pairs_grid_3d", &spconv::getIndicePairPreGrid<3>) .op("spconv::get_indice_pairs_grid_3d", &spconv::getIndicePairPreGrid<3>)
.op("spconv::indice_conv_fp32", &spconv::indiceConv<float>) .op("spconv::indice_conv", &spconv::indiceConv)
.op("spconv::indice_conv_backward_fp32", &spconv::indiceConvBackward<float>) .op("spconv::indice_conv_backward", &spconv::indiceConvBackward)
.op("spconv::indice_conv_half", &spconv::indiceConv<at::Half>)
.op("spconv::indice_conv_backward_half",
&spconv::indiceConvBackward<at::Half>)
.op("spconv::fused_indice_conv_fp32", &spconv::fusedIndiceConvBatchNorm<float>) .op("spconv::fused_indice_conv_fp32", &spconv::fusedIndiceConvBatchNorm<float>)
.op("spconv::fused_indice_conv_half", &spconv::fusedIndiceConvBatchNorm<at::Half>) .op("spconv::fused_indice_conv_half", &spconv::fusedIndiceConvBatchNorm<at::Half>)
.op("spconv::indice_maxpool_fp32", &spconv::indiceMaxPool<float>) .op("spconv::indice_maxpool_fp32", &spconv::indiceMaxPool<float>)
......
#include <spconv/spconv_ops.h>
namespace spconv {
torch::Tensor indiceConv(torch::Tensor features, torch::Tensor filters,
torch::Tensor indicePairs, torch::Tensor indiceNum,
int64_t numActOut, int64_t _inverse, int64_t _subM) {
bool subM = _subM != 0;
bool inverse = _inverse != 0;
auto device = features.device().type();
auto ndim = filters.dim() - 2;
auto kernelVolume = indicePairs.size(0);
auto numInPlanes = features.size(1);
auto numOutPlanes = filters.size(ndim + 1);
auto indicePairNumCpu = indiceNum.to({torch::kCPU});
auto indicePairMaxSizeIter =
std::max_element(indicePairNumCpu.data_ptr<int>(),
indicePairNumCpu.data_ptr<int>() + kernelVolume);
int indicePairMaxOffset =
indicePairMaxSizeIter - indicePairNumCpu.data_ptr<int>();
int indicePairMaxSize = *indicePairMaxSizeIter;
/*if (_subM){
std::vector<int> indicePairNumVec(indicePairNumCpu.data_ptr<int>(),
indicePairNumCpu.data_ptr<int>() + kernelVolume);
indicePairNumVec.erase(indicePairNumVec.begin() + indicePairMaxOffset);
auto indicePairVecMaxSizeIter = std::max_element(
indicePairNumVec.begin(), indicePairNumVec.end());
indicePairMaxSize = *indicePairVecMaxSizeIter;
}*/
auto options =
torch::TensorOptions().dtype(features.dtype()).device(features.device());
// auto indicePairOptions =
// torch::TensorOptions().dtype(torch::kInt64).device(indicePairs.device());
torch::Tensor output = torch::zeros({numActOut, numOutPlanes}, options);
torch::Tensor inputBuffer =
torch::zeros({indicePairMaxSize, numInPlanes}, options);
torch::Tensor outputBuffer =
torch::zeros({indicePairMaxSize, numOutPlanes}, options);
filters = filters.view({-1, numInPlanes, numOutPlanes});
if (subM) { // the center index of subm conv don't need gather and scatter
// add.
torch::mm_out(output, features, filters[indicePairMaxOffset]);
}
double totalGatherTime = 0;
double totalGEMMTime = 0;
double totalSAddTime = 0;
tv::torch_dispatch<float, double, at::Half>(
features.scalar_type(), [&](auto I) {
using T = decltype(I);
for (int i = 0; i < kernelVolume; ++i) {
auto nHot = indicePairNumCpu.data_ptr<int>()[i];
if (nHot <= 0 || (subM && i == indicePairMaxOffset)) {
continue;
}
// auto timer = spconv::CudaContextTimer<>();
auto outputBufferBlob = torch::from_blob(
outputBuffer.data_ptr<T>(), {nHot, numOutPlanes}, options);
auto inputBufferBlob = torch::from_blob(inputBuffer.data_ptr<T>(),
{nHot, numInPlanes}, options);
if (device == torch::kCPU) {
functor::SparseGatherFunctor<tv::CPU, T, int> gatherFtor;
gatherFtor(tv::CPU(), tv::torch2tv<T>(inputBuffer),
tv::torch2tv<const T>(features),
tv::torch2tv<const int>(indicePairs).subview(i, inverse),
nHot);
}
#ifdef SPCONV_CUDA
else if (device == torch::kCUDA) {
functor::SparseGatherFunctor<tv::GPU, T, int> gatherFtor;
gatherFtor(tv::TorchGPU(), tv::torch2tv<T>(inputBuffer),
tv::torch2tv<const T>(features),
tv::torch2tv<const int>(indicePairs).subview(i, inverse),
nHot);
TV_CHECK_CUDA_ERR();
/* slower than SparseGatherFunctor, may due to int->long conversion
auto indicePairLong = indicePairs[i][inverse].to(torch::kInt64);
auto indicePairBlob = torch::from_blob(indicePairLong.data<long>(),
{nHot}, indicePairOptions); torch::index_select_out(inputBufferBlob,
features, 0, indicePairBlob);*/
}
#endif
else {
TV_ASSERT_INVALID_ARG(false, "unknown device type");
}
// totalGatherTime += timer.report() / 1000.0;
torch::mm_out(outputBufferBlob, inputBufferBlob, filters[i]);
// totalGEMMTime += timer.report() / 1000.0;
if (device == torch::kCPU) {
functor::SparseScatterAddFunctor<tv::CPU, T, int> scatterFtor;
scatterFtor(
tv::CPU(), tv::torch2tv<T>(output),
tv::torch2tv<const T>(outputBuffer),
tv::torch2tv<const int>(indicePairs).subview(i, !inverse), nHot,
true);
}
#ifdef SPCONV_CUDA
else if (device == torch::kCUDA) {
functor::SparseScatterAddFunctor<tv::GPU, T, int> scatterFtor;
scatterFtor(
tv::TorchGPU(), tv::torch2tv<T>(output),
tv::torch2tv<const T>(outputBuffer),
tv::torch2tv<const int>(indicePairs).subview(i, !inverse), nHot,
true);
TV_CHECK_CUDA_ERR();
}
#endif
else {
TV_ASSERT_INVALID_ARG(false, "unknown device type");
}
// totalSAddTime += timer.report() / 1000.0;
}
});
// std::cout << "gather time " << totalGatherTime << std::endl;
// std::cout << "gemm time " << totalGEMMTime << std::endl;
// std::cout << "scatteradd time " << totalSAddTime << std::endl;
return output;
}
std::vector<torch::Tensor>
indiceConvBackward(torch::Tensor features, torch::Tensor filters,
torch::Tensor outGrad, torch::Tensor indicePairs,
torch::Tensor indiceNum, int64_t _inverse, int64_t _subM) {
bool subM = _subM != 0;
bool inverse = _inverse != 0;
auto device = features.device().type();
auto ndim = filters.dim() - 2;
auto kernelVolume = indicePairs.size(0);
auto numInPlanes = features.size(1);
auto numOutPlanes = filters.size(ndim + 1);
auto indicePairNumCpu = indiceNum.to({torch::kCPU});
auto indicePairMaxSizeIter =
std::max_element(indicePairNumCpu.data_ptr<int>(),
indicePairNumCpu.data_ptr<int>() + kernelVolume);
int indicePairMaxOffset =
indicePairMaxSizeIter - indicePairNumCpu.data_ptr<int>();
int indicePairMaxSize = *indicePairMaxSizeIter;
auto options =
torch::TensorOptions().dtype(features.dtype()).device(features.device());
auto filterShape = filters.sizes();
torch::Tensor inputGrad = torch::zeros(features.sizes(), options);
torch::Tensor filtersGrad = torch::zeros(filterShape, options);
torch::Tensor inputBuffer =
torch::zeros({indicePairMaxSize, numInPlanes}, options);
torch::Tensor outputBuffer =
torch::zeros({indicePairMaxSize, numOutPlanes}, options);
filters = filters.view({-1, numInPlanes, numOutPlanes});
filtersGrad = filtersGrad.view({-1, numInPlanes, numOutPlanes});
if (subM) {
auto filterGradSub = filtersGrad[indicePairMaxOffset];
torch::mm_out(filterGradSub, features.t(), outGrad);
torch::mm_out(inputGrad, outGrad, filters[indicePairMaxOffset].t());
}
tv::torch_dispatch<float, double,
at::Half>(features.scalar_type(), [&](auto I) {
using T = decltype(I);
for (int i = 0; i < kernelVolume; ++i) {
auto nHot = indicePairNumCpu.data_ptr<int>()[i];
if (nHot <= 0 || (subM && i == indicePairMaxOffset)) {
continue;
}
if (device == torch::kCPU) {
functor::SparseGatherFunctor<tv::CPU, T, int> gatherFtor;
functor::SparseGatherFunctor<tv::CPU, T, int> gatherFtorOut;
gatherFtor(tv::CPU(), tv::torch2tv<T>(inputBuffer),
tv::torch2tv<const T>(features),
tv::torch2tv<const int>(indicePairs).subview(i, inverse),
nHot);
gatherFtorOut(tv::CPU(), tv::torch2tv<T>(outputBuffer),
tv::torch2tv<const T>(outGrad),
tv::torch2tv<const int>(indicePairs).subview(i, !inverse),
nHot);
}
#ifdef SPCONV_CUDA
else if (device == torch::kCUDA) {
functor::SparseGatherFunctor<tv::GPU, T, int> gatherFtor;
functor::SparseGatherFunctor<tv::GPU, T, int> gatherFtorOut;
gatherFtor(tv::TorchGPU(), tv::torch2tv<T>(inputBuffer),
tv::torch2tv<const T>(features),
tv::torch2tv<const int>(indicePairs).subview(i, inverse),
nHot);
TV_CHECK_CUDA_ERR();
gatherFtorOut(tv::TorchGPU(), tv::torch2tv<T>(outputBuffer),
tv::torch2tv<const T>(outGrad),
tv::torch2tv<const int>(indicePairs).subview(i, !inverse),
nHot);
TV_CHECK_CUDA_ERR();
}
#endif
else {
TV_ASSERT_INVALID_ARG(false, "unknown device type");
}
auto filterGradSub = filtersGrad[i];
auto outputBufferBlob = torch::from_blob(outputBuffer.data_ptr<T>(),
{nHot, numOutPlanes}, options);
auto inputBufferBlob = torch::from_blob(inputBuffer.data_ptr<T>(),
{nHot, numInPlanes}, options);
torch::mm_out(filterGradSub, inputBufferBlob.t(), outputBufferBlob);
torch::mm_out(inputBufferBlob, outputBufferBlob, filters[i].t());
if (device == torch::kCPU) {
functor::SparseScatterAddFunctor<tv::CPU, T, int> scatterFtor;
scatterFtor(tv::CPU(), tv::torch2tv<T>(inputGrad),
tv::torch2tv<const T>(inputBuffer),
tv::torch2tv<const int>(indicePairs).subview(i, inverse),
nHot);
}
#ifdef SPCONV_CUDA
else if (device == torch::kCUDA) {
functor::SparseScatterAddFunctor<tv::GPU, T, int> scatterFtor;
scatterFtor(tv::TorchGPU(), tv::torch2tv<T>(inputGrad),
tv::torch2tv<const T>(inputBuffer),
tv::torch2tv<const int>(indicePairs).subview(i, inverse),
nHot);
TV_CHECK_CUDA_ERR();
}
#endif
else {
TV_ASSERT_INVALID_ARG(false, "unknown device type");
}
}
});
return {inputGrad, filtersGrad.view(filterShape)};
}
} // namespace spconv
\ No newline at end of file
...@@ -581,7 +581,7 @@ def main(): ...@@ -581,7 +581,7 @@ def main():
if all([s > 1, d > 1]): if all([s > 1, d > 1]):
continue continue
device = torch.device(dev) device = torch.device(dev)
num_points = [5] * bs num_points = [500] * bs
sparse_dict = generate_sparse_data(shape, num_points, IC) sparse_dict = generate_sparse_data(shape, num_points, IC)
...@@ -601,7 +601,7 @@ def main(): ...@@ -601,7 +601,7 @@ def main():
net.net[0].weight[:] = filters_t net.net[0].weight[:] = filters_t
out_ref = net_ref(features_dense_t) out_ref = net_ref(features_dense_t)
times = [] times = []
for i in range(0): for i in range(10):
t = time.time() t = time.time()
out = net(features_t, indices_t, bs) out = net(features_t, indices_t, bs)
torch.cuda.synchronize() torch.cuda.synchronize()
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment