Commit 48b9a86a authored by traveller59's avatar traveller59
Browse files

fix cpu bug, increase cpu speed

parent e8533f47
...@@ -185,164 +185,6 @@ TV_HOST_DEVICE Index getValidOutPosTranspose( ...@@ -185,164 +185,6 @@ TV_HOST_DEVICE Index getValidOutPosTranspose(
return pointCounter; return pointCounter;
} }
template <typename Index, typename IndexGrid, unsigned NDim>
Index getIndicePairsConv(tv::TensorView<const Index> indicesIn,
tv::TensorView<Index> indicesOut,
tv::TensorView<IndexGrid> gridsOut,
tv::TensorView<Index> indicePairs,
tv::TensorView<Index> indiceNum,
const Index *kernelSize, const Index *stride,
const Index *padding, const Index *dilation,
const Index *outSpatialShape) {
// indicesOut: num_active * kernelVolume * (NDim + 1)
Index numAct = 0;
auto numActIn = indicesIn.dim(0);
Index batchIdx = 0;
Index spatialVolume = 1;
#pragma unroll
for (int i = 0; i < NDim; ++i) {
spatialVolume *= outSpatialShape[i];
}
Index kernelVolume = 1;
#pragma unroll
for (int i = 0; i < NDim; ++i) {
kernelVolume *= kernelSize[i];
}
Index numValidPoints = 0;
std::vector<Index> validPoints_(kernelVolume * (NDim + 1));
Index* validPoints = validPoints_.data();
Index *pointPtr = nullptr;
tsl::robin_map<Index, Index> hash;
for (int j = 0; j < numActIn; ++j) {
batchIdx = indicesIn(j, 0);
numValidPoints = getValidOutPos<Index, NDim>(
indicesIn.data() + j * (NDim + 1) + 1, kernelSize, stride, padding,
dilation, outSpatialShape, validPoints);
for (Index i = 0; i < numValidPoints; ++i) {
pointPtr = validPoints + i * (NDim + 1);
auto offset = pointPtr[NDim];
auto index = tv::rowArrayIdx<Index, NDim>(pointPtr, outSpatialShape) +
spatialVolume * batchIdx;
if (hash.find(index) == hash.end()) {
for (unsigned k = 1; k < NDim + 1; ++k) {
indicesOut(numAct, k) = pointPtr[k - 1];
}
indicesOut(numAct, 0) = batchIdx;
hash[index] = numAct++;
}
// indicePairs: [K, 2, L]
indicePairs(offset, 0, indiceNum[offset]) = j;
indicePairs(offset, 1, indiceNum[offset]++) = hash[index];
}
}
return numAct;
}
template <typename Index, typename IndexGrid, unsigned NDim>
Index getIndicePairsDeConv(tv::TensorView<const Index> indicesIn,
tv::TensorView<Index> indicesOut,
tv::TensorView<IndexGrid> gridsOut,
tv::TensorView<Index> indicePairs,
tv::TensorView<Index> indiceNum,
const Index *kernelSize, const Index *stride,
const Index *padding, const Index *dilation,
const Index *outSpatialShape) {
Index numAct = 0;
auto numActIn = indicesIn.dim(0);
Index batchIdx = 0;
Index spatialVolume = 1;
#pragma unroll
for (int i = 0; i < NDim; ++i) {
spatialVolume *= outSpatialShape[i];
}
Index kernelVolume = 1;
#pragma unroll
for (int i = 0; i < NDim; ++i) {
kernelVolume *= kernelSize[i];
}
Index numValidPoints = 0;
std::vector<Index> validPoints_(kernelVolume * (NDim + 1));
Index* validPoints = validPoints_.data();
Index *pointPtr = nullptr;
tsl::robin_map<Index, Index> hash;
for (int j = 0; j < numActIn; ++j) {
batchIdx = indicesIn(j, 0);
numValidPoints = getValidOutPosTranspose<Index, NDim>(
indicesIn.data() + j * (NDim + 1) + 1, kernelSize, stride, padding,
dilation, outSpatialShape, validPoints);
for (Index i = 0; i < numValidPoints; ++i) {
pointPtr = validPoints + i * (NDim + 1);
auto offset = pointPtr[NDim];
auto index = tv::rowArrayIdx<Index, NDim>(pointPtr, outSpatialShape) +
spatialVolume * batchIdx;
if (hash.find(index) == hash.end()) {
for (unsigned k = 1; k < NDim + 1; ++k) {
indicesOut(numAct, k) = pointPtr[k - 1];
}
indicesOut(numAct, 0) = batchIdx;
hash[index] = numAct++;
}
// indicePairs: [K, 2, L]
indicePairs(offset, 0, indiceNum[offset]) = j;
indicePairs(offset, 1, indiceNum[offset]++) = hash[index];
}
}
return numAct;
}
template <typename Index, typename IndexGrid, unsigned NDim>
Index getIndicePairsSubM(tv::TensorView<const Index> indicesIn,
tv::TensorView<IndexGrid> gridsOut,
tv::TensorView<Index> indicePairs,
tv::TensorView<Index> indiceNum,
const Index *const kernelSize,
const Index *const stride, const Index *const padding,
const Index *dilation, const Index *const outSpatialShape) {
Index numAct = 0;
auto numActIn = indicesIn.dim(0);
Index batchIdx = 0;
Index spatialVolume = 1;
#pragma unroll
for (int i = 0; i < NDim; ++i) {
spatialVolume *= outSpatialShape[i];
}
Index kernelVolume = 1;
#pragma unroll
for (int i = 0; i < NDim; ++i) {
kernelVolume *= kernelSize[i];
}
Index numValidPoints = 0;
// Index validPoints[kernelVolume * (NDim + 1)];
std::vector<Index> validPoints_(kernelVolume * (NDim + 1));
Index* validPoints = validPoints_.data();
Index *pointPtr = nullptr;
tsl::robin_map<Index, Index> hash;
for (int j = 0; j < numActIn; ++j) {
Index index = 0;
index = tv::rowArrayIdx<Index, NDim>(indicesIn.data() + j * (NDim + 1) + 1,
outSpatialShape) +
spatialVolume * indicesIn(j, 0);
hash[index] = j;
}
Index index = 0;
for (int j = 0; j < numActIn; ++j) {
numValidPoints = getValidOutPos<Index, NDim>(
indicesIn.data() + j * (NDim + 1) + 1, kernelSize, stride, padding,
dilation, outSpatialShape, validPoints);
for (Index i = 0; i < numValidPoints; ++i) {
pointPtr = validPoints + i * (NDim + 1);
auto offset = pointPtr[NDim];
index = tv::rowArrayIdx<Index, NDim>(pointPtr, outSpatialShape) +
spatialVolume * indicesIn(j, 0);
if (hash.find(index) == hash.end()) {
indicePairs(offset, 0, indiceNum[offset]) = j;
indicePairs(offset, 1, indiceNum[offset]++) = hash[index];
}
}
}
return numActIn;
}
} // namespace spconv } // namespace spconv
#endif #endif
\ No newline at end of file
...@@ -4,6 +4,12 @@ if (SPCONV_BuildCUDA) ...@@ -4,6 +4,12 @@ if (SPCONV_BuildCUDA)
endif() endif()
add_library(spconv SHARED ${ALL_FILES}) add_library(spconv SHARED ${ALL_FILES})
find_package(OpenMP)
if(OpenMP_CXX_FOUND)
target_link_libraries(spconv PUBLIC OpenMP::OpenMP_CXX)
endif()
target_include_directories(spconv PRIVATE ${ALL_INCLUDE} ) target_include_directories(spconv PRIVATE ${ALL_INCLUDE} )
set_property(TARGET spconv PROPERTY CUDA_STANDARD 14) set_property(TARGET spconv PROPERTY CUDA_STANDARD 14)
set_property(TARGET spconv PROPERTY CXX_STANDARD 14) set_property(TARGET spconv PROPERTY CXX_STANDARD 14)
......
...@@ -16,9 +16,185 @@ ...@@ -16,9 +16,185 @@
#include <spconv/indice.h> #include <spconv/indice.h>
#include <spconv/spconv_ops.h> #include <spconv/spconv_ops.h>
#include <torch/script.h> #include <torch/script.h>
#include <ATen/Parallel.h>
namespace spconv { namespace spconv {
template <typename Index, typename IndexGrid, unsigned NDim>
Index getIndicePairsConv(tv::TensorView<const Index> indicesIn,
tv::TensorView<Index> indicesOut,
tv::TensorView<IndexGrid> gridsOut,
tv::TensorView<Index> indicePairs,
tv::TensorView<Index> indiceNum,
const Index *kernelSize, const Index *stride,
const Index *padding, const Index *dilation,
const Index *outSpatialShape) {
// indicesOut: num_active * kernelVolume * (NDim + 1)
Index numAct = 0;
auto numActIn = indicesIn.dim(0);
Index batchIdx = 0;
Index spatialVolume = 1;
#pragma unroll
for (int i = 0; i < NDim; ++i) {
spatialVolume *= outSpatialShape[i];
}
Index kernelVolume = 1;
#pragma unroll
for (int i = 0; i < NDim; ++i) {
kernelVolume *= kernelSize[i];
}
Index numValidPoints = 0;
std::vector<Index> validPoints_(kernelVolume * (NDim + 1));
Index* validPoints = validPoints_.data();
Index *pointPtr = nullptr;
Index hashval;
tsl::robin_map<Index, Index> hash;
for (int j = 0; j < numActIn; ++j) {
batchIdx = indicesIn(j, 0);
numValidPoints = getValidOutPos<Index, NDim>(
indicesIn.data() + j * (NDim + 1) + 1, kernelSize, stride, padding,
dilation, outSpatialShape, validPoints);
for (Index i = 0; i < numValidPoints; ++i) {
pointPtr = validPoints + i * (NDim + 1);
auto offset = pointPtr[NDim];
auto index = tv::rowArrayIdx<Index, NDim>(pointPtr, outSpatialShape) +
spatialVolume * batchIdx;
auto iter = hash.find(index);
if (iter == hash.end()) {
for (unsigned k = 1; k < NDim + 1; ++k) {
indicesOut(numAct, k) = pointPtr[k - 1];
}
indicesOut(numAct, 0) = batchIdx;
hashval = numAct++;
hash[index] = hashval;
}else{
hashval = iter->second;
}
// indicePairs: [K, 2, L]
indicePairs(offset, 0, indiceNum[offset]) = j;
indicePairs(offset, 1, indiceNum[offset]++) = hashval;
}
}
return numAct;
}
template <typename Index, typename IndexGrid, unsigned NDim>
Index getIndicePairsDeConv(tv::TensorView<const Index> indicesIn,
tv::TensorView<Index> indicesOut,
tv::TensorView<IndexGrid> gridsOut,
tv::TensorView<Index> indicePairs,
tv::TensorView<Index> indiceNum,
const Index *kernelSize, const Index *stride,
const Index *padding, const Index *dilation,
const Index *outSpatialShape) {
Index numAct = 0;
auto numActIn = indicesIn.dim(0);
Index batchIdx = 0;
Index spatialVolume = 1;
#pragma unroll
for (int i = 0; i < NDim; ++i) {
spatialVolume *= outSpatialShape[i];
}
Index kernelVolume = 1;
#pragma unroll
for (int i = 0; i < NDim; ++i) {
kernelVolume *= kernelSize[i];
}
Index numValidPoints = 0;
std::vector<Index> validPoints_(kernelVolume * (NDim + 1));
Index* validPoints = validPoints_.data();
Index *pointPtr = nullptr;
Index hashval;
tsl::robin_map<Index, Index> hash;
for (int j = 0; j < numActIn; ++j) {
batchIdx = indicesIn(j, 0);
numValidPoints = getValidOutPosTranspose<Index, NDim>(
indicesIn.data() + j * (NDim + 1) + 1, kernelSize, stride, padding,
dilation, outSpatialShape, validPoints);
for (Index i = 0; i < numValidPoints; ++i) {
pointPtr = validPoints + i * (NDim + 1);
auto offset = pointPtr[NDim];
auto index = tv::rowArrayIdx<Index, NDim>(pointPtr, outSpatialShape) +
spatialVolume * batchIdx;
auto iter = hash.find(index);
if (iter == hash.end()) {
for (unsigned k = 1; k < NDim + 1; ++k) {
indicesOut(numAct, k) = pointPtr[k - 1];
}
indicesOut(numAct, 0) = batchIdx;
hashval = numAct++;
hash[index] = hashval;
}else{
hashval = iter->second;
}
// indicePairs: [K, 2, L]
indicePairs(offset, 0, indiceNum[offset]) = j;
indicePairs(offset, 1, indiceNum[offset]++) = hashval;
}
}
return numAct;
}
template <typename Index, typename IndexGrid, unsigned NDim>
Index getIndicePairsSubM(tv::TensorView<const Index> indicesIn,
tv::TensorView<IndexGrid> gridsOut,
tv::TensorView<Index> indicePairs,
tv::TensorView<Index> indiceNum,
const Index *const kernelSize,
const Index *const stride, const Index *const padding,
const Index *dilation, const Index *const outSpatialShape) {
Index numAct = 0;
auto numActIn = indicesIn.dim(0);
Index batchIdx = 0;
Index spatialVolume = 1;
#pragma unroll
for (int i = 0; i < NDim; ++i) {
spatialVolume *= outSpatialShape[i];
}
Index kernelVolume = 1;
#pragma unroll
for (int i = 0; i < NDim; ++i) {
kernelVolume *= kernelSize[i];
}
tsl::robin_map<Index, Index> hash;
for (int j = 0; j < numActIn; ++j) {
Index index = 0;
index = tv::rowArrayIdx<Index, NDim>(indicesIn.data() + j * (NDim + 1) + 1,
outSpatialShape) +
spatialVolume * indicesIn(j, 0);
hash[index] = j;
}
at::parallel_for(0, numActIn, 0, [&](int64_t begin, int64_t end){
Index index = 0;
Index numValidPoints = 0;
std::vector<Index> validPoints_(kernelVolume * (NDim + 1));
Index* validPoints = validPoints_.data();
Index *pointPtr = nullptr;
Index oldOffset = 0;
for (int j = begin; j < end; ++j) {
numValidPoints = getValidOutPos<Index, NDim>(
indicesIn.data() + j * (NDim + 1) + 1, kernelSize, stride, padding,
dilation, outSpatialShape, validPoints);
for (Index i = 0; i < numValidPoints; ++i) {
pointPtr = validPoints + i * (NDim + 1);
auto offset = pointPtr[NDim];
index = tv::rowArrayIdx<Index, NDim>(pointPtr, outSpatialShape) +
spatialVolume * indicesIn(j, 0);
auto iter = hash.find(index);
if (iter != hash.end()) {
#pragma omp atomic capture
oldOffset = indiceNum[offset]++;
indicePairs(offset, 0, oldOffset) = j;
indicePairs(offset, 1, oldOffset) = iter->second;
}
}
}
});
return numActIn;
}
namespace functor { namespace functor {
template <typename Index, typename IndexGrid, unsigned NDim> template <typename Index, typename IndexGrid, unsigned NDim>
struct CreateConvIndicePairFunctor<tv::CPU, Index, IndexGrid, NDim> { struct CreateConvIndicePairFunctor<tv::CPU, Index, IndexGrid, NDim> {
......
...@@ -41,15 +41,13 @@ struct SparseScatterAddFunctor<tv::CPU, T, Index> { ...@@ -41,15 +41,13 @@ struct SparseScatterAddFunctor<tv::CPU, T, Index> {
int numPlanes = outFeatures.dim(1); int numPlanes = outFeatures.dim(1);
const T* buf = buffer.data(); const T* buf = buffer.data();
T* out = outFeatures.data(); T* out = outFeatures.data();
at::parallel_for(0, size, 0, [&](int64_t begin, int64_t end){ for (int i = 0; i < size; ++i) {
for (int i = begin; i < end; ++i) { buf = buffer.data() + i * numPlanes;
buf = buffer.data() + i * numPlanes; out = outFeatures.data() + indices[i] * numPlanes;
out = outFeatures.data() + indices[i] * numPlanes; for (int j = 0; j < numPlanes; ++j){
for (int j = 0; j < numPlanes; ++j){ out[j] += buf[j];
out[j] += buf[j];
}
} }
}); }
} }
}; };
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment