// Copyright 2019 Yan Yan // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #include #include #include #include #include #include #include #include #include #include #include namespace spconv { namespace functor { template struct SparseGatherFunctor { using vecload_type_t = std::conditional_t::value, int2, int4>; using kernel_block_t = mp_list_c; void operator()(const tv::GPU &d, tv::TensorView buffer, tv::TensorView features, tv::TensorView indices, int size) { if (size <= 0) return; int numPlanes = features.dim(1); bool notFound = true; constexpr int vecloadFactor = sizeof(vecload_type_t) / sizeof(T); mp_for_each([=, &buffer, &features, &indices, ¬Found](auto NumTLP) { constexpr int NumILP = NumTLP / 4; // constexpr int NumILP = NumTLP / (64 / (NumTLP / vecloadFactor)); int nHotBlock = (size / NumTLP) * NumTLP; if (notFound) { if (numPlanes % NumTLP == 0) { if (nHotBlock >= NumTLP) { gatherVecBlockKernel <<>>(buffer.data(), features.data(), indices.data(), nHotBlock, numPlanes / vecloadFactor); TV_CHECK_CUDA_ERR(); } if (size - nHotBlock > 0) { gatherVecKernel <<>>(buffer.data() + nHotBlock * numPlanes, features.data(), indices.data() + nHotBlock, size - nHotBlock, numPlanes / vecloadFactor); TV_CHECK_CUDA_ERR(); } notFound = false; } } }); if (notFound) { constexpr int NumTLP = 64; constexpr int NumILP = NumTLP / 4; gatherGenericKernel <<>>( buffer.data(), features.data(), indices.data(), size, numPlanes); TV_CHECK_CUDA_ERR(); } } }; template struct SparseScatterAddFunctor { using vecload_type_t = std::conditional_t::value, int2, int4>; using kernel_block_t = mp_list_c; void operator()(const tv::GPU &d, tv::TensorView outFeatures, tv::TensorView buffer, tv::TensorView indices, int size, bool stable) { if (size <= 0) return; int numPlanes = outFeatures.dim(1); bool notFound = true; constexpr int vecloadFactor = sizeof(vecload_type_t) / sizeof(T); // important for half. mp_for_each([=, &d, &outFeatures, &buffer, &indices, ¬Found](auto NumTLP) { // constexpr int NumILP = NumTLP / (64 / (NumTLP / vecloadFactor)); constexpr int NumILP = NumTLP / 4; int nHotBlock = (size / NumTLP) * NumTLP; if (notFound) { if (numPlanes % NumTLP == 0) { if (nHotBlock >= NumTLP) { scatterAddVecBlockKernel <<>>(outFeatures.data(), buffer.data(), indices.data(), nHotBlock, numPlanes / vecloadFactor); TV_CHECK_CUDA_ERR(); } if (size - nHotBlock > 0) { scatterAddGenericKernel <<>>( outFeatures.data(), buffer.data() + nHotBlock * numPlanes, indices.data() + nHotBlock, size - nHotBlock, numPlanes); TV_CHECK_CUDA_ERR(); } notFound = false; } } }); if (notFound) { constexpr int NumTLP = 64; constexpr int NumILP = NumTLP / 4; scatterAddGenericKernel <<>>( outFeatures.data(), buffer.data(), indices.data(), size, numPlanes); TV_CHECK_CUDA_ERR(); } } }; } // namespace functor #define DECLARE_GPU_SPECS_T_INDEX(T, Index) \ template struct functor::SparseGatherFunctor; \ template struct functor::SparseScatterAddFunctor; #define DECLARE_GPU_SPECS(T) DECLARE_GPU_SPECS_T_INDEX(T, int); DECLARE_GPU_SPECS(float); DECLARE_GPU_SPECS(double); DECLARE_GPU_SPECS(at::Half); #undef DECLARE_GPU_SPECS #undef DECLARE_GPU_SPECS_T_INDEX } // namespace spconv