// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #pragma once #include "paddle/extension.h" #include #include constexpr int kBlockSize = 256; constexpr int kNumWaves = 16; inline cudaError_t GetNumBlocks(int64_t n, int* num_blocks) { int dev; { cudaError_t err = cudaGetDevice(&dev); if (err != cudaSuccess) { return err; } } int sm_count; { cudaError_t err = cudaDeviceGetAttribute(&sm_count, cudaDevAttrMultiProcessorCount, dev); if (err != cudaSuccess) { return err; } } int tpm; { cudaError_t err = cudaDeviceGetAttribute(&tpm, cudaDevAttrMaxThreadsPerMultiProcessor, dev); if (err != cudaSuccess) { return err; } } *num_blocks = std::max(1, std::min((n + kBlockSize - 1) / kBlockSize, sm_count * tpm / kBlockSize * kNumWaves)); return cudaSuccess; } template __device__ T max_func(const T a, const T b) { return a > b ? a : b; } template struct MaxOp { __device__ __forceinline__ T operator()(const T& a, const T& b) const { return max_func(a, b); } }; template class PDTraits; template <> class PDTraits { public: typedef float DataType; typedef float data_t; }; template <> class PDTraits { public: typedef half DataType; typedef paddle::float16 data_t; }; template <> class PDTraits { public: typedef __nv_bfloat16 DataType; typedef paddle::bfloat16 data_t; }; template struct alignas(sizeof(T) * Size) AlignedVector { T val[Size]; HOSTDEVICE inline const T& operator[](int i) const { return val[i]; } HOSTDEVICE inline T& operator[](int i) { return val[i]; } }; template HOSTDEVICE inline void Load(const T* addr, AlignedVector* vec) { const AlignedVector* addr_vec = reinterpret_cast*>(addr); *vec = *addr_vec; } template HOSTDEVICE inline void Store(const AlignedVector& vec, T* addr) { AlignedVector* addr_vec = reinterpret_cast*>(addr); *addr_vec = vec; } constexpr int VEC_16B = 16;