/*! * Copyright(C) 2023 Advanced Micro Devices, Inc. All rights reserved. */ #ifdef USE_CUDA #if defined(__HIP_PLATFORM_AMD__) || defined(__HIP__) // ROCm doesn't have __shfl_down_sync, only __shfl_down without mask. // Since mask is full 0xffffffff, we can use __shfl_down instead. #define __shfl_down_sync(mask, val, offset) __shfl_down(val, offset) #define __shfl_up_sync(mask, val, offset) __shfl_up(val, offset) // ROCm warpSize is constexpr and is either 32 or 64 depending on gfx arch. #define WARPSIZE warpSize // ROCm doesn't have atomicAdd_block, but it should be semantically the same as atomicAdd #define atomicAdd_block atomicAdd #else // CUDA warpSize is not a constexpr, but always 32 #define WARPSIZE 32 #endif #endif