Unverified Commit 07b738c5 authored by Anton Gorenko's avatar Anton Gorenko Committed by GitHub
Browse files

Support ROCm 7 (#5162)

* Remove std::enable_if, warpRotateLeft is always used with TILE_SIZE

* Do not use built-in warpSize in constexpr contexts

Starting from ROCm 7 warpSize is no longer constexpr.
findInteractingBlocks.hip uses it for sizes of __shared__ arrays.

* Check if hipHostMallocNumaUser is allowed before using it
parent bb5552c0
...@@ -636,6 +636,7 @@ private: ...@@ -636,6 +636,7 @@ private:
int multiprocessors; int multiprocessors;
int sharedMemPerBlock; int sharedMemPerBlock;
bool supportsHardwareFloatGlobalAtomicAdd; bool supportsHardwareFloatGlobalAtomicAdd;
unsigned int hostMallocFlags;
bool useBlockingSync, useDoublePrecision, useMixedPrecision, contextIsValid, boxIsTriclinic, hasAssignedPosqCharges; bool useBlockingSync, useDoublePrecision, useMixedPrecision, contextIsValid, boxIsTriclinic, hasAssignedPosqCharges;
bool isLinkedContext; bool isLinkedContext;
std::string tempDir, cacheDir, gpuArchitecture; std::string tempDir, cacheDir, gpuArchitecture;
......
...@@ -182,6 +182,17 @@ HipContext::HipContext(const System& system, int deviceIndex, bool useBlockingSy ...@@ -182,6 +182,17 @@ HipContext::HipContext(const System& system, int deviceIndex, bool useBlockingSy
this->supportsHardwareFloatGlobalAtomicAdd = true; this->supportsHardwareFloatGlobalAtomicAdd = true;
} }
hostMallocFlags = hipHostMallocDefault;
#if !defined(WIN32)
// hipHostMallocNumaUser may not be allowed in some conditions, for example, if docker container
// is created without --security-opt seccomp=unconfined or --cap-add=SYS_NICE
int* tmpHostBuffer;
if(hipHostMalloc(&tmpHostBuffer, sizeof(*tmpHostBuffer), hipHostMallocNumaUser) == hipSuccess) {
CHECK_RESULT(hipHostFree(tmpHostBuffer));
hostMallocFlags = hipHostMallocNumaUser;
}
#endif
contextIsValid = true; contextIsValid = true;
ContextSelector selector(*this); ContextSelector selector(*this);
if (contextIndex > 0 && originalContext == NULL) { if (contextIndex > 0 && originalContext == NULL) {
...@@ -911,9 +922,5 @@ unsigned int HipContext::getEventFlags() { ...@@ -911,9 +922,5 @@ unsigned int HipContext::getEventFlags() {
} }
unsigned int HipContext::getHostMallocFlags() { unsigned int HipContext::getHostMallocFlags() {
#ifdef WIN32 return hostMallocFlags;
return hipHostMallocDefault;
#else
return hipHostMallocNumaUser;
#endif
} }
...@@ -680,10 +680,10 @@ hipFunction_t HipNonbondedUtilities::createInteractionKernel(const string& sourc ...@@ -680,10 +680,10 @@ hipFunction_t HipNonbondedUtilities::createInteractionKernel(const string& sourc
replacements["SAVE_DERIVATIVES"] = saveDerivs.str(); replacements["SAVE_DERIVATIVES"] = saveDerivs.str();
stringstream shuffleWarpData; stringstream shuffleWarpData;
shuffleWarpData << "shflPosq = warpRotateLeft<TILE_SIZE>(shflPosq);\n"; shuffleWarpData << "shflPosq = warpRotateLeft(shflPosq);\n";
shuffleWarpData << "shflForce = warpRotateLeft<TILE_SIZE>(shflForce);\n"; shuffleWarpData << "shflForce = warpRotateLeft(shflForce);\n";
for (const ComputeParameterInfo& param : params) { for (const ComputeParameterInfo& param : params) {
shuffleWarpData<<"shfl"<<param.getName()<<"=warpRotateLeft<TILE_SIZE>(shfl"<<param.getName()<<");\n"; shuffleWarpData<<"shfl"<<param.getName()<<"=warpRotateLeft(shfl"<<param.getName()<<");\n";
} }
replacements["SHUFFLE_WARP_DATA"] = shuffleWarpData.str(); replacements["SHUFFLE_WARP_DATA"] = shuffleWarpData.str();
......
...@@ -315,6 +315,13 @@ extern "C" __global__ __launch_bounds__(GROUP_SIZE) void findBlocksWithInteracti ...@@ -315,6 +315,13 @@ extern "C" __global__ __launch_bounds__(GROUP_SIZE) void findBlocksWithInteracti
if (rebuildNeighborList[0] == 0) if (rebuildNeighborList[0] == 0)
return; // The neighbor list doesn't need to be rebuilt. return; // The neighbor list doesn't need to be rebuilt.
// Starting from ROCm 7 warpSize is no longer constexpr. Redefine as a local value for using it
// in sizes of __shared__ arrays:
#if defined(AMD_RDNA)
constexpr int warpSize = 32;
#else
constexpr int warpSize = 64;
#endif
constexpr int tilesPerWarp = warpSize/TILE_SIZE; constexpr int tilesPerWarp = warpSize/TILE_SIZE;
constexpr int warpsPerBlock = GROUP_SIZE/warpSize; constexpr int warpsPerBlock = GROUP_SIZE/warpSize;
const int indexInWarp = threadIdx.x%warpSize; const int indexInWarp = threadIdx.x%warpSize;
......
...@@ -2,17 +2,6 @@ ...@@ -2,17 +2,6 @@
* This file contains the device function for using cross-lane operations (ballot and shuffle) * This file contains the device function for using cross-lane operations (ballot and shuffle)
*/ */
#if defined(TILE_SIZE)
#if !defined(AMD_RDNA)
// Two subwarps per warp
#define SHFL(var, srcLane) __shfl(var, (srcLane) & (TILE_SIZE - 1), TILE_SIZE)
#define BALLOT(var) (unsigned int)(__ballot(var) >> (threadIdx.x & ((64 - 1) ^ (TILE_SIZE - 1))))
#else
#define SHFL(var, srcLane) __shfl(var, srcLane)
#define BALLOT(var) __ballot(var)
#endif
#endif
template<class T> template<class T>
static __inline__ __device__ static __inline__ __device__
T warpShuffle(const T& input, const int src_lane) { T warpShuffle(const T& input, const int src_lane) {
...@@ -21,7 +10,7 @@ T warpShuffle(const T& input, const int src_lane) { ...@@ -21,7 +10,7 @@ T warpShuffle(const T& input, const int src_lane) {
T output; T output;
#pragma unroll #pragma unroll
for(int i = 0; i < words_no; i++) { for (int i = 0; i < words_no; i++) {
int word; int word;
__builtin_memcpy(&word, reinterpret_cast<const char*>(&input) + i * sizeof(int), sizeof(int)); __builtin_memcpy(&word, reinterpret_cast<const char*>(&input) + i * sizeof(int), sizeof(int));
word = __builtin_amdgcn_ds_bpermute(src_lane << 2, word); word = __builtin_amdgcn_ds_bpermute(src_lane << 2, word);
...@@ -31,16 +20,30 @@ T warpShuffle(const T& input, const int src_lane) { ...@@ -31,16 +20,30 @@ T warpShuffle(const T& input, const int src_lane) {
return output; return output;
} }
template<int Subwarp, class T> #if defined(AMD_RDNA)
// RDNA: device warp size = tile size = 32
#define SHFL(var, srcLane) __shfl(var, srcLane)
#define BALLOT(var) __ballot(var)
template<class T>
static __inline__ __device__ static __inline__ __device__
typename std::enable_if<(Subwarp == warpSize), T>::type T warpRotateLeft(const T& input) {
warpRotateLeft(const T& input) {
return warpShuffle(input, threadIdx.x + 1); return warpShuffle(input, threadIdx.x + 1);
} }
template<int Subwarp, class T> #else
// CDNA: device warp size = 64, tile size = 32
#define SHFL(var, srcLane) __shfl(var, (srcLane) & (32 - 1), 32)
#define BALLOT(var) (unsigned int)(__ballot(var) >> (threadIdx.x & ((64 - 1) ^ (32 - 1))))
template<class T>
static __inline__ __device__ static __inline__ __device__
typename std::enable_if<!(Subwarp == warpSize), T>::type T warpRotateLeft(const T& input) {
warpRotateLeft(const T& input) { return warpShuffle(input, ((threadIdx.x + 1) & (32 - 1)) | (threadIdx.x & ~(32 - 1)));
return warpShuffle(input, ((threadIdx.x + 1) & (Subwarp - 1)) | (threadIdx.x & ~(Subwarp - 1)));
} }
#endif
...@@ -213,7 +213,7 @@ extern "C" __launch_bounds__(THREAD_BLOCK_SIZE) __global__ void computeNonbonded ...@@ -213,7 +213,7 @@ extern "C" __launch_bounds__(THREAD_BLOCK_SIZE) __global__ void computeNonbonded
excl >>= 1; excl >>= 1;
#endif #endif
SHUFFLE_WARP_DATA SHUFFLE_WARP_DATA
atomIndex = warpRotateLeft<TILE_SIZE>(atomIndex); atomIndex = warpRotateLeft(atomIndex);
} }
const unsigned int offset = atomIndex; const unsigned int offset = atomIndex;
// write results for off diagonal tiles // write results for off diagonal tiles
...@@ -367,7 +367,7 @@ extern "C" __launch_bounds__(THREAD_BLOCK_SIZE) __global__ void computeNonbonded ...@@ -367,7 +367,7 @@ extern "C" __launch_bounds__(THREAD_BLOCK_SIZE) __global__ void computeNonbonded
} }
#endif #endif
SHUFFLE_WARP_DATA SHUFFLE_WARP_DATA
atomIndex = warpRotateLeft<TILE_SIZE>(atomIndex); atomIndex = warpRotateLeft(atomIndex);
} }
} }
else else
...@@ -426,7 +426,7 @@ extern "C" __launch_bounds__(THREAD_BLOCK_SIZE) __global__ void computeNonbonded ...@@ -426,7 +426,7 @@ extern "C" __launch_bounds__(THREAD_BLOCK_SIZE) __global__ void computeNonbonded
} }
#endif #endif
SHUFFLE_WARP_DATA SHUFFLE_WARP_DATA
atomIndex = warpRotateLeft<TILE_SIZE>(atomIndex); atomIndex = warpRotateLeft(atomIndex);
} }
} }
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment