"tests/1.sh" did not exist on "ce671dd4b974ffb18eb80cdb267d77f0b00d60b1"
Unverified Commit 07b738c5 authored by Anton Gorenko's avatar Anton Gorenko Committed by GitHub
Browse files

Support ROCm 7 (#5162)

* Remove std::enable_if, warpRotateLeft is always used with TILE_SIZE

* Do not use built-in warpSize in constexpr contexts

Starting from ROCm 7 warpSize is no longer constexpr.
findInteractingBlocks.hip uses it for sizes of __shared__ arrays.

* Check if hipHostMallocNumaUser is allowed before using it
parent bb5552c0
......@@ -636,6 +636,7 @@ private:
int multiprocessors;
int sharedMemPerBlock;
bool supportsHardwareFloatGlobalAtomicAdd;
unsigned int hostMallocFlags;
bool useBlockingSync, useDoublePrecision, useMixedPrecision, contextIsValid, boxIsTriclinic, hasAssignedPosqCharges;
bool isLinkedContext;
std::string tempDir, cacheDir, gpuArchitecture;
......
......@@ -182,6 +182,17 @@ HipContext::HipContext(const System& system, int deviceIndex, bool useBlockingSy
this->supportsHardwareFloatGlobalAtomicAdd = true;
}
hostMallocFlags = hipHostMallocDefault;
#if !defined(WIN32)
// hipHostMallocNumaUser may not be allowed in some conditions, for example, if docker container
// is created without --security-opt seccomp=unconfined or --cap-add=SYS_NICE
int* tmpHostBuffer;
if(hipHostMalloc(&tmpHostBuffer, sizeof(*tmpHostBuffer), hipHostMallocNumaUser) == hipSuccess) {
CHECK_RESULT(hipHostFree(tmpHostBuffer));
hostMallocFlags = hipHostMallocNumaUser;
}
#endif
contextIsValid = true;
ContextSelector selector(*this);
if (contextIndex > 0 && originalContext == NULL) {
......@@ -911,9 +922,5 @@ unsigned int HipContext::getEventFlags() {
}
unsigned int HipContext::getHostMallocFlags() {
#ifdef WIN32
return hipHostMallocDefault;
#else
return hipHostMallocNumaUser;
#endif
return hostMallocFlags;
}
......@@ -680,10 +680,10 @@ hipFunction_t HipNonbondedUtilities::createInteractionKernel(const string& sourc
replacements["SAVE_DERIVATIVES"] = saveDerivs.str();
stringstream shuffleWarpData;
shuffleWarpData << "shflPosq = warpRotateLeft<TILE_SIZE>(shflPosq);\n";
shuffleWarpData << "shflForce = warpRotateLeft<TILE_SIZE>(shflForce);\n";
shuffleWarpData << "shflPosq = warpRotateLeft(shflPosq);\n";
shuffleWarpData << "shflForce = warpRotateLeft(shflForce);\n";
for (const ComputeParameterInfo& param : params) {
shuffleWarpData<<"shfl"<<param.getName()<<"=warpRotateLeft<TILE_SIZE>(shfl"<<param.getName()<<");\n";
shuffleWarpData<<"shfl"<<param.getName()<<"=warpRotateLeft(shfl"<<param.getName()<<");\n";
}
replacements["SHUFFLE_WARP_DATA"] = shuffleWarpData.str();
......
......@@ -315,6 +315,13 @@ extern "C" __global__ __launch_bounds__(GROUP_SIZE) void findBlocksWithInteracti
if (rebuildNeighborList[0] == 0)
return; // The neighbor list doesn't need to be rebuilt.
// Starting from ROCm 7 warpSize is no longer constexpr. Redefine as a local value for using it
// in sizes of __shared__ arrays:
#if defined(AMD_RDNA)
constexpr int warpSize = 32;
#else
constexpr int warpSize = 64;
#endif
constexpr int tilesPerWarp = warpSize/TILE_SIZE;
constexpr int warpsPerBlock = GROUP_SIZE/warpSize;
const int indexInWarp = threadIdx.x%warpSize;
......
......@@ -2,17 +2,6 @@
* This file contains the device function for using cross-lane operations (ballot and shuffle)
*/
#if defined(TILE_SIZE)
#if !defined(AMD_RDNA)
// Two subwarps per warp
#define SHFL(var, srcLane) __shfl(var, (srcLane) & (TILE_SIZE - 1), TILE_SIZE)
#define BALLOT(var) (unsigned int)(__ballot(var) >> (threadIdx.x & ((64 - 1) ^ (TILE_SIZE - 1))))
#else
#define SHFL(var, srcLane) __shfl(var, srcLane)
#define BALLOT(var) __ballot(var)
#endif
#endif
template<class T>
static __inline__ __device__
T warpShuffle(const T& input, const int src_lane) {
......@@ -21,7 +10,7 @@ T warpShuffle(const T& input, const int src_lane) {
T output;
#pragma unroll
for(int i = 0; i < words_no; i++) {
for (int i = 0; i < words_no; i++) {
int word;
__builtin_memcpy(&word, reinterpret_cast<const char*>(&input) + i * sizeof(int), sizeof(int));
word = __builtin_amdgcn_ds_bpermute(src_lane << 2, word);
......@@ -31,16 +20,30 @@ T warpShuffle(const T& input, const int src_lane) {
return output;
}
template<int Subwarp, class T>
#if defined(AMD_RDNA)
// RDNA: device warp size = tile size = 32
#define SHFL(var, srcLane) __shfl(var, srcLane)
#define BALLOT(var) __ballot(var)
template<class T>
static __inline__ __device__
typename std::enable_if<(Subwarp == warpSize), T>::type
warpRotateLeft(const T& input) {
T warpRotateLeft(const T& input) {
return warpShuffle(input, threadIdx.x + 1);
}
template<int Subwarp, class T>
#else
// CDNA: device warp size = 64, tile size = 32
#define SHFL(var, srcLane) __shfl(var, (srcLane) & (32 - 1), 32)
#define BALLOT(var) (unsigned int)(__ballot(var) >> (threadIdx.x & ((64 - 1) ^ (32 - 1))))
template<class T>
static __inline__ __device__
typename std::enable_if<!(Subwarp == warpSize), T>::type
warpRotateLeft(const T& input) {
return warpShuffle(input, ((threadIdx.x + 1) & (Subwarp - 1)) | (threadIdx.x & ~(Subwarp - 1)));
T warpRotateLeft(const T& input) {
return warpShuffle(input, ((threadIdx.x + 1) & (32 - 1)) | (threadIdx.x & ~(32 - 1)));
}
#endif
......@@ -213,7 +213,7 @@ extern "C" __launch_bounds__(THREAD_BLOCK_SIZE) __global__ void computeNonbonded
excl >>= 1;
#endif
SHUFFLE_WARP_DATA
atomIndex = warpRotateLeft<TILE_SIZE>(atomIndex);
atomIndex = warpRotateLeft(atomIndex);
}
const unsigned int offset = atomIndex;
// write results for off diagonal tiles
......@@ -367,7 +367,7 @@ extern "C" __launch_bounds__(THREAD_BLOCK_SIZE) __global__ void computeNonbonded
}
#endif
SHUFFLE_WARP_DATA
atomIndex = warpRotateLeft<TILE_SIZE>(atomIndex);
atomIndex = warpRotateLeft(atomIndex);
}
}
else
......@@ -426,7 +426,7 @@ extern "C" __launch_bounds__(THREAD_BLOCK_SIZE) __global__ void computeNonbonded
}
#endif
SHUFFLE_WARP_DATA
atomIndex = warpRotateLeft<TILE_SIZE>(atomIndex);
atomIndex = warpRotateLeft(atomIndex);
}
}
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment