Support ROCm 7 (#5162)

* Remove std::enable_if, warpRotateLeft is always used with TILE_SIZE * Do not use built-in warpSize in constexpr contexts Starting from ROCm 7 warpSize is no longer constexpr. findInteractingBlocks.hip uses it for sizes of __shared__ arrays. * Check if hipHostMallocNumaUser is allowed before using it

Support ROCm 7 (#5162)
* Remove std::enable_if, warpRotateLeft is always used with TILE_SIZE * Do not use built-in warpSize in constexpr contexts Starting from ROCm 7 warpSize is no longer constexpr. findInteractingBlocks.hip uses it for sizes of __shared__ arrays. * Check if hipHostMallocNumaUser is allowed before using it
07b738c5 · Anton Gorenko · GitHub · bb5552c0 · 07b738c5 · 07b738c5
Unverified Commit 07b738c5 authored Dec 14, 2025 by Anton Gorenko Committed by GitHub Dec 14, 2025
6 changed files
--- a/platforms/hip/include/HipContext.h
+++ b/platforms/hip/include/HipContext.h
@@ -636,6 +636,7 @@ private:
    int multiprocessors;
    int sharedMemPerBlock;
    bool supportsHardwareFloatGlobalAtomicAdd;
+    unsigned int hostMallocFlags;
    bool useBlockingSync, useDoublePrecision, useMixedPrecision, contextIsValid, boxIsTriclinic, hasAssignedPosqCharges;
    bool isLinkedContext;
    std::string tempDir, cacheDir, gpuArchitecture;

--- a/platforms/hip/src/HipContext.cpp
+++ b/platforms/hip/src/HipContext.cpp
@@ -182,6 +182,17 @@ HipContext::HipContext(const System& system, int deviceIndex, bool useBlockingSy
        this->supportsHardwareFloatGlobalAtomicAdd = true;
    }

+    hostMallocFlags = hipHostMallocDefault;
+#if !defined(WIN32)
+    // hipHostMallocNumaUser may not be allowed in some conditions, for example, if docker container 
+    // is created without --security-opt seccomp=unconfined or --cap-add=SYS_NICE
+    int* tmpHostBuffer;
+    if(hipHostMalloc(&tmpHostBuffer, sizeof(*tmpHostBuffer), hipHostMallocNumaUser) == hipSuccess) {
+        CHECK_RESULT(hipHostFree(tmpHostBuffer));
+        hostMallocFlags = hipHostMallocNumaUser;
+    }
+#endif
+
    contextIsValid = true;
    ContextSelector selector(*this);
    if (contextIndex > 0 && originalContext == NULL) {
@@ -911,9 +922,5 @@ unsigned int HipContext::getEventFlags() {
 }

 unsigned int HipContext::getHostMallocFlags() {
-#ifdef WIN32
-    return hipHostMallocDefault;
-#else
-    return hipHostMallocNumaUser;
-#endif
+    return hostMallocFlags;
 }
--- a/platforms/hip/src/HipNonbondedUtilities.cpp
+++ b/platforms/hip/src/HipNonbondedUtilities.cpp
@@ -680,10 +680,10 @@ hipFunction_t HipNonbondedUtilities::createInteractionKernel(const string& sourc
    replacements["SAVE_DERIVATIVES"] = saveDerivs.str();

    stringstream shuffleWarpData;
-    shuffleWarpData << "shflPosq = warpRotateLeft<TILE_SIZE>(shflPosq);\n";
-    shuffleWarpData << "shflForce = warpRotateLeft<TILE_SIZE>(shflForce);\n";
+    shuffleWarpData << "shflPosq = warpRotateLeft(shflPosq);\n";
+    shuffleWarpData << "shflForce = warpRotateLeft(shflForce);\n";
    for (const ComputeParameterInfo& param : params) {
-        shuffleWarpData<<"shfl"<<param.getName()<<"=warpRotateLeft<TILE_SIZE>(shfl"<<param.getName()<<");\n";
+        shuffleWarpData<<"shfl"<<param.getName()<<"=warpRotateLeft(shfl"<<param.getName()<<");\n";
    }
    replacements["SHUFFLE_WARP_DATA"] = shuffleWarpData.str();


--- a/platforms/hip/src/kernels/findInteractingBlocks.hip
+++ b/platforms/hip/src/kernels/findInteractingBlocks.hip
@@ -315,6 +315,13 @@ extern "C" __global__ __launch_bounds__(GROUP_SIZE) void findBlocksWithInteracti
    if (rebuildNeighborList[0] == 0)
        return; // The neighbor list doesn't need to be rebuilt.

+    // Starting from ROCm 7 warpSize is no longer constexpr. Redefine as a local value for using it
+    // in sizes of __shared__ arrays:
+#if defined(AMD_RDNA)
+    constexpr int warpSize = 32;
+#else
+    constexpr int warpSize = 64;
+#endif
    constexpr int tilesPerWarp = warpSize/TILE_SIZE;
    constexpr int warpsPerBlock = GROUP_SIZE/warpSize;
    const int indexInWarp = threadIdx.x%warpSize;

--- a/platforms/hip/src/kernels/intrinsics.hip
+++ b/platforms/hip/src/kernels/intrinsics.hip
@@ -2,17 +2,6 @@
 * This file contains the device function for using cross-lane operations (ballot and shuffle)
 */

-#if defined(TILE_SIZE)
-#if !defined(AMD_RDNA)
-// Two subwarps per warp
-#define SHFL(var, srcLane) __shfl(var, (srcLane) & (TILE_SIZE - 1), TILE_SIZE)
-#define BALLOT(var) (unsigned int)(__ballot(var) >> (threadIdx.x & ((64 - 1) ^ (TILE_SIZE - 1))))
-#else
-#define SHFL(var, srcLane) __shfl(var, srcLane)
-#define BALLOT(var) __ballot(var)
-#endif
-#endif
-
 template<class T>
 static __inline__ __device__
 T warpShuffle(const T& input, const int src_lane) {
@@ -21,7 +10,7 @@ T warpShuffle(const T& input, const int src_lane) {

    T output;
    #pragma unroll
-    for(int i = 0; i < words_no; i++) {
+    for (int i = 0; i < words_no; i++) {
        int word;
        __builtin_memcpy(&word, reinterpret_cast<const char*>(&input) + i * sizeof(int), sizeof(int));
        word = __builtin_amdgcn_ds_bpermute(src_lane << 2, word);
@@ -31,16 +20,30 @@ T warpShuffle(const T& input, const int src_lane) {
    return output;
 }

-template<int Subwarp, class T>
+#if defined(AMD_RDNA)
+
+// RDNA: device warp size = tile size = 32
+
+#define SHFL(var, srcLane) __shfl(var, srcLane)
+#define BALLOT(var) __ballot(var)
+
+template<class T>
 static __inline__ __device__
-typename std::enable_if<(Subwarp == warpSize), T>::type
-warpRotateLeft(const T& input) {
+T warpRotateLeft(const T& input) {
    return warpShuffle(input, threadIdx.x + 1);
 }

-template<int Subwarp, class T>
+#else
+
+// CDNA: device warp size = 64, tile size = 32
+
+#define SHFL(var, srcLane) __shfl(var, (srcLane) & (32 - 1), 32)
+#define BALLOT(var) (unsigned int)(__ballot(var) >> (threadIdx.x & ((64 - 1) ^ (32 - 1))))
+
+template<class T>
 static __inline__ __device__
-typename std::enable_if<!(Subwarp == warpSize), T>::type
-warpRotateLeft(const T& input) {
-    return warpShuffle(input, ((threadIdx.x + 1) & (Subwarp - 1)) | (threadIdx.x & ~(Subwarp - 1)));
+T warpRotateLeft(const T& input) {
+    return warpShuffle(input, ((threadIdx.x + 1) & (32 - 1)) | (threadIdx.x & ~(32 - 1)));
 }
+
+#endif
--- a/platforms/hip/src/kernels/nonbonded.hip
+++ b/platforms/hip/src/kernels/nonbonded.hip
@@ -213,7 +213,7 @@ extern "C" __launch_bounds__(THREAD_BLOCK_SIZE) __global__ void computeNonbonded
                excl >>= 1;
 #endif
                SHUFFLE_WARP_DATA
-                atomIndex = warpRotateLeft<TILE_SIZE>(atomIndex);
+                atomIndex = warpRotateLeft(atomIndex);
            }
            const unsigned int offset = atomIndex;
            // write results for off diagonal tiles
@@ -367,7 +367,7 @@ extern "C" __launch_bounds__(THREAD_BLOCK_SIZE) __global__ void computeNonbonded
                    }
 #endif
                    SHUFFLE_WARP_DATA
-                    atomIndex = warpRotateLeft<TILE_SIZE>(atomIndex);
+                    atomIndex = warpRotateLeft(atomIndex);
                }
            }
            else
@@ -426,7 +426,7 @@ extern "C" __launch_bounds__(THREAD_BLOCK_SIZE) __global__ void computeNonbonded
                    }
 #endif
                    SHUFFLE_WARP_DATA
-                    atomIndex = warpRotateLeft<TILE_SIZE>(atomIndex);
+                    atomIndex = warpRotateLeft(atomIndex);
                }
            }