Fix computeNonbonded hang on the HIP platform (#4959)

* Add a workaround for infinite loop in computeNonbonded (HIP) computeNonbonded hangs in some tests (without neighbor list). Reproducible on ROCm 6.4 and 6.4.1 (maybe on older versions too) on various architectures (both CDNA and RDNA). Affected tests: TestHipATMForce, TestHipMonteCarloBarostat, TestHipNonbondedForce, TestHipVirtualSites. Disassembly shows that the compiler splits branches of `if (skipBase+tgx < NUM_TILES_WITH_EXCLUSIONS)` and does `SHFL(skipTiles, TILE_SIZE-1) < pos` checks in them separately, even though `__builtin_amdgcn_ds_bpermute` is a convergent function. Apparently in this case not all lanes participate in each call. * Simplify includeTile check using ballot

Fix computeNonbonded hang on the HIP platform (#4959)
* Add a workaround for infinite loop in computeNonbonded (HIP) computeNonbonded hangs in some tests (without neighbor list). Reproducible on ROCm 6.4 and 6.4.1 (maybe on older versions too) on various architectures (both CDNA and RDNA). Affected tests: TestHipATMForce, TestHipMonteCarloBarostat, TestHipNonbondedForce, TestHipVirtualSites. Disassembly shows that the compiler splits branches of `if (skipBase+tgx < NUM_TILES_WITH_EXCLUSIONS)` and does `SHFL(skipTiles, TILE_SIZE-1) < pos` checks in them separately, even though `__builtin_amdgcn_ds_bpermute` is a convergent function. Apparently in this case not all lanes participate in each call. * Simplify includeTile check using ballot
a4b43a04 · Anton Gorenko · GitHub · e03ee784 · a4b43a04
Unverified Commit a4b43a04 authored Jun 08, 2025 by Anton Gorenko Committed by GitHub Jun 07, 2025
Show whitespace changes
Inline Side-by-side

Showing with 3 additions and 9 deletions

platforms/hip/src/kernels/nonbonded.hip platforms/hip/src/kernels/nonbonded.hip +3 -9

No files found.
--- a/platforms/hip/src/kernels/nonbonded.hip
+++ b/platforms/hip/src/kernels/nonbonded.hip
@@ -72,7 +72,6 @@ extern "C" __launch_bounds__(THREAD_BLOCK_SIZE) __global__ void computeNonbonded
    const unsigned int totalWarps = (blockDim.x*gridDim.x)/TILE_SIZE;
    const unsigned int warp = blockIdx.x*(THREAD_BLOCK_SIZE/TILE_SIZE) + threadIdx.x/TILE_SIZE; // global warpIndex
    const unsigned int tgx = threadIdx.x & (TILE_SIZE-1); // index within the warp
-    const unsigned int tbx = threadIdx.x - tgx;           // block warpIndex
    mixed energy = 0;
    INIT_DERIVATIVES

@@ -250,9 +249,7 @@ extern "C" __launch_bounds__(THREAD_BLOCK_SIZE) __global__ void computeNonbonded
    int pos = (int) (startTileIndex+warp*numTileIndices/totalWarps);
    int end = (int) (startTileIndex+(warp+1)*numTileIndices/totalWarps);
    int skipBase = 0;
-    int currentSkipIndex = tbx;
-    int skipTiles;
-    skipTiles = -1;
+    int skipTiles = -1;
    for (; pos < end; pos++) {
 #endif
        real3 force = make_real3(0);
@@ -277,7 +274,7 @@ extern "C" __launch_bounds__(THREAD_BLOCK_SIZE) __global__ void computeNonbonded

        // Skip over tiles that have exclusions, since they were already processed.

-        while (SHFL(skipTiles, TILE_SIZE-1) < pos) {
+        while (BALLOT(skipTiles >= pos) == 0) {
            if (skipBase+tgx < NUM_TILES_WITH_EXCLUSIONS) {
                int2 tile = exclusionTiles[skipBase+tgx];
                skipTiles = tile.x + tile.y*NUM_BLOCKS - tile.y*(tile.y+1)/2;
@@ -285,11 +282,8 @@ extern "C" __launch_bounds__(THREAD_BLOCK_SIZE) __global__ void computeNonbonded
            else
                skipTiles = end;
            skipBase += TILE_SIZE;
-            currentSkipIndex = 0;
        }
-        while (SHFL(skipTiles, currentSkipIndex) < pos)
-            currentSkipIndex++;
-        includeTile = (SHFL(skipTiles, currentSkipIndex) != pos);
+        includeTile = BALLOT(skipTiles == pos) == 0;
 #endif
        if (includeTile) {
            unsigned int atom1 = x*TILE_SIZE + tgx;