Optimize computeNonbonded

* All AMD GPUs support shuffle, double precision and 64-bit int atomics; * Remove unused code: !ENABLE_SHUFFLE code paths in nonbonded.hip; * Use intrinsics in single-precision; * Use realToFixedPoint (faster float32-to-int64); * Remove shared atomIndices, use shuffles; * Check early if atoms are in the cutoff range, sometimes all lanes in a warp can skip computations, single pairs can also skip useless atomics with zero values; * Remove volatile skipTiles access, use shuffles; * Distribute work for warps in a strided order; * Skip warps that may be still busy in the first loop; * Unify conditions for excluded atoms with `includeInteraction`; * Move multiprocessors to HipContext; * Increase number of warps for computeNonbonded; * Disable packed math for >=MI200 (it affects performance of some kernels like computeGKForces of amoebagk); * Remove defaultOptimizationOptions and createModule's optimizationFlags as they are never used; * Support -save-temps.

Optimize computeNonbonded
* All AMD GPUs support shuffle, double precision and 64-bit int atomics; * Remove unused code: !ENABLE_SHUFFLE code paths in nonbonded.hip; * Use intrinsics in single-precision; * Use realToFixedPoint (faster float32-to-int64); * Remove shared atomIndices, use shuffles; * Check early if atoms are in the cutoff range, sometimes all lanes in a warp can skip computations, single pairs can also skip useless atomics with zero values; * Remove volatile skipTiles access, use shuffles; * Distribute work for warps in a strided order; * Skip warps that may be still busy in the first loop; * Unify conditions for excluded atoms with `includeInteraction`; * Move multiprocessors to HipContext; * Increase number of warps for computeNonbonded; * Disable packed math for >=MI200 (it affects performance of some kernels like computeGKForces of amoebagk); * Remove defaultOptimizationOptions and createModule's optimizationFlags as they are never used; * Support -save-temps.
67f5644d · Anton Gorenko · 7279c539 · 67f5644d · 67f5644d · 67f5644d
Unverified Commit 67f5644d authored Aug 25, 2024 by Anton Gorenko
6 changed files
--- a/platforms/hip/include/HipContext.h
+++ b/platforms/hip/include/HipContext.h
@@ -60,7 +60,7 @@
 #include "openmm/common/ComputeContext.h"
 #include "openmm/Kernel.h"

-typedef unsigned long tileflags;
+typedef unsigned int tileflags;

 namespace OpenMM {

@@ -259,19 +259,15 @@ public:
     * Create a HIP module from source code.
     *
     * @param source             the source code of the module
-     * @param optimizationFlags  the optimization flags to pass to the HIP compiler.  If this is
-     *                           omitted, a default set of options will be used
     */
-    hipModule_t createModule(const std::string source, const char* optimizationFlags = NULL);
+    hipModule_t createModule(const std::string source);
    /**
     * Create a HIP module from source code.
     *
     * @param source             the source code of the module
     * @param defines            a set of preprocessor definitions (name, value) to define when compiling the program
-     * @param optimizationFlags  the optimization flags to pass to the HIP compiler.  If this is
-     *                           omitted, a default set of options will be used
     */
-    hipModule_t createModule(const std::string source, const std::map<std::string, std::string>& defines, const char* optimizationFlags = NULL);
+    hipModule_t createModule(const std::string source, const std::map<std::string, std::string>& defines);
    /**
     * Get a kernel from a HIP module.
     *
@@ -358,22 +354,22 @@ public:
        return simdWidth;
    }
    /**
-     * Get whether the device being used warp shuffles.
+     * Get the number of multiprocessors (compute units) of the device being used.
     */
-    bool getSupportsWarpShuffle() const {
-        return hasWarpShuffle;
+    int getMultiprocessors() const {
+        return multiprocessors;
    }
    /**
     * Get whether the device being used supports 64 bit atomic operations on global memory.
     */
    bool getSupports64BitGlobalAtomics() const {
-        return hasGlobalInt64Atomics;
+        return true;
    }
    /**
     * Get whether the device being used supports double precision math.
     */
    bool getSupportsDoublePrecision() const {
-        return hasDoubles;
+        return true;
    }
    /**
     * Get whether double precision is being used.
@@ -555,16 +551,13 @@ private:
    int numAtomBlocks;
    int numThreadBlocks;
    int simdWidth;
+    int multiprocessors;
    int sharedMemPerBlock;
-    bool hasDoubles;
-    bool hasGlobalInt64Atomics;
-    bool hasWarpShuffle;
    bool useBlockingSync, useDoublePrecision, useMixedPrecision, contextIsValid, boxIsTriclinic, hasCompilerKernel, isHipccAvailable, hasAssignedPosqCharges;
    bool isLinkedContext;
    std::string compiler, tempDir, cacheDir, gpuArchitecture;
    float4 periodicBoxVecXFloat, periodicBoxVecYFloat, periodicBoxVecZFloat, periodicBoxSizeFloat, invPeriodicBoxSizeFloat;
    double4 periodicBoxVecX, periodicBoxVecY, periodicBoxVecZ, periodicBoxSize, invPeriodicBoxSize;
-    std::string defaultOptimizationOptions;
    std::map<std::string, std::string> compilationDefines;
    hipDevice_t device;
    hipStream_t currentStream;

--- a/platforms/hip/src/HipContext.cpp
+++ b/platforms/hip/src/HipContext.cpp
@@ -69,8 +69,7 @@ using namespace OpenMM;
 using namespace std;

 const int HipContext::ThreadBlockSize = 64;
-const int HipContext::TileSize = sizeof(tileflags)*8;
-static_assert(HipContext::ThreadBlockSize == HipContext::TileSize);
+const int HipContext::TileSize = 32;
 bool HipContext::hasInitializedHip = false;


@@ -136,7 +135,6 @@ HipContext::HipContext(const System& system, int deviceIndex, bool useBlockingSy
        for (int i = 0; i < static_cast<int>(devicePrecedence.size()); i++) {
            int trialDeviceIndex = devicePrecedence[i];
            CHECK_RESULT(hipDeviceGet(&device, trialDeviceIndex));
-            defaultOptimizationOptions = "-ffast-math -Wall";
            // try setting device
            if (hipSetDevice(device) == hipSuccess) {
                // and set flags
@@ -172,10 +170,6 @@ HipContext::HipContext(const System& system, int deviceIndex, bool useBlockingSy
    // set device properties
    this->simdWidth = props.warpSize;
    this->sharedMemPerBlock = props.sharedMemPerBlock;
-    this->hasGlobalInt64Atomics = props.arch.hasGlobalInt64Atomics;
-    this->hasDoubles = props.arch.hasDoubles;
-    // HIP-TODO: remove hasWarpShuffle==false paths completely?
-    this->hasWarpShuffle = true;

    gpuArchitecture = props.gcnArchName;
    // HIP-TODO: find a good value here
@@ -195,19 +189,15 @@ HipContext::HipContext(const System& system, int deviceIndex, bool useBlockingSy
    numAtoms = system.getNumParticles();
    paddedNumAtoms = TileSize*((numAtoms+TileSize-1)/TileSize);
    numAtomBlocks = (paddedNumAtoms+(TileSize-1))/TileSize;
-    int multiprocessors;
    CHECK_RESULT(hipDeviceGetAttribute(&multiprocessors, hipDeviceAttributeMultiprocessorCount, device));
+    // For RDNA GPUs hipDeviceAttributeMultiprocessorCount means WGP (work-group processors, two compute units), not CUs.
+    if (simdWidth == 32)
+        multiprocessors *= 2;
    numThreadBlocks = numThreadBlocksPerComputeUnit*multiprocessors;

-    // GCN hardware is more like CUDA-8, w/ no independent forward progress
-    // or *_sync primatives
-    compilationDefines["SYNC_WARPS"] = "";
-    compilationDefines["SHFL(var, srcLane)"] = "__shfl(var, srcLane);";
-    // Important: the predicate for ballot is defined as an integer, hence
-    // it is important that we convert the variable field to a true/false value
-    // before running ballot, such that we do not discard the top half when
-    // running on a long int
-    compilationDefines["BALLOT(var)"] = "__ballot((var) != 0);";
+    compilationDefines["USE_HIP"] = "1";
+    if (simdWidth == 32)
+        compilationDefines["AMD_RDNA"] = "1";
    if (useDoublePrecision) {
        posq.initialize<double4>(*this, paddedNumAtoms, "posq");
        velm.initialize<double4>(*this, paddedNumAtoms, "velm");
@@ -263,11 +253,11 @@ HipContext::HipContext(const System& system, int deviceIndex, bool useBlockingSy

    // Set defines based on the requested precision.

-    compilationDefines["SQRT"] = useDoublePrecision ? "sqrt" : "sqrtf";
-    compilationDefines["RSQRT"] = useDoublePrecision ? "rsqrt" : "rsqrtf";
-    compilationDefines["RECIP"] = useDoublePrecision ? "1.0/" : "1.0f/";
-    compilationDefines["EXP"] = useDoublePrecision ? "exp" : "expf";
-    compilationDefines["LOG"] = useDoublePrecision ? "log" : "logf";
+    compilationDefines["SQRT"] = useDoublePrecision ? "sqrt" : "__fsqrt_rn";
+    compilationDefines["RSQRT"] = useDoublePrecision ? "rsqrt" : "__frsqrt_rn";
+    compilationDefines["RECIP(x)"] = useDoublePrecision ? "(1.0/(x))" : "(1.0f/(x))";
+    compilationDefines["EXP"] = useDoublePrecision ? "exp" : "__expf";
+    compilationDefines["LOG"] = useDoublePrecision ? "log" : "__logf";
    compilationDefines["POW"] = useDoublePrecision ? "pow" : "powf";
    compilationDefines["COS"] = useDoublePrecision ? "cos" : "cosf";
    compilationDefines["SIN"] = useDoublePrecision ? "sin" : "sinf";
@@ -423,14 +413,22 @@ void HipContext::setAsCurrent() {
        hipSetDevice(device);
 }

-hipModule_t HipContext::createModule(const string source, const char* optimizationFlags) {
-    return createModule(source, map<string, string>(), optimizationFlags);
+hipModule_t HipContext::createModule(const string source) {
+    return createModule(source, map<string, string>());
 }

-hipModule_t HipContext::createModule(const string source, const map<string, string>& defines, const char* optimizationFlags) {
-    static_assert(8*sizeof(void*) == HipContext::TileSize);
+hipModule_t HipContext::createModule(const string source, const map<string, string>& defines) {
+    const char* saveTempsEnv = getenv("OPENMM_SAVE_TEMPS");
+    bool saveTemps = saveTempsEnv != nullptr;
    string bits = intToString(8*sizeof(void*));
-    string options = (optimizationFlags == NULL ? defaultOptimizationOptions : string(optimizationFlags));
+    string options = "-ffast-math -Wall";
+    if (gpuArchitecture.find("gfx90a") == 0 ||
+        gpuArchitecture.find("gfx94") == 0) {
+        // HIP-TODO: Remove it when the compiler does a better job
+        // Disable SLP vectorization as it may generate unoptimal packed math instructions on
+        // >=MI200 (gfx90a, gfx942): more v_mov, higher register usage etc.
+        options += " -fno-slp-vectorize";
+    }
    if (getMaxThreadBlockSize() < 1024) {
        options += " --gpu-max-threads-per-block=" + std::to_string(getMaxThreadBlockSize());
    }
@@ -478,9 +476,7 @@ hipModule_t HipContext::createModule(const string source, const map<string, stri
        src << "typedef float3 mixed3;\n";
        src << "typedef float4 mixed4;\n";
    }
-
-    src << "typedef unsigned long tileflags;\n";
-    src << "static_assert(sizeof(tileflags)*8==" << HipContext::TileSize << ",\"tileflags size does not match TILE_SIZE\");\n";
+    src << "typedef unsigned int tileflags;\n";
    src << HipKernelSources::common << endl;
    for (auto& pair : defines) {
        src << "#define " << pair.first;
@@ -490,6 +486,7 @@ hipModule_t HipContext::createModule(const string source, const map<string, stri
    }
    if (!defines.empty())
        src << endl;
+    src << HipKernelSources::intrinsics << endl;
    src << source << endl;

    // See whether we already have PTX for this kernel cached.
@@ -499,12 +496,12 @@ hipModule_t HipContext::createModule(const string source, const map<string, stri
    sha1.Final();
    UINT_8 hash[20];
    sha1.GetHash(hash);
-    stringstream cacheFile;
-    cacheFile << cacheDir;
-    cacheFile.flags(ios::hex);
+    stringstream cacheHash;
+    cacheHash.flags(ios::hex);
    for (int i = 0; i < 20; i++)
-        cacheFile << setw(2) << setfill('0') << (int) hash[i];
-    cacheFile << '_' << gpuArchitecture << '_' << bits;
+        cacheHash << setw(2) << setfill('0') << (int) hash[i];
+    stringstream cacheFile;
+    cacheFile << cacheDir << cacheHash.str() << '_' << gpuArchitecture << '_' << bits;
    hipModule_t module;
    if (hipModuleLoad(&module, cacheFile.str().c_str()) == hipSuccess)
        return module;
@@ -512,11 +509,22 @@ hipModule_t HipContext::createModule(const string source, const map<string, stri
    // Select names for the various temporary files.

    stringstream tempFileName;
-    tempFileName << "openmmTempKernel" << this; // Include a pointer to this context as part of the filename to avoid collisions.
-    tempFileName << "_" << getpid();
-    string inputFile = (tempDir+tempFileName.str()+".hip.cpp");
-    string outputFile = (tempDir+tempFileName.str()+".hsaco");
-    string logFile = (tempDir+tempFileName.str()+".log");
+    if (saveTemps) {
+        tempFileName << saveTempsEnv;
+        const char* saveTempsPrefixEnv = getenv("OPENMM_SAVE_TEMPS_PREFIX");
+        if (saveTempsPrefixEnv) {
+            tempFileName << saveTempsPrefixEnv;
+        }
+        tempFileName << cacheHash.str();
+    }
+    else {
+        tempFileName << tempDir;
+        tempFileName << "openmmTempKernel" << this; // Include a pointer to this context as part of the filename to avoid collisions.
+        tempFileName << "_" << getpid();
+    }
+    string inputFile = (tempFileName.str()+".hip.cpp");
+    string outputFile = (tempFileName.str()+".hsaco");
+    string logFile = (tempFileName.str()+".log");
    int res = 0;

    // If the runtime compiler plugin is available, use it.
@@ -550,7 +558,7 @@ hipModule_t HipContext::createModule(const string source, const map<string, stri
        ofstream out(inputFile.c_str());
        out << src.str();
        out.close();
-        string command = compiler + " --genco --amdgpu-target=" + gpuArchitecture + " " + options + " -o \""+outputFile+"\" " + " \""+inputFile+"\" 2> \""+logFile+"\"";
+        string command = compiler + " --genco --amdgpu-target=" + gpuArchitecture + " " + options + (saveTemps ? " -save-temps=obj" : "") +" -o \""+outputFile+"\" " + " \""+inputFile+"\" 2> \""+logFile+"\"";
        res = std::system(command.c_str());
    }
    try {
@@ -576,20 +584,16 @@ hipModule_t HipContext::createModule(const string source, const map<string, stri
            m<<"Error loading HIP module: "<<getErrorString(result)<<" ("<<result<<")";
            throw OpenMMException(m.str());
        }
-        const char* save = getenv("OPENMM_SAVE_TEMPS");
-        bool savetemps = save != nullptr;
-        if (!savetemps) {
+        if (!saveTemps) {
            remove(inputFile.c_str());
            remove(logFile.c_str());
        }
-        if (rename(outputFile.c_str(), cacheFile.str().c_str()) != 0 && !savetemps)
+        if (rename(outputFile.c_str(), cacheFile.str().c_str()) != 0 && !saveTemps)
            remove(outputFile.c_str());
        return module;
    }
    catch (...) {
-        const char* save = getenv("OPENMM_SAVE_TEMPS");
-        bool savetemps = save != nullptr;
-        if (!savetemps) {
+        if (!saveTemps) {
            remove(inputFile.c_str());
            remove(outputFile.c_str());
            remove(logFile.c_str());

--- a/platforms/hip/src/HipNonbondedUtilities.cpp
+++ b/platforms/hip/src/HipNonbondedUtilities.cpp
@@ -69,12 +69,10 @@ HipNonbondedUtilities::HipNonbondedUtilities(HipContext& context) : context(cont
    // Decide how many thread blocks to use.

    string errorMessage = "Error initializing nonbonded utilities";
-    int multiprocessors;
-    CHECK_RESULT(hipDeviceGetAttribute(&multiprocessors, hipDeviceAttributeMultiprocessorCount, context.getDevice()));
-    CHECK_RESULT(hipEventCreateWithFlags(&downloadCountEvent, 0));
-    CHECK_RESULT(hipHostMalloc((void**) &pinnedCountBuffer, 2*sizeof(int), hipHostMallocPortable));
-    numForceThreadBlocks = 4*multiprocessors;
-    forceThreadBlockSize = (256);
+    CHECK_RESULT(hipEventCreateWithFlags(&downloadCountEvent, context.getEventFlags()));
+    CHECK_RESULT(hipHostMalloc((void**) &pinnedCountBuffer, 2*sizeof(unsigned int), hipHostMallocPortable));
+    numForceThreadBlocks = 5*4*context.getMultiprocessors();
+    forceThreadBlockSize = 64;
    setKernelSource(HipKernelSources::nonbonded);
 }

@@ -234,7 +232,7 @@ void HipNonbondedUtilities::initialize(const System& system) {
    // Record the exclusion data.

    exclusions.initialize<tileflags>(context, tilesWithExclusions.size()*HipContext::TileSize, "exclusions");
-    tileflags allFlags = static_cast<tileflags>(-1);
+    tileflags allFlags = (tileflags) -1;
    vector<tileflags> exclusionVec(exclusions.getSize(), allFlags);
    for (int atom1 = 0; atom1 < (int) atomExclusions.size(); ++atom1) {
        int x = atom1/HipContext::TileSize;
@@ -245,11 +243,11 @@ void HipNonbondedUtilities::initialize(const System& system) {
            int offset2 = atom2-y*HipContext::TileSize;
            if (x > y) {
                int index = exclusionTileMap[make_pair(x, y)]*HipContext::TileSize;
-                exclusionVec[index+offset1] &= allFlags-(static_cast<tileflags>(1)<<offset2);
+                exclusionVec[index+offset1] &= allFlags-(1<<offset2);
            }
            else {
                int index = exclusionTileMap[make_pair(y, x)]*HipContext::TileSize;
-                exclusionVec[index+offset2] &= allFlags-(static_cast<tileflags>(1)<<offset1);
+                exclusionVec[index+offset2] &= allFlags-(1<<offset1);
            }
        }
    }
@@ -413,7 +411,7 @@ void HipNonbondedUtilities::computeInteractions(int forceGroups, bool includeFor
        hipFunction_t& kernel = (includeForces ? (includeEnergy ? kernels.forceEnergyKernel : kernels.forceKernel) : kernels.energyKernel);
        if (kernel == NULL)
            kernel = createInteractionKernel(kernels.source, parameters, arguments, true, true, forceGroups, includeForces, includeEnergy);
-        context.executeKernel(kernel, &forceArgs[0], numForceThreadBlocks*forceThreadBlockSize, forceThreadBlockSize);
+        context.executeKernelFlat(kernel, &forceArgs[0], numForceThreadBlocks*forceThreadBlockSize, forceThreadBlockSize);
    }
    if (useCutoff && numTiles > 0) {
        hipEventSynchronize(downloadCountEvent);
@@ -445,7 +443,7 @@ bool HipNonbondedUtilities::updateNeighborListSize() {
        findInteractingBlocksArgs[7] = &interactingAtoms.getDevicePointer();
    }
    if (pinnedCountBuffer[1] > maxSinglePairs) {
-        maxSinglePairs = (int) (1.2*pinnedCountBuffer[1]);
+        maxSinglePairs = (unsigned int) (1.2*pinnedCountBuffer[1]);
        singlePairs.resize(maxSinglePairs);
        if (forceArgs.size() > 0)
            forceArgs[19] = &singlePairs.getDevicePointer();
@@ -512,18 +510,6 @@ hipFunction_t HipNonbondedUtilities::createInteractionKernel(const string& sourc
    map<string, string> replacements;
    replacements["COMPUTE_INTERACTION"] = source;
    const string suffixes[] = {"x", "y", "z", "w"};
-    stringstream localData;
-    int localDataSize = 0;
-    for (const ParameterInfo& param : params) {
-        if (param.getNumComponents() == 1)
-            localData<<param.getType()<<" "<<param.getName()<<";\n";
-        else {
-            for (int j = 0; j < param.getNumComponents(); ++j)
-                localData<<param.getComponentType()<<" "<<param.getName()<<"_"<<suffixes[j]<<";\n";
-        }
-        localDataSize += param.getSize();
-    }
-    replacements["ATOM_PARAMETER_DATA"] = localData.str();
    stringstream args;
    for (const ParameterInfo& param : params) {
        args << ", ";
@@ -556,101 +542,43 @@ hipFunction_t HipNonbondedUtilities::createInteractionKernel(const string& sourc
    }
    replacements["LOAD_ATOM1_PARAMETERS"] = load1.str();

-    bool useShuffle = context.getSupportsWarpShuffle();
-
    // Part 1. Defines for on diagonal exclusion tiles
-    stringstream loadLocal1;
-    if(useShuffle) {
-        // not needed if using shuffles as we can directly fetch from register
-    } else {
-        for (const ParameterInfo& param : params) {
-            if (param.getNumComponents() == 1)
-                loadLocal1<<"localData[LOCAL_ID]."<<param.getName()<<" = "<<param.getName()<<"1;\n";
-            else {
-                for (int j = 0; j < param.getNumComponents(); ++j)
-                    loadLocal1<<"localData[LOCAL_ID]."<<param.getName()<<"_"<<suffixes[j]<<" = "<<param.getName()<<"1."<<suffixes[j]<<";\n";
-            }
-        }
-    }
-    replacements["LOAD_LOCAL_PARAMETERS_FROM_1"] = loadLocal1.str();

    stringstream broadcastWarpData;
-    if (useShuffle) {
-        broadcastWarpData << "posq2.x = real_shfl(shflPosq.x, j);\n";
-        broadcastWarpData << "posq2.y = real_shfl(shflPosq.y, j);\n";
-        broadcastWarpData << "posq2.z = real_shfl(shflPosq.z, j);\n";
-        broadcastWarpData << "posq2.w = real_shfl(shflPosq.w, j);\n";
-        for (const ParameterInfo& param : params) {
-            broadcastWarpData << param.getType() << " shfl" << param.getName() << ";\n";
-            for (int j = 0; j < param.getNumComponents(); j++) {
-                if (param.getNumComponents() == 1)
-                    broadcastWarpData << "shfl" << param.getName() << "=real_shfl(" << param.getName() <<"1,j);\n";
-                else
-                    broadcastWarpData << "shfl" << param.getName()+"."+suffixes[j] << "=real_shfl(" << param.getName()+"1."+suffixes[j] <<",j);\n";
-            }
+    broadcastWarpData << "posq2.x = SHFL(shflPosq.x, j);\n";
+    broadcastWarpData << "posq2.y = SHFL(shflPosq.y, j);\n";
+    broadcastWarpData << "posq2.z = SHFL(shflPosq.z, j);\n";
+    broadcastWarpData << "posq2.w = SHFL(shflPosq.w, j);\n";
+    for (const ParameterInfo& param : params) {
+        broadcastWarpData << param.getType() << " shfl" << param.getName() << ";\n";
+        for (int j = 0; j < param.getNumComponents(); j++) {
+            if (param.getNumComponents() == 1)
+                broadcastWarpData << "shfl" << param.getName() << "=SHFL(" << param.getName() <<"1,j);\n";
+            else
+                broadcastWarpData << "shfl" << param.getName()+"."+suffixes[j] << "=SHFL(" << param.getName()+"1."+suffixes[j] <<",j);\n";
        }
-    } else {
-        // not used if not shuffling
    }
    replacements["BROADCAST_WARP_DATA"] = broadcastWarpData.str();

    // Part 2. Defines for off-diagonal exclusions, and neighborlist tiles.
    stringstream declareLocal2;
-    if (useShuffle) {
-        for (const ParameterInfo& param : params)
-            declareLocal2<<param.getType()<<" shfl"<<param.getName()<<";\n";
-    }
-    else {
-        // not used if using shared memory
-    }
+    for (const ParameterInfo& param : params)
+        declareLocal2<<param.getType()<<" shfl"<<param.getName()<<";\n";
    replacements["DECLARE_LOCAL_PARAMETERS"] = declareLocal2.str();

    stringstream loadLocal2;
-    if (useShuffle) {
-        for (const ParameterInfo& param : params)
-            loadLocal2<<"shfl"<<param.getName()<<" = global_"<<param.getName()<<"[j];\n";
-    }
-    else {
-        for (const ParameterInfo& param : params) {
-            if (param.getNumComponents() == 1)
-                loadLocal2<<"localData[LOCAL_ID]."<<param.getName()<<" = global_"<<param.getName()<<"[j];\n";
-            else {
-                loadLocal2<<param.getType()<<" temp_"<<param.getName()<<" = global_"<<param.getName()<<"[j];\n";
-                for (int j = 0; j < param.getNumComponents(); ++j)
-                    loadLocal2<<"localData[LOCAL_ID]."<<param.getName()<<"_"<<suffixes[j]<<" = temp_"<<param.getName()<<"."<<suffixes[j]<<";\n";
-            }
-        }
-    }
+    for (const ParameterInfo& param : params)
+        loadLocal2<<"shfl"<<param.getName()<<" = global_"<<param.getName()<<"[j];\n";
    replacements["LOAD_LOCAL_PARAMETERS_FROM_GLOBAL"] = loadLocal2.str();

    stringstream load2j;
-    if (useShuffle) {
-        for (const ParameterInfo& param : params)
-            load2j<<param.getType()<<" "<<param.getName()<<"2 = shfl"<<param.getName()<<";\n";
-    }
-    else {
-        for (const ParameterInfo& param : params) {
-            if (param.getNumComponents() == 1)
-                load2j<<param.getType()<<" "<<param.getName()<<"2 = localData[atom2]."<<param.getName()<<";\n";
-            else {
-                load2j<<param.getType()<<" "<<param.getName()<<"2 = make_"<<param.getType()<<"(";
-                for (int j = 0; j < param.getNumComponents(); ++j) {
-                    if (j > 0)
-                        load2j<<", ";
-                    load2j<<"localData[atom2]."<<param.getName()<<"_"<<suffixes[j];
-                }
-                load2j<<");\n";
-            }
-        }
-    }
+    for (const ParameterInfo& param : params)
+        load2j<<param.getType()<<" "<<param.getName()<<"2 = shfl"<<param.getName()<<";\n";
    replacements["LOAD_ATOM2_PARAMETERS"] = load2j.str();

    stringstream clearLocal;
    for (const ParameterInfo& param : params) {
-        if (useShuffle)
-            clearLocal<<"shfl";
-        else
-            clearLocal<<"localData[atom2].";
+        clearLocal<<"shfl";
        clearLocal<<param.getName()<<" = ";
        if (param.getNumComponents() == 1)
            clearLocal<<"0;\n";
@@ -673,29 +601,10 @@ hipFunction_t HipNonbondedUtilities::createInteractionKernel(const string& sourc
    replacements["SAVE_DERIVATIVES"] = saveDerivs.str();

    stringstream shuffleWarpData;
-    if(useShuffle) {
-        shuffleWarpData << "shflPosq.x = real_shfl(shflPosq.x, tgx+1);\n";
-        shuffleWarpData << "shflPosq.y = real_shfl(shflPosq.y, tgx+1);\n";
-        shuffleWarpData << "shflPosq.z = real_shfl(shflPosq.z, tgx+1);\n";
-        shuffleWarpData << "shflPosq.w = real_shfl(shflPosq.w, tgx+1);\n";
-        shuffleWarpData << "shflForce.x = real_shfl(shflForce.x, tgx+1);\n";
-        shuffleWarpData << "shflForce.y = real_shfl(shflForce.y, tgx+1);\n";
-        shuffleWarpData << "shflForce.z = real_shfl(shflForce.z, tgx+1);\n";
-        for (const ParameterInfo& param : params) {
-            if (param.getNumComponents() == 1)
-                shuffleWarpData<<"shfl"<<param.getName()<<"=real_shfl(shfl"<<param.getName()<<", tgx+1);\n";
-            else {
-                for (int j = 0; j < param.getNumComponents(); j++) {
-                    // looks something like shflsigmaEpsilon.x = real_shfl(shflsigmaEpsilon.x,tgx+1);
-                    shuffleWarpData<<"shfl"<<param.getName()
-                        <<"."<<suffixes[j]<<"=real_shfl(shfl"
-                        <<param.getName()<<"."<<suffixes[j]
-                        <<", tgx+1);\n";
-                }
-            }
-        }
-    } else {
-        // not used otherwise
+    shuffleWarpData << "shflPosq = warpRotateLeft<TILE_SIZE>(shflPosq);\n";
+    shuffleWarpData << "shflForce = warpRotateLeft<TILE_SIZE>(shflForce);\n";
+    for (const ParameterInfo& param : params) {
+        shuffleWarpData<<"shfl"<<param.getName()<<"=warpRotateLeft<TILE_SIZE>(shfl"<<param.getName()<<");\n";
    }
    replacements["SHUFFLE_WARP_DATA"] = shuffleWarpData.str();

@@ -708,8 +617,7 @@ hipFunction_t HipNonbondedUtilities::createInteractionKernel(const string& sourc
        defines["USE_EXCLUSIONS"] = "1";
    if (isSymmetric)
        defines["USE_SYMMETRIC"] = "1";
-    if (useShuffle)
-        defines["ENABLE_SHUFFLE"] = "1";
+    defines["ENABLE_SHUFFLE"] = "1"; // Used only in hippoNonbonded.cc
    if (includeForces)
        defines["INCLUDE_FORCES"] = "1";
    if (includeEnergy)
@@ -725,6 +633,7 @@ hipFunction_t HipNonbondedUtilities::createInteractionKernel(const string& sourc
        }
    }
    defines["MAX_CUTOFF"] = context.doubleToString(maxCutoff);
+    defines["MAX_CUTOFF_SQUARED"] = context.doubleToString(maxCutoff*maxCutoff);
    defines["NUM_ATOMS"] = context.intToString(context.getNumAtoms());
    defines["PADDED_NUM_ATOMS"] = context.intToString(context.getPaddedNumAtoms());
    defines["NUM_BLOCKS"] = context.intToString(context.getNumAtomBlocks());
@@ -736,8 +645,6 @@ hipFunction_t HipNonbondedUtilities::createInteractionKernel(const string& sourc
    int endExclusionIndex = (context.getContextIndex()+1)*numExclusionTiles/numContexts;
    defines["FIRST_EXCLUSION_TILE"] = context.intToString(startExclusionIndex);
    defines["LAST_EXCLUSION_TILE"] = context.intToString(endExclusionIndex);
-    if ((localDataSize/4)%2 == 0 && !context.getUseDoublePrecision())
-        defines["PARAMETER_SIZE_IS_EVEN"] = "1";
    hipModule_t program = context.createModule(HipKernelSources::vectorOps+context.replaceStrings(kernelSource, replacements), defines);
    hipFunction_t kernel = context.getKernel(program, "computeNonbonded");
    return kernel;

--- a/platforms/hip/src/kernels/common.hip
+++ b/platforms/hip/src/kernels/common.hip
@@ -16,11 +16,36 @@
 #define GROUP_ID blockIdx.x
 #define NUM_GROUPS gridDim.x
 #define SYNC_THREADS __syncthreads();
+#define SYNC_WARPS {__builtin_amdgcn_wave_barrier(); __builtin_amdgcn_fence(__ATOMIC_ACQ_REL, "wavefront");}
 #define MEM_FENCE __threadfence_block();
 #define ATOMIC_ADD(dest, value) atomicAdd(dest, value)

 typedef long long mm_long;
 typedef unsigned long long mm_ulong;

-#define SUPPORTS_64_BIT_ATOMICS __HIP_ARCH_HAS_GLOBAL_INT64_ATOMICS__
-#define SUPPORTS_DOUBLE_PRECISION __HIP_ARCH_HAS_DOUBLES__
+#define SUPPORTS_DOUBLE_PRECISION 1
+
+#ifdef USE_DOUBLE_PRECISION
+
+__device__ inline long long realToFixedPoint(double x) {
+    return static_cast<long long>(x * 0x100000000);
+}
+
+#else
+
+__device__ inline long long realToFixedPoint(float x) {
+    // Faster way to calculate static_cast<long long>(x * 0x100000000) with exactly the same
+    // results but less instructions.
+    float integral = truncf(x);
+    float fractional = (x - integral) * 0x100000000;
+    unsigned int integral_u32 = static_cast<int>(integral);
+    unsigned int fractional_u32 = static_cast<unsigned int>(fabsf(fractional));
+    // A negative real number (with non-zero fractional) needs rounding-down x for integral and
+    // changing fractional's sign. However, -1 is used as a threshold instead of 0 because, when
+    // fractional is in (-1; 0], fractional_u32 is 0 and the number is considered an integer.
+    bool isNegReal = fractional <= -1.0f;
+    return (static_cast<unsigned long long>(isNegReal ? integral_u32 - 1 : integral_u32) << 32) |
+            static_cast<unsigned long long>(isNegReal ? 0 - fractional_u32 : fractional_u32);
+}
+
+#endif
--- a/platforms/hip/src/kernels/intrinsics.hip
+++ b/platforms/hip/src/kernels/intrinsics.hip
+/**
+ * This file contains the device function for using cross-lane operations (ballot and shuffle)
+ */
+
+#if defined(TILE_SIZE)
+#if !defined(AMD_RDNA)
+// Two subwarps per warp
+#define SHFL(var, srcLane) __shfl(var, (srcLane) & (TILE_SIZE - 1), TILE_SIZE)
+#define BALLOT(var) (unsigned int)(__ballot(var) >> (threadIdx.x & ((64 - 1) ^ (TILE_SIZE - 1))))
+#else
+#define SHFL(var, srcLane) __shfl(var, srcLane)
+#define BALLOT(var) __ballot(var)
+#endif
+#endif
+
+template<class T>
+static __inline__ __device__
+T warpShuffle(const T& input, const int src_lane) {
+    static_assert(sizeof(T) % sizeof(int) == 0, "incorrect type size");
+    constexpr int words_no = sizeof(T) / sizeof(int);
+
+    T output;
+    #pragma unroll
+    for(int i = 0; i < words_no; i++) {
+        int word;
+        __builtin_memcpy(&word, reinterpret_cast<const char*>(&input) + i * sizeof(int), sizeof(int));
+        word = __builtin_amdgcn_ds_bpermute(src_lane << 2, word);
+        __builtin_memcpy(reinterpret_cast<char*>(&output) + i * sizeof(int), &word, sizeof(int));
+    }
+
+    return output;
+}
+
+template<int Subwarp, class T>
+static __inline__ __device__
+typename std::enable_if<(Subwarp == warpSize), T>::type
+warpRotateLeft(const T& input) {
+    return warpShuffle(input, threadIdx.x + 1);
+}
+
+template<int Subwarp, class T>
+static __inline__ __device__
+typename std::enable_if<!(Subwarp == warpSize), T>::type
+warpRotateLeft(const T& input) {
+    return warpShuffle(input, ((threadIdx.x + 1) & (Subwarp - 1)) | (threadIdx.x & ~(Subwarp - 1)));
+}
--- a/platforms/hip/src/kernels/nonbonded.hip
+++ b/platforms/hip/src/kernels/nonbonded.hip
-#ifndef ENABLE_SHUFFLE
-typedef struct {
-    real x, y, z;
-    real q;
-    real fx, fy, fz;
-    ATOM_PARAMETER_DATA
-#ifndef PARAMETER_SIZE_IS_EVEN
-    real padding;
-#endif
-} AtomData;
-#endif
-
-#ifdef ENABLE_SHUFFLE
-#define real_shfl SHFL
-#endif
-
 /**
 * Compute nonbonded interactions. The kernel is separated into two parts,
 * tiles with exclusions and tiles without exclusions. It relies heavily on
@@ -75,7 +59,7 @@ typedef struct {
 * [in]interactingAtoms - a list of interactions within a given tile
 *
 */
-extern "C" __global__ void computeNonbonded(
+extern "C" __launch_bounds__(THREAD_BLOCK_SIZE) __global__ void computeNonbonded(
        unsigned long long* __restrict__ forceBuffers, mixed* __restrict__ energyBuffer, const real4* __restrict__ posq, const tileflags* __restrict__ exclusions,
        const int2* __restrict__ exclusionTiles, unsigned int startTileIndex, unsigned long long numTileIndices
 #ifdef USE_CUTOFF
@@ -86,21 +70,15 @@ extern "C" __global__ void computeNonbonded(
 #endif
        PARAMETER_ARGUMENTS) {
    const unsigned int totalWarps = (blockDim.x*gridDim.x)/TILE_SIZE;
-    const unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/TILE_SIZE; // global warpIndex
+    const unsigned int warp = blockIdx.x*(THREAD_BLOCK_SIZE/TILE_SIZE) + threadIdx.x/TILE_SIZE; // global warpIndex
    const unsigned int tgx = threadIdx.x & (TILE_SIZE-1); // index within the warp
    const unsigned int tbx = threadIdx.x - tgx;           // block warpIndex
    mixed energy = 0;
    INIT_DERIVATIVES
-    // used shared memory if the device cannot shuffle
-#ifndef ENABLE_SHUFFLE
-    __shared__ AtomData localData[THREAD_BLOCK_SIZE];
-#endif

    // First loop: process tiles that contain exclusions.

-    const unsigned int firstExclusionTile = FIRST_EXCLUSION_TILE+warp*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/totalWarps;
-    const unsigned int lastExclusionTile = FIRST_EXCLUSION_TILE+(warp+1)*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/totalWarps;
-    for (int pos = firstExclusionTile; pos < lastExclusionTile; pos++) {
+    for (int pos = FIRST_EXCLUSION_TILE+warp; pos < LAST_EXCLUSION_TILE; pos+=totalWarps) {
        const int2 tileIndices = exclusionTiles[pos];
        const unsigned int x = tileIndices.x;
        const unsigned int y = tileIndices.y;
@@ -111,62 +89,57 @@ extern "C" __global__ void computeNonbonded(
 #ifdef USE_EXCLUSIONS
        tileflags excl = exclusions[pos*TILE_SIZE+tgx];
 #endif
-        const bool hasExclusions = true;
        if (x == y) {
            // This tile is on the diagonal.
-#ifdef ENABLE_SHUFFLE
            real4 shflPosq = posq1;
-#else
-            localData[threadIdx.x].x = posq1.x;
-            localData[threadIdx.x].y = posq1.y;
-            localData[threadIdx.x].z = posq1.z;
-            localData[threadIdx.x].q = posq1.w;
-            LOAD_LOCAL_PARAMETERS_FROM_1
-#endif

            // we do not need to fetch parameters from global since this is a symmetric tile
            // instead we can broadcast the values using shuffle
+            #pragma unroll 2
            for (unsigned int j = 0; j < TILE_SIZE; j++) {
-                int atom2 = tbx+j;
                real4 posq2;
-#ifdef ENABLE_SHUFFLE
                BROADCAST_WARP_DATA
-#else
-                posq2 = make_real4(localData[atom2].x, localData[atom2].y, localData[atom2].z, localData[atom2].q);
-#endif
                real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z);
 #ifdef USE_PERIODIC
                APPLY_PERIODIC_TO_DELTA(delta)
 #endif
                real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
-                real invR = RSQRT(r2);
-                real r = r2*invR;
-                LOAD_ATOM2_PARAMETERS
-                atom2 = y*TILE_SIZE+j;
+#ifdef USE_CUTOFF
+                if (r2 < MAX_CUTOFF_SQUARED) {
+#endif
+                    real invR = RSQRT(r2);
+                    real r = r2*invR;
+                    LOAD_ATOM2_PARAMETERS
+                    int atom2 = y*TILE_SIZE+j;
 #ifdef USE_SYMMETRIC
-                real dEdR = 0.0f;
+                    real dEdR = 0.0f;
 #else
-                real3 dEdR1 = make_real3(0);
-                real3 dEdR2 = make_real3(0);
+                    real3 dEdR1 = make_real3(0);
+                    real3 dEdR2 = make_real3(0);
 #endif
 #ifdef USE_EXCLUSIONS
-                bool isExcluded = (atom1 >= NUM_ATOMS || atom2 >= NUM_ATOMS || !(excl & 1));
+                    bool isExcluded = !(atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && (excl & 1));
+#endif
+                    real tempEnergy = 0.0f;
+                    const real interactionScale = 0.5f;
+                    COMPUTE_INTERACTION
+#ifdef INCLUDE_ENERGY
+                    energy += 0.5f*tempEnergy;
 #endif
-                real tempEnergy = 0.0f;
-                const real interactionScale = 0.5f;
-                COMPUTE_INTERACTION
-                energy += 0.5f*tempEnergy;
 #ifdef INCLUDE_FORCES
 #ifdef USE_SYMMETRIC
-                force.x -= delta.x*dEdR;
-                force.y -= delta.y*dEdR;
-                force.z -= delta.z*dEdR;
+                    force.x -= delta.x*dEdR;
+                    force.y -= delta.y*dEdR;
+                    force.z -= delta.z*dEdR;
 #else
-                force.x -= dEdR1.x;
-                force.y -= dEdR1.y;
-                force.z -= dEdR1.z;
+                    force.x -= dEdR1.x;
+                    force.y -= dEdR1.y;
+                    force.z -= dEdR1.z;
 #endif
 #endif
+#ifdef USE_CUTOFF
+                }
+#endif
 #ifdef USE_EXCLUSIONS
                excl >>= 1;
 #endif
@@ -175,117 +148,88 @@ extern "C" __global__ void computeNonbonded(
        else {
            // This is an off-diagonal tile.
            unsigned int j = y*TILE_SIZE + tgx;
+            int atomIndex = j;
            real4 shflPosq = posq[j];
-#ifdef ENABLE_SHUFFLE
            real3 shflForce;
            shflForce.x = 0.0f;
            shflForce.y = 0.0f;
            shflForce.z = 0.0f;
-#else
-            localData[threadIdx.x].x = shflPosq.x;
-            localData[threadIdx.x].y = shflPosq.y;
-            localData[threadIdx.x].z = shflPosq.z;
-            localData[threadIdx.x].q = shflPosq.w;
-            localData[threadIdx.x].fx = 0.0f;
-            localData[threadIdx.x].fy = 0.0f;
-            localData[threadIdx.x].fz = 0.0f;
-#endif
            DECLARE_LOCAL_PARAMETERS
            LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
 #ifdef USE_EXCLUSIONS
            excl = (excl >> tgx) | (excl << (TILE_SIZE - tgx));
 #endif
-            unsigned int tj = tgx;
+            #pragma unroll 2
            for (j = 0; j < TILE_SIZE; j++) {
-                int atom2 = tbx+tj;
-#ifdef ENABLE_SHUFFLE
                real4 posq2 = shflPosq;
-#else
-                real4 posq2 = make_real4(localData[atom2].x, localData[atom2].y, localData[atom2].z, localData[atom2].q);
-#endif
                real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z);
 #ifdef USE_PERIODIC
                APPLY_PERIODIC_TO_DELTA(delta)
 #endif
                real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
-                real invR = RSQRT(r2);
-                real r = r2*invR;
-                LOAD_ATOM2_PARAMETERS
-                atom2 = y*TILE_SIZE+tj;
+#ifdef USE_CUTOFF
+                if (r2 < MAX_CUTOFF_SQUARED) {
+#endif
+                    real invR = RSQRT(r2);
+                    real r = r2*invR;
+                    LOAD_ATOM2_PARAMETERS
+                    int atom2 = atomIndex;
 #ifdef USE_SYMMETRIC
-                real dEdR = 0.0f;
+                    real dEdR = 0.0f;
 #else
-                real3 dEdR1 = make_real3(0);
-                real3 dEdR2 = make_real3(0);
+                    real3 dEdR1 = make_real3(0);
+                    real3 dEdR2 = make_real3(0);
 #endif
 #ifdef USE_EXCLUSIONS
-                bool isExcluded = (atom1 >= NUM_ATOMS || atom2 >= NUM_ATOMS || !(excl & 1));
+                    bool isExcluded = !(atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && (excl & 1));
+#endif
+                    real tempEnergy = 0.0f;
+                    const real interactionScale = 1.0f;
+                    COMPUTE_INTERACTION
+#ifdef INCLUDE_ENERGY
+                    energy += tempEnergy;
 #endif
-                real tempEnergy = 0.0f;
-                const real interactionScale = 1.0f;
-                COMPUTE_INTERACTION
-                energy += tempEnergy;
 #ifdef INCLUDE_FORCES
 #ifdef USE_SYMMETRIC
-                delta *= dEdR;
-                force.x -= delta.x;
-                force.y -= delta.y;
-                force.z -= delta.z;
-#ifdef ENABLE_SHUFFLE
-                shflForce.x += delta.x;
-                shflForce.y += delta.y;
-                shflForce.z += delta.z;
-
-#else
-                localData[tbx+tj].fx += delta.x;
-                localData[tbx+tj].fy += delta.y;
-                localData[tbx+tj].fz += delta.z;
-#endif
+                    delta *= dEdR;
+                    force.x -= delta.x;
+                    force.y -= delta.y;
+                    force.z -= delta.z;
+                    shflForce.x += delta.x;
+                    shflForce.y += delta.y;
+                    shflForce.z += delta.z;
 #else // !USE_SYMMETRIC
-                force.x -= dEdR1.x;
-                force.y -= dEdR1.y;
-                force.z -= dEdR1.z;
-#ifdef ENABLE_SHUFFLE
-                shflForce.x += dEdR2.x;
-                shflForce.y += dEdR2.y;
-                shflForce.z += dEdR2.z;
-#else
-                localData[tbx+tj].fx += dEdR2.x;
-                localData[tbx+tj].fy += dEdR2.y;
-                localData[tbx+tj].fz += dEdR2.z;
-#endif
+                    force.x -= dEdR1.x;
+                    force.y -= dEdR1.y;
+                    force.z -= dEdR1.z;
+                    shflForce.x += dEdR2.x;
+                    shflForce.y += dEdR2.y;
+                    shflForce.z += dEdR2.z;
 #endif // end USE_SYMMETRIC
 #endif
-#ifdef ENABLE_SHUFFLE
-                SHUFFLE_WARP_DATA
+#ifdef USE_CUTOFF
+                }
 #endif
 #ifdef USE_EXCLUSIONS
                excl >>= 1;
 #endif
-                // cycles the indices
-                // 0 1 2 3 4 5 6 7 -> 1 2 3 4 5 6 7 0
-                tj = (tj + 1) & (TILE_SIZE - 1);
+                SHUFFLE_WARP_DATA
+                atomIndex = warpRotateLeft<TILE_SIZE>(atomIndex);
            }
-            const unsigned int offset = y*TILE_SIZE + tgx;
+            const unsigned int offset = atomIndex;
            // write results for off diagonal tiles
 #ifdef INCLUDE_FORCES
-#ifdef ENABLE_SHUFFLE
-            atomicAdd(&forceBuffers[offset], static_cast<unsigned long long>((long long) (shflForce.x*0x100000000)));
-            atomicAdd(&forceBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (shflForce.y*0x100000000)));
-            atomicAdd(&forceBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (shflForce.z*0x100000000)));
-#else
-            atomicAdd(&forceBuffers[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fx*0x100000000)));
-            atomicAdd(&forceBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fy*0x100000000)));
-            atomicAdd(&forceBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fz*0x100000000)));
-#endif
+            atomicAdd(&forceBuffers[offset], static_cast<unsigned long long>(realToFixedPoint(shflForce.x)));
+            atomicAdd(&forceBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>(realToFixedPoint(shflForce.y)));
+            atomicAdd(&forceBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>(realToFixedPoint(shflForce.z)));
 #endif
        }
        // Write results for on and off diagonal tiles
 #ifdef INCLUDE_FORCES
        const unsigned int offset = x*TILE_SIZE + tgx;
-        atomicAdd(&forceBuffers[offset], static_cast<unsigned long long>((long long) (force.x*0x100000000)));
-        atomicAdd(&forceBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force.y*0x100000000)));
-        atomicAdd(&forceBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force.z*0x100000000)));
+        atomicAdd(&forceBuffers[offset], static_cast<unsigned long long>(realToFixedPoint(force.x)));
+        atomicAdd(&forceBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>(realToFixedPoint(force.y)));
+        atomicAdd(&forceBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>(realToFixedPoint(force.z)));
 #endif
    }

@@ -296,22 +240,21 @@ extern "C" __global__ void computeNonbonded(
    const unsigned int numTiles = interactionCount[0];
    if (numTiles > maxTiles)
        return; // There wasn't enough memory for the neighbor list.
-    int pos = (int) (warp*(long long)numTiles/totalWarps);
-    int end = (int) ((warp+1)*(long long)numTiles/totalWarps);
+    for (unsigned int pos0 = warp; pos0 < LAST_EXCLUSION_TILE+numTiles; pos0+=totalWarps) {
+        // Skip warps that may be still busy in the first loop
+        if (pos0 < LAST_EXCLUSION_TILE) {
+            continue;
+        }
+        const unsigned int pos = pos0-LAST_EXCLUSION_TILE;
 #else
    int pos = (int) (startTileIndex+warp*numTileIndices/totalWarps);
    int end = (int) (startTileIndex+(warp+1)*numTileIndices/totalWarps);
-#endif
    int skipBase = 0;
    int currentSkipIndex = tbx;
-    // atomIndices can probably be shuffled as well
-    // but it probably wouldn't make things any faster
-    __shared__ int atomIndices[THREAD_BLOCK_SIZE];
-    __shared__ volatile int skipTiles[THREAD_BLOCK_SIZE];
-    skipTiles[threadIdx.x] = -1;
-
-    while (pos < end) {
-        const bool hasExclusions = false;
+    int skipTiles;
+    skipTiles = -1;
+    for (; pos < end; pos++) {
+#endif
        real3 force = make_real3(0);
        bool includeTile = true;

@@ -334,63 +277,44 @@ extern "C" __global__ void computeNonbonded(

        // Skip over tiles that have exclusions, since they were already processed.

-        while (skipTiles[tbx+TILE_SIZE-1] < pos) {
+        while (SHFL(skipTiles, TILE_SIZE-1) < pos) {
            if (skipBase+tgx < NUM_TILES_WITH_EXCLUSIONS) {
                int2 tile = exclusionTiles[skipBase+tgx];
-                skipTiles[threadIdx.x] = tile.x + tile.y*NUM_BLOCKS - tile.y*(tile.y+1)/2;
+                skipTiles = tile.x + tile.y*NUM_BLOCKS - tile.y*(tile.y+1)/2;
            }
            else
-                skipTiles[threadIdx.x] = end;
+                skipTiles = end;
            skipBase += TILE_SIZE;
-            currentSkipIndex = tbx;
+            currentSkipIndex = 0;
        }
-        while (skipTiles[currentSkipIndex] < pos)
+        while (SHFL(skipTiles, currentSkipIndex) < pos)
            currentSkipIndex++;
-        includeTile = (skipTiles[currentSkipIndex] != pos);
+        includeTile = (SHFL(skipTiles, currentSkipIndex) != pos);
 #endif
        if (includeTile) {
            unsigned int atom1 = x*TILE_SIZE + tgx;
            // Load atom data for this tile.
            real4 posq1 = posq[atom1];
            LOAD_ATOM1_PARAMETERS
-            //const unsigned int localAtomIndex = threadIdx.x;
 #ifdef USE_CUTOFF
            unsigned int j = interactingAtoms[pos*TILE_SIZE+tgx];
 #else
            unsigned int j = y*TILE_SIZE + tgx;
 #endif
-            atomIndices[threadIdx.x] = j;
-#ifdef ENABLE_SHUFFLE
+            int atomIndex = j;
            DECLARE_LOCAL_PARAMETERS
            real4 shflPosq;
            real3 shflForce;
            shflForce.x = 0.0f;
            shflForce.y = 0.0f;
            shflForce.z = 0.0f;
-#endif
            if (j < PADDED_NUM_ATOMS) {
                // Load position of atom j from from global memory
-#ifdef ENABLE_SHUFFLE
                shflPosq = posq[j];
-#else
-                localData[threadIdx.x].x = posq[j].x;
-                localData[threadIdx.x].y = posq[j].y;
-                localData[threadIdx.x].z = posq[j].z;
-                localData[threadIdx.x].q = posq[j].w;
-                localData[threadIdx.x].fx = 0.0f;
-                localData[threadIdx.x].fy = 0.0f;
-                localData[threadIdx.x].fz = 0.0f;
-#endif
                LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
            }
            else {
-#ifdef ENABLE_SHUFFLE
                shflPosq = make_real4(0, 0, 0, 0);
-#else
-                localData[threadIdx.x].x = 0;
-                localData[threadIdx.x].y = 0;
-                localData[threadIdx.x].z = 0;
-#endif
                CLEAR_LOCAL_PARAMETERS
            }
 #ifdef USE_PERIODIC
@@ -399,171 +323,132 @@ extern "C" __global__ void computeNonbonded(
                // box, then skip having to apply periodic boundary conditions later.
                real4 blockCenterX = blockCenter[x];
                APPLY_PERIODIC_TO_POS_WITH_CENTER(posq1, blockCenterX)
-#ifdef ENABLE_SHUFFLE
                APPLY_PERIODIC_TO_POS_WITH_CENTER(shflPosq, blockCenterX)
-#else
-                APPLY_PERIODIC_TO_POS_WITH_CENTER(localData[threadIdx.x], blockCenterX)
-#endif
-                unsigned int tj = tgx;
+                #pragma unroll 2
                for (j = 0; j < TILE_SIZE; j++) {
-                    int atom2 = tbx+tj;
-#ifdef ENABLE_SHUFFLE
                    real4 posq2 = shflPosq;
-#else
-                    real4 posq2 = make_real4(localData[atom2].x, localData[atom2].y, localData[atom2].z, localData[atom2].q);
-#endif
                    real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z);
                    real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
-                    real invR = RSQRT(r2);
-                    real r = r2*invR;
-                    LOAD_ATOM2_PARAMETERS
-                    atom2 = atomIndices[tbx+tj];
+#ifdef USE_CUTOFF
+                    if (r2 < MAX_CUTOFF_SQUARED) {
+#endif
+                        real invR = RSQRT(r2);
+                        real r = r2*invR;
+                        LOAD_ATOM2_PARAMETERS
+                        int atom2 = atomIndex;
 #ifdef USE_SYMMETRIC
-                    real dEdR = 0.0f;
+                        real dEdR = 0.0f;
 #else
-                    real3 dEdR1 = make_real3(0);
-                    real3 dEdR2 = make_real3(0);
+                        real3 dEdR1 = make_real3(0);
+                        real3 dEdR2 = make_real3(0);
 #endif
 #ifdef USE_EXCLUSIONS
-                    bool isExcluded = (atom1 >= NUM_ATOMS || atom2 >= NUM_ATOMS);
+                        bool isExcluded = !(atom1 < NUM_ATOMS && atom2 < NUM_ATOMS);
+#endif
+                        real tempEnergy = 0.0f;
+                        const real interactionScale = 1.0f;
+                        COMPUTE_INTERACTION
+#ifdef INCLUDE_ENERGY
+                        energy += tempEnergy;
 #endif
-                    real tempEnergy = 0.0f;
-                    const real interactionScale = 1.0f;
-                    COMPUTE_INTERACTION
-                    energy += tempEnergy;
 #ifdef INCLUDE_FORCES
 #ifdef USE_SYMMETRIC
-                    delta *= dEdR;
-                    force.x -= delta.x;
-                    force.y -= delta.y;
-                    force.z -= delta.z;
-#ifdef ENABLE_SHUFFLE
-                    shflForce.x += delta.x;
-                    shflForce.y += delta.y;
-                    shflForce.z += delta.z;
-
-#else
-                    localData[tbx+tj].fx += delta.x;
-                    localData[tbx+tj].fy += delta.y;
-                    localData[tbx+tj].fz += delta.z;
-#endif
+                        delta *= dEdR;
+                        force.x -= delta.x;
+                        force.y -= delta.y;
+                        force.z -= delta.z;
+                        shflForce.x += delta.x;
+                        shflForce.y += delta.y;
+                        shflForce.z += delta.z;
 #else // !USE_SYMMETRIC
-                    force.x -= dEdR1.x;
-                    force.y -= dEdR1.y;
-                    force.z -= dEdR1.z;
-#ifdef ENABLE_SHUFFLE
-                    shflForce.x += dEdR2.x;
-                    shflForce.y += dEdR2.y;
-                    shflForce.z += dEdR2.z;
-#else
-                    localData[tbx+tj].fx += dEdR2.x;
-                    localData[tbx+tj].fy += dEdR2.y;
-                    localData[tbx+tj].fz += dEdR2.z;
-#endif
+                        force.x -= dEdR1.x;
+                        force.y -= dEdR1.y;
+                        force.z -= dEdR1.z;
+                        shflForce.x += dEdR2.x;
+                        shflForce.y += dEdR2.y;
+                        shflForce.z += dEdR2.z;
 #endif // end USE_SYMMETRIC
 #endif
-#ifdef ENABLE_SHUFFLE
-                    SHUFFLE_WARP_DATA
+#ifdef USE_CUTOFF
+                    }
 #endif
-                    tj = (tj + 1) & (TILE_SIZE - 1);
+                    SHUFFLE_WARP_DATA
+                    atomIndex = warpRotateLeft<TILE_SIZE>(atomIndex);
                }
            }
            else
 #endif
            {
                // We need to apply periodic boundary conditions separately for each interaction.
-                unsigned int tj = tgx;
+                #pragma unroll 2
                for (j = 0; j < TILE_SIZE; j++) {
-                    int atom2 = tbx+tj;
-#ifdef ENABLE_SHUFFLE
                    real4 posq2 = shflPosq;
-#else
-                    real4 posq2 = make_real4(localData[atom2].x, localData[atom2].y, localData[atom2].z, localData[atom2].q);
-#endif
                    real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z);
 #ifdef USE_PERIODIC
                    APPLY_PERIODIC_TO_DELTA(delta)
 #endif
                    real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
-                    real invR = RSQRT(r2);
-                    real r = r2*invR;
-                    LOAD_ATOM2_PARAMETERS
-                    atom2 = atomIndices[tbx+tj];
+#ifdef USE_CUTOFF
+                    if (r2 < MAX_CUTOFF_SQUARED) {
+#endif
+                        real invR = RSQRT(r2);
+                        real r = r2*invR;
+                        LOAD_ATOM2_PARAMETERS
+                        int atom2 = atomIndex;
 #ifdef USE_SYMMETRIC
-                    real dEdR = 0.0f;
+                        real dEdR = 0.0f;
 #else
-                    real3 dEdR1 = make_real3(0);
-                    real3 dEdR2 = make_real3(0);
+                        real3 dEdR1 = make_real3(0);
+                        real3 dEdR2 = make_real3(0);
 #endif
 #ifdef USE_EXCLUSIONS
-                    bool isExcluded = (atom1 >= NUM_ATOMS || atom2 >= NUM_ATOMS);
+                        bool isExcluded = !(atom1 < NUM_ATOMS && atom2 < NUM_ATOMS);
+#endif
+                        real tempEnergy = 0.0f;
+                        const real interactionScale = 1.0f;
+                        COMPUTE_INTERACTION
+#ifdef INCLUDE_ENERGY
+                        energy += tempEnergy;
 #endif
-                    real tempEnergy = 0.0f;
-                    const real interactionScale = 1.0f;
-                    COMPUTE_INTERACTION
-                    energy += tempEnergy;
 #ifdef INCLUDE_FORCES
 #ifdef USE_SYMMETRIC
-                    delta *= dEdR;
-                    force.x -= delta.x;
-                    force.y -= delta.y;
-                    force.z -= delta.z;
-#ifdef ENABLE_SHUFFLE
-                    shflForce.x += delta.x;
-                    shflForce.y += delta.y;
-                    shflForce.z += delta.z;
-
-#else
-                    localData[tbx+tj].fx += delta.x;
-                    localData[tbx+tj].fy += delta.y;
-                    localData[tbx+tj].fz += delta.z;
-#endif
+                        delta *= dEdR;
+                        force.x -= delta.x;
+                        force.y -= delta.y;
+                        force.z -= delta.z;
+                        shflForce.x += delta.x;
+                        shflForce.y += delta.y;
+                        shflForce.z += delta.z;
 #else // !USE_SYMMETRIC
-                    force.x -= dEdR1.x;
-                    force.y -= dEdR1.y;
-                    force.z -= dEdR1.z;
-#ifdef ENABLE_SHUFFLE
-                    shflForce.x += dEdR2.x;
-                    shflForce.y += dEdR2.y;
-                    shflForce.z += dEdR2.z;
-#else
-                    localData[tbx+tj].fx += dEdR2.x;
-                    localData[tbx+tj].fy += dEdR2.y;
-                    localData[tbx+tj].fz += dEdR2.z;
-#endif
+                        force.x -= dEdR1.x;
+                        force.y -= dEdR1.y;
+                        force.z -= dEdR1.z;
+                        shflForce.x += dEdR2.x;
+                        shflForce.y += dEdR2.y;
+                        shflForce.z += dEdR2.z;
 #endif // end USE_SYMMETRIC
 #endif
-#ifdef ENABLE_SHUFFLE
-                    SHUFFLE_WARP_DATA
+#ifdef USE_CUTOFF
+                    }
 #endif
-                    tj = (tj + 1) & (TILE_SIZE - 1);
+                    SHUFFLE_WARP_DATA
+                    atomIndex = warpRotateLeft<TILE_SIZE>(atomIndex);
                }
            }

            // Write results.
 #ifdef INCLUDE_FORCES
-            atomicAdd(&forceBuffers[atom1], static_cast<unsigned long long>((long long) (force.x*0x100000000)));
-            atomicAdd(&forceBuffers[atom1+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force.y*0x100000000)));
-            atomicAdd(&forceBuffers[atom1+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force.z*0x100000000)));
-#ifdef USE_CUTOFF
-            unsigned int atom2 = atomIndices[threadIdx.x];
-#else
-            unsigned int atom2 = y*TILE_SIZE + tgx;
-#endif
+            atomicAdd(&forceBuffers[atom1], static_cast<unsigned long long>(realToFixedPoint(force.x)));
+            atomicAdd(&forceBuffers[atom1+PADDED_NUM_ATOMS], static_cast<unsigned long long>(realToFixedPoint(force.y)));
+            atomicAdd(&forceBuffers[atom1+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>(realToFixedPoint(force.z)));
+            unsigned int atom2 = atomIndex;
            if (atom2 < PADDED_NUM_ATOMS) {
-#ifdef ENABLE_SHUFFLE
-                atomicAdd(&forceBuffers[atom2], static_cast<unsigned long long>((long long) (shflForce.x*0x100000000)));
-                atomicAdd(&forceBuffers[atom2+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (shflForce.y*0x100000000)));
-                atomicAdd(&forceBuffers[atom2+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (shflForce.z*0x100000000)));
-#else
-                atomicAdd(&forceBuffers[atom2], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fx*0x100000000)));
-                atomicAdd(&forceBuffers[atom2+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fy*0x100000000)));
-                atomicAdd(&forceBuffers[atom2+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fz*0x100000000)));
-#endif
+                atomicAdd(&forceBuffers[atom2], static_cast<unsigned long long>(realToFixedPoint(shflForce.x)));
+                atomicAdd(&forceBuffers[atom2+PADDED_NUM_ATOMS], static_cast<unsigned long long>(realToFixedPoint(shflForce.y)));
+                atomicAdd(&forceBuffers[atom2+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>(realToFixedPoint(shflForce.z)));
            }
 #endif
        }
-        pos++;
    }

    // Third loop: single pairs that aren't part of a tile.
@@ -578,44 +463,45 @@ extern "C" __global__ void computeNonbonded(
        int atom2 = pair.y;
        real4 posq1 = posq[atom1];
        real4 posq2 = posq[atom2];
-        LOAD_ATOM1_PARAMETERS
-        int j = atom2;
-atom2 = threadIdx.x;
-        DECLARE_LOCAL_PARAMETERS
-        LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
-        LOAD_ATOM2_PARAMETERS
-atom2 = pair.y;
        real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z);
 #ifdef USE_PERIODIC
        APPLY_PERIODIC_TO_DELTA(delta)
 #endif
        real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
-        real invR = RSQRT(r2);
-        real r = r2*invR;
+        if (r2 < MAX_CUTOFF_SQUARED) {
+            LOAD_ATOM1_PARAMETERS
+            int j = atom2;
+            DECLARE_LOCAL_PARAMETERS
+            LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
+            LOAD_ATOM2_PARAMETERS
+            real invR = RSQRT(r2);
+            real r = r2*invR;
 #ifdef USE_SYMMETRIC
-        real dEdR = 0.0f;
+            real dEdR = 0.0f;
 #else
-        real3 dEdR1 = make_real3(0);
-        real3 dEdR2 = make_real3(0);
-#endif
-        bool hasExclusions = false;
-        bool isExcluded = false;
-        real tempEnergy = 0.0f;
-        const real interactionScale = 1.0f;
-        COMPUTE_INTERACTION
-        energy += tempEnergy;
+            real3 dEdR1 = make_real3(0);
+            real3 dEdR2 = make_real3(0);
+#endif
+            bool isExcluded = false;
+            real tempEnergy = 0.0f;
+            const real interactionScale = 1.0f;
+            COMPUTE_INTERACTION
+#ifdef INCLUDE_ENERGY
+            energy += tempEnergy;
+#endif
 #ifdef INCLUDE_FORCES
 #ifdef USE_SYMMETRIC
-        real3 dEdR1 = delta*dEdR;
-        real3 dEdR2 = -dEdR1;
+            real3 dEdR1 = delta*dEdR;
+            real3 dEdR2 = -dEdR1;
 #endif
-        atomicAdd(&forceBuffers[atom1], static_cast<unsigned long long>((long long) (-dEdR1.x*0x100000000)));
-        atomicAdd(&forceBuffers[atom1+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (-dEdR1.y*0x100000000)));
-        atomicAdd(&forceBuffers[atom1+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (-dEdR1.z*0x100000000)));
-        atomicAdd(&forceBuffers[atom2], static_cast<unsigned long long>((long long) (-dEdR2.x*0x100000000)));
-        atomicAdd(&forceBuffers[atom2+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (-dEdR2.y*0x100000000)));
-        atomicAdd(&forceBuffers[atom2+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (-dEdR2.z*0x100000000)));
+            atomicAdd(&forceBuffers[atom1], static_cast<unsigned long long>(realToFixedPoint(-dEdR1.x)));
+            atomicAdd(&forceBuffers[atom1+PADDED_NUM_ATOMS], static_cast<unsigned long long>(realToFixedPoint(-dEdR1.y)));
+            atomicAdd(&forceBuffers[atom1+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>(realToFixedPoint(-dEdR1.z)));
+            atomicAdd(&forceBuffers[atom2], static_cast<unsigned long long>(realToFixedPoint(-dEdR2.x)));
+            atomicAdd(&forceBuffers[atom2+PADDED_NUM_ATOMS], static_cast<unsigned long long>(realToFixedPoint(-dEdR2.y)));
+            atomicAdd(&forceBuffers[atom2+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>(realToFixedPoint(-dEdR2.z)));
 #endif
+        }
    }
 #endif
 #ifdef INCLUDE_ENERGY