Improved support for devices without 64 bit atomics (#3737)

ae686364 · Peter Eastman · GitHub · 48664a1f · ae686364 · ae686364
Unverified Commit ae686364 authored Aug 17, 2022 by Peter Eastman Committed by GitHub Aug 17, 2022
20 changed files
--- a/platforms/common/include/openmm/common/CommonKernels.h
+++ b/platforms/common/include/openmm/common/CommonKernels.h
@@ -786,8 +786,6 @@ private:
    ComputeArray globals;
    ComputeArray donors;
    ComputeArray acceptors;
-    ComputeArray donorBufferIndices;
-    ComputeArray acceptorBufferIndices;
    ComputeArray donorExclusions;
    ComputeArray acceptorExclusions;
    std::vector<std::string> globalParamNames;

--- a/platforms/common/src/CommonKernels.cpp
+++ b/platforms/common/src/CommonKernels.cpp
--- a/platforms/common/src/IntegrationUtilities.cpp
+++ b/platforms/common/src/IntegrationUtilities.cpp
@@ -528,8 +528,6 @@ IntegrationUtilities::IntegrationUtilities(ComputeContext& context, const System
    for (int i = 0; i < numAtoms; i++)
        if (atomCounts[i] > 1)
            hasOverlappingVsites = true;
-    if (hasOverlappingVsites && !context.getSupports64BitGlobalAtomics())
-        throw OpenMMException("This device does not support 64 bit atomics.  Cannot have multiple virtual sites that depend on the same atom.");
    // Create the kernels used by this class.

--- a/platforms/common/src/kernels/customGBEnergyN2.cc
+++ b/platforms/common/src/kernels/customGBEnergyN2.cc
-#ifdef SUPPORTS_64_BIT_ATOMICS
 #define STORE_DERIVATIVE_1(INDEX) ATOMIC_ADD(&derivBuffers[offset+(INDEX-1)*PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(deriv##INDEX##_1));
 #define STORE_DERIVATIVE_2(INDEX) ATOMIC_ADD(&derivBuffers[offset+(INDEX-1)*PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(local_deriv##INDEX[LOCAL_ID]));
-#else
-#define STORE_DERIVATIVE_1(INDEX) derivBuffers##INDEX[offset] += deriv##INDEX##_1;
-#define STORE_DERIVATIVE_2(INDEX) derivBuffers##INDEX[offset] += local_deriv##INDEX[LOCAL_ID];
-#endif
 /**
 * Compute a force based on pair interactions.
 */
 KERNEL void computeN2Energy(
-#ifdef SUPPORTS_64_BIT_ATOMICS
        GLOBAL mm_ulong* RESTRICT forceBuffers,
-#else
-        GLOBAL real4* RESTRICT forceBuffers,
-#endif
        GLOBAL mixed* RESTRICT energyBuffer,
        GLOBAL const real4* RESTRICT posq, GLOBAL const unsigned int* RESTRICT exclusions,
        GLOBAL const int2* exclusionTiles, int needEnergy,
@@ -160,7 +151,6 @@ KERNEL void computeN2Energy(
        // Write results.
-#ifdef SUPPORTS_64_BIT_ATOMICS
        unsigned int offset = x*TILE_SIZE + tgx;
        ATOMIC_ADD(&forceBuffers[offset], (mm_ulong) realToFixedPoint(force.x));
        ATOMIC_ADD(&forceBuffers[offset+PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(force.y));
@@ -173,18 +163,6 @@ KERNEL void computeN2Energy(
            ATOMIC_ADD(&forceBuffers[offset+2*PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(local_force[LOCAL_ID].z));
            STORE_DERIVATIVES_2
        }
-#else
-        unsigned int offset1 = x*TILE_SIZE + tgx + warp*PADDED_NUM_ATOMS;
-        unsigned int offset2 = y*TILE_SIZE + tgx + warp*PADDED_NUM_ATOMS;
-        unsigned int offset = offset1;
-        forceBuffers[offset1].xyz += force.xyz;
-        STORE_DERIVATIVES_1
-        if (x != y) {
-            offset = offset2;
-            forceBuffers[offset2] += (real4) (local_force[LOCAL_ID].x, local_force[LOCAL_ID].y, local_force[LOCAL_ID].z, 0.0f);
-            STORE_DERIVATIVES_2
-        }
-#endif
    }
    // Second loop: tiles without exclusions, either from the neighbor list (with cutoff) or just enumerating all
@@ -363,7 +341,6 @@ KERNEL void computeN2Energy(
 #else
            unsigned int atom2 = y*TILE_SIZE + tgx;
 #endif
-#ifdef SUPPORTS_64_BIT_ATOMICS
            ATOMIC_ADD(&forceBuffers[atom1], (mm_ulong) realToFixedPoint(force.x));
            ATOMIC_ADD(&forceBuffers[atom1+PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(force.y));
            ATOMIC_ADD(&forceBuffers[atom1+2*PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(force.z));
@@ -376,18 +353,6 @@ KERNEL void computeN2Energy(
                offset = atom2;
                STORE_DERIVATIVES_2
            }
-#else
-            unsigned int offset1 = atom1 + warp*PADDED_NUM_ATOMS;
-            unsigned int offset2 = atom2 + warp*PADDED_NUM_ATOMS;
-            forceBuffers[offset1].xyz += force.xyz;
-            unsigned int offset = offset1;
-            STORE_DERIVATIVES_1
-            if (atom2 < PADDED_NUM_ATOMS) {
-                forceBuffers[offset2] += (real4) (local_force[LOCAL_ID].x, local_force[LOCAL_ID].y, local_force[LOCAL_ID].z, 0.0f);
-                offset = offset2;
-                STORE_DERIVATIVES_2
-            }
-#endif
        }
        pos++;
    }

--- a/platforms/common/src/kernels/customGBEnergyN2_cpu.cc
+++ b/platforms/common/src/kernels/customGBEnergyN2_cpu.cc
-#ifdef SUPPORTS_64_BIT_ATOMICS
 #define STORE_DERIVATIVE_1(INDEX) ATOMIC_ADD(&derivBuffers[offset+(INDEX-1)*PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(deriv##INDEX##_1));
 #define STORE_DERIVATIVE_2(INDEX) ATOMIC_ADD(&derivBuffers[offset+(INDEX-1)*PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(local_deriv##INDEX[tgx]));
-#else
-#define STORE_DERIVATIVE_1(INDEX) derivBuffers##INDEX[offset] += deriv##INDEX##_1;
-#define STORE_DERIVATIVE_2(INDEX) derivBuffers##INDEX[offset] += local_deriv##INDEX[tgx];
-#endif
 /**
 * Compute a force based on pair interactions.
 */
 KERNEL void computeN2Energy(
-#ifdef SUPPORTS_64_BIT_ATOMICS
        GLOBAL mm_ulong* RESTRICT forceBuffers,
-#else
-        GLOBAL real4* RESTRICT forceBuffers,
-#endif
        GLOBAL mixed* RESTRICT energyBuffer,
        GLOBAL const real4* RESTRICT posq, GLOBAL const unsigned int* RESTRICT exclusions,
        GLOBAL const int2* exclusionTiles, int needEnergy,
@@ -100,17 +91,11 @@ KERNEL void computeN2Energy(
                // Write results.
-#ifdef SUPPORTS_64_BIT_ATOMICS
                unsigned int offset = atom1;
                ATOMIC_ADD(&forceBuffers[offset], (mm_ulong) realToFixedPoint(force.x));
                ATOMIC_ADD(&forceBuffers[offset+PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(force.y));
                ATOMIC_ADD(&forceBuffers[offset+2*PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(force.z));
                STORE_DERIVATIVES_1
-#else
-                unsigned int offset = atom1 + GROUP_ID*PADDED_NUM_ATOMS;
-                forceBuffers[offset].xyz += force.xyz;
-                STORE_DERIVATIVES_1
-#endif
            }
        }
        else {
@@ -174,33 +159,21 @@ KERNEL void computeN2Energy(
                // Write results for atom1.
-#ifdef SUPPORTS_64_BIT_ATOMICS
                unsigned int offset = atom1;
                ATOMIC_ADD(&forceBuffers[offset], (mm_ulong) realToFixedPoint(force.x));
                ATOMIC_ADD(&forceBuffers[offset+PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(force.y));
                ATOMIC_ADD(&forceBuffers[offset+2*PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(force.z));
                STORE_DERIVATIVES_1
-#else
-                unsigned int offset = atom1 + GROUP_ID*PADDED_NUM_ATOMS;
-                forceBuffers[offset].xyz += force.xyz;
-                STORE_DERIVATIVES_1
-#endif
            }
            // Write results.
            for (int tgx = 0; tgx < TILE_SIZE; tgx++) {
-#ifdef SUPPORTS_64_BIT_ATOMICS
                unsigned int offset = y*TILE_SIZE+tgx;
                ATOMIC_ADD(&forceBuffers[offset], (mm_ulong) realToFixedPoint(local_force[tgx].x));
                ATOMIC_ADD(&forceBuffers[offset+PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(local_force[tgx].y));
                ATOMIC_ADD(&forceBuffers[offset+2*PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(local_force[tgx].z));
                STORE_DERIVATIVES_2
-#else
-                unsigned int offset = y*TILE_SIZE+tgx + GROUP_ID*PADDED_NUM_ATOMS;
-                forceBuffers[offset].xyz += local_force[tgx].xyz;
-                STORE_DERIVATIVES_2
-#endif
            }
        }
    }
@@ -316,17 +289,11 @@ KERNEL void computeN2Energy(
                    // Write results for atom1.
-#ifdef SUPPORTS_64_BIT_ATOMICS
                    unsigned int offset = atom1;
                    ATOMIC_ADD(&forceBuffers[offset], (mm_ulong) realToFixedPoint(force.x));
                    ATOMIC_ADD(&forceBuffers[offset+PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(force.y));
                    ATOMIC_ADD(&forceBuffers[offset+2*PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(force.z));
                    STORE_DERIVATIVES_1
-#else
-                    unsigned int offset = atom1 + GROUP_ID*PADDED_NUM_ATOMS;
-                    forceBuffers[offset].xyz += force.xyz;
-                    STORE_DERIVATIVES_1
-#endif
                }
            }
            else
@@ -375,17 +342,11 @@ KERNEL void computeN2Energy(
                    // Write results for atom1.
-#ifdef SUPPORTS_64_BIT_ATOMICS
                    unsigned int offset = atom1;
                    ATOMIC_ADD(&forceBuffers[offset], (mm_ulong) realToFixedPoint(force.x));
                    ATOMIC_ADD(&forceBuffers[offset+PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(force.y));
                    ATOMIC_ADD(&forceBuffers[offset+2*PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(force.z));
                    STORE_DERIVATIVES_1
-#else
-                    unsigned int offset = atom1 + GROUP_ID*PADDED_NUM_ATOMS;
-                    forceBuffers[offset].xyz += force.xyz;
-                    STORE_DERIVATIVES_1
-#endif
                }
            }
@@ -398,17 +359,11 @@ KERNEL void computeN2Energy(
                unsigned int atom2 = y*TILE_SIZE + tgx;
 #endif
                if (atom2 < PADDED_NUM_ATOMS) {
-#ifdef SUPPORTS_64_BIT_ATOMICS
                    ATOMIC_ADD(&forceBuffers[atom2], (mm_ulong) realToFixedPoint(local_force[tgx].x));
                    ATOMIC_ADD(&forceBuffers[atom2+PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(local_force[tgx].y));
                    ATOMIC_ADD(&forceBuffers[atom2+2*PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(local_force[tgx].z));
                    unsigned int offset = atom2;
                    STORE_DERIVATIVES_2
-#else
-                    unsigned int offset = atom2 + GROUP_ID*PADDED_NUM_ATOMS;
-                    forceBuffers[offset].xyz += local_force[tgx].xyz;
-                    STORE_DERIVATIVES_2
-#endif
                }
            }
        }

--- a/platforms/common/src/kernels/customGBEnergyPerParticle.cc
+++ b/platforms/common/src/kernels/customGBEnergyPerParticle.cc
@@ -10,20 +10,13 @@
 */
 KERNEL void computePerParticleEnergy(GLOBAL mixed* RESTRICT energyBuffer, GLOBAL const real4* RESTRICT posq,
-#ifdef SUPPORTS_64_BIT_ATOMICS
        GLOBAL mm_long* RESTRICT forceBuffers
-#else
-        GLOBAL real4* RESTRICT forceBuffers, int bufferSize, int numBuffers
-#endif
        PARAMETER_ARGUMENTS) {
    mixed energy = 0;
    INIT_PARAM_DERIVS
    for (int index = GLOBAL_ID; index < NUM_ATOMS; index += GLOBAL_SIZE) {
        // Reduce the derivatives
-#ifndef SUPPORTS_64_BIT_ATOMICS
-        int totalSize = bufferSize*numBuffers;
-#endif
        REDUCE_DERIVATIVES
        // Now calculate the per-particle energy terms.

--- a/platforms/common/src/kernels/customGBGradientChainRule.cc
+++ b/platforms/common/src/kernels/customGBGradientChainRule.cc
@@ -3,29 +3,17 @@
 */
 KERNEL void computeGradientChainRuleTerms(GLOBAL const real4* RESTRICT posq,
-#ifdef SUPPORTS_64_BIT_ATOMICS
    GLOBAL mm_long* RESTRICT forceBuffers
-#else
-    GLOBAL real4* RESTRICT forceBuffers
-#endif
        PARAMETER_ARGUMENTS) {
    INIT_PARAM_DERIVS
    const real scale = RECIP((real) 0x100000000);
    for (int index = GLOBAL_ID; index < NUM_ATOMS; index += GLOBAL_SIZE) {
        real4 pos = posq[index];
-#ifdef SUPPORTS_64_BIT_ATOMICS
        real3 force = make_real3(scale*forceBuffers[index], scale*forceBuffers[index+PADDED_NUM_ATOMS], scale*forceBuffers[index+PADDED_NUM_ATOMS*2]);
-#else
-        real3 force = trimTo3(forceBuffers[index]);
-#endif
        COMPUTE_FORCES
-#ifdef SUPPORTS_64_BIT_ATOMICS
        forceBuffers[index] = realToFixedPoint(force.x);
        forceBuffers[index+PADDED_NUM_ATOMS] = realToFixedPoint(force.y);
        forceBuffers[index+PADDED_NUM_ATOMS*2] = realToFixedPoint(force.z);
-#else
-        forceBuffers[index] = make_real4(force.x, force.y, force.z, 0);
-#endif
    }
    SAVE_PARAM_DERIVS
 }
--- a/platforms/common/src/kernels/customGBValueN2.cc
+++ b/platforms/common/src/kernels/customGBValueN2.cc
@@ -3,11 +3,7 @@
 */
 KERNEL void computeN2Value(GLOBAL const real4* RESTRICT posq, GLOBAL const unsigned int* RESTRICT exclusions,
        GLOBAL const int2* exclusionTiles,
-#ifdef SUPPORTS_64_BIT_ATOMICS
        GLOBAL mm_ulong* RESTRICT global_value,
-#else
-        GLOBAL real* RESTRICT global_value,
-#endif
 #ifdef USE_CUTOFF
        GLOBAL const int* RESTRICT tiles, GLOBAL const unsigned int* RESTRICT interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize,
        real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, unsigned int maxTiles, GLOBAL const real4* RESTRICT blockCenter,
@@ -137,7 +133,6 @@ KERNEL void computeN2Value(GLOBAL const real4* RESTRICT posq, GLOBAL const unsig
        // Write results.
-#ifdef SUPPORTS_64_BIT_ATOMICS
        unsigned int offset1 = x*TILE_SIZE + tgx;
        ATOMIC_ADD(&global_value[offset1], (mm_ulong) realToFixedPoint(value));
        STORE_PARAM_DERIVS1
@@ -146,16 +141,6 @@ KERNEL void computeN2Value(GLOBAL const real4* RESTRICT posq, GLOBAL const unsig
            ATOMIC_ADD(&global_value[offset2], (mm_ulong) realToFixedPoint(local_value[LOCAL_ID]));
            STORE_PARAM_DERIVS2
        }
-#else
-        unsigned int offset1 = x*TILE_SIZE + tgx + warp*PADDED_NUM_ATOMS;
-        unsigned int offset2 = y*TILE_SIZE + tgx + warp*PADDED_NUM_ATOMS;
-        global_value[offset1] += value;
-        STORE_PARAM_DERIVS1
-        if (x != y) {
-            global_value[offset2] += local_value[LOCAL_ID];
-            STORE_PARAM_DERIVS2
-        }
-#endif
    }
    // Second loop: tiles without exclusions, either from the neighbor list (with cutoff) or just enumerating all
@@ -317,7 +302,6 @@ KERNEL void computeN2Value(GLOBAL const real4* RESTRICT posq, GLOBAL const unsig
 #else
            unsigned int atom2 = y*TILE_SIZE + tgx;
 #endif
-#ifdef SUPPORTS_64_BIT_ATOMICS
            unsigned int offset1 = atom1;
            ATOMIC_ADD(&global_value[offset1], (mm_ulong) realToFixedPoint(value));
            STORE_PARAM_DERIVS1
@@ -326,16 +310,6 @@ KERNEL void computeN2Value(GLOBAL const real4* RESTRICT posq, GLOBAL const unsig
                ATOMIC_ADD(&global_value[offset2], (mm_ulong) realToFixedPoint(local_value[LOCAL_ID]));
                STORE_PARAM_DERIVS2
            }
-#else
-            unsigned int offset1 = atom1 + warp*PADDED_NUM_ATOMS;
-            global_value[offset1] += value;
-            STORE_PARAM_DERIVS1
-            if (atom2 < PADDED_NUM_ATOMS) {
-                unsigned int offset2 = atom2 + warp*PADDED_NUM_ATOMS;
-                global_value[offset2] += local_value[LOCAL_ID];
-                STORE_PARAM_DERIVS2
-            }
-#endif
        }
        pos++;
    }

--- a/platforms/common/src/kernels/customGBValueN2_cpu.cc
+++ b/platforms/common/src/kernels/customGBValueN2_cpu.cc
@@ -3,11 +3,7 @@
 */
 KERNEL void computeN2Value(GLOBAL const real4* RESTRICT posq, GLOBAL const unsigned int* RESTRICT exclusions,
        GLOBAL const int2* exclusionTiles,
-#ifdef SUPPORTS_64_BIT_ATOMICS
        GLOBAL mm_ulong* RESTRICT global_value,
-#else
-        GLOBAL real* RESTRICT global_value,
-#endif
 #ifdef USE_CUTOFF
        GLOBAL const int* RESTRICT tiles, GLOBAL const unsigned int* RESTRICT interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize,
        real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, unsigned int maxTiles, GLOBAL const real4* RESTRICT blockCenter,
@@ -84,13 +80,8 @@ KERNEL void computeN2Value(GLOBAL const real4* RESTRICT posq, GLOBAL const unsig
                // Write results.
-#ifdef SUPPORTS_64_BIT_ATOMICS
                unsigned int offset1 = atom1;
                ATOMIC_ADD(&global_value[offset1], (mm_ulong) realToFixedPoint(value));
-#else
-                unsigned int offset1 = atom1 + get_group_id(0)*PADDED_NUM_ATOMS;
-                global_value[offset1] += value;
-#endif
                STORE_PARAM_DERIVS1
            }
        }
@@ -146,26 +137,16 @@ KERNEL void computeN2Value(GLOBAL const real4* RESTRICT posq, GLOBAL const unsig
                // Write results for atom1.
-#ifdef SUPPORTS_64_BIT_ATOMICS
                unsigned int offset1 = atom1;
                ATOMIC_ADD(&global_value[offset1], (mm_ulong) realToFixedPoint(value));
-#else
-                unsigned int offset1 = atom1 + get_group_id(0)*PADDED_NUM_ATOMS;
-                global_value[offset1] += value;
-#endif
                STORE_PARAM_DERIVS1
            }
            // Write results.
            for (int tgx = 0; tgx < TILE_SIZE; tgx++) {
-#ifdef SUPPORTS_64_BIT_ATOMICS
                unsigned int offset2 = y*TILE_SIZE+tgx;
                ATOMIC_ADD(&global_value[offset2], (mm_ulong) realToFixedPoint(local_value[tgx]));
-#else
-                unsigned int offset2 = y*TILE_SIZE+tgx + get_group_id(0)*PADDED_NUM_ATOMS;
-                global_value[offset2] += local_value[tgx];
-#endif
                STORE_PARAM_DERIVS2
            }
        }
@@ -273,13 +254,8 @@ KERNEL void computeN2Value(GLOBAL const real4* RESTRICT posq, GLOBAL const unsig
                    // Write results for atom1.
-#ifdef SUPPORTS_64_BIT_ATOMICS
                    unsigned int offset1 = atom1;
                    ATOMIC_ADD(&global_value[offset1], (mm_ulong) realToFixedPoint(value));
-#else
-                    unsigned int offset1 = atom1 + get_group_id(0)*PADDED_NUM_ATOMS;
-                    global_value[offset1] += value;
-#endif
                    STORE_PARAM_DERIVS1
                }
            }
@@ -322,13 +298,8 @@ KERNEL void computeN2Value(GLOBAL const real4* RESTRICT posq, GLOBAL const unsig
                    // Write results for atom1.
-#ifdef SUPPORTS_64_BIT_ATOMICS
                    unsigned int offset1 = atom1;
                    ATOMIC_ADD(&global_value[offset1], (mm_ulong) realToFixedPoint(value));
-#else
-                    unsigned int offset1 = atom1 + get_group_id(0)*PADDED_NUM_ATOMS;
-                    global_value[offset1] += value;
-#endif
                    STORE_PARAM_DERIVS1
                }
            }
@@ -342,13 +313,8 @@ KERNEL void computeN2Value(GLOBAL const real4* RESTRICT posq, GLOBAL const unsig
                unsigned int atom2 = y*TILE_SIZE + tgx;
 #endif
                if (atom2 < PADDED_NUM_ATOMS) {
-#ifdef SUPPORTS_64_BIT_ATOMICS
                    unsigned int offset2 = atom2;
                    ATOMIC_ADD(&global_value[offset2], (mm_ulong) realToFixedPoint(local_value[tgx]));
-#else
-                    unsigned int offset2 = atom2 + get_group_id(0)*PADDED_NUM_ATOMS;
-                    global_value[offset2] += local_value[tgx];
-#endif
                    STORE_PARAM_DERIVS2
                }
            }

--- a/platforms/common/src/kernels/customGBValuePerParticle.cc
+++ b/platforms/common/src/kernels/customGBValuePerParticle.cc
@@ -3,23 +3,12 @@
 */
 KERNEL void computePerParticleValues(GLOBAL real4* posq,
-#ifdef SUPPORTS_64_BIT_ATOMICS
        GLOBAL mm_long* valueBuffers
-#else
-        GLOBAL real* valueBuffers, int bufferSize, int numBuffers
-#endif
        PARAMETER_ARGUMENTS) {
    for (int index = GLOBAL_ID; index < NUM_ATOMS; index += GLOBAL_SIZE) {
        // Reduce the pairwise value
-#ifdef SUPPORTS_64_BIT_ATOMICS
        real sum = valueBuffers[index]/(real) 0x100000000;
-#else
-        int totalSize = bufferSize*numBuffers;
-        real sum = valueBuffers[index];
-        for (int i = index+bufferSize; i < totalSize; i += bufferSize)
-            sum += valueBuffers[i];
-#endif
        REDUCE_PARAM0_DERIV
        // Now calculate other values

--- a/platforms/common/src/kernels/customHbondForce.cc
+++ b/platforms/common/src/kernels/customHbondForce.cc
@@ -44,11 +44,7 @@ inline DEVICE real4 computeCross(real4 vec1, real4 vec2) {
 * Compute forces on donors.
 */
 KERNEL void computeDonorForces(
-#ifdef SUPPORTS_64_BIT_ATOMICS
 	GLOBAL mm_ulong* RESTRICT force,
-#else
-	GLOBAL real4* RESTRICT forceBuffers, GLOBAL const int4* RESTRICT donorBufferIndices,
-#endif
 	GLOBAL mixed* RESTRICT energyBuffer, GLOBAL const real4* RESTRICT posq, GLOBAL const int4* RESTRICT exclusions,
        GLOBAL const int4* RESTRICT donorAtoms, GLOBAL const int4* RESTRICT acceptorAtoms, real4 periodicBoxSize, real4 invPeriodicBoxSize,
        real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ
@@ -114,7 +110,6 @@ KERNEL void computeDonorForces(
        // Write results
        if (donorIndex < NUM_DONORS) {
-#ifdef SUPPORTS_64_BIT_ATOMICS
            if (atoms.x > -1) {
                ATOMIC_ADD(&force[atoms.x], (mm_ulong) realToFixedPoint(f1.x));
                ATOMIC_ADD(&force[atoms.x+PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(f1.y));
@@ -133,27 +128,6 @@ KERNEL void computeDonorForces(
                ATOMIC_ADD(&force[atoms.z+2*PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(f3.z));
                MEM_FENCE;
            }
-#else
-            int4 bufferIndices = donorBufferIndices[donorIndex];
-            if (atoms.x > -1) {
-                unsigned int offset = atoms.x+bufferIndices.x*PADDED_NUM_ATOMS;
-                real4 force = forceBuffers[offset];
-                force.xyz += f1.xyz;
-                forceBuffers[offset] = force;
-            }
-            if (atoms.y > -1) {
-                unsigned int offset = atoms.y+bufferIndices.y*PADDED_NUM_ATOMS;
-                real4 force = forceBuffers[offset];
-                force.xyz += f2.xyz;
-                forceBuffers[offset] = force;
-            }
-            if (atoms.z > -1) {
-                unsigned int offset = atoms.z+bufferIndices.z*PADDED_NUM_ATOMS;
-                real4 force = forceBuffers[offset];
-                force.xyz += f3.xyz;
-                forceBuffers[offset] = force;
-            }
-#endif
        }
    }
    energyBuffer[GLOBAL_ID] += energy;
@@ -162,11 +136,7 @@ KERNEL void computeDonorForces(
 * Compute forces on acceptors.
 */
 KERNEL void computeAcceptorForces(
-#ifdef SUPPORTS_64_BIT_ATOMICS
 	GLOBAL mm_ulong* RESTRICT force,
-#else
-	GLOBAL real4* RESTRICT forceBuffers, GLOBAL const int4* RESTRICT acceptorBufferIndices,
-#endif
        GLOBAL mixed* RESTRICT energyBuffer, GLOBAL const real4* RESTRICT posq, GLOBAL const int4* RESTRICT exclusions,
        GLOBAL const int4* RESTRICT donorAtoms, GLOBAL const int4* RESTRICT acceptorAtoms, real4 periodicBoxSize, real4 invPeriodicBoxSize,
        real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ
@@ -231,7 +201,6 @@ KERNEL void computeAcceptorForces(
        // Write results
        if (acceptorIndex < NUM_ACCEPTORS) {
-#ifdef SUPPORTS_64_BIT_ATOMICS
            if (atoms.x > -1) {
                ATOMIC_ADD(&force[atoms.x], (mm_ulong) realToFixedPoint(f1.x));
                ATOMIC_ADD(&force[atoms.x+PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(f1.y));
@@ -250,27 +219,6 @@ KERNEL void computeAcceptorForces(
                ATOMIC_ADD(&force[atoms.z+2*PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(f3.z));
                MEM_FENCE;
            }
-#else
-            int4 bufferIndices = acceptorBufferIndices[acceptorIndex];
-            if (atoms.x > -1) {
-                unsigned int offset = atoms.x+bufferIndices.x*PADDED_NUM_ATOMS;
-                real4 force = forceBuffers[offset];
-                force.xyz += f1.xyz;
-                forceBuffers[offset] = force;
-            }
-            if (atoms.y > -1) {
-                unsigned int offset = atoms.y+bufferIndices.y*PADDED_NUM_ATOMS;
-                real4 force = forceBuffers[offset];
-                force.xyz += f2.xyz;
-                forceBuffers[offset] = force;
-            }
-            if (atoms.z > -1) {
-                unsigned int offset = atoms.z+bufferIndices.z*PADDED_NUM_ATOMS;
-                real4 force = forceBuffers[offset];
-                force.xyz += f3.xyz;
-                forceBuffers[offset] = force;
-            }
-#endif
        }
    }
 }
--- a/platforms/common/src/kernels/customNonbondedGroups.cc
+++ b/platforms/common/src/kernels/customNonbondedGroups.cc
@@ -35,38 +35,8 @@ DEVICE int reduceMax(int val, LOCAL_ARG int* temp) {
 #endif
 }
-#ifndef SUPPORTS_64_BIT_ATOMICS
-/**
- * This function is used on devices that don't support 64 bit atomics.  Multiple threads within
- * a single tile might have computed forces on the same atom.  This loops over them and makes sure
- * that only one thread updates the force on any given atom.
- */
-void writeForces(GLOBAL real4* forceBuffers, LOCAL AtomData* localData, int atomIndex) {
-    localData[LOCAL_ID].x = atomIndex;
-    SYNC_WARPS;
-    real4 forceSum = make_real4(0);
-    int start = (LOCAL_ID/TILE_SIZE)*TILE_SIZE;
-    int end = start+32;
-    bool isFirst = true;
-    for (int i = start; i < end; i++)
-        if (localData[i].x == atomIndex) {
-            forceSum += (real4) (localData[i].fx, localData[i].fy, localData[i].fz, 0);
-            isFirst &= (i >= LOCAL_ID);
-        }
-    const unsigned int warp = GLOBAL_ID/TILE_SIZE;
-    unsigned int offset = atomIndex + warp*PADDED_NUM_ATOMS;
-    if (isFirst)
-        forceBuffers[offset] += forceSum;
-    SYNC_WARPS;
-}
-#endif
 KERNEL void computeInteractionGroups(
-#ifdef SUPPORTS_64_BIT_ATOMICS
        GLOBAL mm_ulong* RESTRICT forceBuffers,
-#else
-        GLOBAL real4* RESTRICT forceBuffers,
-#endif
        GLOBAL mixed* RESTRICT energyBuffer, GLOBAL const real4* RESTRICT posq, GLOBAL const int4* RESTRICT groupData,
        GLOBAL const int* RESTRICT numGroupTiles, int useNeighborList,
        real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ
@@ -139,7 +109,6 @@ KERNEL void computeInteractionGroups(
            }
            SYNC_WARPS;
        }
-#ifdef SUPPORTS_64_BIT_ATOMICS
        if (exclusions != 0) {
            ATOMIC_ADD(&forceBuffers[atom1], (mm_ulong) realToFixedPoint(force.x));
            ATOMIC_ADD(&forceBuffers[atom1+PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(force.y));
@@ -149,13 +118,6 @@ KERNEL void computeInteractionGroups(
        ATOMIC_ADD(&forceBuffers[atom2+PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(localData[LOCAL_ID].fy));
        ATOMIC_ADD(&forceBuffers[atom2+2*PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(localData[LOCAL_ID].fz));
        SYNC_WARPS;
-#else
-        writeForces(forceBuffers, localData, atom2);
-        localData[LOCAL_ID].fx = force.x;
-        localData[LOCAL_ID].fy = force.y;
-        localData[LOCAL_ID].fz = force.z;
-        writeForces(forceBuffers, localData, atom1);
-#endif
    }
    energyBuffer[GLOBAL_ID] += energy;
    SAVE_DERIVATIVES

--- a/platforms/common/src/kernels/gbsaObc.cc
+++ b/platforms/common/src/kernels/gbsaObc.cc
@@ -17,11 +17,7 @@ typedef struct ALIGN {
 * Compute the Born sum.
 */
 KERNEL void computeBornSum(
-#ifdef SUPPORTS_64_BIT_ATOMICS
        GLOBAL mm_ulong* RESTRICT global_bornSum,
-#else
-        GLOBAL real* RESTRICT global_bornSum,
-#endif
        GLOBAL const real4* RESTRICT posq, GLOBAL const real* RESTRICT charge, GLOBAL const float2* RESTRICT global_params,
 #ifdef USE_CUTOFF
        GLOBAL const int* RESTRICT tiles, GLOBAL const unsigned int* RESTRICT interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize,
@@ -152,20 +148,12 @@ KERNEL void computeBornSum(
        // Write results.
-#ifdef SUPPORTS_64_BIT_ATOMICS
        unsigned int offset = x*TILE_SIZE + tgx;
        ATOMIC_ADD(&global_bornSum[offset], (mm_ulong) realToFixedPoint(bornSum));
        if (x != y) {
            offset = y*TILE_SIZE + tgx;
            ATOMIC_ADD(&global_bornSum[offset], (mm_ulong) realToFixedPoint(localData[LOCAL_ID].bornSum));
        }
-#else
-        unsigned int offset1 = x*TILE_SIZE + tgx + warp*PADDED_NUM_ATOMS;
-        unsigned int offset2 = y*TILE_SIZE + tgx + warp*PADDED_NUM_ATOMS;
-        global_bornSum[offset1] += bornSum;
-        if (x != y)
-            global_bornSum[offset2] += localData[LOCAL_ID].bornSum;
-#endif
    }
    // Second loop: tiles without exclusions, either from the neighbor list (with cutoff) or just enumerating all
@@ -357,17 +345,9 @@ KERNEL void computeBornSum(
 #else
            unsigned int atom2 = y*TILE_SIZE + tgx;
 #endif
-#ifdef SUPPORTS_64_BIT_ATOMICS
            ATOMIC_ADD(&global_bornSum[atom1], (mm_ulong) realToFixedPoint(bornSum));
            if (atom2 < PADDED_NUM_ATOMS)
                ATOMIC_ADD(&global_bornSum[atom2], (mm_ulong) realToFixedPoint(localData[LOCAL_ID].bornSum));
-#else
-            unsigned int offset1 = atom1 + warp*PADDED_NUM_ATOMS;
-            unsigned int offset2 = atom2 + warp*PADDED_NUM_ATOMS;
-            global_bornSum[offset1] += bornSum;
-            if (atom2 < PADDED_NUM_ATOMS)
-                global_bornSum[offset2] += localData[LOCAL_ID].bornSum;
-#endif
        }
        pos++;
    }
@@ -385,11 +365,7 @@ typedef struct ALIGN {
 */
 KERNEL void computeGBSAForce1(
-#ifdef SUPPORTS_64_BIT_ATOMICS
        GLOBAL mm_ulong* RESTRICT forceBuffers, GLOBAL mm_ulong* RESTRICT global_bornForce,
-#else
-        GLOBAL real4* RESTRICT forceBuffers, GLOBAL real* RESTRICT global_bornForce,
-#endif
        GLOBAL mixed* RESTRICT energyBuffer, GLOBAL const real4* RESTRICT posq, GLOBAL const real* RESTRICT charge,
        GLOBAL const real* RESTRICT global_bornRadii, int needEnergy,
 #ifdef USE_CUTOFF
@@ -538,7 +514,6 @@ KERNEL void computeGBSAForce1(
        // Write results.
-#ifdef SUPPORTS_64_BIT_ATOMICS
        unsigned int offset = x*TILE_SIZE + tgx;
        ATOMIC_ADD(&forceBuffers[offset], (mm_ulong) realToFixedPoint(force.x));
        ATOMIC_ADD(&forceBuffers[offset+PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(force.y));
@@ -551,16 +526,6 @@ KERNEL void computeGBSAForce1(
            ATOMIC_ADD(&forceBuffers[offset+2*PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(localData[LOCAL_ID].fz));
            ATOMIC_ADD(&global_bornForce[offset], (mm_ulong) realToFixedPoint(localData[LOCAL_ID].fw));
        }
-#else
-        unsigned int offset1 = x*TILE_SIZE + tgx + warp*PADDED_NUM_ATOMS;
-        unsigned int offset2 = y*TILE_SIZE + tgx + warp*PADDED_NUM_ATOMS;
-        forceBuffers[offset1] += make_real4(force.x, force.y, force.z, 0);
-        global_bornForce[offset1] += force.w;
-        if (x != y) {
-            forceBuffers[offset2] += (real4) (localData[LOCAL_ID].fx, localData[LOCAL_ID].fy, localData[LOCAL_ID].fz, 0.0f);
-            global_bornForce[offset2] += localData[LOCAL_ID].fw;
-        }
-#endif
    }
    // Second loop: tiles without exclusions, either from the neighbor list (with cutoff) or just enumerating all
@@ -763,7 +728,6 @@ KERNEL void computeGBSAForce1(
 #else
            unsigned int atom2 = y*TILE_SIZE + tgx;
 #endif
-#ifdef SUPPORTS_64_BIT_ATOMICS
            ATOMIC_ADD(&forceBuffers[atom1], (mm_ulong) realToFixedPoint(force.x));
            ATOMIC_ADD(&forceBuffers[atom1+PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(force.y));
            ATOMIC_ADD(&forceBuffers[atom1+2*PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(force.z));
@@ -774,16 +738,6 @@ KERNEL void computeGBSAForce1(
                ATOMIC_ADD(&forceBuffers[atom2+2*PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(localData[LOCAL_ID].fz));
                ATOMIC_ADD(&global_bornForce[atom2], (mm_ulong) realToFixedPoint(localData[LOCAL_ID].fw));
            }
-#else
-            unsigned int offset1 = atom1 + warp*PADDED_NUM_ATOMS;
-            unsigned int offset2 = atom2 + warp*PADDED_NUM_ATOMS;
-            forceBuffers[offset1] += make_real4(force.x, force.y, force.z, 0);
-            global_bornForce[offset1] += force.w;
-            if (atom2 < PADDED_NUM_ATOMS) {
-                forceBuffers[offset2] += (real4) (localData[LOCAL_ID].fx, localData[LOCAL_ID].fy, localData[LOCAL_ID].fz, 0.0f);
-                global_bornForce[offset2] += localData[LOCAL_ID].fw;
-            }
-#endif
        }
        pos++;
    }

--- a/platforms/common/src/kernels/gbsaObc2.cc
+++ b/platforms/common/src/kernels/gbsaObc2.cc
@@ -16,13 +16,8 @@
    real t2I = (l_ij2I-u_ij2I);
    real term1 = (0.5f*(0.25f+OBC_PARAMS2.y*OBC_PARAMS2.y*invRSquaredOver4)*t2J + t1J*invRSquaredOver4)*invR;
    real term2 = (0.5f*(0.25f+OBC_PARAMS1.y*OBC_PARAMS1.y*invRSquaredOver4)*t2I + t1I*invRSquaredOver4)*invR;
-#ifdef SUPPORTS_64_BIT_ATOMICS
    real tempdEdR = (OBC_PARAMS1.x < rScaledRadiusJ ? BORN_FORCE1*term1/0x100000000 : 0);
    tempdEdR += (OBC_PARAMS2.x < rScaledRadiusI ? BORN_FORCE2*term2/0x100000000 : 0);
-#else
-    real tempdEdR = (OBC_PARAMS1.x < rScaledRadiusJ ? BORN_FORCE1*term1 : (real) 0);
-    tempdEdR += (OBC_PARAMS2.x < rScaledRadiusI ? BORN_FORCE2*term2 : (real) 0);
-#endif
 #ifdef USE_CUTOFF
    unsigned int includeInteraction = (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && atom1 != atom2 && r2 < CUTOFF_SQUARED);
 #else

--- a/platforms/common/src/kernels/gbsaObcReductions.cc
+++ b/platforms/common/src/kernels/gbsaObcReductions.cc
@@ -6,23 +6,12 @@
 */
 KERNEL void reduceBornSum(float alpha, float beta, float gamma,
-#ifdef SUPPORTS_64_BIT_ATOMICS
            GLOBAL const mm_long* RESTRICT bornSum,
-#else
-            GLOBAL const real* RESTRICT bornSum, int bufferSize, int numBuffers,
-#endif
            GLOBAL const float2* RESTRICT params, GLOBAL real* RESTRICT bornRadii, GLOBAL real* RESTRICT obcChain) {
    for (unsigned int index = GLOBAL_ID; index < NUM_ATOMS; index += GLOBAL_SIZE) {
        // Get summed Born data
-#ifdef SUPPORTS_64_BIT_ATOMICS
        real sum = RECIP((real) 0x100000000)*bornSum[index];
-#else
-        real sum = bornSum[index];
-        int totalSize = bufferSize*numBuffers;
-        for (int i = index+bufferSize; i < totalSize; i += bufferSize)
-            sum += bornSum[i];
-#endif
        // Now calculate Born radius and OBC term.
@@ -45,24 +34,14 @@ KERNEL void reduceBornSum(float alpha, float beta, float gamma,
 */
 KERNEL void reduceBornForce(
-#ifdef SUPPORTS_64_BIT_ATOMICS
            GLOBAL mm_long* RESTRICT bornForce,
-#else
-            GLOBAL real* bornForce, int bufferSize, int numBuffers,
-#endif
            GLOBAL mixed* RESTRICT energyBuffer, GLOBAL const float2* RESTRICT params, GLOBAL const real* RESTRICT bornRadii, GLOBAL const real* RESTRICT obcChain) {
    mixed energy = 0;
    for (unsigned int index = GLOBAL_ID; index < NUM_ATOMS; index += GLOBAL_SIZE) {
        // Get summed Born force
-#ifdef SUPPORTS_64_BIT_ATOMICS
        real force = RECIP((real) 0x100000000)*bornForce[index];
-#else
-        real force = bornForce[index];
-        int totalSize = bufferSize*numBuffers;
-        for (int i = index+bufferSize; i < totalSize; i += bufferSize)
-            force += bornForce[i];
-#endif
        // Now calculate the actual force
        float offsetRadius = params[index].x;
@@ -73,11 +52,7 @@ KERNEL void reduceBornForce(
        force += saTerm/bornRadius;
        energy += saTerm;
        force *= bornRadius*bornRadius*obcChain[index];
-#ifdef SUPPORTS_64_BIT_ATOMICS
        bornForce[index] = realToFixedPoint(force);
-#else
-        bornForce[index] = force;
-#endif
    }
    energyBuffer[GLOBAL_ID] += energy/-6;
 }
--- a/platforms/common/src/kernels/gbsaObc_cpu.cc
+++ b/platforms/common/src/kernels/gbsaObc_cpu.cc
@@ -9,11 +9,7 @@ typedef struct {
 * Compute the Born sum.
 */
 KERNEL void computeBornSum(
-#ifdef SUPPORTS_64_BIT_ATOMICS
        GLOBAL mm_long* RESTRICT global_bornSum,
-#else
-        GLOBAL real* RESTRICT global_bornSum,
-#endif
        GLOBAL const real4* RESTRICT posq, GLOBAL const real* RESTRICT charge, GLOBAL const float2* RESTRICT global_params,
 #ifdef USE_CUTOFF
        GLOBAL const int* RESTRICT tiles, GLOBAL const unsigned int* RESTRICT interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize,
@@ -87,12 +83,7 @@ KERNEL void computeBornSum(
                // Write results.
-#ifdef SUPPORTS_64_BIT_ATOMICS
                ATOMIC_ADD(&global_bornSum[atom1], realToFixedPoint(bornSum));
-#else
-                unsigned int offset = atom1 + GROUP_ID*PADDED_NUM_ATOMS;
-                global_bornSum[offset] += bornSum;
-#endif
            }
        }
        else {
@@ -149,24 +140,14 @@ KERNEL void computeBornSum(
               // Write results for atom1.
-#ifdef SUPPORTS_64_BIT_ATOMICS
                ATOMIC_ADD(&global_bornSum[atom1], realToFixedPoint(bornSum));
-#else
-                unsigned int offset = atom1 + GROUP_ID*PADDED_NUM_ATOMS;
-                global_bornSum[offset] += bornSum;
-#endif
            }
            // Write results.
            for (int tgx = 0; tgx < TILE_SIZE; tgx++) {
-#ifdef SUPPORTS_64_BIT_ATOMICS
                unsigned int offset = y*TILE_SIZE + tgx;
                ATOMIC_ADD(&global_bornSum[offset], realToFixedPoint(localData[tgx].bornSum));
-#else
-                unsigned int offset = y*TILE_SIZE+tgx + GROUP_ID*PADDED_NUM_ATOMS;
-                global_bornSum[offset] += localData[tgx].bornSum;
-#endif
            }
        }
    }
@@ -296,12 +277,7 @@ KERNEL void computeBornSum(
                    // Write results for atom1.
-#ifdef SUPPORTS_64_BIT_ATOMICS
                    ATOMIC_ADD(&global_bornSum[atom1], realToFixedPoint(bornSum));
-#else
-                    unsigned int offset = atom1 + GROUP_ID*PADDED_NUM_ATOMS;
-                    global_bornSum[offset] += bornSum;
-#endif
                }
            }
            else
@@ -359,12 +335,7 @@ KERNEL void computeBornSum(
                    // Write results for atom1.
-#ifdef SUPPORTS_64_BIT_ATOMICS
                    ATOMIC_ADD(&global_bornSum[atom1], realToFixedPoint(bornSum));
-#else
-                    unsigned int offset = atom1 + GROUP_ID*PADDED_NUM_ATOMS;
-                    global_bornSum[offset] += bornSum;
-#endif
                }
            }
@@ -377,12 +348,7 @@ KERNEL void computeBornSum(
                unsigned int atom2 = y*TILE_SIZE + tgx;
 #endif
                if (atom2 < PADDED_NUM_ATOMS) {
-#ifdef SUPPORTS_64_BIT_ATOMICS
                    ATOMIC_ADD(&global_bornSum[atom2], realToFixedPoint(localData[tgx].bornSum));
-#else
-                    unsigned int offset = atom2 + GROUP_ID*PADDED_NUM_ATOMS;
-                    global_bornSum[offset] += localData[tgx].bornSum;
-#endif
                }
            }
        }
@@ -402,11 +368,7 @@ typedef struct {
 */
 KERNEL void computeGBSAForce1(
-#ifdef SUPPORTS_64_BIT_ATOMICS
        GLOBAL mm_long* RESTRICT forceBuffers, GLOBAL mm_long* RESTRICT global_bornForce,
-#else
-        GLOBAL real4* RESTRICT forceBuffers, GLOBAL real* RESTRICT global_bornForce,
-#endif
        GLOBAL mixed* RESTRICT energyBuffer, GLOBAL const real4* RESTRICT posq, GLOBAL const real* RESTRICT charge,
        GLOBAL const real* RESTRICT global_bornRadii, int needEnergy,
 #ifdef USE_CUTOFF
@@ -490,16 +452,10 @@ KERNEL void computeGBSAForce1(
                // Write results.
-#ifdef SUPPORTS_64_BIT_ATOMICS
                ATOMIC_ADD(&forceBuffers[atom1], realToFixedPoint(force.x));
                ATOMIC_ADD(&forceBuffers[atom1+PADDED_NUM_ATOMS], realToFixedPoint(force.y));
                ATOMIC_ADD(&forceBuffers[atom1+2*PADDED_NUM_ATOMS], realToFixedPoint(force.z));
                ATOMIC_ADD(&global_bornForce[atom1], realToFixedPoint(force.w));
-#else
-                unsigned int offset = atom1 + GROUP_ID*PADDED_NUM_ATOMS;
-                forceBuffers[offset] += make_real4(force.x, force.y, force.z, 0);
-                global_bornForce[offset] += force.w;
-#endif
            }
        }
        else {
@@ -561,36 +517,20 @@ KERNEL void computeGBSAForce1(
               // Write results for atom1.
-#ifdef SUPPORTS_64_BIT_ATOMICS
                ATOMIC_ADD(&forceBuffers[atom1], realToFixedPoint(force.x));
                ATOMIC_ADD(&forceBuffers[atom1+PADDED_NUM_ATOMS], realToFixedPoint(force.y));
                ATOMIC_ADD(&forceBuffers[atom1+2*PADDED_NUM_ATOMS], realToFixedPoint(force.z));
                ATOMIC_ADD(&global_bornForce[atom1], realToFixedPoint(force.w));
-#else
-                unsigned int offset = atom1 + GROUP_ID*PADDED_NUM_ATOMS;
-                forceBuffers[offset] += make_real4(force.x, force.y, force.z, 0);
-                global_bornForce[offset] += force.w;
-#endif
            }
            // Write results.
            for (int tgx = 0; tgx < TILE_SIZE; tgx++) {
-#ifdef SUPPORTS_64_BIT_ATOMICS
                unsigned int offset = y*TILE_SIZE + tgx;
                ATOMIC_ADD(&forceBuffers[offset], realToFixedPoint(localData[tgx].fx));
                ATOMIC_ADD(&forceBuffers[offset+PADDED_NUM_ATOMS], realToFixedPoint(localData[tgx].fy));
                ATOMIC_ADD(&forceBuffers[offset+2*PADDED_NUM_ATOMS], realToFixedPoint(localData[tgx].fz));
                ATOMIC_ADD(&global_bornForce[offset], realToFixedPoint(localData[tgx].fw));
-#else
-                unsigned int offset = y*TILE_SIZE+tgx + GROUP_ID*PADDED_NUM_ATOMS;
-                real4 f = forceBuffers[offset];
-                f.x += localData[tgx].fx;
-                f.y += localData[tgx].fy;
-                f.z += localData[tgx].fz;
-                forceBuffers[offset] = f;
-                global_bornForce[offset] += localData[tgx].fw;
-#endif
            }
        }
    }
@@ -722,16 +662,10 @@ KERNEL void computeGBSAForce1(
                    // Write results for atom1.
-#ifdef SUPPORTS_64_BIT_ATOMICS
                    ATOMIC_ADD(&forceBuffers[atom1], realToFixedPoint(force.x));
                    ATOMIC_ADD(&forceBuffers[atom1+PADDED_NUM_ATOMS], realToFixedPoint(force.y));
                    ATOMIC_ADD(&forceBuffers[atom1+2*PADDED_NUM_ATOMS], realToFixedPoint(force.z));
                    ATOMIC_ADD(&global_bornForce[atom1], realToFixedPoint(force.w));
-#else
-                    unsigned int offset = atom1 + GROUP_ID*PADDED_NUM_ATOMS;
-                    forceBuffers[offset] += make_real4(force.x, force.y, force.z, 0);
-                    global_bornForce[offset] += force.w;
-#endif
                }
            }
            else
@@ -790,16 +724,10 @@ KERNEL void computeGBSAForce1(
                    // Write results for atom1.
-#ifdef SUPPORTS_64_BIT_ATOMICS
                    ATOMIC_ADD(&forceBuffers[atom1], realToFixedPoint(force.x));
                    ATOMIC_ADD(&forceBuffers[atom1+PADDED_NUM_ATOMS], realToFixedPoint(force.y));
                    ATOMIC_ADD(&forceBuffers[atom1+2*PADDED_NUM_ATOMS], realToFixedPoint(force.z));
                    ATOMIC_ADD(&global_bornForce[atom1], realToFixedPoint(force.w));
-#else
-                    unsigned int offset = atom1 + GROUP_ID*PADDED_NUM_ATOMS;
-                    forceBuffers[offset] += make_real4(force.x, force.y, force.z, 0);
-                    global_bornForce[offset] += force.w;
-#endif
                }
            }
@@ -812,20 +740,10 @@ KERNEL void computeGBSAForce1(
                unsigned int atom2 = y*TILE_SIZE + tgx;
 #endif
                if (atom2 < PADDED_NUM_ATOMS) {
-#ifdef SUPPORTS_64_BIT_ATOMICS
                    ATOMIC_ADD(&forceBuffers[atom2], realToFixedPoint(localData[tgx].fx));
                    ATOMIC_ADD(&forceBuffers[atom2+PADDED_NUM_ATOMS], realToFixedPoint(localData[tgx].fy));
                    ATOMIC_ADD(&forceBuffers[atom2+2*PADDED_NUM_ATOMS], realToFixedPoint(localData[tgx].fz));
                    ATOMIC_ADD(&global_bornForce[atom2], realToFixedPoint(localData[tgx].fw));
-#else
-                    unsigned int offset = atom2 + GROUP_ID*PADDED_NUM_ATOMS;
-                    real4 f = forceBuffers[offset];
-                    f.x += localData[tgx].fx;
-                    f.y += localData[tgx].fy;
-                    f.z += localData[tgx].fz;
-                    forceBuffers[offset] = f;
-                    global_bornForce[offset] += localData[tgx].fw;
-#endif
                }
            }
        }

--- a/platforms/common/src/kernels/pme.cc
+++ b/platforms/common/src/kernels/pme.cc
 KERNEL void findAtomGridIndex(GLOBAL const real4* RESTRICT posq, GLOBAL int2* RESTRICT pmeAtomGridIndex,
        real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ,
        real4 recipBoxVecX, real4 recipBoxVecY, real4 recipBoxVecZ
-#ifndef SUPPORTS_64_BIT_ATOMICS
-        , GLOBAL real4* RESTRICT pmeBsplineTheta, LOCAL real4* RESTRICT bsplinesCache,
-#ifdef CHARGE_FROM_SIGEPS
-        GLOBAL const float2* RESTRICT sigmaEpsilon
-#else
-        GLOBAL const real* RESTRICT charges
-#endif
-#endif
    ) {
    // Compute the index of the grid point each atom is associated with.
@@ -25,42 +17,9 @@ KERNEL void findAtomGridIndex(GLOBAL const real4* RESTRICT posq, GLOBAL int2* RE
                                   ((int) t.y) % GRID_SIZE_Y,
                                   ((int) t.z) % GRID_SIZE_Z);
        pmeAtomGridIndex[atom] = make_int2(atom, gridIndex.x*GRID_SIZE_Y*GRID_SIZE_Z+gridIndex.y*GRID_SIZE_Z+gridIndex.z);
-#ifndef SUPPORTS_64_BIT_ATOMICS
-        // Compute B-splines here for use in the charge spreading kernel.
-        const real4 scale = 1/(real) (PME_ORDER-1);
-        LOCAL real4* data = &bsplinesCache[LOCAL_ID*PME_ORDER];
-        real4 dr = (real4) (t.x-(int) t.x, t.y-(int) t.y, t.z-(int) t.z, 0.0f);
-        data[PME_ORDER-1] = 0.0f;
-        data[1] = dr;
-        data[0] = 1.0f-dr;
-        for (int j = 3; j < PME_ORDER; j++) {
-            real div = RECIP(j-1.0f);
-            data[j-1] = div*dr*data[j-2];
-            for (int k = 1; k < (j-1); k++)
-                data[j-k-1] = div*((dr+make_real4(k))*data[j-k-2] + (-dr+make_real4(j-k))*data[j-k-1]);
-            data[0] = div*(- dr+1.0f)*data[0];
-        }
-        data[PME_ORDER-1] = scale*dr*data[PME_ORDER-2];
-        for (int j = 1; j < (PME_ORDER-1); j++)
-            data[PME_ORDER-j-1] = scale*((dr+make_real4(j))*data[PME_ORDER-j-2] + (-dr+make_real4(PME_ORDER-j))*data[PME_ORDER-j-1]);
-        data[0] = scale*(-dr+1.0f)*data[0];
-        for (int j = 0; j < PME_ORDER; j++) {
-#ifdef CHARGE_FROM_SIGEPS
-            const float2 sigEps = sigmaEpsilon[atom];
-            const real charge = 8*sigEps.x*sigEps.x*sigEps.x*sigEps.y;
-#else
-            const real charge = CHARGE;
-#endif
-            data[j].w = charge; // Storing the charge here improves cache coherency in the charge spreading kernel
-            pmeBsplineTheta[atom+j*NUM_ATOMS] = data[j];
-        }
-#endif
    }
 }
-#ifdef SUPPORTS_64_BIT_ATOMICS
-#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable
 #if defined(USE_HIP) && !defined(AMD_RDNA)
 LAUNCH_BOUNDS_EXACT(128, 1)
 #endif
@@ -206,197 +165,6 @@ KERNEL void finishSpreadCharge(
 #endif
    }
 }
-#elif defined(DEVICE_IS_CPU)
-KERNEL void gridSpreadCharge(GLOBAL const real4* RESTRICT posq, GLOBAL real* RESTRICT pmeGrid,
-        real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ,
-        real4 recipBoxVecX, real4 recipBoxVecY, real4 recipBoxVecZ,
-#ifdef CHARGE_FROM_SIGEPS
-        GLOBAL const float2* RESTRICT sigmaEpsilon
-#else
-        GLOBAL const real* RESTRICT charges
-#endif
-    ) {
-    const int firstx = GLOBAL_ID*GRID_SIZE_X/GLOBAL_SIZE;
-    const int lastx = (GLOBAL_ID+1)*GRID_SIZE_X/GLOBAL_SIZE;
-    if (firstx == lastx)
-        return;
-    const real4 scale = 1/(real) (PME_ORDER-1);
-    real4 data[PME_ORDER];
-    // Process the atoms in spatially sorted order.  This improves efficiency when writing
-    // the grid values.
-    for (int i = 0; i < NUM_ATOMS; i++) {
-        int atom = i;
-        real4 pos = posq[atom];
-        APPLY_PERIODIC_TO_POS(pos)
-        real3 t = (real3) (pos.x*recipBoxVecX.x+pos.y*recipBoxVecY.x+pos.z*recipBoxVecZ.x,
-                           pos.y*recipBoxVecY.y+pos.z*recipBoxVecZ.y,
-                           pos.z*recipBoxVecZ.z);
-        t.x = (t.x-floor(t.x))*GRID_SIZE_X;
-        t.y = (t.y-floor(t.y))*GRID_SIZE_Y;
-        t.z = (t.z-floor(t.z))*GRID_SIZE_Z;
-        int4 gridIndex = (int4) (((int) t.x) % GRID_SIZE_X,
-                                 ((int) t.y) % GRID_SIZE_Y,
-                                 ((int) t.z) % GRID_SIZE_Z, 0);
-        // Spread the charge from this atom onto each grid point.
-#ifdef CHARGE_FROM_SIGEPS
-        const float2 sigEps = sigmaEpsilon[atom];
-        const real charge = 8*sigEps.x*sigEps.x*sigEps.x*sigEps.y;
-#else
-        const real charge = (CHARGE)*EPSILON_FACTOR;
-#endif
-        if (charge == 0)
-            continue;
-        bool hasComputedThetas = false;
-        for (int ix = 0; ix < PME_ORDER; ix++) {
-            int xindex = gridIndex.x+ix;
-            xindex -= (xindex >= GRID_SIZE_X ? GRID_SIZE_X : 0);
-            if (xindex < firstx || xindex >= lastx)
-                continue;
-            if (!hasComputedThetas) {
-                hasComputedThetas = true;
-                // Since we need the full set of thetas, it's faster to compute them here than load them
-                // from global memory.
-                real4 dr = (real4) (t.x-(int) t.x, t.y-(int) t.y, t.z-(int) t.z, 0.0f);
-                data[PME_ORDER-1] = 0.0f;
-                data[1] = dr;
-                data[0] = 1.0f-dr;
-                for (int j = 3; j < PME_ORDER; j++) {
-                    real div = RECIP(j-1.0f);
-                    data[j-1] = div*dr*data[j-2];
-                    for (int k = 1; k < (j-1); k++)
-                        data[j-k-1] = div*((dr+(real4) k) *data[j-k-2] + (-dr+(real4) (j-k))*data[j-k-1]);
-                    data[0] = div*(- dr+1.0f)*data[0];
-                }
-                data[PME_ORDER-1] = scale*dr*data[PME_ORDER-2];
-                for (int j = 1; j < (PME_ORDER-1); j++)
-                    data[PME_ORDER-j-1] = scale*((dr+(real4) j)*data[PME_ORDER-j-2] + (-dr+(real4) (PME_ORDER-j))*data[PME_ORDER-j-1]);
-                data[0] = scale*(-dr+1.0f)*data[0];
-            }
-            for (int iy = 0; iy < PME_ORDER; iy++) {
-                int yindex = gridIndex.y+iy;
-                yindex -= (yindex >= GRID_SIZE_Y ? GRID_SIZE_Y : 0);
-                for (int iz = 0; iz < PME_ORDER; iz++) {
-                    int zindex = gridIndex.z+iz;
-                    zindex -= (zindex >= GRID_SIZE_Z ? GRID_SIZE_Z : 0);
-                    int index = xindex*GRID_SIZE_Y*GRID_SIZE_Z + yindex*GRID_SIZE_Z + zindex;
-                    pmeGrid[index] += charge*data[ix].x*data[iy].y*data[iz].z;
-                }
-            }
-        }
-    }
-}
-#else
-/**
- * For each grid point, find the range of sorted atoms associated with that point.
- */
-KERNEL void findAtomRangeForGrid(GLOBAL int2* RESTRICT pmeAtomGridIndex, GLOBAL int* RESTRICT pmeAtomRange, GLOBAL const real4* RESTRICT posq) {
-    int start = (NUM_ATOMS*GLOBAL_ID)/GLOBAL_SIZE;
-    int end = (NUM_ATOMS*(GLOBAL_ID+1))/GLOBAL_SIZE;
-    int last = (start == 0 ? -1 : pmeAtomGridIndex[start-1].y);
-    for (int i = start; i < end; ++i) {
-        int2 atomData = pmeAtomGridIndex[i];
-        int gridIndex = atomData.y;
-        if (gridIndex != last) {
-            for (int j = last+1; j <= gridIndex; ++j)
-                pmeAtomRange[j] = i;
-            last = gridIndex;
-        }
-    }
-    // Fill in values beyond the last atom.
-    if (GLOBAL_ID == GLOBAL_SIZE-1) {
-        int gridSize = GRID_SIZE_X*GRID_SIZE_Y*GRID_SIZE_Z;
-        for (int j = last+1; j <= gridSize; ++j)
-            pmeAtomRange[j] = NUM_ATOMS;
-    }
-}
-/**
- * The grid index won't be needed again.  Reuse that component to hold the z index, thus saving
- * some work in the charge spreading kernel.
- */
-KERNEL void recordZIndex(GLOBAL int2* RESTRICT pmeAtomGridIndex, GLOBAL const real4* RESTRICT posq, real4 periodicBoxSize, real4 recipBoxVecZ) {
-    int start = (NUM_ATOMS*GLOBAL_ID)/GLOBAL_SIZE;
-    int end = (NUM_ATOMS*(GLOBAL_ID+1))/GLOBAL_SIZE;
-    for (int i = start; i < end; ++i) {
-        real posz = posq[pmeAtomGridIndex[i].x].z;
-        posz -= floor(posz*recipBoxVecZ.z)*periodicBoxSize.z;
-        int z = ((int) ((posz*recipBoxVecZ.z)*GRID_SIZE_Z)) % GRID_SIZE_Z;
-        pmeAtomGridIndex[i].y = z;
-    }
-}
-KERNEL void gridSpreadCharge(GLOBAL const real4* RESTRICT posq, GLOBAL real* RESTRICT pmeGrid,
-        GLOBAL const int2* RESTRICT pmeAtomGridIndex, GLOBAL const int* RESTRICT pmeAtomRange,
-        GLOBAL const real4* RESTRICT pmeBsplineTheta
-#ifdef CHARGE_FROM_SIGEPS
-        , GLOBAL const float2* RESTRICT sigmaEpsilon
-#else
-        , GLOBAL const real* RESTRICT charges
-#endif
-    ) {
-    unsigned int numGridPoints = GRID_SIZE_X*GRID_SIZE_Y*GRID_SIZE_Z;
-    for (int gridIndex = GLOBAL_ID; gridIndex < numGridPoints; gridIndex += GLOBAL_SIZE) {
-        // Compute the charge on a grid point.
-        int4 gridPoint;
-        gridPoint.x = gridIndex/(GRID_SIZE_Y*GRID_SIZE_Z);
-        int remainder = gridIndex-gridPoint.x*GRID_SIZE_Y*GRID_SIZE_Z;
-        gridPoint.y = remainder/GRID_SIZE_Z;
-        gridPoint.z = remainder-gridPoint.y*GRID_SIZE_Z;
-        real result = 0.0f;
-        // Loop over all atoms that affect this grid point.
-        for (int ix = 0; ix < PME_ORDER; ++ix) {
-            int x = gridPoint.x-ix+(gridPoint.x >= ix ? 0 : GRID_SIZE_X);
-            for (int iy = 0; iy < PME_ORDER; ++iy) {
-                int y = gridPoint.y-iy+(gridPoint.y >= iy ? 0 : GRID_SIZE_Y);
-                int z1 = gridPoint.z-PME_ORDER+1;
-                z1 += (z1 >= 0 ? 0 : GRID_SIZE_Z);
-                int z2 = (z1 < gridPoint.z ? gridPoint.z : GRID_SIZE_Z-1);
-                int gridIndex1 = x*GRID_SIZE_Y*GRID_SIZE_Z+y*GRID_SIZE_Z+z1;
-                int gridIndex2 = x*GRID_SIZE_Y*GRID_SIZE_Z+y*GRID_SIZE_Z+z2;
-                int firstAtom = pmeAtomRange[gridIndex1];
-                int lastAtom = pmeAtomRange[gridIndex2+1];
-                for (int i = firstAtom; i < lastAtom; ++i)
-                {
-                    int2 atomData = pmeAtomGridIndex[i];
-                    int atomIndex = atomData.x;
-                    int z = atomData.y;
-                    int iz = gridPoint.z-z+(gridPoint.z >= z ? 0 : GRID_SIZE_Z);
-                    real atomCharge = pmeBsplineTheta[atomIndex+ix*NUM_ATOMS].w;
-                    result += atomCharge*pmeBsplineTheta[atomIndex+ix*NUM_ATOMS].x*pmeBsplineTheta[atomIndex+iy*NUM_ATOMS].y*pmeBsplineTheta[atomIndex+iz*NUM_ATOMS].z;
-                }
-                if (z1 > gridPoint.z)
-                {
-                    gridIndex1 = x*GRID_SIZE_Y*GRID_SIZE_Z+y*GRID_SIZE_Z;
-                    gridIndex2 = x*GRID_SIZE_Y*GRID_SIZE_Z+y*GRID_SIZE_Z+gridPoint.z;
-                    firstAtom = pmeAtomRange[gridIndex1];
-                    lastAtom = pmeAtomRange[gridIndex2+1];
-                    for (int i = firstAtom; i < lastAtom; ++i)
-                    {
-                        int2 atomData = pmeAtomGridIndex[i];
-                        int atomIndex = atomData.x;
-                        int z = atomData.y;
-                        int iz = gridPoint.z-z+(gridPoint.z >= z ? 0 : GRID_SIZE_Z);
-                        real atomCharge = pmeBsplineTheta[atomIndex+ix*NUM_ATOMS].w;
-                        result += atomCharge*pmeBsplineTheta[atomIndex+ix*NUM_ATOMS].x*pmeBsplineTheta[atomIndex+iy*NUM_ATOMS].y*pmeBsplineTheta[atomIndex+iz*NUM_ATOMS].z;
-                    }
-                }
-            }
-        }
-        pmeGrid[gridIndex] = result*EPSILON_FACTOR;
-    }
-}
-#endif
 KERNEL void reciprocalConvolution(GLOBAL real2* RESTRICT pmeGrid, GLOBAL const real* RESTRICT pmeBsplineModuliX,
        GLOBAL const real* RESTRICT pmeBsplineModuliY, GLOBAL const real* RESTRICT pmeBsplineModuliZ,

--- a/platforms/opencl/include/OpenCLBondedUtilities.h
+++ b/platforms/opencl/include/OpenCLBondedUtilities.h
@@ -9,7 +9,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2011-2019 Stanford University and the Authors.      *
+ * Portions copyright (c) 2011-2022 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -133,12 +133,6 @@ public:
     * Initialize this object in preparation for a simulation.
     */
    void initialize(const System& system);
-    /**
-     * Get the number of force buffers required for bonded forces.
-     */
-    int getNumForceBuffers() {
-        return numForceBuffers;
-    }
    /**
     * Compute the bonded interactions.
     * 
@@ -148,19 +142,17 @@ public:
 private:
    std::string createForceSource(int forceIndex, int numBonds, int numAtoms, int group, const std::string& computeForce);
    OpenCLContext& context;
-    std::vector<cl::Kernel> kernels;
+    cl::Kernel kernel;
    std::vector<std::vector<std::vector<int> > > forceAtoms;
    std::vector<int> indexWidth;
    std::vector<std::string> forceSource;
    std::vector<int> forceGroup;
-    std::vector<std::vector<int> > forceSets;
    std::vector<cl::Memory*> arguments;
    std::vector<std::string> argTypes;
    std::vector<OpenCLArray> atomIndices;
-    std::vector<OpenCLArray> bufferIndices;
    std::vector<std::string> prefixCode;
    std::vector<std::string> energyParameterDerivatives;
-    int numForceBuffers, maxBonds, allGroups;
+    int maxBonds, allGroups;
    bool hasInitializedKernels;
 };

--- a/platforms/opencl/include/OpenCLNonbondedUtilities.h
+++ b/platforms/opencl/include/OpenCLNonbondedUtilities.h
@@ -125,7 +125,7 @@ public:
     * Get the number of force buffers required for nonbonded forces.
     */
    int getNumForceBuffers() const {
-        return numForceBuffers;
+        return 1;
    }
    /**
     * Get the number of energy buffers required for nonbonded forces.
@@ -331,7 +331,7 @@ private:
    std::map<int, std::string> groupKernelSource;
    double lastCutoff;
    bool useCutoff, usePeriodic, deviceIsCpu, anyExclusions, usePadding, forceRebuildNeighborList;
-    int numForceBuffers, startTileIndex, startBlockIndex, numBlocks, maxExclusions, numForceThreadBlocks;
+    int startTileIndex, startBlockIndex, numBlocks, maxExclusions, numForceThreadBlocks;
    int forceThreadBlockSize, interactingBlocksThreadBlockSize, groupFlags;
    unsigned int tilesAfterReorder;
    long long numTiles;

--- a/platforms/opencl/src/OpenCLBondedUtilities.cpp
+++ b/platforms/opencl/src/OpenCLBondedUtilities.cpp
@@ -6,7 +6,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2011-2019 Stanford University and the Authors.      *
+ * Portions copyright (c) 2011-2022 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -34,7 +34,7 @@
 using namespace OpenMM;
 using namespace std;
-OpenCLBondedUtilities::OpenCLBondedUtilities(OpenCLContext& context) : context(context), numForceBuffers(0), maxBonds(0), allGroups(0), hasInitializedKernels(false) {
+OpenCLBondedUtilities::OpenCLBondedUtilities(OpenCLContext& context) : context(context), maxBonds(0), allGroups(0), hasInitializedKernels(false) {
 }
 void OpenCLBondedUtilities::addInteraction(const vector<vector<int> >& atoms, const string& source, int group) {
@@ -85,11 +85,8 @@ void OpenCLBondedUtilities::initialize(const System& system) {
    if (numForces == 0)
        return;
-    // Build the lists of atom indices and buffer indices.
+    // Build the lists of atom indices.
-    vector<vector<cl_uint> > bufferVec(numForces);
-    vector<vector<int> > bufferCounter(numForces, vector<int>(system.getNumParticles(), 0));
-    vector<int> numBuffers(numForces, 0);
    atomIndices.resize(numForces);
    for (int i = 0; i < numForces; i++) {
        int numBonds = forceAtoms[i].size();
@@ -102,100 +99,17 @@ void OpenCLBondedUtilities::initialize(const System& system) {
        }
        atomIndices[i].initialize<cl_uint>(context, indexVec.size(), "bondedIndices");
        atomIndices[i].upload(indexVec);
-        bufferVec[i].resize(width*numBonds, 0);
-        for (int bond = 0; bond < numBonds; bond++) {
-            for (int atom = 0; atom < numAtoms; atom++)
-                bufferVec[i][bond*width+atom] = bufferCounter[i][forceAtoms[i][bond][atom]]++;
-        }
-        for (int j = 0; j < (int) bufferCounter[i].size(); j++)
-            numBuffers[i] = max(numBuffers[i], bufferCounter[i][j]);
-    }
-    // For efficiency, we want to merge multiple forces into a single kernel - but only if that
-    // won't increase the number of force buffers.
-    if (context.getSupports64BitGlobalAtomics()) {
-        // Put all the forces in the same set.
-        numForceBuffers = 1;
-        forceSets.push_back(vector<int>());
-        for (int i = 0; i < numForces; i++)
-            forceSets[0].push_back(i);
-    }
-    else {
-        // Figure out how many force buffers will be required.
-        for (int i = 0; i < numForces; i++)
-            numForceBuffers = max(numForceBuffers, numBuffers[i]);
-        int bufferLimit = max(numForceBuffers, (int) context.getPlatformData().contexts.size());
-        if (context.getNonbondedUtilities().getHasInteractions())
-            bufferLimit = max(bufferLimit, context.getNonbondedUtilities().getNumForceBuffers());
-        // Figure out sets of forces that can be merged.
-        vector<int> unmerged(numForces);
-        for (int i = 0; i < numForces; i++)
-            unmerged[i] = i;
-        for (int i = 0; i < numForces; i++)
-            for (int j = i-1; j >= 0; j--) {
-                if (numBuffers[unmerged[j]] <= numBuffers[unmerged[j+1]])
-                    break;
-                int temp = unmerged[j+1];
-                unmerged[j+1] = unmerged[j];
-                unmerged[j] = temp;
-            }
-        while (unmerged.size() > 0) {
-            int sum = numBuffers[unmerged.back()];
-            int i;
-            for (i = 0; i < (int) unmerged.size()-1; i++) {
-                if (sum+numBuffers[unmerged[i]] > bufferLimit)
-                    break;
-                sum += numBuffers[unmerged[i]];
-            }
-            forceSets.push_back(vector<int>());
-            for (int j = 0; j < i; j++)
-                forceSets.back().push_back(unmerged[j]);
-            forceSets.back().push_back(unmerged.back());
-            for (int j = 0; j < i; j++)
-                unmerged.erase(unmerged.begin());
-            unmerged.pop_back();
-        }
-    }
-    // Update the buffer indices based on merged sets.
-    bufferIndices.resize(numForces);
-    for (int i = 0; i < (int) forceSets.size(); i++)
-        for (int j = 0; j < (int) forceSets[i].size(); j++) {
-            int force = forceSets[i][j];
-            int numBonds = forceAtoms[force].size();
-            int numAtoms = forceAtoms[force][0].size();
-            int width = indexWidth[force];
-            for (int k = 0; k < j; k++)
-                for (int bond = 0; bond < numBonds; bond++)
-                    for (int atom = 0; atom < numAtoms; atom++)
-                        bufferVec[force][bond*width+atom] += bufferCounter[forceSets[i][k]][forceAtoms[force][bond][atom]];
-            bufferIndices[force].initialize<cl_uint>(context, bufferVec[force].size(), "bondedBufferIndices");
-            bufferIndices[force].upload(bufferVec[force]);
    }
-    // Create the kernels.
+    // Create the kernel.
-    for (auto& set : forceSets) {
-        int setSize = set.size();
    stringstream s;
-        s<<"#ifdef SUPPORTS_64_BIT_ATOMICS\n";
-        s<<"#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable\n";
-        s<<"#endif\n";
    for (int i = 0; i < (int) prefixCode.size(); i++)
        s<<prefixCode[i];
-        string bufferType = (context.getSupports64BitGlobalAtomics() ? "long" : "real4");
+    s<<"__kernel void computeBondedForces(__global long* restrict forceBuffers, __global mixed* restrict energyBuffer, __global const real4* restrict posq, int groups, real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ";
-        s<<"__kernel void computeBondedForces(__global "<<bufferType<<"* restrict forceBuffers, __global mixed* restrict energyBuffer, __global const real4* restrict posq, int groups, real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ";
+    for (int force = 0; force < numForces; force++) {
-        for (int i = 0; i < setSize; i++) {
-            int force = set[i];
        string indexType = "uint"+(indexWidth[force] == 1 ? "" : context.intToString(indexWidth[force]));
-            s<<", __global const "<<indexType<<"* restrict atomIndices"<<i;
+        s<<", __global const "<<indexType<<"* restrict atomIndices"<<force;
-            s<<", __global const "<<indexType<<"* restrict bufferIndices"<<i;
    }
    for (int i = 0; i < (int) arguments.size(); i++)
        s<<", __global "<<argTypes[i]<<"* customArg"<<(i+1);
@@ -205,10 +119,8 @@ void OpenCLBondedUtilities::initialize(const System& system) {
    s<<"mixed energy = 0;\n";
    for (int i = 0; i < energyParameterDerivatives.size(); i++)
        s<<"mixed energyParamDeriv"<<i<<" = 0;\n";
-        for (int i = 0; i < setSize; i++) {
+    for (int force = 0; force < numForces; force++)
-            int force = set[i];
+        s<<createForceSource(force, forceAtoms[force].size(), forceAtoms[force][0].size(), forceGroup[force], forceSource[force]);
-            s<<createForceSource(i, forceAtoms[force].size(), forceAtoms[force][0].size(), forceGroup[force], forceSource[force]);
-        }
    s<<"energyBuffer[get_global_id(0)] += energy;\n";
    const vector<string>& allParamDerivNames = context.getEnergyParamDerivNames();
    int numDerivs = allParamDerivNames.size();
@@ -220,8 +132,7 @@ void OpenCLBondedUtilities::initialize(const System& system) {
    map<string, string> defines;
    defines["PADDED_NUM_ATOMS"] = context.intToString(context.getPaddedNumAtoms());
    cl::Program program = context.createProgram(s.str(), defines);
-        kernels.push_back(cl::Kernel(program, "computeBondedForces"));
+    kernel = cl::Kernel(program, "computeBondedForces");
-    }
    forceAtoms.clear();
    forceSource.clear();
 }
@@ -247,7 +158,6 @@ string OpenCLBondedUtilities::createForceSource(int forceIndex, int numBonds, in
    s<<"if ((groups&"<<(1<<group)<<") != 0)\n";
    s<<"for (unsigned int index = get_global_id(0); index < "<<numBonds<<"; index += get_global_size(0)) {\n";
    s<<"    "<<indexType<<" atoms = atomIndices"<<forceIndex<<"[index];\n";
-    s<<"    "<<indexType<<" buffers = bufferIndices"<<forceIndex<<"[index];\n";
    for (int i = 0; i < numAtoms; i++) {
        s<<"    unsigned int atom"<<(i+1)<<" = atoms"<<suffix[i]<<";\n";
        s<<"    real4 pos"<<(i+1)<<" = posq[atom"<<(i+1)<<"];\n";
@@ -255,17 +165,9 @@ string OpenCLBondedUtilities::createForceSource(int forceIndex, int numBonds, in
    s<<computeForce<<"\n";
    for (int i = 0; i < numAtoms; i++) {
        s<<"    {\n";
-        if (context.getSupports64BitGlobalAtomics()) {
+        s<<"    ATOMIC_ADD(&forceBuffers[atom"<<(i+1)<<"], (mm_ulong) realToFixedPoint(force"<<(i+1)<<".x));\n";
-            s<<"    atom_add(&forceBuffers[atom"<<(i+1)<<"], realToFixedPoint(force"<<(i+1)<<".x));\n";
+        s<<"    ATOMIC_ADD(&forceBuffers[atom"<<(i+1)<<"+PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(force"<<(i+1)<<".y));\n";
-            s<<"    atom_add(&forceBuffers[atom"<<(i+1)<<"+PADDED_NUM_ATOMS], realToFixedPoint(force"<<(i+1)<<".y));\n";
+        s<<"    ATOMIC_ADD(&forceBuffers[atom"<<(i+1)<<"+2*PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(force"<<(i+1)<<".z));\n";
-            s<<"    atom_add(&forceBuffers[atom"<<(i+1)<<"+2*PADDED_NUM_ATOMS], realToFixedPoint(force"<<(i+1)<<".z));\n";
-        }
-        else {
-            s<<"    unsigned int offset = atom"<<(i+1)<<"+buffers"<<suffix[i]<<"*PADDED_NUM_ATOMS;\n";
-            s<<"    real4 force = forceBuffers[offset];\n";
-            s<<"    force.xyz += force"<<(i+1)<<".xyz;\n";
-            s<<"    forceBuffers[offset] = force;\n";
-        }
        s<<"    }\n";
    }
    s<<"}\n";
@@ -277,28 +179,18 @@ void OpenCLBondedUtilities::computeInteractions(int groups) {
        return;
    if (!hasInitializedKernels) {
        hasInitializedKernels = true;
-        for (int i = 0; i < (int) forceSets.size(); i++) {
        int index = 0;
-            cl::Kernel& kernel = kernels[i];
-            if (context.getSupports64BitGlobalAtomics())
        kernel.setArg<cl::Buffer>(index++, context.getLongForceBuffer().getDeviceBuffer());
-            else
-                kernel.setArg<cl::Buffer>(index++, context.getForceBuffers().getDeviceBuffer());
        kernel.setArg<cl::Buffer>(index++, context.getEnergyBuffer().getDeviceBuffer());
        kernel.setArg<cl::Buffer>(index++, context.getPosq().getDeviceBuffer());
        index += 6;
-            for (int j = 0; j < (int) forceSets[i].size(); j++) {
+        for (int j = 0; j < (int) atomIndices.size(); j++)
-                kernel.setArg<cl::Buffer>(index++, atomIndices[forceSets[i][j]].getDeviceBuffer());
+            kernel.setArg<cl::Buffer>(index++, atomIndices[j].getDeviceBuffer());
-                kernel.setArg<cl::Buffer>(index++, bufferIndices[forceSets[i][j]].getDeviceBuffer());
-            }
        for (int j = 0; j < (int) arguments.size(); j++)
            kernel.setArg<cl::Memory>(index++, *arguments[j]);
        if (energyParameterDerivatives.size() > 0)
            kernel.setArg<cl::Memory>(index++, context.getEnergyParamDerivBuffer().getDeviceBuffer());
    }
-    }
-    for (int i = 0; i < (int) kernels.size(); i++) {
-        cl::Kernel& kernel = kernels[i];
    kernel.setArg<cl_int>(3, groups);
    if (context.getUseDoublePrecision()) {
        kernel.setArg<mm_double4>(4, context.getPeriodicBoxSizeDouble());
@@ -314,6 +206,5 @@ void OpenCLBondedUtilities::computeInteractions(int groups) {
        kernel.setArg<mm_float4>(7, context.getPeriodicBoxVecY());
        kernel.setArg<mm_float4>(8, context.getPeriodicBoxVecZ());
    }
-        context.executeKernel(kernels[i], maxBonds);
+    context.executeKernel(kernel, maxBonds);
-    }
 }