Improved support for devices without 64 bit atomics (#3737)

ae686364 · Peter Eastman · GitHub · 48664a1f · ae686364 · ae686364
Unverified Commit ae686364 authored Aug 17, 2022 by Peter Eastman Committed by GitHub Aug 17, 2022
20 changed files
--- a/platforms/common/include/openmm/common/CommonKernels.h
+++ b/platforms/common/include/openmm/common/CommonKernels.h
@@ -786,8 +786,6 @@ private:
    ComputeArray globals;
    ComputeArray donors;
    ComputeArray acceptors;
-    ComputeArray donorBufferIndices;
-    ComputeArray acceptorBufferIndices;
    ComputeArray donorExclusions;
    ComputeArray acceptorExclusions;
    std::vector<std::string> globalParamNames;

--- a/platforms/common/src/CommonKernels.cpp
+++ b/platforms/common/src/CommonKernels.cpp
@@ -1483,8 +1483,6 @@ void CommonCalcCustomCentroidBondForceKernel::initialize(const System& system, c
    numBonds = force.getNumBonds();
    if (numBonds == 0)
        return;
-    if (!cc.getSupports64BitGlobalAtomics())
-        throw OpenMMException("CustomCentroidBondForce requires a device that supports 64 bit atomic operations");
    info = new ForceInfo(force);
    cc.addForce(info);
    
@@ -2380,8 +2378,7 @@ double CommonCalcCustomNonbondedForceKernel::execute(ContextImpl& context, bool
    if (interactionGroupData.isInitialized()) {
        if (!hasInitializedKernel) {
            hasInitializedKernel = true;
-            bool useLong = cc.getSupports64BitGlobalAtomics();
-            interactionGroupKernel->addArg((useLong ? cc.getLongForceBuffer() : cc.getForceBuffers()));
+            interactionGroupKernel->addArg(cc.getLongForceBuffer());
            interactionGroupKernel->addArg(cc.getEnergyBuffer());
            interactionGroupKernel->addArg(cc.getPosq());
            interactionGroupKernel->addArg((useNeighborList ? filteredGroupData : interactionGroupData));
@@ -2501,15 +2498,8 @@ void CommonCalcGBSAOBCForceKernel::initialize(const System& system, const GBSAOB
    charges.initialize(cc, cc.getPaddedNumAtoms(), elementSize, "gbsaObcCharges");
    bornRadii.initialize(cc, cc.getPaddedNumAtoms(), elementSize, "bornRadii");
    obcChain.initialize(cc, cc.getPaddedNumAtoms(), elementSize, "obcChain");
-    if (cc.getSupports64BitGlobalAtomics()) {
-        bornSum.initialize<long long>(cc, cc.getPaddedNumAtoms(), "bornSum");
-        bornForce.initialize<long long>(cc, cc.getPaddedNumAtoms(), "bornForce");
-    }
-    else {
-        int bufferSize = cc.getPaddedNumAtoms()*nb.getNumForceBuffers();
-        bornSum.initialize(cc, bufferSize, elementSize, "bornSum");
-        bornForce.initialize(cc, bufferSize, elementSize, "bornForce");
-    }
+    bornSum.initialize<long long>(cc, cc.getPaddedNumAtoms(), "bornSum");
+    bornForce.initialize<long long>(cc, cc.getPaddedNumAtoms(), "bornForce");
    cc.addAutoclearBuffer(bornSum);
    cc.addAutoclearBuffer(bornForce);
    vector<double> chargeVec(cc.getPaddedNumAtoms());
@@ -2541,7 +2531,7 @@ void CommonCalcGBSAOBCForceKernel::initialize(const System& system, const GBSAOB
    nb.addInteraction(useCutoff, usePeriodic, false, cutoff, vector<vector<int> >(), source, force.getForceGroup());
    nb.addParameter(ComputeParameterInfo(charges, prefix+"charge", "float", 1));
    nb.addParameter(ComputeParameterInfo(params, prefix+"obcParams", "float", 2));
-    nb.addParameter(ComputeParameterInfo(bornForce, prefix+"bornForce", cc.getSupports64BitGlobalAtomics() ? "mm_long" : "real", 1));
+    nb.addParameter(ComputeParameterInfo(bornForce, prefix+"bornForce", "mm_long", 1));
    info = new ForceInfo(force);
    cc.addForce(info);
 }
@@ -2580,7 +2570,6 @@ double CommonCalcGBSAOBCForceKernel::execute(ContextImpl& context, bool includeF
        else
            file = CommonKernelSources::gbsaObc;
        ComputeProgram program = cc.compileProgram(file, defines);
-        bool useLong = cc.getSupports64BitGlobalAtomics();
        computeBornSumKernel = program->createKernel("computeBornSum");
        computeBornSumKernel->addArg(bornSum);
        computeBornSumKernel->addArg(cc.getPosq());
@@ -2600,7 +2589,7 @@ double CommonCalcGBSAOBCForceKernel::execute(ContextImpl& context, bool includeF
            computeBornSumKernel->addArg(numAtomBlocks*(numAtomBlocks+1)/2);
        computeBornSumKernel->addArg(nb.getExclusionTiles());
        force1Kernel = program->createKernel("computeGBSAForce1");
-        force1Kernel->addArg(useLong ? cc.getLongForceBuffer() : cc.getForceBuffers());
+        force1Kernel->addArg(cc.getLongForceBuffer());
        force1Kernel->addArg(bornForce);
        force1Kernel->addArg(cc.getEnergyBuffer());
        force1Kernel->addArg(cc.getPosq());
@@ -2626,19 +2615,11 @@ double CommonCalcGBSAOBCForceKernel::execute(ContextImpl& context, bool includeF
        reduceBornSumKernel->addArg(0.8f);
        reduceBornSumKernel->addArg(4.85f);
        reduceBornSumKernel->addArg(bornSum);
-        if (!useLong) {
-            reduceBornSumKernel->addArg(cc.getPaddedNumAtoms());
-            reduceBornSumKernel->addArg(cc.getForceBuffers().getSize()/cc.getPaddedNumAtoms());
-        }
        reduceBornSumKernel->addArg(params);
        reduceBornSumKernel->addArg(bornRadii);
        reduceBornSumKernel->addArg(obcChain);
        reduceBornForceKernel = program->createKernel("reduceBornForce");
        reduceBornForceKernel->addArg(bornForce);
-        if (!useLong) {
-            reduceBornForceKernel->addArg(cc.getPaddedNumAtoms());
-            reduceBornForceKernel->addArg(cc.getForceBuffers().getSize()/cc.getPaddedNumAtoms());
-        }
        reduceBornForceKernel->addArg(cc.getEnergyBuffer());
        reduceBornForceKernel->addArg(params);
        reduceBornForceKernel->addArg(bornRadii);
@@ -2872,28 +2853,17 @@ void CommonCalcCustomGBForceKernel::initialize(const System& system, const Custo
            energyParamDerivExpressions[i].push_back(ex.differentiate(force.getEnergyParameterDerivativeName(j)).optimize());
    }
    bool deviceIsCpu = cc.getIsCPU();
-    bool useLong = cc.getSupports64BitGlobalAtomics();
    int elementSize = (cc.getUseDoublePrecision() ? sizeof(double) : sizeof(float));
-    if (useLong) {
-        valueBuffers.initialize<long long>(cc, cc.getPaddedNumAtoms(), "customGBValueBuffers");
-        longEnergyDerivs.initialize<long long>(cc, numComputedValues*cc.getPaddedNumAtoms(), "customGBLongEnergyDerivatives");
-        energyDerivs = new ComputeParameterSet(cc, numComputedValues, cc.getPaddedNumAtoms(), "customGBEnergyDerivatives", true);
-    }
-    else {
-        int bufferSize = cc.getPaddedNumAtoms()*nb.getNumForceBuffers();
-        valueBuffers.initialize(cc, bufferSize, elementSize, "customGBValueBuffers");
-        energyDerivs = new ComputeParameterSet(cc, numComputedValues, bufferSize, "customGBEnergyDerivatives", true);
-    }
+    valueBuffers.initialize<long long>(cc, cc.getPaddedNumAtoms(), "customGBValueBuffers");
+    longEnergyDerivs.initialize<long long>(cc, numComputedValues*cc.getPaddedNumAtoms(), "customGBLongEnergyDerivatives");
+    energyDerivs = new ComputeParameterSet(cc, numComputedValues, cc.getPaddedNumAtoms(), "customGBEnergyDerivatives", true);
    cc.addAutoclearBuffer(valueBuffers);
    energyDerivChain = new ComputeParameterSet(cc, numComputedValues, cc.getPaddedNumAtoms(), "customGBEnergyDerivativeChain", true);
    needEnergyParamDerivs = (force.getNumEnergyParameterDerivatives() > 0);
    dValue0dParam.resize(force.getNumEnergyParameterDerivatives());
    for (int i = 0; i < force.getNumEnergyParameterDerivatives(); i++) {
        dValuedParam.push_back(new ComputeParameterSet(cc, numComputedValues, cc.getPaddedNumAtoms(), "dValuedParam", true, cc.getUseDoublePrecision()));
-        if (useLong)
-            dValue0dParam[i].initialize<long long>(cc, cc.getPaddedNumAtoms(), "dValue0dParam");
-        else
-            dValue0dParam[i].initialize(cc, cc.getPaddedNumAtoms()*nb.getNumForceBuffers(), elementSize, "dValue0dParam");
+        dValue0dParam[i].initialize<long long>(cc, cc.getPaddedNumAtoms(), "dValue0dParam");
        cc.addAutoclearBuffer(dValue0dParam[i]);
        string name = force.getEnergyParameterDerivativeName(i);
        cc.addEnergyParameterDerivative(name);
@@ -2960,10 +2930,7 @@ void CommonCalcCustomGBForceKernel::initialize(const System& system, const Custo
        }
        for (int i = 0; i < force.getNumEnergyParameterDerivatives(); i++) {
            string derivName = "dValue0dParam"+cc.intToString(i+1);
-            if (useLong)
-                extraArgs << ", GLOBAL mm_ulong* RESTRICT global_" << derivName;
-            else
-                extraArgs << ", GLOBAL real* RESTRICT global_" << derivName;
+            extraArgs << ", GLOBAL mm_ulong* RESTRICT global_" << derivName;
            atomParams << "LOCAL real local_" << derivName << "[LOCAL_BUFFER_SIZE];\n";
            loadLocal2 << "local_" << derivName << "[localAtomIndex] = 0;\n";
            load1 << "real " << derivName << " = 0;\n";
@@ -2975,20 +2942,11 @@ void CommonCalcCustomGBForceKernel::initialize(const System& system, const Custo
                    tempDerivs2 << "local_" << derivName << "[j] += temp_" << derivName << "_2;\n";
                else
                    tempDerivs2 << "local_" << derivName << "[tbx+tj] += temp_" << derivName << "_2;\n";
-                if (useLong) {
-                    storeDeriv1 << "ATOMIC_ADD(&global_" << derivName << "[offset1], (mm_ulong) realToFixedPoint(" << derivName << "));\n";
-                    if (deviceIsCpu)
-                        storeDeriv2 << "ATOMIC_ADD(&global_" << derivName << "[offset2], (mm_ulong) realToFixedPoint(local_" << derivName << "[tgx]));\n";
-                    else
-                        storeDeriv2 << "ATOMIC_ADD(&global_" << derivName << "[offset2], (mm_ulong) realToFixedPoint(local_" << derivName << "[LOCAL_ID]));\n";
-                }
-                else {
-                    storeDeriv1 << "global_" << derivName << "[offset1] += " << derivName << ";\n";
-                    if (deviceIsCpu)
-                        storeDeriv2 << "global_" << derivName << "[offset2] += local_" << derivName << "[tgx];\n";
-                    else
-                        storeDeriv2 << "global_" << derivName << "[offset2] += local_" << derivName << "[LOCAL_ID];\n";
-                }
+                storeDeriv1 << "ATOMIC_ADD(&global_" << derivName << "[offset1], (mm_ulong) realToFixedPoint(" << derivName << "));\n";
+                if (deviceIsCpu)
+                    storeDeriv2 << "ATOMIC_ADD(&global_" << derivName << "[offset2], (mm_ulong) realToFixedPoint(local_" << derivName << "[tgx]));\n";
+                else
+                    storeDeriv2 << "ATOMIC_ADD(&global_" << derivName << "[offset2], (mm_ulong) realToFixedPoint(local_" << derivName << "[LOCAL_ID]));\n";
            }
        }
        replacements["PARAMETER_ARGUMENTS"] = extraArgs.str()+tableArgs.str();
@@ -3039,16 +2997,8 @@ void CommonCalcCustomGBForceKernel::initialize(const System& system, const Custo
        }
        for (int i = 0; i < force.getNumEnergyParameterDerivatives(); i++) {
            string variableName = "dValuedParam_0_"+cc.intToString(i);
-            if (useLong) {
-                extraArgs << ", GLOBAL const mm_long* RESTRICT dValue0dParam" << i;
-                deriv0 << "real " << variableName << " = RECIP((real) 0x100000000)*dValue0dParam" << i << "[index];\n";
-            }
-            else {
-                extraArgs << ", GLOBAL const real* RESTRICT dValue0dParam" << i;
-                deriv0 << "real " << variableName << " = dValue0dParam" << i << "[index];\n";
-                deriv0 << "for (int i = index+bufferSize; i < totalSize; i += bufferSize)\n";
-                deriv0 << "    " << variableName << " += dValue0dParam" << i << "[i];\n";
-            }
+            extraArgs << ", GLOBAL const mm_long* RESTRICT dValue0dParam" << i;
+            deriv0 << "real " << variableName << " = RECIP((real) 0x100000000)*dValue0dParam" << i << "[index];\n";
            for (int j = 0; j < dValuedParam[i]->getParameterInfos().size(); j++)
                extraArgs << ", GLOBAL real* RESTRICT global_dValuedParam_" << j << "_" << i;
            deriv0 << "global_dValuedParam_0_" << i << "[index] = dValuedParam_0_" << i << ";\n";
@@ -3129,21 +3079,11 @@ void CommonCalcCustomGBForceKernel::initialize(const System& system, const Custo
            map<string, Lepton::ParsedExpression> n2EnergyExpressions;
            n2EnergyExpressions["tempEnergy += "] = Lepton::Parser::parse(expression, functions).optimize();
            n2EnergyExpressions["dEdR += "] = Lepton::Parser::parse(expression, functions).differentiate("r").optimize();
-            if (useLong) {
-                for (int j = 0; j < numComputedValues; j++) {
-                    if (needChainForValue[j]) {
-                        string index = cc.intToString(j+1);
-                        n2EnergyExpressions["/*"+cc.intToString(i+1)+"*/ deriv"+index+"_1 += "] = energyDerivExpressions[i][2*j];
-                        n2EnergyExpressions["/*"+cc.intToString(i+1)+"*/ deriv"+index+"_2 += "] = energyDerivExpressions[i][2*j+1];
-                    }
-                }
-            }
-            else {
-                for (int j = 0; j < numComputedValues; j++) {
-                    if (needChainForValue[j]) {
-                        n2EnergyExpressions["/*"+cc.intToString(i+1)+"*/ deriv"+energyDerivs->getParameterSuffix(j, "_1")+" += "] = energyDerivExpressions[i][2*j];
-                        n2EnergyExpressions["/*"+cc.intToString(i+1)+"*/ deriv"+energyDerivs->getParameterSuffix(j, "_2")+" += "] = energyDerivExpressions[i][2*j+1];
-                    }
+            for (int j = 0; j < numComputedValues; j++) {
+                if (needChainForValue[j]) {
+                    string index = cc.intToString(j+1);
+                    n2EnergyExpressions["/*"+cc.intToString(i+1)+"*/ deriv"+index+"_1 += "] = energyDerivExpressions[i][2*j];
+                    n2EnergyExpressions["/*"+cc.intToString(i+1)+"*/ deriv"+index+"_2 += "] = energyDerivExpressions[i][2*j+1];
                }
            }
            for (int j = 0; j < force.getNumEnergyParameterDerivatives(); j++)
@@ -3188,32 +3128,16 @@ void CommonCalcCustomGBForceKernel::initialize(const System& system, const Custo
                pairEnergyUsesValue[i] = true;
            }
        }
-        if (useLong) {
-            extraArgs << ", GLOBAL mm_ulong* RESTRICT derivBuffers";
-            for (int i = 0; i < numComputedValues; i++) {
-                string index = cc.intToString(i+1);
-                atomParams << "LOCAL real local_deriv" << index << "[LOCAL_BUFFER_SIZE];\n";
-                clearLocal << "local_deriv" << index << "[localAtomIndex] = 0.0f;\n";
-                declare1 << "real deriv" << index << "_1 = 0;\n";
-                load2 << "real deriv" << index << "_2 = 0;\n";
-                recordDeriv << "local_deriv" << index << "[atom2] += deriv" << index << "_2;\n";
-                storeDerivs1 << "STORE_DERIVATIVE_1(" << index << ")\n";
-                storeDerivs2 << "STORE_DERIVATIVE_2(" << index << ")\n";
-            }
-        }
-        else {
-            for (int i = 0; i < (int) energyDerivs->getParameterInfos().size(); i++) {
-                const ComputeParameterInfo& buffer = energyDerivs->getParameterInfos()[i];
-                string index = cc.intToString(i+1);
-                extraArgs << ", GLOBAL " << buffer.getType() << "* RESTRICT derivBuffers" << index;
-                atomParams << "LOCAL " << buffer.getType() << " local_deriv" << index << "[LOCAL_BUFFER_SIZE];\n";
-                clearLocal << "local_deriv" << index << "[localAtomIndex] = 0.0f;\n";
-                declare1 << buffer.getType() << " deriv" << index << "_1 = 0.0f;\n";
-                load2 << buffer.getType() << " deriv" << index << "_2 = 0.0f;\n";
-                recordDeriv << "local_deriv" << index << "[atom2] += deriv" << index << "_2;\n";
-                storeDerivs1 << "STORE_DERIVATIVE_1(" << index << ")\n";
-                storeDerivs2 << "STORE_DERIVATIVE_2(" << index << ")\n";
-            }
+        extraArgs << ", GLOBAL mm_ulong* RESTRICT derivBuffers";
+        for (int i = 0; i < numComputedValues; i++) {
+            string index = cc.intToString(i+1);
+            atomParams << "LOCAL real local_deriv" << index << "[LOCAL_BUFFER_SIZE];\n";
+            clearLocal << "local_deriv" << index << "[localAtomIndex] = 0.0f;\n";
+            declare1 << "real deriv" << index << "_1 = 0;\n";
+            load2 << "real deriv" << index << "_2 = 0;\n";
+            recordDeriv << "local_deriv" << index << "[atom2] += deriv" << index << "_2;\n";
+            storeDerivs1 << "STORE_DERIVATIVE_1(" << index << ")\n";
+            storeDerivs2 << "STORE_DERIVATIVE_2(" << index << ")\n";
        }
        if (needEnergyParamDerivs) {
            extraArgs << ", GLOBAL mixed* RESTRICT energyParamDerivs";
@@ -3285,16 +3209,10 @@ void CommonCalcCustomGBForceKernel::initialize(const System& system, const Custo
            string index = cc.intToString(i+1);
            extraArgs << ", GLOBAL " << buffer.getType() << "* RESTRICT derivChain" << index;
        }
-        if (useLong) {
-            extraArgs << ", GLOBAL const mm_long* RESTRICT derivBuffersIn";
-            for (int i = 0; i < energyDerivs->getNumParameters(); ++i)
-                reduce << "derivBuffers" << energyDerivs->getParameterSuffix(i, "[index]") <<
-                        " = RECIP((real) 0x100000000)*derivBuffersIn[index+PADDED_NUM_ATOMS*" << cc.intToString(i) << "];\n";
-        }
-        else {
-            for (int i = 0; i < (int) energyDerivs->getParameterInfos().size(); i++)
-                reduce << "REDUCE_VALUE(derivBuffers" << cc.intToString(i+1) << ", " << energyDerivs->getParameterInfos()[i].getType() << ")\n";
-        }
+        extraArgs << ", GLOBAL const mm_long* RESTRICT derivBuffersIn";
+        for (int i = 0; i < energyDerivs->getNumParameters(); ++i)
+            reduce << "derivBuffers" << energyDerivs->getParameterSuffix(i, "[index]") <<
+                    " = RECIP((real) 0x100000000)*derivBuffersIn[index+PADDED_NUM_ATOMS*" << cc.intToString(i) << "];\n";
        if (needEnergyParamDerivs) {
            extraArgs << ", GLOBAL mixed* RESTRICT energyParamDerivs";
            const vector<string>& allParamDerivNames = cc.getEnergyParamDerivNames();
@@ -3353,13 +3271,9 @@ void CommonCalcCustomGBForceKernel::initialize(const System& system, const Custo
            string index = cc.intToString(i+1);
            compute << "derivBuffers" << index << "[index] = deriv" << index << ";\n";
        }
-        if (useLong) {
-            compute << "forceBuffers[index] += realToFixedPoint(force.x);\n";
-            compute << "forceBuffers[index+PADDED_NUM_ATOMS] += realToFixedPoint(force.y);\n";
-            compute << "forceBuffers[index+PADDED_NUM_ATOMS*2] += realToFixedPoint(force.z);\n";
-        }
-        else
-            compute << "forceBuffers[index] = forceBuffers[index]+make_real4(force.x, force.y, force.z, 0);\n";
+        compute << "forceBuffers[index] += realToFixedPoint(force.x);\n";
+        compute << "forceBuffers[index+PADDED_NUM_ATOMS] += realToFixedPoint(force.y);\n";
+        compute << "forceBuffers[index+PADDED_NUM_ATOMS*2] += realToFixedPoint(force.z);\n";
        for (int i = 1; i < numComputedValues; i++) {
            compute << "real totalDeriv"<<i<<" = dV"<<i<<"dV0";
            for (int j = 1; j < i; j++)
@@ -3548,12 +3462,7 @@ void CommonCalcCustomGBForceKernel::initialize(const System& system, const Custo
    }
    info = new ForceInfo(force);
    cc.addForce(info);
-    if (useLong)
-        cc.addAutoclearBuffer(longEnergyDerivs);
-    else {
-        for (auto& buffer : energyDerivs->getParameterInfos())
-            cc.addAutoclearBuffer(buffer.getArray());
-    }
+    cc.addAutoclearBuffer(longEnergyDerivs);
 }

 double CommonCalcCustomGBForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy) {
@@ -3600,7 +3509,6 @@ double CommonCalcCustomGBForceKernel::execute(ContextImpl& context, bool include
        
        maxTiles = (nb.getUseCutoff() ? nb.getInteractingTiles().getSize() : 0);
        int numAtomBlocks = cc.getPaddedNumAtoms()/32;
-        bool useLong = cc.getSupports64BitGlobalAtomics();
        pairValueKernel->addArg(cc.getPosq());
        pairValueKernel->addArg(cc.getNonbondedUtilities().getExclusions());
        pairValueKernel->addArg(cc.getNonbondedUtilities().getExclusionTiles());
@@ -3631,10 +3539,6 @@ double CommonCalcCustomGBForceKernel::execute(ContextImpl& context, bool include
            pairValueKernel->addArg(function);
        perParticleValueKernel->addArg(cc.getPosq());
        perParticleValueKernel->addArg(valueBuffers);
-        if (!useLong) {
-            perParticleValueKernel->addArg(cc.getPaddedNumAtoms());
-            perParticleValueKernel->addArg(cc.getForceBuffers().getSize()/cc.getPaddedNumAtoms());
-        }
        if (globals.isInitialized())
            perParticleValueKernel->addArg(globals);
        for (auto& buffer : params->getParameterInfos())
@@ -3648,7 +3552,7 @@ double CommonCalcCustomGBForceKernel::execute(ContextImpl& context, bool include
        }
        for (auto& function : tabulatedFunctionArrays)
            perParticleValueKernel->addArg(function);
-        pairEnergyKernel->addArg(useLong ? cc.getLongForceBuffer() : cc.getForceBuffers());
+        pairEnergyKernel->addArg(cc.getLongForceBuffer());
        pairEnergyKernel->addArg(cc.getEnergyBuffer());
        pairEnergyKernel->addArg(cc.getPosq());
        pairEnergyKernel->addArg(cc.getNonbondedUtilities().getExclusions());
@@ -3680,24 +3584,14 @@ double CommonCalcCustomGBForceKernel::execute(ContextImpl& context, bool include
                pairEnergyKernel->addArg(buffer.getArray());
            }
        }
-        if (useLong)
-            pairEnergyKernel->addArg(longEnergyDerivs);
-        else
-            for (auto& buffer : energyDerivs->getParameterInfos())
-                pairEnergyKernel->addArg(buffer.getArray());
+        pairEnergyKernel->addArg(longEnergyDerivs);
        if (needEnergyParamDerivs)
            pairEnergyKernel->addArg(cc.getEnergyParamDerivBuffer());
        for (auto& function : tabulatedFunctionArrays)
            pairEnergyKernel->addArg(function);
        perParticleEnergyKernel->addArg(cc.getEnergyBuffer());
        perParticleEnergyKernel->addArg(cc.getPosq());
-        if (cc.getSupports64BitGlobalAtomics())
-            perParticleEnergyKernel->addArg(cc.getLongForceBuffer());
-        else {
-            perParticleEnergyKernel->addArg(cc.getForceBuffers());
-            perParticleEnergyKernel->addArg(cc.getPaddedNumAtoms());
-            perParticleEnergyKernel->addArg(cc.getForceBuffers().getSize()/cc.getPaddedNumAtoms());
-        }
+        perParticleEnergyKernel->addArg(cc.getLongForceBuffer());
        if (globals.isInitialized())
            perParticleEnergyKernel->addArg(globals);
        for (auto& buffer : params->getParameterInfos())
@@ -3708,15 +3602,14 @@ double CommonCalcCustomGBForceKernel::execute(ContextImpl& context, bool include
            perParticleEnergyKernel->addArg(buffer.getArray());
        for (auto& buffer : energyDerivChain->getParameterInfos())
            perParticleEnergyKernel->addArg(buffer.getArray());
-        if (useLong)
-            perParticleEnergyKernel->addArg(longEnergyDerivs);
+        perParticleEnergyKernel->addArg(longEnergyDerivs);
        if (needEnergyParamDerivs)
            perParticleEnergyKernel->addArg(cc.getEnergyParamDerivBuffer());
        for (auto& function : tabulatedFunctionArrays)
            perParticleEnergyKernel->addArg(function);
        if (needParameterGradient || needEnergyParamDerivs) {
            gradientChainRuleKernel->addArg(cc.getPosq());
-            gradientChainRuleKernel->addArg(useLong ? cc.getLongForceBuffer() : cc.getForceBuffers());
+            gradientChainRuleKernel->addArg(cc.getLongForceBuffer());
            if (globals.isInitialized())
                gradientChainRuleKernel->addArg(globals);
            for (auto& buffer : params->getParameterInfos())
@@ -3937,33 +3830,6 @@ void CommonCalcCustomHbondForceKernel::initialize(const System& system, const Cu
    }
    acceptors.upload(acceptorVector);
    acceptorParams->setParameterValues(acceptorParamVector);
-
-    // Select an output buffer index for each donor and acceptor.
-
-    if (!cc.getSupports64BitGlobalAtomics()) {
-        donorBufferIndices.initialize<mm_int4>(cc, numDonors, "customHbondDonorBuffers");
-        acceptorBufferIndices.initialize<mm_int4>(cc, numAcceptors, "customHbondAcceptorBuffers");
-        vector<mm_int4> donorBufferVector(numDonors);
-        vector<mm_int4> acceptorBufferVector(numAcceptors);
-        vector<int> donorBufferCounter(numParticles, 0);
-        for (int i = 0; i < numDonors; i++)
-            donorBufferVector[i] = mm_int4(donorVector[i].x > -1 ? donorBufferCounter[donorVector[i].x]++ : 0,
-                                           donorVector[i].y > -1 ? donorBufferCounter[donorVector[i].y]++ : 0,
-                                           donorVector[i].z > -1 ? donorBufferCounter[donorVector[i].z]++ : 0, 0);
-        vector<int> acceptorBufferCounter(numParticles, 0);
-        for (int i = 0; i < numAcceptors; i++)
-            acceptorBufferVector[i] = mm_int4(acceptorVector[i].x > -1 ? acceptorBufferCounter[acceptorVector[i].x]++ : 0,
-                                           acceptorVector[i].y > -1 ? acceptorBufferCounter[acceptorVector[i].y]++ : 0,
-                                           acceptorVector[i].z > -1 ? acceptorBufferCounter[acceptorVector[i].z]++ : 0, 0);
-        donorBufferIndices.upload(donorBufferVector);
-        acceptorBufferIndices.upload(acceptorBufferVector);
-        int maxBuffers = 1;
-        for (int i : donorBufferCounter)
-            maxBuffers = max(maxBuffers, i);
-        for (int i : acceptorBufferCounter)
-            maxBuffers = max(maxBuffers, i);
-        cc.requestForceBuffers(maxBuffers);
-    }
    info = new ForceInfo(force);
    cc.addForce(info);

@@ -4243,12 +4109,7 @@ double CommonCalcCustomHbondForceKernel::execute(ContextImpl& context, bool incl
    }
    if (!hasInitializedKernel) {
        hasInitializedKernel = true;
-        if (cc.getSupports64BitGlobalAtomics())
-            donorKernel->addArg(cc.getLongForceBuffer());
-        else {
-            donorKernel->addArg(cc.getForceBuffers());
-            donorKernel->addArg(donorBufferIndices);
-        }
+        donorKernel->addArg(cc.getLongForceBuffer());
        donorKernel->addArg(cc.getEnergyBuffer());
        donorKernel->addArg(cc.getPosq());
        donorKernel->addArg(donorExclusions);
@@ -4264,12 +4125,7 @@ double CommonCalcCustomHbondForceKernel::execute(ContextImpl& context, bool incl
            donorKernel->addArg(parameter.getArray());
        for (auto& function : tabulatedFunctionArrays)
            donorKernel->addArg(function);
-        if (cc.getSupports64BitGlobalAtomics())
-            acceptorKernel->addArg(cc.getLongForceBuffer());
-        else {
-            acceptorKernel->addArg(cc.getForceBuffers());
-            acceptorKernel->addArg(acceptorBufferIndices);
-        }
+        acceptorKernel->addArg(cc.getLongForceBuffer());
        acceptorKernel->addArg(cc.getEnergyBuffer());
        acceptorKernel->addArg(cc.getPosq());
        acceptorKernel->addArg(acceptorExclusions);
@@ -4286,9 +4142,9 @@ double CommonCalcCustomHbondForceKernel::execute(ContextImpl& context, bool incl
        for (auto& function : tabulatedFunctionArrays)
            acceptorKernel->addArg(function);
    }
-    setPeriodicBoxArgs(cc, donorKernel, cc.getSupports64BitGlobalAtomics() ? 6 : 7);
+    setPeriodicBoxArgs(cc, donorKernel, 6);
    donorKernel->execute(max(numDonors, numAcceptors), 64);
-    setPeriodicBoxArgs(cc, acceptorKernel, cc.getSupports64BitGlobalAtomics() ? 6 : 7);
+    setPeriodicBoxArgs(cc, acceptorKernel, 6);
    acceptorKernel->execute(max(numDonors, numAcceptors), 64);
    return 0.0;
 }
@@ -4391,8 +4247,6 @@ CommonCalcCustomManyParticleForceKernel::~CommonCalcCustomManyParticleForceKerne

 void CommonCalcCustomManyParticleForceKernel::initialize(const System& system, const CustomManyParticleForce& force) {
    ContextSelector selector(cc);
-    if (!cc.getSupports64BitGlobalAtomics())
-        throw OpenMMException("CustomManyParticleForce requires a device that supports 64 bit atomic operations");
    int numParticles = force.getNumParticles();
    int particlesPerSet = force.getNumParticlesPerSet();
    bool centralParticleMode = (force.getPermutationMode() == CustomManyParticleForce::UniqueCentralParticle);
@@ -4933,9 +4787,6 @@ private:
 };

 void CommonCalcGayBerneForceKernel::initialize(const System& system, const GayBerneForce& force) {
-    if (!cc.getSupports64BitGlobalAtomics())
-        throw OpenMMException("GayBerneForce requires a device that supports 64 bit atomic operations");
-
    // Initialize interactions.

    ContextSelector selector(cc);

--- a/platforms/common/src/IntegrationUtilities.cpp
+++ b/platforms/common/src/IntegrationUtilities.cpp
@@ -528,8 +528,6 @@ IntegrationUtilities::IntegrationUtilities(ComputeContext& context, const System
    for (int i = 0; i < numAtoms; i++)
        if (atomCounts[i] > 1)
            hasOverlappingVsites = true;
-    if (hasOverlappingVsites && !context.getSupports64BitGlobalAtomics())
-        throw OpenMMException("This device does not support 64 bit atomics.  Cannot have multiple virtual sites that depend on the same atom.");

    // Create the kernels used by this class.


--- a/platforms/common/src/kernels/customGBEnergyN2.cc
+++ b/platforms/common/src/kernels/customGBEnergyN2.cc
-#ifdef SUPPORTS_64_BIT_ATOMICS
 #define STORE_DERIVATIVE_1(INDEX) ATOMIC_ADD(&derivBuffers[offset+(INDEX-1)*PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(deriv##INDEX##_1));
 #define STORE_DERIVATIVE_2(INDEX) ATOMIC_ADD(&derivBuffers[offset+(INDEX-1)*PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(local_deriv##INDEX[LOCAL_ID]));
-#else
-#define STORE_DERIVATIVE_1(INDEX) derivBuffers##INDEX[offset] += deriv##INDEX##_1;
-#define STORE_DERIVATIVE_2(INDEX) derivBuffers##INDEX[offset] += local_deriv##INDEX[LOCAL_ID];
-#endif

 /**
 * Compute a force based on pair interactions.
 */
 KERNEL void computeN2Energy(
-#ifdef SUPPORTS_64_BIT_ATOMICS
        GLOBAL mm_ulong* RESTRICT forceBuffers,
-#else
-        GLOBAL real4* RESTRICT forceBuffers,
-#endif
        GLOBAL mixed* RESTRICT energyBuffer,
        GLOBAL const real4* RESTRICT posq, GLOBAL const unsigned int* RESTRICT exclusions,
        GLOBAL const int2* exclusionTiles, int needEnergy,
@@ -160,7 +151,6 @@ KERNEL void computeN2Energy(

        // Write results.

-#ifdef SUPPORTS_64_BIT_ATOMICS
        unsigned int offset = x*TILE_SIZE + tgx;
        ATOMIC_ADD(&forceBuffers[offset], (mm_ulong) realToFixedPoint(force.x));
        ATOMIC_ADD(&forceBuffers[offset+PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(force.y));
@@ -173,18 +163,6 @@ KERNEL void computeN2Energy(
            ATOMIC_ADD(&forceBuffers[offset+2*PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(local_force[LOCAL_ID].z));
            STORE_DERIVATIVES_2
        }
-#else
-        unsigned int offset1 = x*TILE_SIZE + tgx + warp*PADDED_NUM_ATOMS;
-        unsigned int offset2 = y*TILE_SIZE + tgx + warp*PADDED_NUM_ATOMS;
-        unsigned int offset = offset1;
-        forceBuffers[offset1].xyz += force.xyz;
-        STORE_DERIVATIVES_1
-        if (x != y) {
-            offset = offset2;
-            forceBuffers[offset2] += (real4) (local_force[LOCAL_ID].x, local_force[LOCAL_ID].y, local_force[LOCAL_ID].z, 0.0f);
-            STORE_DERIVATIVES_2
-        }
-#endif
    }

    // Second loop: tiles without exclusions, either from the neighbor list (with cutoff) or just enumerating all
@@ -363,7 +341,6 @@ KERNEL void computeN2Energy(
 #else
            unsigned int atom2 = y*TILE_SIZE + tgx;
 #endif
-#ifdef SUPPORTS_64_BIT_ATOMICS
            ATOMIC_ADD(&forceBuffers[atom1], (mm_ulong) realToFixedPoint(force.x));
            ATOMIC_ADD(&forceBuffers[atom1+PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(force.y));
            ATOMIC_ADD(&forceBuffers[atom1+2*PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(force.z));
@@ -376,18 +353,6 @@ KERNEL void computeN2Energy(
                offset = atom2;
                STORE_DERIVATIVES_2
            }
-#else
-            unsigned int offset1 = atom1 + warp*PADDED_NUM_ATOMS;
-            unsigned int offset2 = atom2 + warp*PADDED_NUM_ATOMS;
-            forceBuffers[offset1].xyz += force.xyz;
-            unsigned int offset = offset1;
-            STORE_DERIVATIVES_1
-            if (atom2 < PADDED_NUM_ATOMS) {
-                forceBuffers[offset2] += (real4) (local_force[LOCAL_ID].x, local_force[LOCAL_ID].y, local_force[LOCAL_ID].z, 0.0f);
-                offset = offset2;
-                STORE_DERIVATIVES_2
-            }
-#endif
        }
        pos++;
    }

--- a/platforms/common/src/kernels/customGBEnergyN2_cpu.cc
+++ b/platforms/common/src/kernels/customGBEnergyN2_cpu.cc
-#ifdef SUPPORTS_64_BIT_ATOMICS
 #define STORE_DERIVATIVE_1(INDEX) ATOMIC_ADD(&derivBuffers[offset+(INDEX-1)*PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(deriv##INDEX##_1));
 #define STORE_DERIVATIVE_2(INDEX) ATOMIC_ADD(&derivBuffers[offset+(INDEX-1)*PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(local_deriv##INDEX[tgx]));
-#else
-#define STORE_DERIVATIVE_1(INDEX) derivBuffers##INDEX[offset] += deriv##INDEX##_1;
-#define STORE_DERIVATIVE_2(INDEX) derivBuffers##INDEX[offset] += local_deriv##INDEX[tgx];
-#endif

 /**
 * Compute a force based on pair interactions.
 */
 KERNEL void computeN2Energy(
-#ifdef SUPPORTS_64_BIT_ATOMICS
        GLOBAL mm_ulong* RESTRICT forceBuffers,
-#else
-        GLOBAL real4* RESTRICT forceBuffers,
-#endif
        GLOBAL mixed* RESTRICT energyBuffer,
        GLOBAL const real4* RESTRICT posq, GLOBAL const unsigned int* RESTRICT exclusions,
        GLOBAL const int2* exclusionTiles, int needEnergy,
@@ -100,17 +91,11 @@ KERNEL void computeN2Energy(

                // Write results.

-#ifdef SUPPORTS_64_BIT_ATOMICS
                unsigned int offset = atom1;
                ATOMIC_ADD(&forceBuffers[offset], (mm_ulong) realToFixedPoint(force.x));
                ATOMIC_ADD(&forceBuffers[offset+PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(force.y));
                ATOMIC_ADD(&forceBuffers[offset+2*PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(force.z));
                STORE_DERIVATIVES_1
-#else
-                unsigned int offset = atom1 + GROUP_ID*PADDED_NUM_ATOMS;
-                forceBuffers[offset].xyz += force.xyz;
-                STORE_DERIVATIVES_1
-#endif
            }
        }
        else {
@@ -174,33 +159,21 @@ KERNEL void computeN2Energy(

                // Write results for atom1.

-#ifdef SUPPORTS_64_BIT_ATOMICS
                unsigned int offset = atom1;
                ATOMIC_ADD(&forceBuffers[offset], (mm_ulong) realToFixedPoint(force.x));
                ATOMIC_ADD(&forceBuffers[offset+PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(force.y));
                ATOMIC_ADD(&forceBuffers[offset+2*PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(force.z));
                STORE_DERIVATIVES_1
-#else
-                unsigned int offset = atom1 + GROUP_ID*PADDED_NUM_ATOMS;
-                forceBuffers[offset].xyz += force.xyz;
-                STORE_DERIVATIVES_1
-#endif
            }

            // Write results.

            for (int tgx = 0; tgx < TILE_SIZE; tgx++) {
-#ifdef SUPPORTS_64_BIT_ATOMICS
                unsigned int offset = y*TILE_SIZE+tgx;
                ATOMIC_ADD(&forceBuffers[offset], (mm_ulong) realToFixedPoint(local_force[tgx].x));
                ATOMIC_ADD(&forceBuffers[offset+PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(local_force[tgx].y));
                ATOMIC_ADD(&forceBuffers[offset+2*PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(local_force[tgx].z));
                STORE_DERIVATIVES_2
-#else
-                unsigned int offset = y*TILE_SIZE+tgx + GROUP_ID*PADDED_NUM_ATOMS;
-                forceBuffers[offset].xyz += local_force[tgx].xyz;
-                STORE_DERIVATIVES_2
-#endif
            }
        }
    }
@@ -316,17 +289,11 @@ KERNEL void computeN2Energy(

                    // Write results for atom1.

-#ifdef SUPPORTS_64_BIT_ATOMICS
                    unsigned int offset = atom1;
                    ATOMIC_ADD(&forceBuffers[offset], (mm_ulong) realToFixedPoint(force.x));
                    ATOMIC_ADD(&forceBuffers[offset+PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(force.y));
                    ATOMIC_ADD(&forceBuffers[offset+2*PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(force.z));
                    STORE_DERIVATIVES_1
-#else
-                    unsigned int offset = atom1 + GROUP_ID*PADDED_NUM_ATOMS;
-                    forceBuffers[offset].xyz += force.xyz;
-                    STORE_DERIVATIVES_1
-#endif
                }
            }
            else
@@ -375,17 +342,11 @@ KERNEL void computeN2Energy(

                    // Write results for atom1.

-#ifdef SUPPORTS_64_BIT_ATOMICS
                    unsigned int offset = atom1;
                    ATOMIC_ADD(&forceBuffers[offset], (mm_ulong) realToFixedPoint(force.x));
                    ATOMIC_ADD(&forceBuffers[offset+PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(force.y));
                    ATOMIC_ADD(&forceBuffers[offset+2*PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(force.z));
                    STORE_DERIVATIVES_1
-#else
-                    unsigned int offset = atom1 + GROUP_ID*PADDED_NUM_ATOMS;
-                    forceBuffers[offset].xyz += force.xyz;
-                    STORE_DERIVATIVES_1
-#endif
                }
            }

@@ -398,17 +359,11 @@ KERNEL void computeN2Energy(
                unsigned int atom2 = y*TILE_SIZE + tgx;
 #endif
                if (atom2 < PADDED_NUM_ATOMS) {
-#ifdef SUPPORTS_64_BIT_ATOMICS
                    ATOMIC_ADD(&forceBuffers[atom2], (mm_ulong) realToFixedPoint(local_force[tgx].x));
                    ATOMIC_ADD(&forceBuffers[atom2+PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(local_force[tgx].y));
                    ATOMIC_ADD(&forceBuffers[atom2+2*PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(local_force[tgx].z));
                    unsigned int offset = atom2;
                    STORE_DERIVATIVES_2
-#else
-                    unsigned int offset = atom2 + GROUP_ID*PADDED_NUM_ATOMS;
-                    forceBuffers[offset].xyz += local_force[tgx].xyz;
-                    STORE_DERIVATIVES_2
-#endif
                }
            }
        }

--- a/platforms/common/src/kernels/customGBEnergyPerParticle.cc
+++ b/platforms/common/src/kernels/customGBEnergyPerParticle.cc
@@ -10,20 +10,13 @@
 */

 KERNEL void computePerParticleEnergy(GLOBAL mixed* RESTRICT energyBuffer, GLOBAL const real4* RESTRICT posq,
-#ifdef SUPPORTS_64_BIT_ATOMICS
        GLOBAL mm_long* RESTRICT forceBuffers
-#else
-        GLOBAL real4* RESTRICT forceBuffers, int bufferSize, int numBuffers
-#endif
        PARAMETER_ARGUMENTS) {
    mixed energy = 0;
    INIT_PARAM_DERIVS
    for (int index = GLOBAL_ID; index < NUM_ATOMS; index += GLOBAL_SIZE) {
        // Reduce the derivatives

-#ifndef SUPPORTS_64_BIT_ATOMICS
-        int totalSize = bufferSize*numBuffers;
-#endif
        REDUCE_DERIVATIVES

        // Now calculate the per-particle energy terms.

--- a/platforms/common/src/kernels/customGBGradientChainRule.cc
+++ b/platforms/common/src/kernels/customGBGradientChainRule.cc
@@ -3,29 +3,17 @@
 */

 KERNEL void computeGradientChainRuleTerms(GLOBAL const real4* RESTRICT posq,
-#ifdef SUPPORTS_64_BIT_ATOMICS
    GLOBAL mm_long* RESTRICT forceBuffers
-#else
-    GLOBAL real4* RESTRICT forceBuffers
-#endif
        PARAMETER_ARGUMENTS) {
    INIT_PARAM_DERIVS
    const real scale = RECIP((real) 0x100000000);
    for (int index = GLOBAL_ID; index < NUM_ATOMS; index += GLOBAL_SIZE) {
        real4 pos = posq[index];
-#ifdef SUPPORTS_64_BIT_ATOMICS
        real3 force = make_real3(scale*forceBuffers[index], scale*forceBuffers[index+PADDED_NUM_ATOMS], scale*forceBuffers[index+PADDED_NUM_ATOMS*2]);
-#else
-        real3 force = trimTo3(forceBuffers[index]);
-#endif
        COMPUTE_FORCES
-#ifdef SUPPORTS_64_BIT_ATOMICS
        forceBuffers[index] = realToFixedPoint(force.x);
        forceBuffers[index+PADDED_NUM_ATOMS] = realToFixedPoint(force.y);
        forceBuffers[index+PADDED_NUM_ATOMS*2] = realToFixedPoint(force.z);
-#else
-        forceBuffers[index] = make_real4(force.x, force.y, force.z, 0);
-#endif
    }
    SAVE_PARAM_DERIVS
 }
--- a/platforms/common/src/kernels/customGBValueN2.cc
+++ b/platforms/common/src/kernels/customGBValueN2.cc
@@ -3,11 +3,7 @@
 */
 KERNEL void computeN2Value(GLOBAL const real4* RESTRICT posq, GLOBAL const unsigned int* RESTRICT exclusions,
        GLOBAL const int2* exclusionTiles,
-#ifdef SUPPORTS_64_BIT_ATOMICS
        GLOBAL mm_ulong* RESTRICT global_value,
-#else
-        GLOBAL real* RESTRICT global_value,
-#endif
 #ifdef USE_CUTOFF
        GLOBAL const int* RESTRICT tiles, GLOBAL const unsigned int* RESTRICT interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize,
        real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, unsigned int maxTiles, GLOBAL const real4* RESTRICT blockCenter,
@@ -137,7 +133,6 @@ KERNEL void computeN2Value(GLOBAL const real4* RESTRICT posq, GLOBAL const unsig

        // Write results.

-#ifdef SUPPORTS_64_BIT_ATOMICS
        unsigned int offset1 = x*TILE_SIZE + tgx;
        ATOMIC_ADD(&global_value[offset1], (mm_ulong) realToFixedPoint(value));
        STORE_PARAM_DERIVS1
@@ -146,16 +141,6 @@ KERNEL void computeN2Value(GLOBAL const real4* RESTRICT posq, GLOBAL const unsig
            ATOMIC_ADD(&global_value[offset2], (mm_ulong) realToFixedPoint(local_value[LOCAL_ID]));
            STORE_PARAM_DERIVS2
        }
-#else
-        unsigned int offset1 = x*TILE_SIZE + tgx + warp*PADDED_NUM_ATOMS;
-        unsigned int offset2 = y*TILE_SIZE + tgx + warp*PADDED_NUM_ATOMS;
-        global_value[offset1] += value;
-        STORE_PARAM_DERIVS1
-        if (x != y) {
-            global_value[offset2] += local_value[LOCAL_ID];
-            STORE_PARAM_DERIVS2
-        }
-#endif
    }

    // Second loop: tiles without exclusions, either from the neighbor list (with cutoff) or just enumerating all
@@ -317,7 +302,6 @@ KERNEL void computeN2Value(GLOBAL const real4* RESTRICT posq, GLOBAL const unsig
 #else
            unsigned int atom2 = y*TILE_SIZE + tgx;
 #endif
-#ifdef SUPPORTS_64_BIT_ATOMICS
            unsigned int offset1 = atom1;
            ATOMIC_ADD(&global_value[offset1], (mm_ulong) realToFixedPoint(value));
            STORE_PARAM_DERIVS1
@@ -326,16 +310,6 @@ KERNEL void computeN2Value(GLOBAL const real4* RESTRICT posq, GLOBAL const unsig
                ATOMIC_ADD(&global_value[offset2], (mm_ulong) realToFixedPoint(local_value[LOCAL_ID]));
                STORE_PARAM_DERIVS2
            }
-#else
-            unsigned int offset1 = atom1 + warp*PADDED_NUM_ATOMS;
-            global_value[offset1] += value;
-            STORE_PARAM_DERIVS1
-            if (atom2 < PADDED_NUM_ATOMS) {
-                unsigned int offset2 = atom2 + warp*PADDED_NUM_ATOMS;
-                global_value[offset2] += local_value[LOCAL_ID];
-                STORE_PARAM_DERIVS2
-            }
-#endif
        }
        pos++;
    }

--- a/platforms/common/src/kernels/customGBValueN2_cpu.cc
+++ b/platforms/common/src/kernels/customGBValueN2_cpu.cc
@@ -3,11 +3,7 @@
 */
 KERNEL void computeN2Value(GLOBAL const real4* RESTRICT posq, GLOBAL const unsigned int* RESTRICT exclusions,
        GLOBAL const int2* exclusionTiles,
-#ifdef SUPPORTS_64_BIT_ATOMICS
        GLOBAL mm_ulong* RESTRICT global_value,
-#else
-        GLOBAL real* RESTRICT global_value,
-#endif
 #ifdef USE_CUTOFF
        GLOBAL const int* RESTRICT tiles, GLOBAL const unsigned int* RESTRICT interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize,
        real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, unsigned int maxTiles, GLOBAL const real4* RESTRICT blockCenter,
@@ -84,13 +80,8 @@ KERNEL void computeN2Value(GLOBAL const real4* RESTRICT posq, GLOBAL const unsig

                // Write results.

-#ifdef SUPPORTS_64_BIT_ATOMICS
                unsigned int offset1 = atom1;
                ATOMIC_ADD(&global_value[offset1], (mm_ulong) realToFixedPoint(value));
-#else
-                unsigned int offset1 = atom1 + get_group_id(0)*PADDED_NUM_ATOMS;
-                global_value[offset1] += value;
-#endif
                STORE_PARAM_DERIVS1
            }
        }
@@ -146,26 +137,16 @@ KERNEL void computeN2Value(GLOBAL const real4* RESTRICT posq, GLOBAL const unsig

                // Write results for atom1.

-#ifdef SUPPORTS_64_BIT_ATOMICS
                unsigned int offset1 = atom1;
                ATOMIC_ADD(&global_value[offset1], (mm_ulong) realToFixedPoint(value));
-#else
-                unsigned int offset1 = atom1 + get_group_id(0)*PADDED_NUM_ATOMS;
-                global_value[offset1] += value;
-#endif
                STORE_PARAM_DERIVS1
            }

            // Write results.

            for (int tgx = 0; tgx < TILE_SIZE; tgx++) {
-#ifdef SUPPORTS_64_BIT_ATOMICS
                unsigned int offset2 = y*TILE_SIZE+tgx;
                ATOMIC_ADD(&global_value[offset2], (mm_ulong) realToFixedPoint(local_value[tgx]));
-#else
-                unsigned int offset2 = y*TILE_SIZE+tgx + get_group_id(0)*PADDED_NUM_ATOMS;
-                global_value[offset2] += local_value[tgx];
-#endif
                STORE_PARAM_DERIVS2
            }
        }
@@ -273,13 +254,8 @@ KERNEL void computeN2Value(GLOBAL const real4* RESTRICT posq, GLOBAL const unsig

                    // Write results for atom1.

-#ifdef SUPPORTS_64_BIT_ATOMICS
                    unsigned int offset1 = atom1;
                    ATOMIC_ADD(&global_value[offset1], (mm_ulong) realToFixedPoint(value));
-#else
-                    unsigned int offset1 = atom1 + get_group_id(0)*PADDED_NUM_ATOMS;
-                    global_value[offset1] += value;
-#endif
                    STORE_PARAM_DERIVS1
                }
            }
@@ -322,13 +298,8 @@ KERNEL void computeN2Value(GLOBAL const real4* RESTRICT posq, GLOBAL const unsig

                    // Write results for atom1.

-#ifdef SUPPORTS_64_BIT_ATOMICS
                    unsigned int offset1 = atom1;
                    ATOMIC_ADD(&global_value[offset1], (mm_ulong) realToFixedPoint(value));
-#else
-                    unsigned int offset1 = atom1 + get_group_id(0)*PADDED_NUM_ATOMS;
-                    global_value[offset1] += value;
-#endif
                    STORE_PARAM_DERIVS1
                }
            }
@@ -342,13 +313,8 @@ KERNEL void computeN2Value(GLOBAL const real4* RESTRICT posq, GLOBAL const unsig
                unsigned int atom2 = y*TILE_SIZE + tgx;
 #endif
                if (atom2 < PADDED_NUM_ATOMS) {
-#ifdef SUPPORTS_64_BIT_ATOMICS
                    unsigned int offset2 = atom2;
                    ATOMIC_ADD(&global_value[offset2], (mm_ulong) realToFixedPoint(local_value[tgx]));
-#else
-                    unsigned int offset2 = atom2 + get_group_id(0)*PADDED_NUM_ATOMS;
-                    global_value[offset2] += local_value[tgx];
-#endif
                    STORE_PARAM_DERIVS2
                }
            }

--- a/platforms/common/src/kernels/customGBValuePerParticle.cc
+++ b/platforms/common/src/kernels/customGBValuePerParticle.cc
@@ -3,23 +3,12 @@
 */

 KERNEL void computePerParticleValues(GLOBAL real4* posq,
-#ifdef SUPPORTS_64_BIT_ATOMICS
        GLOBAL mm_long* valueBuffers
-#else
-        GLOBAL real* valueBuffers, int bufferSize, int numBuffers
-#endif
        PARAMETER_ARGUMENTS) {
    for (int index = GLOBAL_ID; index < NUM_ATOMS; index += GLOBAL_SIZE) {
        // Reduce the pairwise value

-#ifdef SUPPORTS_64_BIT_ATOMICS
        real sum = valueBuffers[index]/(real) 0x100000000;
-#else
-        int totalSize = bufferSize*numBuffers;
-        real sum = valueBuffers[index];
-        for (int i = index+bufferSize; i < totalSize; i += bufferSize)
-            sum += valueBuffers[i];
-#endif
        REDUCE_PARAM0_DERIV
        
        // Now calculate other values

--- a/platforms/common/src/kernels/customHbondForce.cc
+++ b/platforms/common/src/kernels/customHbondForce.cc
@@ -44,11 +44,7 @@ inline DEVICE real4 computeCross(real4 vec1, real4 vec2) {
 * Compute forces on donors.
 */
 KERNEL void computeDonorForces(
-#ifdef SUPPORTS_64_BIT_ATOMICS
 	GLOBAL mm_ulong* RESTRICT force,
-#else
-	GLOBAL real4* RESTRICT forceBuffers, GLOBAL const int4* RESTRICT donorBufferIndices,
-#endif
 	GLOBAL mixed* RESTRICT energyBuffer, GLOBAL const real4* RESTRICT posq, GLOBAL const int4* RESTRICT exclusions,
        GLOBAL const int4* RESTRICT donorAtoms, GLOBAL const int4* RESTRICT acceptorAtoms, real4 periodicBoxSize, real4 invPeriodicBoxSize,
        real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ
@@ -114,7 +110,6 @@ KERNEL void computeDonorForces(
        // Write results

        if (donorIndex < NUM_DONORS) {
-#ifdef SUPPORTS_64_BIT_ATOMICS
            if (atoms.x > -1) {
                ATOMIC_ADD(&force[atoms.x], (mm_ulong) realToFixedPoint(f1.x));
                ATOMIC_ADD(&force[atoms.x+PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(f1.y));
@@ -133,27 +128,6 @@ KERNEL void computeDonorForces(
                ATOMIC_ADD(&force[atoms.z+2*PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(f3.z));
                MEM_FENCE;
            }
-#else
-            int4 bufferIndices = donorBufferIndices[donorIndex];
-            if (atoms.x > -1) {
-                unsigned int offset = atoms.x+bufferIndices.x*PADDED_NUM_ATOMS;
-                real4 force = forceBuffers[offset];
-                force.xyz += f1.xyz;
-                forceBuffers[offset] = force;
-            }
-            if (atoms.y > -1) {
-                unsigned int offset = atoms.y+bufferIndices.y*PADDED_NUM_ATOMS;
-                real4 force = forceBuffers[offset];
-                force.xyz += f2.xyz;
-                forceBuffers[offset] = force;
-            }
-            if (atoms.z > -1) {
-                unsigned int offset = atoms.z+bufferIndices.z*PADDED_NUM_ATOMS;
-                real4 force = forceBuffers[offset];
-                force.xyz += f3.xyz;
-                forceBuffers[offset] = force;
-            }
-#endif
        }
    }
    energyBuffer[GLOBAL_ID] += energy;
@@ -162,11 +136,7 @@ KERNEL void computeDonorForces(
 * Compute forces on acceptors.
 */
 KERNEL void computeAcceptorForces(
-#ifdef SUPPORTS_64_BIT_ATOMICS
 	GLOBAL mm_ulong* RESTRICT force,
-#else
-	GLOBAL real4* RESTRICT forceBuffers, GLOBAL const int4* RESTRICT acceptorBufferIndices,
-#endif
        GLOBAL mixed* RESTRICT energyBuffer, GLOBAL const real4* RESTRICT posq, GLOBAL const int4* RESTRICT exclusions,
        GLOBAL const int4* RESTRICT donorAtoms, GLOBAL const int4* RESTRICT acceptorAtoms, real4 periodicBoxSize, real4 invPeriodicBoxSize,
        real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ
@@ -231,7 +201,6 @@ KERNEL void computeAcceptorForces(
        // Write results

        if (acceptorIndex < NUM_ACCEPTORS) {
-#ifdef SUPPORTS_64_BIT_ATOMICS
            if (atoms.x > -1) {
                ATOMIC_ADD(&force[atoms.x], (mm_ulong) realToFixedPoint(f1.x));
                ATOMIC_ADD(&force[atoms.x+PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(f1.y));
@@ -250,27 +219,6 @@ KERNEL void computeAcceptorForces(
                ATOMIC_ADD(&force[atoms.z+2*PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(f3.z));
                MEM_FENCE;
            }
-#else
-            int4 bufferIndices = acceptorBufferIndices[acceptorIndex];
-            if (atoms.x > -1) {
-                unsigned int offset = atoms.x+bufferIndices.x*PADDED_NUM_ATOMS;
-                real4 force = forceBuffers[offset];
-                force.xyz += f1.xyz;
-                forceBuffers[offset] = force;
-            }
-            if (atoms.y > -1) {
-                unsigned int offset = atoms.y+bufferIndices.y*PADDED_NUM_ATOMS;
-                real4 force = forceBuffers[offset];
-                force.xyz += f2.xyz;
-                forceBuffers[offset] = force;
-            }
-            if (atoms.z > -1) {
-                unsigned int offset = atoms.z+bufferIndices.z*PADDED_NUM_ATOMS;
-                real4 force = forceBuffers[offset];
-                force.xyz += f3.xyz;
-                forceBuffers[offset] = force;
-            }
-#endif
        }
    }
 }
--- a/platforms/common/src/kernels/customNonbondedGroups.cc
+++ b/platforms/common/src/kernels/customNonbondedGroups.cc
@@ -35,38 +35,8 @@ DEVICE int reduceMax(int val, LOCAL_ARG int* temp) {
 #endif
 }

-#ifndef SUPPORTS_64_BIT_ATOMICS
-/**
- * This function is used on devices that don't support 64 bit atomics.  Multiple threads within
- * a single tile might have computed forces on the same atom.  This loops over them and makes sure
- * that only one thread updates the force on any given atom.
- */
-void writeForces(GLOBAL real4* forceBuffers, LOCAL AtomData* localData, int atomIndex) {
-    localData[LOCAL_ID].x = atomIndex;
-    SYNC_WARPS;
-    real4 forceSum = make_real4(0);
-    int start = (LOCAL_ID/TILE_SIZE)*TILE_SIZE;
-    int end = start+32;
-    bool isFirst = true;
-    for (int i = start; i < end; i++)
-        if (localData[i].x == atomIndex) {
-            forceSum += (real4) (localData[i].fx, localData[i].fy, localData[i].fz, 0);
-            isFirst &= (i >= LOCAL_ID);
-        }
-    const unsigned int warp = GLOBAL_ID/TILE_SIZE;
-    unsigned int offset = atomIndex + warp*PADDED_NUM_ATOMS;
-    if (isFirst)
-        forceBuffers[offset] += forceSum;
-    SYNC_WARPS;
-}
-#endif
-
 KERNEL void computeInteractionGroups(
-#ifdef SUPPORTS_64_BIT_ATOMICS
        GLOBAL mm_ulong* RESTRICT forceBuffers,
-#else
-        GLOBAL real4* RESTRICT forceBuffers,
-#endif
        GLOBAL mixed* RESTRICT energyBuffer, GLOBAL const real4* RESTRICT posq, GLOBAL const int4* RESTRICT groupData,
        GLOBAL const int* RESTRICT numGroupTiles, int useNeighborList,
        real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ
@@ -139,7 +109,6 @@ KERNEL void computeInteractionGroups(
            }
            SYNC_WARPS;
        }
-#ifdef SUPPORTS_64_BIT_ATOMICS
        if (exclusions != 0) {
            ATOMIC_ADD(&forceBuffers[atom1], (mm_ulong) realToFixedPoint(force.x));
            ATOMIC_ADD(&forceBuffers[atom1+PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(force.y));
@@ -149,13 +118,6 @@ KERNEL void computeInteractionGroups(
        ATOMIC_ADD(&forceBuffers[atom2+PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(localData[LOCAL_ID].fy));
        ATOMIC_ADD(&forceBuffers[atom2+2*PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(localData[LOCAL_ID].fz));
        SYNC_WARPS;
-#else
-        writeForces(forceBuffers, localData, atom2);
-        localData[LOCAL_ID].fx = force.x;
-        localData[LOCAL_ID].fy = force.y;
-        localData[LOCAL_ID].fz = force.z;
-        writeForces(forceBuffers, localData, atom1);
-#endif
    }
    energyBuffer[GLOBAL_ID] += energy;
    SAVE_DERIVATIVES

--- a/platforms/common/src/kernels/gbsaObc.cc
+++ b/platforms/common/src/kernels/gbsaObc.cc
@@ -17,11 +17,7 @@ typedef struct ALIGN {
 * Compute the Born sum.
 */
 KERNEL void computeBornSum(
-#ifdef SUPPORTS_64_BIT_ATOMICS
        GLOBAL mm_ulong* RESTRICT global_bornSum,
-#else
-        GLOBAL real* RESTRICT global_bornSum,
-#endif
        GLOBAL const real4* RESTRICT posq, GLOBAL const real* RESTRICT charge, GLOBAL const float2* RESTRICT global_params,
 #ifdef USE_CUTOFF
        GLOBAL const int* RESTRICT tiles, GLOBAL const unsigned int* RESTRICT interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize,
@@ -152,20 +148,12 @@ KERNEL void computeBornSum(

        // Write results.

-#ifdef SUPPORTS_64_BIT_ATOMICS
        unsigned int offset = x*TILE_SIZE + tgx;
        ATOMIC_ADD(&global_bornSum[offset], (mm_ulong) realToFixedPoint(bornSum));
        if (x != y) {
            offset = y*TILE_SIZE + tgx;
            ATOMIC_ADD(&global_bornSum[offset], (mm_ulong) realToFixedPoint(localData[LOCAL_ID].bornSum));
        }
-#else
-        unsigned int offset1 = x*TILE_SIZE + tgx + warp*PADDED_NUM_ATOMS;
-        unsigned int offset2 = y*TILE_SIZE + tgx + warp*PADDED_NUM_ATOMS;
-        global_bornSum[offset1] += bornSum;
-        if (x != y)
-            global_bornSum[offset2] += localData[LOCAL_ID].bornSum;
-#endif
    }

    // Second loop: tiles without exclusions, either from the neighbor list (with cutoff) or just enumerating all
@@ -357,17 +345,9 @@ KERNEL void computeBornSum(
 #else
            unsigned int atom2 = y*TILE_SIZE + tgx;
 #endif
-#ifdef SUPPORTS_64_BIT_ATOMICS
            ATOMIC_ADD(&global_bornSum[atom1], (mm_ulong) realToFixedPoint(bornSum));
            if (atom2 < PADDED_NUM_ATOMS)
                ATOMIC_ADD(&global_bornSum[atom2], (mm_ulong) realToFixedPoint(localData[LOCAL_ID].bornSum));
-#else
-            unsigned int offset1 = atom1 + warp*PADDED_NUM_ATOMS;
-            unsigned int offset2 = atom2 + warp*PADDED_NUM_ATOMS;
-            global_bornSum[offset1] += bornSum;
-            if (atom2 < PADDED_NUM_ATOMS)
-                global_bornSum[offset2] += localData[LOCAL_ID].bornSum;
-#endif
        }
        pos++;
    }
@@ -385,11 +365,7 @@ typedef struct ALIGN {
 */

 KERNEL void computeGBSAForce1(
-#ifdef SUPPORTS_64_BIT_ATOMICS
        GLOBAL mm_ulong* RESTRICT forceBuffers, GLOBAL mm_ulong* RESTRICT global_bornForce,
-#else
-        GLOBAL real4* RESTRICT forceBuffers, GLOBAL real* RESTRICT global_bornForce,
-#endif
        GLOBAL mixed* RESTRICT energyBuffer, GLOBAL const real4* RESTRICT posq, GLOBAL const real* RESTRICT charge,
        GLOBAL const real* RESTRICT global_bornRadii, int needEnergy,
 #ifdef USE_CUTOFF
@@ -538,7 +514,6 @@ KERNEL void computeGBSAForce1(
        
        // Write results.
        
-#ifdef SUPPORTS_64_BIT_ATOMICS
        unsigned int offset = x*TILE_SIZE + tgx;
        ATOMIC_ADD(&forceBuffers[offset], (mm_ulong) realToFixedPoint(force.x));
        ATOMIC_ADD(&forceBuffers[offset+PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(force.y));
@@ -551,16 +526,6 @@ KERNEL void computeGBSAForce1(
            ATOMIC_ADD(&forceBuffers[offset+2*PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(localData[LOCAL_ID].fz));
            ATOMIC_ADD(&global_bornForce[offset], (mm_ulong) realToFixedPoint(localData[LOCAL_ID].fw));
        }
-#else
-        unsigned int offset1 = x*TILE_SIZE + tgx + warp*PADDED_NUM_ATOMS;
-        unsigned int offset2 = y*TILE_SIZE + tgx + warp*PADDED_NUM_ATOMS;
-        forceBuffers[offset1] += make_real4(force.x, force.y, force.z, 0);
-        global_bornForce[offset1] += force.w;
-        if (x != y) {
-            forceBuffers[offset2] += (real4) (localData[LOCAL_ID].fx, localData[LOCAL_ID].fy, localData[LOCAL_ID].fz, 0.0f);
-            global_bornForce[offset2] += localData[LOCAL_ID].fw;
-        }
-#endif
    }

    // Second loop: tiles without exclusions, either from the neighbor list (with cutoff) or just enumerating all
@@ -763,7 +728,6 @@ KERNEL void computeGBSAForce1(
 #else
            unsigned int atom2 = y*TILE_SIZE + tgx;
 #endif
-#ifdef SUPPORTS_64_BIT_ATOMICS
            ATOMIC_ADD(&forceBuffers[atom1], (mm_ulong) realToFixedPoint(force.x));
            ATOMIC_ADD(&forceBuffers[atom1+PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(force.y));
            ATOMIC_ADD(&forceBuffers[atom1+2*PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(force.z));
@@ -774,16 +738,6 @@ KERNEL void computeGBSAForce1(
                ATOMIC_ADD(&forceBuffers[atom2+2*PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(localData[LOCAL_ID].fz));
                ATOMIC_ADD(&global_bornForce[atom2], (mm_ulong) realToFixedPoint(localData[LOCAL_ID].fw));
            }
-#else
-            unsigned int offset1 = atom1 + warp*PADDED_NUM_ATOMS;
-            unsigned int offset2 = atom2 + warp*PADDED_NUM_ATOMS;
-            forceBuffers[offset1] += make_real4(force.x, force.y, force.z, 0);
-            global_bornForce[offset1] += force.w;
-            if (atom2 < PADDED_NUM_ATOMS) {
-                forceBuffers[offset2] += (real4) (localData[LOCAL_ID].fx, localData[LOCAL_ID].fy, localData[LOCAL_ID].fz, 0.0f);
-                global_bornForce[offset2] += localData[LOCAL_ID].fw;
-            }
-#endif
        }
        pos++;
    }

--- a/platforms/common/src/kernels/gbsaObc2.cc
+++ b/platforms/common/src/kernels/gbsaObc2.cc
@@ -16,13 +16,8 @@
    real t2I = (l_ij2I-u_ij2I);
    real term1 = (0.5f*(0.25f+OBC_PARAMS2.y*OBC_PARAMS2.y*invRSquaredOver4)*t2J + t1J*invRSquaredOver4)*invR;
    real term2 = (0.5f*(0.25f+OBC_PARAMS1.y*OBC_PARAMS1.y*invRSquaredOver4)*t2I + t1I*invRSquaredOver4)*invR;
-#ifdef SUPPORTS_64_BIT_ATOMICS
    real tempdEdR = (OBC_PARAMS1.x < rScaledRadiusJ ? BORN_FORCE1*term1/0x100000000 : 0);
    tempdEdR += (OBC_PARAMS2.x < rScaledRadiusI ? BORN_FORCE2*term2/0x100000000 : 0);
-#else
-    real tempdEdR = (OBC_PARAMS1.x < rScaledRadiusJ ? BORN_FORCE1*term1 : (real) 0);
-    tempdEdR += (OBC_PARAMS2.x < rScaledRadiusI ? BORN_FORCE2*term2 : (real) 0);
-#endif
 #ifdef USE_CUTOFF
    unsigned int includeInteraction = (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && atom1 != atom2 && r2 < CUTOFF_SQUARED);
 #else

--- a/platforms/common/src/kernels/gbsaObcReductions.cc
+++ b/platforms/common/src/kernels/gbsaObcReductions.cc
@@ -6,23 +6,12 @@
 */

 KERNEL void reduceBornSum(float alpha, float beta, float gamma,
-#ifdef SUPPORTS_64_BIT_ATOMICS
            GLOBAL const mm_long* RESTRICT bornSum,
-#else
-            GLOBAL const real* RESTRICT bornSum, int bufferSize, int numBuffers,
-#endif
            GLOBAL const float2* RESTRICT params, GLOBAL real* RESTRICT bornRadii, GLOBAL real* RESTRICT obcChain) {
    for (unsigned int index = GLOBAL_ID; index < NUM_ATOMS; index += GLOBAL_SIZE) {
        // Get summed Born data

-#ifdef SUPPORTS_64_BIT_ATOMICS
        real sum = RECIP((real) 0x100000000)*bornSum[index];
-#else
-        real sum = bornSum[index];
-        int totalSize = bufferSize*numBuffers;
-        for (int i = index+bufferSize; i < totalSize; i += bufferSize)
-            sum += bornSum[i];
-#endif

        // Now calculate Born radius and OBC term.

@@ -45,24 +34,14 @@ KERNEL void reduceBornSum(float alpha, float beta, float gamma,
 */

 KERNEL void reduceBornForce(
-#ifdef SUPPORTS_64_BIT_ATOMICS
            GLOBAL mm_long* RESTRICT bornForce,
-#else
-            GLOBAL real* bornForce, int bufferSize, int numBuffers,
-#endif
            GLOBAL mixed* RESTRICT energyBuffer, GLOBAL const float2* RESTRICT params, GLOBAL const real* RESTRICT bornRadii, GLOBAL const real* RESTRICT obcChain) {
    mixed energy = 0;
    for (unsigned int index = GLOBAL_ID; index < NUM_ATOMS; index += GLOBAL_SIZE) {
        // Get summed Born force

-#ifdef SUPPORTS_64_BIT_ATOMICS
        real force = RECIP((real) 0x100000000)*bornForce[index];
-#else
-        real force = bornForce[index];
-        int totalSize = bufferSize*numBuffers;
-        for (int i = index+bufferSize; i < totalSize; i += bufferSize)
-            force += bornForce[i];
-#endif
+
        // Now calculate the actual force

        float offsetRadius = params[index].x;
@@ -73,11 +52,7 @@ KERNEL void reduceBornForce(
        force += saTerm/bornRadius;
        energy += saTerm;
        force *= bornRadius*bornRadius*obcChain[index];
-#ifdef SUPPORTS_64_BIT_ATOMICS
        bornForce[index] = realToFixedPoint(force);
-#else
-        bornForce[index] = force;
-#endif
    }
    energyBuffer[GLOBAL_ID] += energy/-6;
 }
--- a/platforms/common/src/kernels/gbsaObc_cpu.cc
+++ b/platforms/common/src/kernels/gbsaObc_cpu.cc
@@ -9,11 +9,7 @@ typedef struct {
 * Compute the Born sum.
 */
 KERNEL void computeBornSum(
-#ifdef SUPPORTS_64_BIT_ATOMICS
        GLOBAL mm_long* RESTRICT global_bornSum,
-#else
-        GLOBAL real* RESTRICT global_bornSum,
-#endif
        GLOBAL const real4* RESTRICT posq, GLOBAL const real* RESTRICT charge, GLOBAL const float2* RESTRICT global_params,
 #ifdef USE_CUTOFF
        GLOBAL const int* RESTRICT tiles, GLOBAL const unsigned int* RESTRICT interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize,
@@ -87,12 +83,7 @@ KERNEL void computeBornSum(

                // Write results.

-#ifdef SUPPORTS_64_BIT_ATOMICS
                ATOMIC_ADD(&global_bornSum[atom1], realToFixedPoint(bornSum));
-#else
-                unsigned int offset = atom1 + GROUP_ID*PADDED_NUM_ATOMS;
-                global_bornSum[offset] += bornSum;
-#endif
            }
        }
        else {
@@ -149,24 +140,14 @@ KERNEL void computeBornSum(

               // Write results for atom1.

-#ifdef SUPPORTS_64_BIT_ATOMICS
                ATOMIC_ADD(&global_bornSum[atom1], realToFixedPoint(bornSum));
-#else
-                unsigned int offset = atom1 + GROUP_ID*PADDED_NUM_ATOMS;
-                global_bornSum[offset] += bornSum;
-#endif
            }

            // Write results.

            for (int tgx = 0; tgx < TILE_SIZE; tgx++) {
-#ifdef SUPPORTS_64_BIT_ATOMICS
                unsigned int offset = y*TILE_SIZE + tgx;
                ATOMIC_ADD(&global_bornSum[offset], realToFixedPoint(localData[tgx].bornSum));
-#else
-                unsigned int offset = y*TILE_SIZE+tgx + GROUP_ID*PADDED_NUM_ATOMS;
-                global_bornSum[offset] += localData[tgx].bornSum;
-#endif
            }
        }
    }
@@ -296,12 +277,7 @@ KERNEL void computeBornSum(

                    // Write results for atom1.

-#ifdef SUPPORTS_64_BIT_ATOMICS
                    ATOMIC_ADD(&global_bornSum[atom1], realToFixedPoint(bornSum));
-#else
-                    unsigned int offset = atom1 + GROUP_ID*PADDED_NUM_ATOMS;
-                    global_bornSum[offset] += bornSum;
-#endif
                }
            }
            else
@@ -359,12 +335,7 @@ KERNEL void computeBornSum(

                    // Write results for atom1.

-#ifdef SUPPORTS_64_BIT_ATOMICS
                    ATOMIC_ADD(&global_bornSum[atom1], realToFixedPoint(bornSum));
-#else
-                    unsigned int offset = atom1 + GROUP_ID*PADDED_NUM_ATOMS;
-                    global_bornSum[offset] += bornSum;
-#endif
                }
            }

@@ -377,12 +348,7 @@ KERNEL void computeBornSum(
                unsigned int atom2 = y*TILE_SIZE + tgx;
 #endif
                if (atom2 < PADDED_NUM_ATOMS) {
-#ifdef SUPPORTS_64_BIT_ATOMICS
                    ATOMIC_ADD(&global_bornSum[atom2], realToFixedPoint(localData[tgx].bornSum));
-#else
-                    unsigned int offset = atom2 + GROUP_ID*PADDED_NUM_ATOMS;
-                    global_bornSum[offset] += localData[tgx].bornSum;
-#endif
                }
            }
        }
@@ -402,11 +368,7 @@ typedef struct {
 */

 KERNEL void computeGBSAForce1(
-#ifdef SUPPORTS_64_BIT_ATOMICS
        GLOBAL mm_long* RESTRICT forceBuffers, GLOBAL mm_long* RESTRICT global_bornForce,
-#else
-        GLOBAL real4* RESTRICT forceBuffers, GLOBAL real* RESTRICT global_bornForce,
-#endif
        GLOBAL mixed* RESTRICT energyBuffer, GLOBAL const real4* RESTRICT posq, GLOBAL const real* RESTRICT charge,
        GLOBAL const real* RESTRICT global_bornRadii, int needEnergy,
 #ifdef USE_CUTOFF
@@ -490,16 +452,10 @@ KERNEL void computeGBSAForce1(

                // Write results.

-#ifdef SUPPORTS_64_BIT_ATOMICS
                ATOMIC_ADD(&forceBuffers[atom1], realToFixedPoint(force.x));
                ATOMIC_ADD(&forceBuffers[atom1+PADDED_NUM_ATOMS], realToFixedPoint(force.y));
                ATOMIC_ADD(&forceBuffers[atom1+2*PADDED_NUM_ATOMS], realToFixedPoint(force.z));
                ATOMIC_ADD(&global_bornForce[atom1], realToFixedPoint(force.w));
-#else
-                unsigned int offset = atom1 + GROUP_ID*PADDED_NUM_ATOMS;
-                forceBuffers[offset] += make_real4(force.x, force.y, force.z, 0);
-                global_bornForce[offset] += force.w;
-#endif
            }
        }
        else {
@@ -561,36 +517,20 @@ KERNEL void computeGBSAForce1(

               // Write results for atom1.

-#ifdef SUPPORTS_64_BIT_ATOMICS
                ATOMIC_ADD(&forceBuffers[atom1], realToFixedPoint(force.x));
                ATOMIC_ADD(&forceBuffers[atom1+PADDED_NUM_ATOMS], realToFixedPoint(force.y));
                ATOMIC_ADD(&forceBuffers[atom1+2*PADDED_NUM_ATOMS], realToFixedPoint(force.z));
                ATOMIC_ADD(&global_bornForce[atom1], realToFixedPoint(force.w));
-#else
-                unsigned int offset = atom1 + GROUP_ID*PADDED_NUM_ATOMS;
-                forceBuffers[offset] += make_real4(force.x, force.y, force.z, 0);
-                global_bornForce[offset] += force.w;
-#endif
            }

            // Write results.

            for (int tgx = 0; tgx < TILE_SIZE; tgx++) {
-#ifdef SUPPORTS_64_BIT_ATOMICS
                unsigned int offset = y*TILE_SIZE + tgx;
                ATOMIC_ADD(&forceBuffers[offset], realToFixedPoint(localData[tgx].fx));
                ATOMIC_ADD(&forceBuffers[offset+PADDED_NUM_ATOMS], realToFixedPoint(localData[tgx].fy));
                ATOMIC_ADD(&forceBuffers[offset+2*PADDED_NUM_ATOMS], realToFixedPoint(localData[tgx].fz));
                ATOMIC_ADD(&global_bornForce[offset], realToFixedPoint(localData[tgx].fw));
-#else
-                unsigned int offset = y*TILE_SIZE+tgx + GROUP_ID*PADDED_NUM_ATOMS;
-                real4 f = forceBuffers[offset];
-                f.x += localData[tgx].fx;
-                f.y += localData[tgx].fy;
-                f.z += localData[tgx].fz;
-                forceBuffers[offset] = f;
-                global_bornForce[offset] += localData[tgx].fw;
-#endif
            }
        }
    }
@@ -722,16 +662,10 @@ KERNEL void computeGBSAForce1(

                    // Write results for atom1.

-#ifdef SUPPORTS_64_BIT_ATOMICS
                    ATOMIC_ADD(&forceBuffers[atom1], realToFixedPoint(force.x));
                    ATOMIC_ADD(&forceBuffers[atom1+PADDED_NUM_ATOMS], realToFixedPoint(force.y));
                    ATOMIC_ADD(&forceBuffers[atom1+2*PADDED_NUM_ATOMS], realToFixedPoint(force.z));
                    ATOMIC_ADD(&global_bornForce[atom1], realToFixedPoint(force.w));
-#else
-                    unsigned int offset = atom1 + GROUP_ID*PADDED_NUM_ATOMS;
-                    forceBuffers[offset] += make_real4(force.x, force.y, force.z, 0);
-                    global_bornForce[offset] += force.w;
-#endif
                }
            }
            else
@@ -790,16 +724,10 @@ KERNEL void computeGBSAForce1(

                    // Write results for atom1.

-#ifdef SUPPORTS_64_BIT_ATOMICS
                    ATOMIC_ADD(&forceBuffers[atom1], realToFixedPoint(force.x));
                    ATOMIC_ADD(&forceBuffers[atom1+PADDED_NUM_ATOMS], realToFixedPoint(force.y));
                    ATOMIC_ADD(&forceBuffers[atom1+2*PADDED_NUM_ATOMS], realToFixedPoint(force.z));
                    ATOMIC_ADD(&global_bornForce[atom1], realToFixedPoint(force.w));
-#else
-                    unsigned int offset = atom1 + GROUP_ID*PADDED_NUM_ATOMS;
-                    forceBuffers[offset] += make_real4(force.x, force.y, force.z, 0);
-                    global_bornForce[offset] += force.w;
-#endif
                }
            }

@@ -812,20 +740,10 @@ KERNEL void computeGBSAForce1(
                unsigned int atom2 = y*TILE_SIZE + tgx;
 #endif
                if (atom2 < PADDED_NUM_ATOMS) {
-#ifdef SUPPORTS_64_BIT_ATOMICS
                    ATOMIC_ADD(&forceBuffers[atom2], realToFixedPoint(localData[tgx].fx));
                    ATOMIC_ADD(&forceBuffers[atom2+PADDED_NUM_ATOMS], realToFixedPoint(localData[tgx].fy));
                    ATOMIC_ADD(&forceBuffers[atom2+2*PADDED_NUM_ATOMS], realToFixedPoint(localData[tgx].fz));
                    ATOMIC_ADD(&global_bornForce[atom2], realToFixedPoint(localData[tgx].fw));
-#else
-                    unsigned int offset = atom2 + GROUP_ID*PADDED_NUM_ATOMS;
-                    real4 f = forceBuffers[offset];
-                    f.x += localData[tgx].fx;
-                    f.y += localData[tgx].fy;
-                    f.z += localData[tgx].fz;
-                    forceBuffers[offset] = f;
-                    global_bornForce[offset] += localData[tgx].fw;
-#endif
                }
            }
        }

--- a/platforms/common/src/kernels/pme.cc
+++ b/platforms/common/src/kernels/pme.cc
 KERNEL void findAtomGridIndex(GLOBAL const real4* RESTRICT posq, GLOBAL int2* RESTRICT pmeAtomGridIndex,
        real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ,
        real4 recipBoxVecX, real4 recipBoxVecY, real4 recipBoxVecZ
-#ifndef SUPPORTS_64_BIT_ATOMICS
-        , GLOBAL real4* RESTRICT pmeBsplineTheta, LOCAL real4* RESTRICT bsplinesCache,
-#ifdef CHARGE_FROM_SIGEPS
-        GLOBAL const float2* RESTRICT sigmaEpsilon
-#else
-        GLOBAL const real* RESTRICT charges
-#endif
-#endif
    ) {
    // Compute the index of the grid point each atom is associated with.

@@ -25,42 +17,9 @@ KERNEL void findAtomGridIndex(GLOBAL const real4* RESTRICT posq, GLOBAL int2* RE
                                   ((int) t.y) % GRID_SIZE_Y,
                                   ((int) t.z) % GRID_SIZE_Z);
        pmeAtomGridIndex[atom] = make_int2(atom, gridIndex.x*GRID_SIZE_Y*GRID_SIZE_Z+gridIndex.y*GRID_SIZE_Z+gridIndex.z);
-#ifndef SUPPORTS_64_BIT_ATOMICS
-        // Compute B-splines here for use in the charge spreading kernel.
-        const real4 scale = 1/(real) (PME_ORDER-1);
-        LOCAL real4* data = &bsplinesCache[LOCAL_ID*PME_ORDER];
-        real4 dr = (real4) (t.x-(int) t.x, t.y-(int) t.y, t.z-(int) t.z, 0.0f);
-        data[PME_ORDER-1] = 0.0f;
-        data[1] = dr;
-        data[0] = 1.0f-dr;
-        for (int j = 3; j < PME_ORDER; j++) {
-            real div = RECIP(j-1.0f);
-            data[j-1] = div*dr*data[j-2];
-            for (int k = 1; k < (j-1); k++)
-                data[j-k-1] = div*((dr+make_real4(k))*data[j-k-2] + (-dr+make_real4(j-k))*data[j-k-1]);
-            data[0] = div*(- dr+1.0f)*data[0];
-        }
-        data[PME_ORDER-1] = scale*dr*data[PME_ORDER-2];
-        for (int j = 1; j < (PME_ORDER-1); j++)
-            data[PME_ORDER-j-1] = scale*((dr+make_real4(j))*data[PME_ORDER-j-2] + (-dr+make_real4(PME_ORDER-j))*data[PME_ORDER-j-1]);
-        data[0] = scale*(-dr+1.0f)*data[0];
-        for (int j = 0; j < PME_ORDER; j++) {
-#ifdef CHARGE_FROM_SIGEPS
-            const float2 sigEps = sigmaEpsilon[atom];
-            const real charge = 8*sigEps.x*sigEps.x*sigEps.x*sigEps.y;
-#else
-            const real charge = CHARGE;
-#endif
-            data[j].w = charge; // Storing the charge here improves cache coherency in the charge spreading kernel
-            pmeBsplineTheta[atom+j*NUM_ATOMS] = data[j];
-        }
-#endif
    }
 }

-#ifdef SUPPORTS_64_BIT_ATOMICS
-#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable
-
 #if defined(USE_HIP) && !defined(AMD_RDNA)
 LAUNCH_BOUNDS_EXACT(128, 1)
 #endif
@@ -206,197 +165,6 @@ KERNEL void finishSpreadCharge(
 #endif
    }
 }
-#elif defined(DEVICE_IS_CPU)
-KERNEL void gridSpreadCharge(GLOBAL const real4* RESTRICT posq, GLOBAL real* RESTRICT pmeGrid,
-        real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ,
-        real4 recipBoxVecX, real4 recipBoxVecY, real4 recipBoxVecZ,
-#ifdef CHARGE_FROM_SIGEPS
-        GLOBAL const float2* RESTRICT sigmaEpsilon
-#else
-        GLOBAL const real* RESTRICT charges
-#endif
-    ) {
-    const int firstx = GLOBAL_ID*GRID_SIZE_X/GLOBAL_SIZE;
-    const int lastx = (GLOBAL_ID+1)*GRID_SIZE_X/GLOBAL_SIZE;
-    if (firstx == lastx)
-        return;
-    const real4 scale = 1/(real) (PME_ORDER-1);
-    real4 data[PME_ORDER];
-    
-    // Process the atoms in spatially sorted order.  This improves efficiency when writing
-    // the grid values.
-    
-    for (int i = 0; i < NUM_ATOMS; i++) {
-        int atom = i;
-        real4 pos = posq[atom];
-        APPLY_PERIODIC_TO_POS(pos)
-        real3 t = (real3) (pos.x*recipBoxVecX.x+pos.y*recipBoxVecY.x+pos.z*recipBoxVecZ.x,
-                           pos.y*recipBoxVecY.y+pos.z*recipBoxVecZ.y,
-                           pos.z*recipBoxVecZ.z);
-        t.x = (t.x-floor(t.x))*GRID_SIZE_X;
-        t.y = (t.y-floor(t.y))*GRID_SIZE_Y;
-        t.z = (t.z-floor(t.z))*GRID_SIZE_Z;
-        int4 gridIndex = (int4) (((int) t.x) % GRID_SIZE_X,
-                                 ((int) t.y) % GRID_SIZE_Y,
-                                 ((int) t.z) % GRID_SIZE_Z, 0);
-
-        // Spread the charge from this atom onto each grid point.
-
-#ifdef CHARGE_FROM_SIGEPS
-        const float2 sigEps = sigmaEpsilon[atom];
-        const real charge = 8*sigEps.x*sigEps.x*sigEps.x*sigEps.y;
-#else
-        const real charge = (CHARGE)*EPSILON_FACTOR;
-#endif
-        if (charge == 0)
-            continue;
-        bool hasComputedThetas = false;
-        for (int ix = 0; ix < PME_ORDER; ix++) {
-            int xindex = gridIndex.x+ix;
-            xindex -= (xindex >= GRID_SIZE_X ? GRID_SIZE_X : 0);
-            if (xindex < firstx || xindex >= lastx)
-                continue;
-            if (!hasComputedThetas) {
-                hasComputedThetas = true;
-                
-                // Since we need the full set of thetas, it's faster to compute them here than load them
-                // from global memory.
-
-                real4 dr = (real4) (t.x-(int) t.x, t.y-(int) t.y, t.z-(int) t.z, 0.0f);
-                data[PME_ORDER-1] = 0.0f;
-                data[1] = dr;
-                data[0] = 1.0f-dr;
-                for (int j = 3; j < PME_ORDER; j++) {
-                    real div = RECIP(j-1.0f);
-                    data[j-1] = div*dr*data[j-2];
-                    for (int k = 1; k < (j-1); k++)
-                        data[j-k-1] = div*((dr+(real4) k) *data[j-k-2] + (-dr+(real4) (j-k))*data[j-k-1]);
-                    data[0] = div*(- dr+1.0f)*data[0];
-                }
-                data[PME_ORDER-1] = scale*dr*data[PME_ORDER-2];
-                for (int j = 1; j < (PME_ORDER-1); j++)
-                    data[PME_ORDER-j-1] = scale*((dr+(real4) j)*data[PME_ORDER-j-2] + (-dr+(real4) (PME_ORDER-j))*data[PME_ORDER-j-1]);
-                data[0] = scale*(-dr+1.0f)*data[0];
-            }
-            for (int iy = 0; iy < PME_ORDER; iy++) {
-                int yindex = gridIndex.y+iy;
-                yindex -= (yindex >= GRID_SIZE_Y ? GRID_SIZE_Y : 0);
-                for (int iz = 0; iz < PME_ORDER; iz++) {
-                    int zindex = gridIndex.z+iz;
-                    zindex -= (zindex >= GRID_SIZE_Z ? GRID_SIZE_Z : 0);
-                    int index = xindex*GRID_SIZE_Y*GRID_SIZE_Z + yindex*GRID_SIZE_Z + zindex;
-                    pmeGrid[index] += charge*data[ix].x*data[iy].y*data[iz].z;
-                }
-            }
-        }
-    }
-}
-#else
-/**
- * For each grid point, find the range of sorted atoms associated with that point.
- */
-KERNEL void findAtomRangeForGrid(GLOBAL int2* RESTRICT pmeAtomGridIndex, GLOBAL int* RESTRICT pmeAtomRange, GLOBAL const real4* RESTRICT posq) {
-    int start = (NUM_ATOMS*GLOBAL_ID)/GLOBAL_SIZE;
-    int end = (NUM_ATOMS*(GLOBAL_ID+1))/GLOBAL_SIZE;
-    int last = (start == 0 ? -1 : pmeAtomGridIndex[start-1].y);
-    for (int i = start; i < end; ++i) {
-        int2 atomData = pmeAtomGridIndex[i];
-        int gridIndex = atomData.y;
-        if (gridIndex != last) {
-            for (int j = last+1; j <= gridIndex; ++j)
-                pmeAtomRange[j] = i;
-            last = gridIndex;
-        }
-    }
-
-    // Fill in values beyond the last atom.
-
-    if (GLOBAL_ID == GLOBAL_SIZE-1) {
-        int gridSize = GRID_SIZE_X*GRID_SIZE_Y*GRID_SIZE_Z;
-        for (int j = last+1; j <= gridSize; ++j)
-            pmeAtomRange[j] = NUM_ATOMS;
-    }
-}
-
-/**
- * The grid index won't be needed again.  Reuse that component to hold the z index, thus saving
- * some work in the charge spreading kernel.
- */
-KERNEL void recordZIndex(GLOBAL int2* RESTRICT pmeAtomGridIndex, GLOBAL const real4* RESTRICT posq, real4 periodicBoxSize, real4 recipBoxVecZ) {
-    int start = (NUM_ATOMS*GLOBAL_ID)/GLOBAL_SIZE;
-    int end = (NUM_ATOMS*(GLOBAL_ID+1))/GLOBAL_SIZE;
-    for (int i = start; i < end; ++i) {
-        real posz = posq[pmeAtomGridIndex[i].x].z;
-        posz -= floor(posz*recipBoxVecZ.z)*periodicBoxSize.z;
-        int z = ((int) ((posz*recipBoxVecZ.z)*GRID_SIZE_Z)) % GRID_SIZE_Z;
-        pmeAtomGridIndex[i].y = z;
-    }
-}
-
-KERNEL void gridSpreadCharge(GLOBAL const real4* RESTRICT posq, GLOBAL real* RESTRICT pmeGrid,
-        GLOBAL const int2* RESTRICT pmeAtomGridIndex, GLOBAL const int* RESTRICT pmeAtomRange,
-        GLOBAL const real4* RESTRICT pmeBsplineTheta
-#ifdef CHARGE_FROM_SIGEPS
-        , GLOBAL const float2* RESTRICT sigmaEpsilon
-#else
-        , GLOBAL const real* RESTRICT charges
-#endif
-    ) {
-    unsigned int numGridPoints = GRID_SIZE_X*GRID_SIZE_Y*GRID_SIZE_Z;
-    for (int gridIndex = GLOBAL_ID; gridIndex < numGridPoints; gridIndex += GLOBAL_SIZE) {
-        // Compute the charge on a grid point.
-
-        int4 gridPoint;
-        gridPoint.x = gridIndex/(GRID_SIZE_Y*GRID_SIZE_Z);
-        int remainder = gridIndex-gridPoint.x*GRID_SIZE_Y*GRID_SIZE_Z;
-        gridPoint.y = remainder/GRID_SIZE_Z;
-        gridPoint.z = remainder-gridPoint.y*GRID_SIZE_Z;
-        real result = 0.0f;
-
-        // Loop over all atoms that affect this grid point.
-
-        for (int ix = 0; ix < PME_ORDER; ++ix) {
-            int x = gridPoint.x-ix+(gridPoint.x >= ix ? 0 : GRID_SIZE_X);
-            for (int iy = 0; iy < PME_ORDER; ++iy) {
-                int y = gridPoint.y-iy+(gridPoint.y >= iy ? 0 : GRID_SIZE_Y);
-                int z1 = gridPoint.z-PME_ORDER+1;
-                z1 += (z1 >= 0 ? 0 : GRID_SIZE_Z);
-                int z2 = (z1 < gridPoint.z ? gridPoint.z : GRID_SIZE_Z-1);
-                int gridIndex1 = x*GRID_SIZE_Y*GRID_SIZE_Z+y*GRID_SIZE_Z+z1;
-                int gridIndex2 = x*GRID_SIZE_Y*GRID_SIZE_Z+y*GRID_SIZE_Z+z2;
-                int firstAtom = pmeAtomRange[gridIndex1];
-                int lastAtom = pmeAtomRange[gridIndex2+1];
-                for (int i = firstAtom; i < lastAtom; ++i)
-                {
-                    int2 atomData = pmeAtomGridIndex[i];
-                    int atomIndex = atomData.x;
-                    int z = atomData.y;
-                    int iz = gridPoint.z-z+(gridPoint.z >= z ? 0 : GRID_SIZE_Z);
-                    real atomCharge = pmeBsplineTheta[atomIndex+ix*NUM_ATOMS].w;
-                    result += atomCharge*pmeBsplineTheta[atomIndex+ix*NUM_ATOMS].x*pmeBsplineTheta[atomIndex+iy*NUM_ATOMS].y*pmeBsplineTheta[atomIndex+iz*NUM_ATOMS].z;
-                }
-                if (z1 > gridPoint.z)
-                {
-                    gridIndex1 = x*GRID_SIZE_Y*GRID_SIZE_Z+y*GRID_SIZE_Z;
-                    gridIndex2 = x*GRID_SIZE_Y*GRID_SIZE_Z+y*GRID_SIZE_Z+gridPoint.z;
-                    firstAtom = pmeAtomRange[gridIndex1];
-                    lastAtom = pmeAtomRange[gridIndex2+1];
-                    for (int i = firstAtom; i < lastAtom; ++i)
-                    {
-                        int2 atomData = pmeAtomGridIndex[i];
-                        int atomIndex = atomData.x;
-                        int z = atomData.y;
-                        int iz = gridPoint.z-z+(gridPoint.z >= z ? 0 : GRID_SIZE_Z);
-                        real atomCharge = pmeBsplineTheta[atomIndex+ix*NUM_ATOMS].w;
-                        result += atomCharge*pmeBsplineTheta[atomIndex+ix*NUM_ATOMS].x*pmeBsplineTheta[atomIndex+iy*NUM_ATOMS].y*pmeBsplineTheta[atomIndex+iz*NUM_ATOMS].z;
-                    }
-                }
-            }
-        }
-        pmeGrid[gridIndex] = result*EPSILON_FACTOR;
-    }
-}
-#endif

 KERNEL void reciprocalConvolution(GLOBAL real2* RESTRICT pmeGrid, GLOBAL const real* RESTRICT pmeBsplineModuliX,
        GLOBAL const real* RESTRICT pmeBsplineModuliY, GLOBAL const real* RESTRICT pmeBsplineModuliZ,

--- a/platforms/opencl/include/OpenCLBondedUtilities.h
+++ b/platforms/opencl/include/OpenCLBondedUtilities.h
@@ -9,7 +9,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2011-2019 Stanford University and the Authors.      *
+ * Portions copyright (c) 2011-2022 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -133,12 +133,6 @@ public:
     * Initialize this object in preparation for a simulation.
     */
    void initialize(const System& system);
-    /**
-     * Get the number of force buffers required for bonded forces.
-     */
-    int getNumForceBuffers() {
-        return numForceBuffers;
-    }
    /**
     * Compute the bonded interactions.
     * 
@@ -148,19 +142,17 @@ public:
 private:
    std::string createForceSource(int forceIndex, int numBonds, int numAtoms, int group, const std::string& computeForce);
    OpenCLContext& context;
-    std::vector<cl::Kernel> kernels;
+    cl::Kernel kernel;
    std::vector<std::vector<std::vector<int> > > forceAtoms;
    std::vector<int> indexWidth;
    std::vector<std::string> forceSource;
    std::vector<int> forceGroup;
-    std::vector<std::vector<int> > forceSets;
    std::vector<cl::Memory*> arguments;
    std::vector<std::string> argTypes;
    std::vector<OpenCLArray> atomIndices;
-    std::vector<OpenCLArray> bufferIndices;
    std::vector<std::string> prefixCode;
    std::vector<std::string> energyParameterDerivatives;
-    int numForceBuffers, maxBonds, allGroups;
+    int maxBonds, allGroups;
    bool hasInitializedKernels;
 };


--- a/platforms/opencl/include/OpenCLNonbondedUtilities.h
+++ b/platforms/opencl/include/OpenCLNonbondedUtilities.h
@@ -125,7 +125,7 @@ public:
     * Get the number of force buffers required for nonbonded forces.
     */
    int getNumForceBuffers() const {
-        return numForceBuffers;
+        return 1;
    }
    /**
     * Get the number of energy buffers required for nonbonded forces.
@@ -331,7 +331,7 @@ private:
    std::map<int, std::string> groupKernelSource;
    double lastCutoff;
    bool useCutoff, usePeriodic, deviceIsCpu, anyExclusions, usePadding, forceRebuildNeighborList;
-    int numForceBuffers, startTileIndex, startBlockIndex, numBlocks, maxExclusions, numForceThreadBlocks;
+    int startTileIndex, startBlockIndex, numBlocks, maxExclusions, numForceThreadBlocks;
    int forceThreadBlockSize, interactingBlocksThreadBlockSize, groupFlags;
    unsigned int tilesAfterReorder;
    long long numTiles;

--- a/platforms/opencl/src/OpenCLBondedUtilities.cpp
+++ b/platforms/opencl/src/OpenCLBondedUtilities.cpp
@@ -6,7 +6,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2011-2019 Stanford University and the Authors.      *
+ * Portions copyright (c) 2011-2022 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -34,7 +34,7 @@
 using namespace OpenMM;
 using namespace std;

-OpenCLBondedUtilities::OpenCLBondedUtilities(OpenCLContext& context) : context(context), numForceBuffers(0), maxBonds(0), allGroups(0), hasInitializedKernels(false) {
+OpenCLBondedUtilities::OpenCLBondedUtilities(OpenCLContext& context) : context(context), maxBonds(0), allGroups(0), hasInitializedKernels(false) {
 }

 void OpenCLBondedUtilities::addInteraction(const vector<vector<int> >& atoms, const string& source, int group) {
@@ -85,11 +85,8 @@ void OpenCLBondedUtilities::initialize(const System& system) {
    if (numForces == 0)
        return;
    
-    // Build the lists of atom indices and buffer indices.
+    // Build the lists of atom indices.
    
-    vector<vector<cl_uint> > bufferVec(numForces);
-    vector<vector<int> > bufferCounter(numForces, vector<int>(system.getNumParticles(), 0));
-    vector<int> numBuffers(numForces, 0);
    atomIndices.resize(numForces);
    for (int i = 0; i < numForces; i++) {
        int numBonds = forceAtoms[i].size();
@@ -102,126 +99,40 @@ void OpenCLBondedUtilities::initialize(const System& system) {
        }
        atomIndices[i].initialize<cl_uint>(context, indexVec.size(), "bondedIndices");
        atomIndices[i].upload(indexVec);
-        bufferVec[i].resize(width*numBonds, 0);
-        for (int bond = 0; bond < numBonds; bond++) {
-            for (int atom = 0; atom < numAtoms; atom++)
-                bufferVec[i][bond*width+atom] = bufferCounter[i][forceAtoms[i][bond][atom]]++;
-        }
-        for (int j = 0; j < (int) bufferCounter[i].size(); j++)
-            numBuffers[i] = max(numBuffers[i], bufferCounter[i][j]);
-    }
-    
-    // For efficiency, we want to merge multiple forces into a single kernel - but only if that
-    // won't increase the number of force buffers.
-    
-    if (context.getSupports64BitGlobalAtomics()) {
-        // Put all the forces in the same set.
-        
-        numForceBuffers = 1;
-        forceSets.push_back(vector<int>());
-        for (int i = 0; i < numForces; i++)
-            forceSets[0].push_back(i);
-    }
-    else {
-        // Figure out how many force buffers will be required.
-    
-        for (int i = 0; i < numForces; i++)
-            numForceBuffers = max(numForceBuffers, numBuffers[i]);
-        int bufferLimit = max(numForceBuffers, (int) context.getPlatformData().contexts.size());
-        if (context.getNonbondedUtilities().getHasInteractions())
-            bufferLimit = max(bufferLimit, context.getNonbondedUtilities().getNumForceBuffers());
-        
-        // Figure out sets of forces that can be merged.
-        
-        vector<int> unmerged(numForces);
-        for (int i = 0; i < numForces; i++)
-            unmerged[i] = i;
-        for (int i = 0; i < numForces; i++)
-            for (int j = i-1; j >= 0; j--) {
-                if (numBuffers[unmerged[j]] <= numBuffers[unmerged[j+1]])
-                    break;
-                int temp = unmerged[j+1];
-                unmerged[j+1] = unmerged[j];
-                unmerged[j] = temp;
-            }
-        while (unmerged.size() > 0) {
-            int sum = numBuffers[unmerged.back()];
-            int i;
-            for (i = 0; i < (int) unmerged.size()-1; i++) {
-                if (sum+numBuffers[unmerged[i]] > bufferLimit)
-                    break;
-                sum += numBuffers[unmerged[i]];
-            }
-            forceSets.push_back(vector<int>());
-            for (int j = 0; j < i; j++)
-                forceSets.back().push_back(unmerged[j]);
-            forceSets.back().push_back(unmerged.back());
-            for (int j = 0; j < i; j++)
-                unmerged.erase(unmerged.begin());
-            unmerged.pop_back();
-        }
    }

-    // Update the buffer indices based on merged sets.
-    
-    bufferIndices.resize(numForces);
-    for (int i = 0; i < (int) forceSets.size(); i++)
-        for (int j = 0; j < (int) forceSets[i].size(); j++) {
-            int force = forceSets[i][j];
-            int numBonds = forceAtoms[force].size();
-            int numAtoms = forceAtoms[force][0].size();
-            int width = indexWidth[force];
-            for (int k = 0; k < j; k++)
-                for (int bond = 0; bond < numBonds; bond++)
-                    for (int atom = 0; atom < numAtoms; atom++)
-                        bufferVec[force][bond*width+atom] += bufferCounter[forceSets[i][k]][forceAtoms[force][bond][atom]];
-            bufferIndices[force].initialize<cl_uint>(context, bufferVec[force].size(), "bondedBufferIndices");
-            bufferIndices[force].upload(bufferVec[force]);
-        }
-
-    // Create the kernels.
+    // Create the kernel.

-    for (auto& set : forceSets) {
-        int setSize = set.size();
-        stringstream s;
-        s<<"#ifdef SUPPORTS_64_BIT_ATOMICS\n";
-        s<<"#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable\n";
-        s<<"#endif\n";
-        for (int i = 0; i < (int) prefixCode.size(); i++)
-            s<<prefixCode[i];
-        string bufferType = (context.getSupports64BitGlobalAtomics() ? "long" : "real4");
-        s<<"__kernel void computeBondedForces(__global "<<bufferType<<"* restrict forceBuffers, __global mixed* restrict energyBuffer, __global const real4* restrict posq, int groups, real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ";
-        for (int i = 0; i < setSize; i++) {
-            int force = set[i];
-            string indexType = "uint"+(indexWidth[force] == 1 ? "" : context.intToString(indexWidth[force]));
-            s<<", __global const "<<indexType<<"* restrict atomIndices"<<i;
-            s<<", __global const "<<indexType<<"* restrict bufferIndices"<<i;
-        }
-        for (int i = 0; i < (int) arguments.size(); i++)
-            s<<", __global "<<argTypes[i]<<"* customArg"<<(i+1);
-        if (energyParameterDerivatives.size() > 0)
-            s<<", __global mixed* restrict energyParamDerivs";
-        s<<") {\n";
-        s<<"mixed energy = 0;\n";
-        for (int i = 0; i < energyParameterDerivatives.size(); i++)
-            s<<"mixed energyParamDeriv"<<i<<" = 0;\n";
-        for (int i = 0; i < setSize; i++) {
-            int force = set[i];
-            s<<createForceSource(i, forceAtoms[force].size(), forceAtoms[force][0].size(), forceGroup[force], forceSource[force]);
-        }
-        s<<"energyBuffer[get_global_id(0)] += energy;\n";
-        const vector<string>& allParamDerivNames = context.getEnergyParamDerivNames();
-        int numDerivs = allParamDerivNames.size();
-        for (int i = 0; i < energyParameterDerivatives.size(); i++)
-            for (int index = 0; index < numDerivs; index++)
-                if (allParamDerivNames[index] == energyParameterDerivatives[i])
-                    s<<"energyParamDerivs[get_global_id(0)*"<<numDerivs<<"+"<<index<<"] += energyParamDeriv"<<i<<";\n";
-        s<<"}\n";
-        map<string, string> defines;
-        defines["PADDED_NUM_ATOMS"] = context.intToString(context.getPaddedNumAtoms());
-        cl::Program program = context.createProgram(s.str(), defines);
-        kernels.push_back(cl::Kernel(program, "computeBondedForces"));
+    stringstream s;
+    for (int i = 0; i < (int) prefixCode.size(); i++)
+        s<<prefixCode[i];
+    s<<"__kernel void computeBondedForces(__global long* restrict forceBuffers, __global mixed* restrict energyBuffer, __global const real4* restrict posq, int groups, real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ";
+    for (int force = 0; force < numForces; force++) {
+        string indexType = "uint"+(indexWidth[force] == 1 ? "" : context.intToString(indexWidth[force]));
+        s<<", __global const "<<indexType<<"* restrict atomIndices"<<force;
    }
+    for (int i = 0; i < (int) arguments.size(); i++)
+        s<<", __global "<<argTypes[i]<<"* customArg"<<(i+1);
+    if (energyParameterDerivatives.size() > 0)
+        s<<", __global mixed* restrict energyParamDerivs";
+    s<<") {\n";
+    s<<"mixed energy = 0;\n";
+    for (int i = 0; i < energyParameterDerivatives.size(); i++)
+        s<<"mixed energyParamDeriv"<<i<<" = 0;\n";
+    for (int force = 0; force < numForces; force++)
+        s<<createForceSource(force, forceAtoms[force].size(), forceAtoms[force][0].size(), forceGroup[force], forceSource[force]);
+    s<<"energyBuffer[get_global_id(0)] += energy;\n";
+    const vector<string>& allParamDerivNames = context.getEnergyParamDerivNames();
+    int numDerivs = allParamDerivNames.size();
+    for (int i = 0; i < energyParameterDerivatives.size(); i++)
+        for (int index = 0; index < numDerivs; index++)
+            if (allParamDerivNames[index] == energyParameterDerivatives[i])
+                s<<"energyParamDerivs[get_global_id(0)*"<<numDerivs<<"+"<<index<<"] += energyParamDeriv"<<i<<";\n";
+    s<<"}\n";
+    map<string, string> defines;
+    defines["PADDED_NUM_ATOMS"] = context.intToString(context.getPaddedNumAtoms());
+    cl::Program program = context.createProgram(s.str(), defines);
+    kernel = cl::Kernel(program, "computeBondedForces");
    forceAtoms.clear();
    forceSource.clear();
 }
@@ -247,7 +158,6 @@ string OpenCLBondedUtilities::createForceSource(int forceIndex, int numBonds, in
    s<<"if ((groups&"<<(1<<group)<<") != 0)\n";
    s<<"for (unsigned int index = get_global_id(0); index < "<<numBonds<<"; index += get_global_size(0)) {\n";
    s<<"    "<<indexType<<" atoms = atomIndices"<<forceIndex<<"[index];\n";
-    s<<"    "<<indexType<<" buffers = bufferIndices"<<forceIndex<<"[index];\n";
    for (int i = 0; i < numAtoms; i++) {
        s<<"    unsigned int atom"<<(i+1)<<" = atoms"<<suffix[i]<<";\n";
        s<<"    real4 pos"<<(i+1)<<" = posq[atom"<<(i+1)<<"];\n";
@@ -255,17 +165,9 @@ string OpenCLBondedUtilities::createForceSource(int forceIndex, int numBonds, in
    s<<computeForce<<"\n";
    for (int i = 0; i < numAtoms; i++) {
        s<<"    {\n";
-        if (context.getSupports64BitGlobalAtomics()) {
-            s<<"    atom_add(&forceBuffers[atom"<<(i+1)<<"], realToFixedPoint(force"<<(i+1)<<".x));\n";
-            s<<"    atom_add(&forceBuffers[atom"<<(i+1)<<"+PADDED_NUM_ATOMS], realToFixedPoint(force"<<(i+1)<<".y));\n";
-            s<<"    atom_add(&forceBuffers[atom"<<(i+1)<<"+2*PADDED_NUM_ATOMS], realToFixedPoint(force"<<(i+1)<<".z));\n";
-        }
-        else {
-            s<<"    unsigned int offset = atom"<<(i+1)<<"+buffers"<<suffix[i]<<"*PADDED_NUM_ATOMS;\n";
-            s<<"    real4 force = forceBuffers[offset];\n";
-            s<<"    force.xyz += force"<<(i+1)<<".xyz;\n";
-            s<<"    forceBuffers[offset] = force;\n";
-        }
+        s<<"    ATOMIC_ADD(&forceBuffers[atom"<<(i+1)<<"], (mm_ulong) realToFixedPoint(force"<<(i+1)<<".x));\n";
+        s<<"    ATOMIC_ADD(&forceBuffers[atom"<<(i+1)<<"+PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(force"<<(i+1)<<".y));\n";
+        s<<"    ATOMIC_ADD(&forceBuffers[atom"<<(i+1)<<"+2*PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(force"<<(i+1)<<".z));\n";
        s<<"    }\n";
    }
    s<<"}\n";
@@ -277,43 +179,32 @@ void OpenCLBondedUtilities::computeInteractions(int groups) {
        return;
    if (!hasInitializedKernels) {
        hasInitializedKernels = true;
-        for (int i = 0; i < (int) forceSets.size(); i++) {
-            int index = 0;
-            cl::Kernel& kernel = kernels[i];
-            if (context.getSupports64BitGlobalAtomics())
-                kernel.setArg<cl::Buffer>(index++, context.getLongForceBuffer().getDeviceBuffer());
-            else
-                kernel.setArg<cl::Buffer>(index++, context.getForceBuffers().getDeviceBuffer());
-            kernel.setArg<cl::Buffer>(index++, context.getEnergyBuffer().getDeviceBuffer());
-            kernel.setArg<cl::Buffer>(index++, context.getPosq().getDeviceBuffer());
-            index += 6;
-            for (int j = 0; j < (int) forceSets[i].size(); j++) {
-                kernel.setArg<cl::Buffer>(index++, atomIndices[forceSets[i][j]].getDeviceBuffer());
-                kernel.setArg<cl::Buffer>(index++, bufferIndices[forceSets[i][j]].getDeviceBuffer());
-            }
-            for (int j = 0; j < (int) arguments.size(); j++)
-                kernel.setArg<cl::Memory>(index++, *arguments[j]);
-            if (energyParameterDerivatives.size() > 0)
-                kernel.setArg<cl::Memory>(index++, context.getEnergyParamDerivBuffer().getDeviceBuffer());
-        }
+        int index = 0;
+        kernel.setArg<cl::Buffer>(index++, context.getLongForceBuffer().getDeviceBuffer());
+        kernel.setArg<cl::Buffer>(index++, context.getEnergyBuffer().getDeviceBuffer());
+        kernel.setArg<cl::Buffer>(index++, context.getPosq().getDeviceBuffer());
+        index += 6;
+        for (int j = 0; j < (int) atomIndices.size(); j++)
+            kernel.setArg<cl::Buffer>(index++, atomIndices[j].getDeviceBuffer());
+        for (int j = 0; j < (int) arguments.size(); j++)
+            kernel.setArg<cl::Memory>(index++, *arguments[j]);
+        if (energyParameterDerivatives.size() > 0)
+            kernel.setArg<cl::Memory>(index++, context.getEnergyParamDerivBuffer().getDeviceBuffer());
    }
-    for (int i = 0; i < (int) kernels.size(); i++) {
-        cl::Kernel& kernel = kernels[i];
-        kernel.setArg<cl_int>(3, groups);
-        if (context.getUseDoublePrecision()) {
-            kernel.setArg<mm_double4>(4, context.getPeriodicBoxSizeDouble());
-            kernel.setArg<mm_double4>(5, context.getInvPeriodicBoxSizeDouble());
-            kernel.setArg<mm_double4>(6, context.getPeriodicBoxVecXDouble());
-            kernel.setArg<mm_double4>(7, context.getPeriodicBoxVecYDouble());
-            kernel.setArg<mm_double4>(8, context.getPeriodicBoxVecZDouble());
-        }
-        else {
-            kernel.setArg<mm_float4>(4, context.getPeriodicBoxSize());
-            kernel.setArg<mm_float4>(5, context.getInvPeriodicBoxSize());
-            kernel.setArg<mm_float4>(6, context.getPeriodicBoxVecX());
-            kernel.setArg<mm_float4>(7, context.getPeriodicBoxVecY());
-            kernel.setArg<mm_float4>(8, context.getPeriodicBoxVecZ());
-        }
-        context.executeKernel(kernels[i], maxBonds);
+    kernel.setArg<cl_int>(3, groups);
+    if (context.getUseDoublePrecision()) {
+        kernel.setArg<mm_double4>(4, context.getPeriodicBoxSizeDouble());
+        kernel.setArg<mm_double4>(5, context.getInvPeriodicBoxSizeDouble());
+        kernel.setArg<mm_double4>(6, context.getPeriodicBoxVecXDouble());
+        kernel.setArg<mm_double4>(7, context.getPeriodicBoxVecYDouble());
+        kernel.setArg<mm_double4>(8, context.getPeriodicBoxVecZDouble());
+    }
+    else {
+        kernel.setArg<mm_float4>(4, context.getPeriodicBoxSize());
+        kernel.setArg<mm_float4>(5, context.getInvPeriodicBoxSize());
+        kernel.setArg<mm_float4>(6, context.getPeriodicBoxVecX());
+        kernel.setArg<mm_float4>(7, context.getPeriodicBoxVecY());
+        kernel.setArg<mm_float4>(8, context.getPeriodicBoxVecZ());
    }
+    context.executeKernel(kernel, maxBonds);
 }