Merged 5.1Optimizations branch back to trunk

93c467b2 · Peter Eastman · f6d4557d · 93c467b2 · 93c467b2 · 93c467b2
Commit 93c467b2 authored Mar 22, 2013 by Peter Eastman
20 changed files
--- a/platforms/cuda/src/CudaContext.cpp
+++ b/platforms/cuda/src/CudaContext.cpp
@@ -61,7 +61,7 @@ using namespace OpenMM;
 using namespace std;
 const int CudaContext::ThreadBlockSize = 64;
-const int CudaContext::TileSize = 32;
+const int CudaContext::TileSize = sizeof(tileflags)*8;
 bool CudaContext::hasInitializedCuda = false;
 CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlockingSync, const string& precision, const string& compiler,
@@ -369,6 +369,7 @@ CUmodule CudaContext::createModule(const string source, const map<string, string
        src << "typedef float3 mixed3;\n";
        src << "typedef float4 mixed4;\n";
    }
+    src << "typedef unsigned int tileflags;\n";
    for (map<string, string>::const_iterator iter = defines.begin(); iter != defines.end(); ++iter) {
        src << "#define " << iter->first;
        if (!iter->second.empty())

--- a/platforms/cuda/src/CudaContext.h
+++ b/platforms/cuda/src/CudaContext.h
@@ -42,6 +42,8 @@
 #include "windowsExportCuda.h"
 #include "CudaPlatform.h"
+typedef unsigned int tileflags;
 namespace OpenMM {
 class CudaArray;

--- a/platforms/cuda/src/CudaIntegrationUtilities.cpp
+++ b/platforms/cuda/src/CudaIntegrationUtilities.cpp
@@ -6,7 +6,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2009-2012 Stanford University and the Authors.      *
+ * Portions copyright (c) 2009-2013 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -99,7 +99,7 @@ CudaIntegrationUtilities::CudaIntegrationUtilities(CudaContext& context, const S
        posDelta(NULL), settleAtoms(NULL), settleParams(NULL), shakeAtoms(NULL), shakeParams(NULL),
        random(NULL), randomSeed(NULL), randomPos(0), stepSize(NULL), ccmaAtoms(NULL), ccmaDistance(NULL),
        ccmaReducedMass(NULL), ccmaAtomConstraints(NULL), ccmaNumAtomConstraints(NULL), ccmaConstraintMatrixColumn(NULL),
-        ccmaConstraintMatrixValue(NULL), ccmaDelta1(NULL), ccmaDelta2(NULL), ccmaConvergedMemory(NULL),
+        ccmaConstraintMatrixValue(NULL), ccmaDelta1(NULL), ccmaDelta2(NULL), ccmaConverged(NULL),
        vsite2AvgAtoms(NULL), vsite2AvgWeights(NULL), vsite3AvgAtoms(NULL), vsite3AvgWeights(NULL),
        vsiteOutOfPlaneAtoms(NULL), vsiteOutOfPlaneWeights(NULL) {
    // Create workspace arrays.
@@ -466,9 +466,8 @@ CudaIntegrationUtilities::CudaIntegrationUtilities(CudaContext& context, const S
        ccmaAtoms = CudaArray::create<int2>(context, numCCMA, "CcmaAtoms");
        ccmaAtomConstraints = CudaArray::create<int>(context, numAtoms*maxAtomConstraints, "CcmaAtomConstraints");
        ccmaNumAtomConstraints = CudaArray::create<int>(context, numAtoms, "CcmaAtomConstraintsIndex");
-        CHECK_RESULT2(cuMemHostAlloc((void**) &ccmaConvergedMemory, 2*sizeof(int), CU_MEMHOSTALLOC_DEVICEMAP), "Error allocating pinned memory");
-        CHECK_RESULT2(cuMemHostGetDevicePointer(&ccmaConvergedDeviceMemory, ccmaConvergedMemory, 0), "Error getting device address for pinned memory");
        ccmaConstraintMatrixColumn = CudaArray::create<int>(context, numCCMA*maxRowElements, "ConstraintMatrixColumn");
+        ccmaConverged = CudaArray::create<int>(context, 2, "ccmaConverged");
        vector<int2> atomsVec(ccmaAtoms->getSize());
        vector<int> atomConstraintsVec(ccmaAtomConstraints->getSize());
        vector<int> numAtomConstraintsVec(ccmaNumAtomConstraints->getSize());
@@ -680,8 +679,8 @@ CudaIntegrationUtilities::~CudaIntegrationUtilities() {
        delete ccmaDelta1;
    if (ccmaDelta2 != NULL)
        delete ccmaDelta2;
-    if (ccmaConvergedMemory != NULL)
+    if (ccmaConverged != NULL)
-        cuMemFreeHost(ccmaConvergedMemory);
+        delete ccmaConverged;
    if (vsite2AvgAtoms != NULL)
        delete vsite2AvgAtoms;
    if (vsite2AvgWeights != NULL)
@@ -734,33 +733,32 @@ void CudaIntegrationUtilities::applyConstraints(bool constrainVelocities, double
        context.executeKernel(shakeKernel, args, shakeAtoms->getSize());
    }
    if (ccmaAtoms != NULL) {
-        void* directionsArgs[] = {&ccmaAtoms->getDevicePointer(), &ccmaDistance->getDevicePointer(), &context.getPosq().getDevicePointer(), &posCorrection};
+        void* directionsArgs[] = {&ccmaAtoms->getDevicePointer(), &ccmaDistance->getDevicePointer(), &context.getPosq().getDevicePointer(), &posCorrection, &ccmaConverged->getDevicePointer()};
        context.executeKernel(ccmaDirectionsKernel, directionsArgs, ccmaAtoms->getSize());
        int i;
        void* forceArgs[] = {&ccmaAtoms->getDevicePointer(), &ccmaDistance->getDevicePointer(),
                constrainVelocities ? &context.getVelm().getDevicePointer() : &posDelta->getDevicePointer(),
-                &ccmaReducedMass->getDevicePointer(), &ccmaDelta1->getDevicePointer(), &ccmaConvergedDeviceMemory,
+                &ccmaReducedMass->getDevicePointer(), &ccmaDelta1->getDevicePointer(), &ccmaConverged->getDevicePointer(),
                tolPointer, &i};
        void* multiplyArgs[] = {&ccmaDelta1->getDevicePointer(), &ccmaDelta2->getDevicePointer(),
-                &ccmaConstraintMatrixColumn->getDevicePointer(), &ccmaConstraintMatrixValue->getDevicePointer(), &ccmaConvergedDeviceMemory, &i};
+                &ccmaConstraintMatrixColumn->getDevicePointer(), &ccmaConstraintMatrixValue->getDevicePointer(), &ccmaConverged->getDevicePointer(), &i};
        void* updateArgs[] = {&ccmaNumAtomConstraints->getDevicePointer(), &ccmaAtomConstraints->getDevicePointer(), &ccmaDistance->getDevicePointer(),
                constrainVelocities ? &context.getVelm().getDevicePointer() : &posDelta->getDevicePointer(),
                &context.getVelm().getDevicePointer(), &ccmaDelta1->getDevicePointer(), &ccmaDelta2->getDevicePointer(),
-                &ccmaConvergedDeviceMemory, &i};
+                &ccmaConverged->getDevicePointer(), &i};
        const int checkInterval = 4;
+        int* converged = (int*) context.getPinnedBuffer();
        for (i = 0; i < 150; i++) {
-            if (i == 0) {
-                ccmaConvergedMemory[0] = 1;
-                ccmaConvergedMemory[1] = 0;
-            }
            context.executeKernel(ccmaForceKernel, forceArgs, ccmaAtoms->getSize());
-            if ((i+1)%checkInterval == 0)
+            if ((i+1)%checkInterval == 0) {
+                ccmaConverged->download(converged, false);
                CHECK_RESULT2(cuEventRecord(ccmaEvent, 0), "Error recording event for CCMA");
+            }
            context.executeKernel(ccmaMultiplyKernel, multiplyArgs, ccmaAtoms->getSize());
            context.executeKernel(ccmaUpdateKernel, updateArgs, context.getNumAtoms());
            if ((i+1)%checkInterval == 0) {
                CHECK_RESULT2(cuEventSynchronize(ccmaEvent), "Error synchronizing on event for CCMA");
-                if (ccmaConvergedMemory[i%2])
+                if (converged[i%2])
                    break;
            }
        }

--- a/platforms/cuda/src/CudaIntegrationUtilities.h
+++ b/platforms/cuda/src/CudaIntegrationUtilities.h
@@ -9,7 +9,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2009-2012 Stanford University and the Authors.      *
+ * Portions copyright (c) 2009-2013 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -140,8 +140,7 @@ private:
    CudaArray* ccmaConstraintMatrixValue;
    CudaArray* ccmaDelta1;
    CudaArray* ccmaDelta2;
-    int* ccmaConvergedMemory;
+    CudaArray* ccmaConverged;
-    CUdeviceptr ccmaConvergedDeviceMemory;
    CUevent ccmaEvent;
    CudaArray* vsite2AvgAtoms;
    CudaArray* vsite2AvgWeights;

--- a/platforms/cuda/src/CudaKernels.cpp
+++ b/platforms/cuda/src/CudaKernels.cpp
@@ -6,7 +6,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2008-2012 Stanford University and the Authors.      *
+ * Portions copyright (c) 2008-2013 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -1351,10 +1351,6 @@ CudaCalcNonbondedForceKernel::~CudaCalcNonbondedForceKernel() {
        delete pmeBsplineModuliY;
    if (pmeBsplineModuliZ != NULL)
        delete pmeBsplineModuliZ;
-    if (pmeBsplineTheta != NULL)
-        delete pmeBsplineTheta;
-    if (pmeBsplineDTheta != NULL)
-        delete pmeBsplineDTheta;
    if (pmeAtomRange != NULL)
        delete pmeAtomRange;
    if (pmeAtomGridIndex != NULL)
@@ -1507,13 +1503,13 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
        if (cu.getUseDoublePrecision())
            pmeDefines["USE_DOUBLE_PRECISION"] = "1";
        CUmodule module = cu.createModule(CudaKernelSources::vectorOps+CudaKernelSources::pme, pmeDefines);
-        pmeUpdateBsplinesKernel = cu.getKernel(module, "updateBsplines");
+        pmeGridIndexKernel = cu.getKernel(module, "findAtomGridIndex");
-        pmeAtomRangeKernel = cu.getKernel(module, "findAtomRangeForGrid");
        pmeSpreadChargeKernel = cu.getKernel(module, "gridSpreadCharge");
        pmeConvolutionKernel = cu.getKernel(module, "reciprocalConvolution");
        pmeInterpolateForceKernel = cu.getKernel(module, "gridInterpolateForce");
        pmeEvalEnergyKernel = cu.getKernel(module, "gridEvaluateEnergy");
        pmeFinishSpreadChargeKernel = cu.getKernel(module, "finishSpreadCharge");
+        cuFuncSetCacheConfig(pmeSpreadChargeKernel, CU_FUNC_CACHE_PREFER_L1);
        cuFuncSetCacheConfig(pmeInterpolateForceKernel, CU_FUNC_CACHE_PREFER_L1);
        // Create required data structures.
@@ -1528,7 +1524,6 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
        pmeBsplineModuliX = new CudaArray(cu, gridSizeX, elementSize, "pmeBsplineModuliX");
        pmeBsplineModuliY = new CudaArray(cu, gridSizeY, elementSize, "pmeBsplineModuliY");
        pmeBsplineModuliZ = new CudaArray(cu, gridSizeZ, elementSize, "pmeBsplineModuliZ");
-        pmeBsplineTheta = new CudaArray(cu, PmeOrder*numParticles, 4*elementSize, "pmeBsplineTheta");
        pmeAtomRange = CudaArray::create<int>(cu, gridSizeX*gridSizeY*gridSizeZ+1, "pmeAtomRange");
        pmeAtomGridIndex = CudaArray::create<int2>(cu, numParticles, "pmeAtomGridIndex");
        sort = new CudaSort(cu, new SortTrait(), cu.getNumAtoms());
@@ -1659,20 +1654,14 @@ double CudaCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeF
        cu.executeKernel(ewaldForcesKernel, forcesArgs, cu.getNumAtoms());
    }
    if (directPmeGrid != NULL && cu.getContextIndex() == 0 && includeReciprocal) {
-        void* bsplinesArgs[] = {&cu.getPosq().getDevicePointer(), &pmeBsplineTheta->getDevicePointer(), &pmeAtomGridIndex->getDevicePointer(),
+        void* gridIndexArgs[] = {&cu.getPosq().getDevicePointer(), &pmeAtomGridIndex->getDevicePointer(), cu.getPeriodicBoxSizePointer(), cu.getInvPeriodicBoxSizePointer()};
-                cu.getPeriodicBoxSizePointer(), cu.getInvPeriodicBoxSizePointer()};
+        cu.executeKernel(pmeGridIndexKernel, gridIndexArgs, cu.getNumAtoms());
-        int bsplinesSharedSize = cu.ThreadBlockSize*PmeOrder*(cu.getUseDoublePrecision() ? sizeof(double4) : sizeof(float4));
-        cu.executeKernel(pmeUpdateBsplinesKernel, bsplinesArgs, cu.getNumAtoms(), cu.ThreadBlockSize, bsplinesSharedSize);
        sort->sort(*pmeAtomGridIndex);
-        void* rangeArgs[] = {&pmeAtomGridIndex->getDevicePointer(), &pmeAtomRange->getDevicePointer(), &cu.getPosq().getDevicePointer(),
+        void* spreadArgs[] = {&cu.getPosq().getDevicePointer(), &directPmeGrid->getDevicePointer(), cu.getPeriodicBoxSizePointer(),
-                cu.getPeriodicBoxSizePointer(), cu.getInvPeriodicBoxSizePointer()};
+                cu.getInvPeriodicBoxSizePointer(), &pmeAtomGridIndex->getDevicePointer()};
-        cu.executeKernel(pmeAtomRangeKernel, rangeArgs, cu.getNumAtoms());
+        cu.executeKernel(pmeSpreadChargeKernel, spreadArgs, cu.getNumAtoms(), 128);
-        void* spreadArgs[] = {&cu.getPosq().getDevicePointer(), &directPmeGrid->getDevicePointer(), &pmeBsplineTheta->getDevicePointer(), cu.getPeriodicBoxSizePointer(), cu.getInvPeriodicBoxSizePointer()};
-        cu.executeKernel(pmeSpreadChargeKernel, spreadArgs, cu.getNumAtoms(), PmeOrder*PmeOrder*PmeOrder);
-        void* finishSpreadArgs[] = {&directPmeGrid->getDevicePointer()};
        if (cu.getUseDoublePrecision() || cu.getComputeCapability() < 2.0) {
            void* finishSpreadArgs[] = {&directPmeGrid->getDevicePointer()};
@@ -1699,8 +1688,8 @@ double CudaCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeF
        void* interpolateArgs[] = {&cu.getPosq().getDevicePointer(), &cu.getForce().getDevicePointer(), &directPmeGrid->getDevicePointer(),
-                cu.getPeriodicBoxSizePointer(), cu.getInvPeriodicBoxSizePointer()};
+                cu.getPeriodicBoxSizePointer(), cu.getInvPeriodicBoxSizePointer(), &pmeAtomGridIndex->getDevicePointer()};
-        cu.executeKernel(pmeInterpolateForceKernel, interpolateArgs, cu.getNumAtoms());
+        cu.executeKernel(pmeInterpolateForceKernel, interpolateArgs, cu.getNumAtoms(), 128);
    }
    double energy = (includeReciprocal ? ewaldSelfEnergy : 0.0);
@@ -2071,6 +2060,14 @@ double CudaCalcGBSAOBCForceKernel::execute(ContextImpl& context, bool includeFor
        defines["PADDED_NUM_ATOMS"] = cu.intToString(cu.getPaddedNumAtoms());
        defines["NUM_BLOCKS"] = cu.intToString(cu.getNumAtomBlocks());
        defines["FORCE_WORK_GROUP_SIZE"] = cu.intToString(nb.getForceThreadBlockSize());
+        defines["TILE_SIZE"] = cu.intToString(CudaContext::TileSize);
+        int numExclusionTiles = nb.getExclusionTiles().getSize();
+        defines["NUM_TILES_WITH_EXCLUSIONS"] = cu.intToString(numExclusionTiles);
+        int numContexts = cu.getPlatformData().contexts.size();
+        int startExclusionIndex = cu.getContextIndex()*numExclusionTiles/numContexts;
+        int endExclusionIndex = (cu.getContextIndex()+1)*numExclusionTiles/numContexts;
+        defines["FIRST_EXCLUSION_TILE"] = cu.intToString(startExclusionIndex);
+        defines["LAST_EXCLUSION_TILE"] = cu.intToString(endExclusionIndex);
        map<string, string> replacements;
        CUmodule module = cu.createModule(CudaKernelSources::vectorOps+cu.replaceStrings(CudaKernelSources::gbsaObc1, replacements), defines);
        computeBornSumKernel = cu.getKernel(module, "computeBornSum");
@@ -2083,12 +2080,12 @@ double CudaCalcGBSAOBCForceKernel::execute(ContextImpl& context, bool includeFor
            computeSumArgs.push_back(cu.getPeriodicBoxSizePointer());
            computeSumArgs.push_back(cu.getInvPeriodicBoxSizePointer());
            computeSumArgs.push_back(&maxTiles);
-            computeSumArgs.push_back(&nb.getInteractionFlags().getDevicePointer());
+            computeSumArgs.push_back(&nb.getBlockCenters().getDevicePointer());
+            computeSumArgs.push_back(&nb.getInteractingAtoms().getDevicePointer());
        }
        else
            computeSumArgs.push_back(&maxTiles);
-        computeSumArgs.push_back(&nb.getExclusionIndices().getDevicePointer());
+        computeSumArgs.push_back(&nb.getExclusionTiles().getDevicePointer());
-        computeSumArgs.push_back(&nb.getExclusionRowIndices().getDevicePointer());
        force1Kernel = cu.getKernel(module, "computeGBSAForce1");
        force1Args.push_back(&cu.getForce().getDevicePointer());
        force1Args.push_back(&bornForce->getDevicePointer());
@@ -2101,12 +2098,12 @@ double CudaCalcGBSAOBCForceKernel::execute(ContextImpl& context, bool includeFor
            force1Args.push_back(cu.getPeriodicBoxSizePointer());
            force1Args.push_back(cu.getInvPeriodicBoxSizePointer());
            force1Args.push_back(&maxTiles);
-            force1Args.push_back(&nb.getInteractionFlags().getDevicePointer());
+            force1Args.push_back(&nb.getBlockCenters().getDevicePointer());
+            force1Args.push_back(&nb.getInteractingAtoms().getDevicePointer());
        }
        else
            force1Args.push_back(&maxTiles);
-        force1Args.push_back(&nb.getExclusionIndices().getDevicePointer());
+        force1Args.push_back(&nb.getExclusionTiles().getDevicePointer());
-        force1Args.push_back(&nb.getExclusionRowIndices().getDevicePointer());
        reduceBornSumKernel = cu.getKernel(module, "reduceBornSum");
        reduceBornForceKernel = cu.getKernel(module, "reduceBornForce");
    }
@@ -2115,8 +2112,8 @@ double CudaCalcGBSAOBCForceKernel::execute(ContextImpl& context, bool includeFor
            maxTiles = nb.getInteractingTiles().getSize();
            computeSumArgs[3] = &nb.getInteractingTiles().getDevicePointer();
            force1Args[5] = &nb.getInteractingTiles().getDevicePointer();
-            computeSumArgs[8] = &nb.getInteractionFlags().getDevicePointer();
+            computeSumArgs[9] = &nb.getInteractingAtoms().getDevicePointer();
-            force1Args[10] = &nb.getInteractionFlags().getDevicePointer();
+            force1Args[11] = &nb.getInteractingAtoms().getDevicePointer();
        }
    }
    cu.executeKernel(computeBornSumKernel, &computeSumArgs[0], nb.getNumForceThreadBlocks()*nb.getForceThreadBlockSize(), nb.getForceThreadBlockSize());
@@ -2244,16 +2241,17 @@ void CudaCalcCustomGBForceKernel::initialize(const System& system, const CustomG
    // Record parameters and exclusions.
    int numParticles = force.getNumParticles();
-    params = new CudaParameterSet(cu, force.getNumPerParticleParameters(), numParticles, "customGBParameters", true);
+    int paddedNumParticles = cu.getPaddedNumAtoms();
-    computedValues = new CudaParameterSet(cu, force.getNumComputedValues(), numParticles, "customGBComputedValues", true, cu.getUseDoublePrecision());
+    int numParams = force.getNumPerParticleParameters();
+    params = new CudaParameterSet(cu, force.getNumPerParticleParameters(), paddedNumParticles, "customGBParameters", true);
+    computedValues = new CudaParameterSet(cu, force.getNumComputedValues(), paddedNumParticles, "customGBComputedValues", true, cu.getUseDoublePrecision());
    if (force.getNumGlobalParameters() > 0)
        globals = CudaArray::create<float>(cu, force.getNumGlobalParameters(), "customGBGlobals");
-    vector<vector<float> > paramVector(numParticles);
+    vector<vector<float> > paramVector(paddedNumParticles, vector<float>(numParams, 0));
    vector<vector<int> > exclusionList(numParticles);
    for (int i = 0; i < numParticles; i++) {
        vector<double> parameters;
        force.getParticleParameters(i, parameters);
-        paramVector[i].resize(parameters.size());
        for (int j = 0; j < (int) parameters.size(); j++)
            paramVector[i][j] = (float) parameters[j];
        exclusionList[i].push_back(i);
@@ -2406,23 +2404,22 @@ void CudaCalcCustomGBForceKernel::initialize(const System& system, const CustomG
        replacements["LOAD_LOCAL_PARAMETERS_FROM_GLOBAL"] = loadLocal2.str();
        replacements["LOAD_ATOM1_PARAMETERS"] = load1.str();
        replacements["LOAD_ATOM2_PARAMETERS"] = load2.str();
-        map<string, string> defines;
        if (useCutoff)
-            defines["USE_CUTOFF"] = "1";
+            pairValueDefines["USE_CUTOFF"] = "1";
        if (usePeriodic)
-            defines["USE_PERIODIC"] = "1";
+            pairValueDefines["USE_PERIODIC"] = "1";
        if (useExclusionsForValue)
-            defines["USE_EXCLUSIONS"] = "1";
+            pairValueDefines["USE_EXCLUSIONS"] = "1";
        if (atomParamSize%2 == 0 && !cu.getUseDoublePrecision())
-            defines["NEED_PADDING"] = "1";
+            pairValueDefines["NEED_PADDING"] = "1";
-        defines["WARPS_PER_GROUP"] = cu.intToString(cu.getNonbondedUtilities().getForceThreadBlockSize()/CudaContext::TileSize);
+        pairValueDefines["WARPS_PER_GROUP"] = cu.intToString(cu.getNonbondedUtilities().getForceThreadBlockSize()/CudaContext::TileSize);
-        defines["THREAD_BLOCK_SIZE"] = cu.intToString(cu.getNonbondedUtilities().getForceThreadBlockSize());
+        pairValueDefines["THREAD_BLOCK_SIZE"] = cu.intToString(cu.getNonbondedUtilities().getForceThreadBlockSize());
-        defines["CUTOFF_SQUARED"] = cu.doubleToString(force.getCutoffDistance()*force.getCutoffDistance());
+        pairValueDefines["CUTOFF_SQUARED"] = cu.doubleToString(force.getCutoffDistance()*force.getCutoffDistance());
-        defines["NUM_ATOMS"] = cu.intToString(cu.getNumAtoms());
+        pairValueDefines["NUM_ATOMS"] = cu.intToString(cu.getNumAtoms());
-        defines["PADDED_NUM_ATOMS"] = cu.intToString(cu.getPaddedNumAtoms());
+        pairValueDefines["PADDED_NUM_ATOMS"] = cu.intToString(cu.getPaddedNumAtoms());
-        defines["NUM_BLOCKS"] = cu.intToString(cu.getNumAtomBlocks());
+        pairValueDefines["NUM_BLOCKS"] = cu.intToString(cu.getNumAtomBlocks());
-        CUmodule module = cu.createModule(CudaKernelSources::vectorOps+cu.replaceStrings(CudaKernelSources::customGBValueN2, replacements), defines);
+        pairValueDefines["TILE_SIZE"] = cu.intToString(CudaContext::TileSize);
-        pairValueKernel = cu.getKernel(module, "computeN2Value");
+        pairValueSrc = cu.replaceStrings(CudaKernelSources::customGBValueN2, replacements);
        if (useExclusionsForValue)
            cu.getNonbondedUtilities().requestExclusions(exclusionList);
    }
@@ -2574,23 +2571,22 @@ void CudaCalcCustomGBForceKernel::initialize(const System& system, const CustomG
        replacements["RECORD_DERIVATIVE_2"] = recordDeriv.str();
        replacements["STORE_DERIVATIVES_1"] = storeDerivs1.str();
        replacements["STORE_DERIVATIVES_2"] = storeDerivs2.str();
-        map<string, string> defines;
        if (useCutoff)
-            defines["USE_CUTOFF"] = "1";
+            pairEnergyDefines["USE_CUTOFF"] = "1";
        if (usePeriodic)
-            defines["USE_PERIODIC"] = "1";
+            pairEnergyDefines["USE_PERIODIC"] = "1";
        if (anyExclusions)
-            defines["USE_EXCLUSIONS"] = "1";
+            pairEnergyDefines["USE_EXCLUSIONS"] = "1";
        if (atomParamSize%2 == 0 && !cu.getUseDoublePrecision())
-            defines["NEED_PADDING"] = "1";
+            pairEnergyDefines["NEED_PADDING"] = "1";
-        defines["THREAD_BLOCK_SIZE"] = cu.intToString(cu.getNonbondedUtilities().getForceThreadBlockSize());
+        pairEnergyDefines["THREAD_BLOCK_SIZE"] = cu.intToString(cu.getNonbondedUtilities().getForceThreadBlockSize());
-        defines["WARPS_PER_GROUP"] = cu.intToString(cu.getNonbondedUtilities().getForceThreadBlockSize()/CudaContext::TileSize);
+        pairEnergyDefines["WARPS_PER_GROUP"] = cu.intToString(cu.getNonbondedUtilities().getForceThreadBlockSize()/CudaContext::TileSize);
-        defines["CUTOFF_SQUARED"] = cu.doubleToString(force.getCutoffDistance()*force.getCutoffDistance());
+        pairEnergyDefines["CUTOFF_SQUARED"] = cu.doubleToString(force.getCutoffDistance()*force.getCutoffDistance());
-        defines["NUM_ATOMS"] = cu.intToString(cu.getNumAtoms());
+        pairEnergyDefines["NUM_ATOMS"] = cu.intToString(cu.getNumAtoms());
-        defines["PADDED_NUM_ATOMS"] = cu.intToString(cu.getPaddedNumAtoms());
+        pairEnergyDefines["PADDED_NUM_ATOMS"] = cu.intToString(cu.getPaddedNumAtoms());
-        defines["NUM_BLOCKS"] = cu.intToString(cu.getNumAtomBlocks());
+        pairEnergyDefines["NUM_BLOCKS"] = cu.intToString(cu.getNumAtomBlocks());
-        CUmodule module = cu.createModule(CudaKernelSources::vectorOps+cu.replaceStrings(CudaKernelSources::customGBEnergyN2, replacements), defines);
+        pairEnergyDefines["TILE_SIZE"] = cu.intToString(CudaContext::TileSize);
-        pairEnergyKernel = cu.getKernel(module, "computeN2Energy");
+        pairEnergySrc = cu.replaceStrings(CudaKernelSources::customGBEnergyN2, replacements);
    }
    {
        // Create the kernel to reduce the derivatives and calculate per-particle energy terms.
@@ -2834,14 +2830,46 @@ double CudaCalcCustomGBForceKernel::execute(ContextImpl& context, bool includeFo
    CudaNonbondedUtilities& nb = cu.getNonbondedUtilities();
    if (!hasInitializedKernels) {
        hasInitializedKernels = true;
+        // These two kernels can't be compiled in initialize(), because the nonbonded utilities object
+        // has not yet been initialized then.
+        {
+            int numExclusionTiles = cu.getNonbondedUtilities().getExclusionTiles().getSize();
+            pairValueDefines["NUM_TILES_WITH_EXCLUSIONS"] = cu.intToString(numExclusionTiles);
+            int numContexts = cu.getPlatformData().contexts.size();
+            int startExclusionIndex = cu.getContextIndex()*numExclusionTiles/numContexts;
+            int endExclusionIndex = (cu.getContextIndex()+1)*numExclusionTiles/numContexts;
+            pairValueDefines["FIRST_EXCLUSION_TILE"] = cu.intToString(startExclusionIndex);
+            pairValueDefines["LAST_EXCLUSION_TILE"] = cu.intToString(endExclusionIndex);
+            CUmodule module = cu.createModule(CudaKernelSources::vectorOps+pairValueSrc, pairValueDefines);
+            pairValueKernel = cu.getKernel(module, "computeN2Value");
+            pairValueSrc = "";
+            pairValueDefines.clear();
+        }
+        {
+            int numExclusionTiles = cu.getNonbondedUtilities().getExclusionTiles().getSize();
+            pairEnergyDefines["NUM_TILES_WITH_EXCLUSIONS"] = cu.intToString(numExclusionTiles);
+            int numContexts = cu.getPlatformData().contexts.size();
+            int startExclusionIndex = cu.getContextIndex()*numExclusionTiles/numContexts;
+            int endExclusionIndex = (cu.getContextIndex()+1)*numExclusionTiles/numContexts;
+            pairEnergyDefines["FIRST_EXCLUSION_TILE"] = cu.intToString(startExclusionIndex);
+            pairEnergyDefines["LAST_EXCLUSION_TILE"] = cu.intToString(endExclusionIndex);
+            CUmodule module = cu.createModule(CudaKernelSources::vectorOps+pairEnergySrc, pairEnergyDefines);
+            pairEnergyKernel = cu.getKernel(module, "computeN2Energy");
+            pairEnergySrc = "";
+            pairEnergyDefines.clear();
+        }
+        // Set arguments for kernels.
        maxTiles = (nb.getUseCutoff() ? nb.getInteractingTiles().getSize() : cu.getNumAtomBlocks()*(cu.getNumAtomBlocks()+1)/2);
        valueBuffers = CudaArray::create<long long>(cu, cu.getPaddedNumAtoms(), "customGBValueBuffers");
        cu.addAutoclearBuffer(*valueBuffers);
        cu.clearBuffer(valueBuffers->getDevicePointer(), sizeof(long long)*valueBuffers->getSize());
        pairValueArgs.push_back(&cu.getPosq().getDevicePointer());
        pairValueArgs.push_back(&cu.getNonbondedUtilities().getExclusions().getDevicePointer());
-        pairValueArgs.push_back(&cu.getNonbondedUtilities().getExclusionIndices().getDevicePointer());
+        pairValueArgs.push_back(&cu.getNonbondedUtilities().getExclusionTiles().getDevicePointer());
-        pairValueArgs.push_back(&cu.getNonbondedUtilities().getExclusionRowIndices().getDevicePointer());
        pairValueArgs.push_back(&valueBuffers->getDevicePointer());
        if (nb.getUseCutoff()) {
            pairValueArgs.push_back(&nb.getInteractingTiles().getDevicePointer());
@@ -2849,7 +2877,8 @@ double CudaCalcCustomGBForceKernel::execute(ContextImpl& context, bool includeFo
            pairValueArgs.push_back(cu.getPeriodicBoxSizePointer());
            pairValueArgs.push_back(cu.getInvPeriodicBoxSizePointer());
            pairValueArgs.push_back(&maxTiles);
-            pairValueArgs.push_back(&nb.getInteractionFlags().getDevicePointer());
+            pairValueArgs.push_back(&nb.getBlockCenters().getDevicePointer());
+            pairValueArgs.push_back(&nb.getInteractingAtoms().getDevicePointer());
        }
        else
            pairValueArgs.push_back(&maxTiles);
@@ -2881,15 +2910,15 @@ double CudaCalcCustomGBForceKernel::execute(ContextImpl& context, bool includeFo
        pairEnergyArgs.push_back(&cu.getEnergyBuffer().getDevicePointer());
        pairEnergyArgs.push_back(&cu.getPosq().getDevicePointer());
        pairEnergyArgs.push_back(&cu.getNonbondedUtilities().getExclusions().getDevicePointer());
-        pairEnergyArgs.push_back(&cu.getNonbondedUtilities().getExclusionIndices().getDevicePointer());
+        pairEnergyArgs.push_back(&cu.getNonbondedUtilities().getExclusionTiles().getDevicePointer());
-        pairEnergyArgs.push_back(&cu.getNonbondedUtilities().getExclusionRowIndices().getDevicePointer());
        if (nb.getUseCutoff()) {
            pairEnergyArgs.push_back(&nb.getInteractingTiles().getDevicePointer());
            pairEnergyArgs.push_back(&nb.getInteractionCount().getDevicePointer());
            pairEnergyArgs.push_back(cu.getPeriodicBoxSizePointer());
            pairEnergyArgs.push_back(cu.getInvPeriodicBoxSizePointer());
            pairEnergyArgs.push_back(&maxTiles);
-            pairEnergyArgs.push_back(&nb.getInteractionFlags().getDevicePointer());
+            pairEnergyArgs.push_back(&nb.getBlockCenters().getDevicePointer());
+            pairEnergyArgs.push_back(&nb.getInteractingAtoms().getDevicePointer());
        }
        else
            pairEnergyArgs.push_back(&maxTiles);
@@ -2953,10 +2982,10 @@ double CudaCalcCustomGBForceKernel::execute(ContextImpl& context, bool includeFo
    if (nb.getUseCutoff()) {
        if (maxTiles < nb.getInteractingTiles().getSize()) {
            maxTiles = nb.getInteractingTiles().getSize();
-            pairValueArgs[5] = &nb.getInteractingTiles().getDevicePointer();
+            pairValueArgs[4] = &nb.getInteractingTiles().getDevicePointer();
-            pairEnergyArgs[6] = &nb.getInteractingTiles().getDevicePointer();
+            pairEnergyArgs[5] = &nb.getInteractingTiles().getDevicePointer();
-            pairValueArgs[10] = &nb.getInteractionFlags().getDevicePointer();
+            pairValueArgs[10] = &nb.getInteractingAtoms().getDevicePointer();
-            pairEnergyArgs[11] = &nb.getInteractionFlags().getDevicePointer();
+            pairEnergyArgs[11] = &nb.getInteractingAtoms().getDevicePointer();
        }
    }
    cu.executeKernel(pairValueKernel, &pairValueArgs[0], nb.getNumForceThreadBlocks()*nb.getForceThreadBlockSize(), nb.getForceThreadBlockSize());
@@ -2976,11 +3005,10 @@ void CudaCalcCustomGBForceKernel::copyParametersToContext(ContextImpl& context,
    // Record the per-particle parameters.
-    vector<vector<float> > paramVector(numParticles);
+    vector<vector<float> > paramVector(cu.getPaddedNumAtoms(), vector<float>(force.getNumPerParticleParameters(), 0));
    vector<double> parameters;
    for (int i = 0; i < numParticles; i++) {
        force.getParticleParameters(i, parameters);
-        paramVector[i].resize(parameters.size());
        for (int j = 0; j < (int) parameters.size(); j++)
            paramVector[i][j] = (float) parameters[j];
    }

--- a/platforms/cuda/src/CudaKernels.h
+++ b/platforms/cuda/src/CudaKernels.h
@@ -557,8 +557,7 @@ class CudaCalcNonbondedForceKernel : public CalcNonbondedForceKernel {
 public:
    CudaCalcNonbondedForceKernel(std::string name, const Platform& platform, CudaContext& cu, System& system) : CalcNonbondedForceKernel(name, platform),
            cu(cu), hasInitializedFFT(false), sigmaEpsilon(NULL), exceptionParams(NULL), cosSinSums(NULL), directPmeGrid(NULL), reciprocalPmeGrid(NULL),
-            pmeBsplineModuliX(NULL), pmeBsplineModuliY(NULL), pmeBsplineModuliZ(NULL), pmeBsplineTheta(NULL), pmeBsplineDTheta(NULL),
+            pmeBsplineModuliX(NULL), pmeBsplineModuliY(NULL), pmeBsplineModuliZ(NULL),  pmeAtomRange(NULL), pmeAtomGridIndex(NULL), sort(NULL) {
-            pmeAtomRange(NULL), pmeAtomGridIndex(NULL), sort(NULL) {
    }
    ~CudaCalcNonbondedForceKernel();
    /**
@@ -607,8 +606,6 @@ private:
    CudaArray* pmeBsplineModuliX;
    CudaArray* pmeBsplineModuliY;
    CudaArray* pmeBsplineModuliZ;
-    CudaArray* pmeBsplineTheta;
-    CudaArray* pmeBsplineDTheta;
    CudaArray* pmeAtomRange;
    CudaArray* pmeAtomGridIndex;
    CudaSort* sort;
@@ -617,9 +614,6 @@ private:
    CUfunction ewaldSumsKernel;
    CUfunction ewaldForcesKernel;
    CUfunction pmeGridIndexKernel;
-    CUfunction pmeAtomRangeKernel;
-    CUfunction pmeZIndexKernel;
-    CUfunction pmeUpdateBsplinesKernel;
    CUfunction pmeSpreadChargeKernel;
    CUfunction pmeFinishSpreadChargeKernel;
    CUfunction pmeEvalEnergyKernel;
@@ -776,6 +770,8 @@ private:
    System& system;
    CUfunction pairValueKernel, perParticleValueKernel, pairEnergyKernel, perParticleEnergyKernel, gradientChainRuleKernel;
    std::vector<void*> pairValueArgs, perParticleValueArgs, pairEnergyArgs, perParticleEnergyArgs, gradientChainRuleArgs;
+    std::string pairValueSrc, pairEnergySrc;
+    std::map<std::string, std::string> pairValueDefines, pairEnergyDefines;
 };
 /**

--- a/platforms/cuda/src/CudaNonbondedUtilities.cpp
+++ b/platforms/cuda/src/CudaNonbondedUtilities.cpp
@@ -6,7 +6,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2009-2012 Stanford University and the Authors.      *
+ * Portions copyright (c) 2009-2013 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -29,6 +29,8 @@
 #include "CudaArray.h"
 #include "CudaKernelSources.h"
 #include "CudaExpressionUtilities.h"
+#include "CudaSort.h"
+#include <algorithm>
 #include <map>
 #include <set>
 #include <utility>
@@ -43,15 +45,33 @@ using namespace std;
        throw OpenMMException(m.str());\
    }
-CudaNonbondedUtilities::CudaNonbondedUtilities(CudaContext& context) : context(context), cutoff(-1.0), useCutoff(false), anyExclusions(false),
-        exclusionIndices(NULL), exclusionRowIndices(NULL), exclusions(NULL), interactingTiles(NULL), interactionFlags(NULL),
+class CudaNonbondedUtilities::BlockSortTrait : public CudaSort::SortTrait {
-        interactionCount(NULL), blockCenter(NULL), blockBoundingBox(NULL), nonbondedForceGroup(0) {
+public:
+    BlockSortTrait(bool useDouble) : useDouble(useDouble) {
+    }
+    int getDataSize() const {return useDouble ? sizeof(double2) : sizeof(float2);}
+    int getKeySize() const {return useDouble ? sizeof(double) : sizeof(float);}
+    const char* getDataType() const {return "real2";}
+    const char* getKeyType() const {return "real";}
+    const char* getMinKey() const {return "-3.40282e+38f";}
+    const char* getMaxKey() const {return "3.40282e+38f";}
+    const char* getMaxValue() const {return "make_real2(3.40282e+38f, 3.40282e+38f)";}
+    const char* getSortKey() const {return "value.x";}
+private:
+    bool useDouble;
+};
+CudaNonbondedUtilities::CudaNonbondedUtilities(CudaContext& context) : context(context), cutoff(-1.0), useCutoff(false), anyExclusions(false), usePadding(true),
+        exclusionIndices(NULL), exclusionRowIndices(NULL), exclusionTiles(NULL), exclusions(NULL), interactingTiles(NULL), interactingAtoms(NULL),
+        interactionCount(NULL), blockCenter(NULL), blockBoundingBox(NULL), sortedBlocks(NULL), sortedBlockCenter(NULL), sortedBlockBoundingBox(NULL),
+        oldPositions(NULL), rebuildNeighborList(NULL), blockSorter(NULL), nonbondedForceGroup(0) {
    // Decide how many thread blocks to use.
    string errorMessage = "Error initializing nonbonded utilities";
    int multiprocessors;
    CHECK_RESULT(cuDeviceGetAttribute(&multiprocessors, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, context.getDevice()));
-    numForceThreadBlocks = 3*multiprocessors;
+    numForceThreadBlocks = 4*multiprocessors;
    forceThreadBlockSize = (context.getComputeCapability() < 2.0 ? 128 : 256);
 }
@@ -60,18 +80,32 @@ CudaNonbondedUtilities::~CudaNonbondedUtilities() {
        delete exclusionIndices;
    if (exclusionRowIndices != NULL)
        delete exclusionRowIndices;
+    if (exclusionTiles != NULL)
+        delete exclusionTiles;
    if (exclusions != NULL)
        delete exclusions;
    if (interactingTiles != NULL)
        delete interactingTiles;
-    if (interactionFlags != NULL)
+    if (interactingAtoms != NULL)
-        delete interactionFlags;
+        delete interactingAtoms;
    if (interactionCount != NULL)
        delete interactionCount;
    if (blockCenter != NULL)
        delete blockCenter;
    if (blockBoundingBox != NULL)
        delete blockBoundingBox;
+    if (sortedBlocks != NULL)
+        delete sortedBlocks;
+    if (sortedBlockCenter != NULL)
+        delete sortedBlockCenter;
+    if (sortedBlockBoundingBox != NULL)
+        delete sortedBlockBoundingBox;
+    if (oldPositions != NULL)
+        delete oldPositions;
+    if (rebuildNeighborList != NULL)
+        delete rebuildNeighborList;
+    if (blockSorter != NULL)
+        delete blockSorter;
 }
 void CudaNonbondedUtilities::addInteraction(bool usesCutoff, bool usesPeriodic, bool usesExclusions, double cutoffDistance, const vector<vector<int> >& exclusionList, const string& kernel, int forceGroup) {
@@ -124,6 +158,10 @@ void CudaNonbondedUtilities::requestExclusions(const vector<vector<int> >& exclu
    }
 }
+static bool compareUshort2(ushort2 a, ushort2 b) {
+    return ((a.y < b.y) || (a.y == b.y && a.x < b.x));
+}
 void CudaNonbondedUtilities::initialize(const System& system) {
    string errorMessage = "Error initializing nonbonded utilities";    
    if (atomExclusions.size() == 0) {
@@ -138,13 +176,10 @@ void CudaNonbondedUtilities::initialize(const System& system) {
    numAtoms = context.getNumAtoms();
    int numAtomBlocks = context.getNumAtomBlocks();
-    int totalTiles = numAtomBlocks*(numAtomBlocks+1)/2;
    int numContexts = context.getPlatformData().contexts.size();
-    startTileIndex = context.getContextIndex()*totalTiles/numContexts;
+    setAtomBlockRange(context.getContextIndex()/(double) numContexts, (context.getContextIndex()+1)/(double) numContexts);
-    int endTileIndex = (context.getContextIndex()+1)*totalTiles/numContexts;
-    numTiles = endTileIndex-startTileIndex;
-    // Build a list of indices for the tiles with exclusions.
+    // Build a list of tiles that contain exclusions.
    set<pair<int, int> > tilesWithExclusions;
    for (int atom1 = 0; atom1 < (int) atomExclusions.size(); ++atom1) {
@@ -155,19 +190,29 @@ void CudaNonbondedUtilities::initialize(const System& system) {
            tilesWithExclusions.insert(make_pair(max(x, y), min(x, y)));
        }
    }
-    if (context.getPaddedNumAtoms() > context.getNumAtoms()) {
+    vector<ushort2> exclusionTilesVec;
-        for (int i = 0; i < numAtomBlocks; ++i)
+    for (set<pair<int, int> >::const_iterator iter = tilesWithExclusions.begin(); iter != tilesWithExclusions.end(); ++iter)
-            tilesWithExclusions.insert(make_pair(numAtomBlocks-1, i));
+        exclusionTilesVec.push_back(make_ushort2((unsigned short) iter->first, (unsigned short) iter->second));
+    sort(exclusionTilesVec.begin(), exclusionTilesVec.end(), compareUshort2);
+    exclusionTiles = CudaArray::create<ushort2>(context, exclusionTilesVec.size(), "exclusionTiles");
+    exclusionTiles->upload(exclusionTilesVec);
+    map<pair<int, int>, int> exclusionTileMap;
+    for (int i = 0; i < (int) exclusionTilesVec.size(); i++) {
+        ushort2 tile = exclusionTilesVec[i];
+        exclusionTileMap[make_pair(tile.x, tile.y)] = i;
+    }
+    vector<vector<int> > exclusionBlocksForBlock(numAtomBlocks);
+    for (set<pair<int, int> >::const_iterator iter = tilesWithExclusions.begin(); iter != tilesWithExclusions.end(); ++iter) {
+        exclusionBlocksForBlock[iter->first].push_back(iter->second);
+        if (iter->first != iter->second)
+            exclusionBlocksForBlock[iter->second].push_back(iter->first);
    }
    vector<unsigned int> exclusionRowIndicesVec(numAtomBlocks+1, 0);
    vector<unsigned int> exclusionIndicesVec;
-    int currentRow = 0;
+    for (int i = 0; i < numAtomBlocks; i++) {
-    for (set<pair<int, int> >::const_iterator iter = tilesWithExclusions.begin(); iter != tilesWithExclusions.end(); ++iter) {
+        exclusionIndicesVec.insert(exclusionIndicesVec.end(), exclusionBlocksForBlock[i].begin(), exclusionBlocksForBlock[i].end());
-        while (iter->first != currentRow)
+        exclusionRowIndicesVec[i+1] = exclusionIndicesVec.size();
-            exclusionRowIndicesVec[++currentRow] = exclusionIndicesVec.size();
-        exclusionIndicesVec.push_back(iter->second);
    }
-    exclusionRowIndicesVec[++currentRow] = exclusionIndicesVec.size();
    exclusionIndices = CudaArray::create<unsigned int>(context, exclusionIndicesVec.size(), "exclusionIndices");
    exclusionRowIndices = CudaArray::create<unsigned int>(context, exclusionRowIndicesVec.size(), "exclusionRowIndices");
    exclusionIndices->upload(exclusionIndicesVec);
@@ -175,8 +220,9 @@ void CudaNonbondedUtilities::initialize(const System& system) {
    // Record the exclusion data.
-    exclusions = CudaArray::create<unsigned int>(context, tilesWithExclusions.size()*CudaContext::TileSize, "exclusions");
+    exclusions = CudaArray::create<tileflags>(context, tilesWithExclusions.size()*CudaContext::TileSize, "exclusions");
-    vector<unsigned int> exclusionVec(exclusions->getSize(), 0xFFFFFFFF);
+    tileflags allFlags = (tileflags) -1;
+    vector<tileflags> exclusionVec(exclusions->getSize(), allFlags);
    for (int atom1 = 0; atom1 < (int) atomExclusions.size(); ++atom1) {
        int x = atom1/CudaContext::TileSize;
        int offset1 = atom1-x*CudaContext::TileSize;
@@ -185,31 +231,12 @@ void CudaNonbondedUtilities::initialize(const System& system) {
            int y = atom2/CudaContext::TileSize;
            int offset2 = atom2-y*CudaContext::TileSize;
            if (x > y) {
-                int index = findExclusionIndex(x, y, exclusionIndicesVec, exclusionRowIndicesVec);
+                int index = exclusionTileMap[make_pair(x, y)]*CudaContext::TileSize;
-                exclusionVec[index+offset1] &= 0xFFFFFFFF-(1<<offset2);
+                exclusionVec[index+offset1] &= allFlags-(1<<offset2);
            }
            else {
-                int index = findExclusionIndex(y, x, exclusionIndicesVec, exclusionRowIndicesVec);
+                int index = exclusionTileMap[make_pair(y, x)]*CudaContext::TileSize;
-                exclusionVec[index+offset2] &= 0xFFFFFFFF-(1<<offset1);
+                exclusionVec[index+offset2] &= allFlags-(1<<offset1);
-            }
-        }
-    }
-    // Mark all interactions that involve a padding atom as being excluded.
-    for (int atom1 = context.getNumAtoms(); atom1 < context.getPaddedNumAtoms(); ++atom1) {
-        int x = atom1/CudaContext::TileSize;
-        int offset1 = atom1-x*CudaContext::TileSize;
-        for (int atom2 = 0; atom2 < context.getPaddedNumAtoms(); ++atom2) {
-            int y = atom2/CudaContext::TileSize;
-            int offset2 = atom2-y*CudaContext::TileSize;
-            if (x >= y) {
-                int index = findExclusionIndex(x, y, exclusionIndicesVec, exclusionRowIndicesVec);
-                exclusionVec[index+offset1] &= 0xFFFFFFFF-(1<<offset2);
-            }
-            if (y >= x) {
-                int index = findExclusionIndex(y, x, exclusionIndicesVec, exclusionRowIndicesVec);
-                exclusionVec[index+offset2] &= 0xFFFFFFFF-(1<<offset1);
            }
        }
    }
@@ -219,26 +246,34 @@ void CudaNonbondedUtilities::initialize(const System& system) {
    // Create data structures for the neighbor list.
    if (useCutoff) {
-        // Select a size for the arrays that hold the neighbor list.  This estimate is intentionally very
+        // Select a size for the arrays that hold the neighbor list.  We have to make a fairly
-        // high, because if it ever is too small, we have to fall back to the N^2 algorithm.
+        // arbitrary guess, but if this turns out to be too small we'll increase it later.
-        double4 boxSize = context.getPeriodicBoxSize();
+        maxTiles = 20*numAtomBlocks;
-        maxTiles = (int) (numTiles*(cutoff/boxSize.x+cutoff/boxSize.y+cutoff/boxSize.z));
        if (maxTiles > numTiles)
            maxTiles = numTiles;
        if (maxTiles < 1)
            maxTiles = 1;
        interactingTiles = CudaArray::create<ushort2>(context, maxTiles, "interactingTiles");
-        interactionFlags = CudaArray::create<unsigned int>(context, maxTiles, "interactionFlags");
+        interactingAtoms = CudaArray::create<int>(context, CudaContext::TileSize*maxTiles, "interactingAtoms");
        interactionCount = CudaArray::create<unsigned int>(context, 1, "interactionCount");
+        int elementSize = (context.getUseDoublePrecision() ? sizeof(double) : sizeof(float));
+        blockCenter = new CudaArray(context, numAtomBlocks, 4*elementSize, "blockCenter");
+        blockBoundingBox = new CudaArray(context, numAtomBlocks, 4*elementSize, "blockBoundingBox");
+        sortedBlocks = new CudaArray(context, numAtomBlocks, 2*elementSize, "sortedBlocks");
+        sortedBlockCenter = new CudaArray(context, numAtomBlocks+1, 4*elementSize, "sortedBlockCenter");
+        sortedBlockBoundingBox = new CudaArray(context, numAtomBlocks+1, 4*elementSize, "sortedBlockBoundingBox");
+        oldPositions = new CudaArray(context, numAtoms, 4*elementSize, "oldPositions");
        if (context.getUseDoublePrecision()) {
-            blockCenter = CudaArray::create<double4>(context, numAtomBlocks, "blockCenter");
+            vector<double4> oldPositionsVec(numAtoms, make_double4(1e30, 1e30, 1e30, 0));
-            blockBoundingBox = CudaArray::create<double4>(context, numAtomBlocks, "blockBoundingBox");
+            oldPositions->upload(oldPositionsVec);
        }
        else {
-            blockCenter = CudaArray::create<float4>(context, numAtomBlocks, "blockCenter");
+            vector<float4> oldPositionsVec(numAtoms, make_float4(1e30f, 1e30f, 1e30f, 0));
-            blockBoundingBox = CudaArray::create<float4>(context, numAtomBlocks, "blockBoundingBox");
+            oldPositions->upload(oldPositionsVec);
        }
+        rebuildNeighborList = CudaArray::create<int>(context, 1, "rebuildNeighborList");
+        blockSorter = new CudaSort(context, new BlockSortTrait(context.getUseDoublePrecision()), numAtomBlocks);
        vector<unsigned int> count(1, 0);
        interactionCount->upload(count);
    }
@@ -248,11 +283,22 @@ void CudaNonbondedUtilities::initialize(const System& system) {
    if (kernelSource.size() > 0)
        forceKernel = createInteractionKernel(kernelSource, parameters, arguments, true, true);
    if (useCutoff) {
+        double padding = (usePadding ? 0.1*cutoff : 0.0);
+        double paddedCutoff = cutoff+padding;
        map<string, string> defines;
+        defines["TILE_SIZE"] = context.intToString(CudaContext::TileSize);
        defines["NUM_BLOCKS"] = context.intToString(context.getNumAtomBlocks());
-        defines["CUTOFF_SQUARED"] = context.doubleToString(cutoff*cutoff);
+        defines["NUM_ATOMS"] = context.intToString(context.getNumAtoms());
+        defines["PADDING"] = context.doubleToString(padding);
+        defines["PADDED_CUTOFF"] = context.doubleToString(paddedCutoff);
+        defines["PADDED_CUTOFF_SQUARED"] = context.doubleToString(paddedCutoff*paddedCutoff);
+        defines["NUM_TILES_WITH_EXCLUSIONS"] = context.intToString(exclusionTiles->getSize());
        if (usePeriodic)
            defines["USE_PERIODIC"] = "1";
+        int maxExclusions = 0;
+        for (int i = 0; i < (int) exclusionBlocksForBlock.size(); i++)
+            maxExclusions = (maxExclusions > exclusionBlocksForBlock[i].size() ? maxExclusions : exclusionBlocksForBlock[i].size());
+        defines["MAX_EXCLUSIONS"] = context.intToString(maxExclusions);
        CUmodule interactingBlocksProgram = context.createModule(CudaKernelSources::vectorOps+CudaKernelSources::findInteractingBlocks, defines);
        findBlockBoundsKernel = context.getKernel(interactingBlocksProgram, "findBlockBounds");
        findBlockBoundsArgs.push_back(&numAtoms);
@@ -261,7 +307,18 @@ void CudaNonbondedUtilities::initialize(const System& system) {
        findBlockBoundsArgs.push_back(&context.getPosq().getDevicePointer());
        findBlockBoundsArgs.push_back(&blockCenter->getDevicePointer());
        findBlockBoundsArgs.push_back(&blockBoundingBox->getDevicePointer());
-        findBlockBoundsArgs.push_back(&interactionCount->getDevicePointer());
+        findBlockBoundsArgs.push_back(&rebuildNeighborList->getDevicePointer());
+        findBlockBoundsArgs.push_back(&sortedBlocks->getDevicePointer());
+        sortBoxDataKernel = context.getKernel(interactingBlocksProgram, "sortBoxData");
+        sortBoxDataArgs.push_back(&sortedBlocks->getDevicePointer());
+        sortBoxDataArgs.push_back(&blockCenter->getDevicePointer());
+        sortBoxDataArgs.push_back(&blockBoundingBox->getDevicePointer());
+        sortBoxDataArgs.push_back(&sortedBlockCenter->getDevicePointer());
+        sortBoxDataArgs.push_back(&sortedBlockBoundingBox->getDevicePointer());
+        sortBoxDataArgs.push_back(&context.getPosq().getDevicePointer());
+        sortBoxDataArgs.push_back(&oldPositions->getDevicePointer());
+        sortBoxDataArgs.push_back(&interactionCount->getDevicePointer());
+        sortBoxDataArgs.push_back(&rebuildNeighborList->getDevicePointer());
        findInteractingBlocksKernel = context.getKernel(interactingBlocksProgram, "findBlocksWithInteractions");
        findInteractingBlocksArgs.push_back(context.getPeriodicBoxSizePointer());
        findInteractingBlocksArgs.push_back(context.getInvPeriodicBoxSizePointer());
@@ -269,35 +326,21 @@ void CudaNonbondedUtilities::initialize(const System& system) {
        findInteractingBlocksArgs.push_back(&blockBoundingBox->getDevicePointer());
        findInteractingBlocksArgs.push_back(&interactionCount->getDevicePointer());
        findInteractingBlocksArgs.push_back(&interactingTiles->getDevicePointer());
-        findInteractingBlocksArgs.push_back(&interactionFlags->getDevicePointer());
+        findInteractingBlocksArgs.push_back(&interactingAtoms->getDevicePointer());
        findInteractingBlocksArgs.push_back(&context.getPosq().getDevicePointer());
        findInteractingBlocksArgs.push_back(&maxTiles);
-        findInteractingBlocksArgs.push_back(&startTileIndex);
+        findInteractingBlocksArgs.push_back(&startBlockIndex);
-        findInteractingBlocksArgs.push_back(&numTiles);
+        findInteractingBlocksArgs.push_back(&numBlocks);
-        findInteractionsWithinBlocksKernel = context.getKernel(interactingBlocksProgram, "findInteractionsWithinBlocks");
+        findInteractingBlocksArgs.push_back(&sortedBlocks->getDevicePointer());
-        findInteractionsWithinBlocksArgs.push_back(context.getPeriodicBoxSizePointer());
+        findInteractingBlocksArgs.push_back(&sortedBlockCenter->getDevicePointer());
-        findInteractionsWithinBlocksArgs.push_back(context.getInvPeriodicBoxSizePointer());
+        findInteractingBlocksArgs.push_back(&sortedBlockBoundingBox->getDevicePointer());
-        findInteractionsWithinBlocksArgs.push_back(&context.getPosq().getDevicePointer());
+        findInteractingBlocksArgs.push_back(&exclusionIndices->getDevicePointer());
-        findInteractionsWithinBlocksArgs.push_back(&interactingTiles->getDevicePointer());
+        findInteractingBlocksArgs.push_back(&exclusionRowIndices->getDevicePointer());
-        findInteractionsWithinBlocksArgs.push_back(&blockCenter->getDevicePointer());
+        findInteractingBlocksArgs.push_back(&oldPositions->getDevicePointer());
-        findInteractionsWithinBlocksArgs.push_back(&blockBoundingBox->getDevicePointer());
+        findInteractingBlocksArgs.push_back(&rebuildNeighborList->getDevicePointer());
-        findInteractionsWithinBlocksArgs.push_back(&interactionFlags->getDevicePointer());
-        findInteractionsWithinBlocksArgs.push_back(&interactionCount->getDevicePointer());
-        findInteractionsWithinBlocksArgs.push_back(&maxTiles);
    }
 }
-int CudaNonbondedUtilities::findExclusionIndex(int x, int y, const vector<unsigned int>& exclusionIndices, const vector<unsigned int>& exclusionRowIndices) {
-    if (x < y)
-        throw OpenMMException("Internal error: called findExclusionIndex with x<y");
-    int start = exclusionRowIndices[x];
-    int end = exclusionRowIndices[x+1];
-    for (int i = start; i < end; i++)
-        if (exclusionIndices[i] == y)
-            return i*CudaContext::TileSize;
-    throw OpenMMException("Internal error: exclusion in unexpected tile");
-}
 void CudaNonbondedUtilities::prepareInteractions() {
    if (!useCutoff)
        return;
@@ -311,13 +354,17 @@ void CudaNonbondedUtilities::prepareInteractions() {
    // Compute the neighbor list.
    context.executeKernel(findBlockBoundsKernel, &findBlockBoundsArgs[0], context.getNumAtoms());
-    context.executeKernel(findInteractingBlocksKernel, &findInteractingBlocksArgs[0], context.getNumAtoms());
+    blockSorter->sort(*sortedBlocks);
-    context.executeKernel(findInteractionsWithinBlocksKernel, &findInteractionsWithinBlocksArgs[0], context.getNumAtoms(), 128);
+    context.executeKernel(sortBoxDataKernel, &sortBoxDataArgs[0], context.getNumAtoms());
+    context.executeKernel(findInteractingBlocksKernel, &findInteractingBlocksArgs[0], context.getNumAtoms(), 256);
 }
 void CudaNonbondedUtilities::computeInteractions() {
-    if (kernelSource.size() > 0)
+    if (kernelSource.size() > 0) {
        context.executeKernel(forceKernel, &forceArgs[0], numForceThreadBlocks*forceThreadBlockSize, forceThreadBlockSize);
+        if (context.getComputeForceCount() == 1)
+            updateNeighborListSize(); // This is the first time step, so check whether our initial guess was large enough.
+    }
 }
 void CudaNonbondedUtilities::updateNeighborListSize() {
@@ -332,26 +379,42 @@ void CudaNonbondedUtilities::updateNeighborListSize() {
    // this from happening in the future.
    maxTiles = (int) (1.2*pinnedInteractionCount[0]);
-    int numTiles = context.getNumAtomBlocks()*(context.getNumAtomBlocks()+1)/2;
+    int totalTiles = context.getNumAtomBlocks()*(context.getNumAtomBlocks()+1)/2;
-    if (maxTiles > numTiles)
+    if (maxTiles > totalTiles)
-        maxTiles = numTiles;
+        maxTiles = totalTiles;
    delete interactingTiles;
+    delete interactingAtoms;
+    interactingTiles = NULL; // Avoid an error in the destructor if the following allocation fails
+    interactingAtoms = NULL;
    interactingTiles = CudaArray::create<ushort2>(context, maxTiles, "interactingTiles");
+    interactingAtoms = CudaArray::create<int>(context, CudaContext::TileSize*maxTiles, "interactingAtoms");
    if (forceArgs.size() > 0)
-        forceArgs[8] = &interactingTiles->getDevicePointer();
+        forceArgs[7] = &interactingTiles->getDevicePointer();
    findInteractingBlocksArgs[5] = &interactingTiles->getDevicePointer();
-    delete interactionFlags;
-    interactionFlags = CudaArray::create<unsigned int>(context, maxTiles, "interactionFlags");
    if (forceArgs.size() > 0)
-        forceArgs[13] = &interactionFlags->getDevicePointer();
+        forceArgs[13] = &interactingAtoms->getDevicePointer();
-    findInteractingBlocksArgs[6] = &interactionFlags->getDevicePointer();
+    findInteractingBlocksArgs[6] = &interactingAtoms->getDevicePointer();
-    findInteractionsWithinBlocksArgs[3] = &interactingTiles->getDevicePointer();
+    if (context.getUseDoublePrecision()) {
-    findInteractionsWithinBlocksArgs[6] = &interactionFlags->getDevicePointer();
+        vector<double4> oldPositionsVec(numAtoms, make_double4(1e30, 1e30, 1e30, 0));
+        oldPositions->upload(oldPositionsVec);
+    }
+    else {
+        vector<float4> oldPositionsVec(numAtoms, make_float4(1e30f, 1e30f, 1e30f, 0));
+        oldPositions->upload(oldPositionsVec);
+    }
 }
-void CudaNonbondedUtilities::setTileRange(int startTileIndex, int numTiles) {
+void CudaNonbondedUtilities::setUsePadding(bool padding) {
-    this->startTileIndex = startTileIndex;
+    usePadding = padding;
-    this->numTiles = numTiles;
+}
+void CudaNonbondedUtilities::setAtomBlockRange(double startFraction, double endFraction) {
+    int numAtomBlocks = context.getNumAtomBlocks();
+    startBlockIndex = (int) (startFraction*numAtomBlocks);
+    numBlocks = (int) (endFraction*numAtomBlocks)-startBlockIndex;
+    int totalTiles = context.getNumAtomBlocks()*(context.getNumAtomBlocks()+1)/2;
+    startTileIndex = (int) (startFraction*totalTiles);;
+    numTiles = (int) (endFraction*totalTiles)-startTileIndex;
 }
 CUfunction CudaNonbondedUtilities::createInteractionKernel(const string& source, vector<ParameterInfo>& params, vector<ParameterInfo>& arguments, bool useExclusions, bool isSymmetric) {
@@ -447,6 +510,14 @@ CUfunction CudaNonbondedUtilities::createInteractionKernel(const string& source,
    defines["NUM_ATOMS"] = context.intToString(context.getNumAtoms());
    defines["PADDED_NUM_ATOMS"] = context.intToString(context.getPaddedNumAtoms());
    defines["NUM_BLOCKS"] = context.intToString(context.getNumAtomBlocks());
+    defines["TILE_SIZE"] = context.intToString(CudaContext::TileSize);
+    int numExclusionTiles = exclusionTiles->getSize();
+    defines["NUM_TILES_WITH_EXCLUSIONS"] = context.intToString(numExclusionTiles);
+    int numContexts = context.getPlatformData().contexts.size();
+    int startExclusionIndex = context.getContextIndex()*numExclusionTiles/numContexts;
+    int endExclusionIndex = (context.getContextIndex()+1)*numExclusionTiles/numContexts;
+    defines["FIRST_EXCLUSION_TILE"] = context.intToString(startExclusionIndex);
+    defines["LAST_EXCLUSION_TILE"] = context.intToString(endExclusionIndex);
    if ((localDataSize/4)%2 == 0 && !context.getUseDoublePrecision())
        defines["PARAMETER_SIZE_IS_EVEN"] = "1";
    if (context.getComputeCapability() >= 3.0 && !context.getUseDoublePrecision())
@@ -461,8 +532,7 @@ CUfunction CudaNonbondedUtilities::createInteractionKernel(const string& source,
    forceArgs.push_back(&context.getEnergyBuffer().getDevicePointer());
    forceArgs.push_back(&context.getPosq().getDevicePointer());
    forceArgs.push_back(&exclusions->getDevicePointer());
-    forceArgs.push_back(&exclusionIndices->getDevicePointer());
+    forceArgs.push_back(&exclusionTiles->getDevicePointer());
-    forceArgs.push_back(&exclusionRowIndices->getDevicePointer());
    forceArgs.push_back(&startTileIndex);
    forceArgs.push_back(&numTiles);
    if (useCutoff) {
@@ -471,7 +541,8 @@ CUfunction CudaNonbondedUtilities::createInteractionKernel(const string& source,
        forceArgs.push_back(context.getPeriodicBoxSizePointer());
        forceArgs.push_back(context.getInvPeriodicBoxSizePointer());
        forceArgs.push_back(&maxTiles);
-        forceArgs.push_back(&interactionFlags->getDevicePointer());
+        forceArgs.push_back(&blockCenter->getDevicePointer());
+        forceArgs.push_back(&interactingAtoms->getDevicePointer());
    }
    for (int i = 0; i < (int) params.size(); i++)
        forceArgs.push_back(&params[i].getMemory());

--- a/platforms/cuda/src/CudaNonbondedUtilities.h
+++ b/platforms/cuda/src/CudaNonbondedUtilities.h
@@ -9,7 +9,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2009-2012 Stanford University and the Authors.      *
+ * Portions copyright (c) 2009-2013 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -36,6 +36,8 @@
 namespace OpenMM {
+class CudaSort;
 /**
 * This class provides a generic interface for calculating nonbonded interactions.  It does this in two
 * ways.  First, it can be used to create kernels that evaluate nonbonded interactions.  Clients
@@ -181,10 +183,10 @@ public:
        return *interactingTiles;
    }
    /**
-     * Get the array containing flags for tiles with interactions.
+     * Get the array containing the atoms in each tile with interactions.
     */
-    CudaArray& getInteractionFlags() {
+    CudaArray& getInteractingAtoms() {
-        return *interactionFlags;
+        return *interactingAtoms;
    }
    /**
     * Get the array containing exclusion flags.
@@ -192,6 +194,12 @@ public:
    CudaArray& getExclusions() {
        return *exclusions;
    }
+    /**
+     * Get the array containing tiles with exclusions.
+     */
+    CudaArray& getExclusionTiles() {
+        return *exclusionTiles;
+    }
    /**
     * Get the array containing the index into the exclusion array for each tile.
     */
@@ -217,9 +225,17 @@ public:
        return numTiles;
    }
    /**
-     * Set the range of tiles that should be processed by this context.
+     * Set whether to add padding to the cutoff distance when building the neighbor list.
+     * This increases the size of the neighbor list (and thus the cost of computing interactions),
+     * but also means we don't need to rebuild it every time step.  The default value is true,
+     * since usually this improves performance.  For very expensive interactions, however,
+     * it may be better to set this to false.
+     */
+    void setUsePadding(bool padding);
+    /**
+     * Set the range of atom blocks and tiles that should be processed by this context.
     */
-    void setTileRange(int startTileIndex, int numTiles);
+    void setAtomBlockRange(double startFraction, double endFraction);
    /**
     * Create a Kernel for evaluating a nonbonded interaction.  Cutoffs and periodic boundary conditions
     * are assumed to be the same as those for the default interaction Kernel, since this kernel will use
@@ -232,42 +248,38 @@ public:
     * @param isSymmetric   specifies whether the interaction is symmetric
     */
    CUfunction createInteractionKernel(const std::string& source, std::vector<ParameterInfo>& params, std::vector<ParameterInfo>& arguments, bool useExclusions, bool isSymmetric);
-    /**
-     * This is a utility routine for locating data in the exclusions array.  It takes the (x,y) indices of a tile,
-     * and returns the location in the array where the data for that tile begins.
-     * 
-     * This routine requires that x >= y.  If not, it will throw an exception.
-     * 
-     * @param x                   the x index of the tile
-     * @param y                   the y index of the tile
-     * @param exclusionIndices    the content of the exclusionIndices array
-     * @param exclusionRowIndices the content of the exclusionRowIndices array
-     * @return the index in the exclusions array at which the data for that tile begins
-     */
-    static int findExclusionIndex(int x, int y, const std::vector<unsigned int>& exclusionIndices, const std::vector<unsigned int>& exclusionRowIndices);
 private:
+    class BlockSortTrait;
    CudaContext& context;
    CUfunction forceKernel;
    CUfunction findBlockBoundsKernel;
+    CUfunction sortBoxDataKernel;
    CUfunction findInteractingBlocksKernel;
    CUfunction findInteractionsWithinBlocksKernel;
+    CudaArray* exclusionTiles;
    CudaArray* exclusions;
    CudaArray* exclusionIndices;
    CudaArray* exclusionRowIndices;
    CudaArray* interactingTiles;
-    CudaArray* interactionFlags;
+    CudaArray* interactingAtoms;
    CudaArray* interactionCount;
    CudaArray* blockCenter;
    CudaArray* blockBoundingBox;
-    std::vector<void*> forceArgs, findBlockBoundsArgs, findInteractingBlocksArgs, findInteractionsWithinBlocksArgs;
+    CudaArray* sortedBlocks;
+    CudaArray* sortedBlockCenter;
+    CudaArray* sortedBlockBoundingBox;
+    CudaArray* oldPositions;
+    CudaArray* rebuildNeighborList;
+    CudaSort* blockSorter;
+    std::vector<void*> forceArgs, findBlockBoundsArgs, sortBoxDataArgs, findInteractingBlocksArgs;
    std::vector<std::vector<int> > atomExclusions;
    std::vector<ParameterInfo> parameters;
    std::vector<ParameterInfo> arguments;
    std::string kernelSource;
    std::map<std::string, std::string> kernelDefines;
    double cutoff;
-    bool useCutoff, usePeriodic, anyExclusions;
+    bool useCutoff, usePeriodic, anyExclusions, usePadding;
-    int startTileIndex, numTiles, maxTiles, numForceThreadBlocks, forceThreadBlockSize, nonbondedForceGroup, numAtoms;
+    int startTileIndex, numTiles, startBlockIndex, numBlocks, maxTiles, numForceThreadBlocks, forceThreadBlockSize, nonbondedForceGroup, numAtoms;
 };
 /**

--- a/platforms/cuda/src/CudaParallelKernels.cpp
+++ b/platforms/cuda/src/CudaParallelKernels.cpp
@@ -6,7 +6,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2011-2012 Stanford University and the Authors.      *
+ * Portions copyright (c) 2011-2013 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -118,7 +118,7 @@ private:
 };
 CudaParallelCalcForcesAndEnergyKernel::CudaParallelCalcForcesAndEnergyKernel(string name, const Platform& platform, CudaPlatform::PlatformData& data) :
-        CalcForcesAndEnergyKernel(name, platform), data(data), completionTimes(data.contexts.size()), contextTiles(data.contexts.size()), contextForces(NULL),
+        CalcForcesAndEnergyKernel(name, platform), data(data), completionTimes(data.contexts.size()), contextNonbondedFractions(data.contexts.size()), contextForces(NULL),
        pinnedPositionBuffer(NULL), pinnedForceBuffer(NULL) {
    for (int i = 0; i < (int) data.contexts.size(); i++)
        kernels.push_back(Kernel(new CudaCalcForcesAndEnergyKernel(name, platform, *data.contexts[i])));
@@ -141,6 +141,8 @@ void CudaParallelCalcForcesAndEnergyKernel::initialize(const System& system) {
    sumKernel = cu.getKernel(module, "sumForces");
    for (int i = 0; i < (int) kernels.size(); i++)
        getKernel(i).initialize(system);
+    for (int i = 0; i < (int) contextNonbondedFractions.size(); i++)
+        contextNonbondedFractions[i] = 1/(double) contextNonbondedFractions.size();
 }
 void CudaParallelCalcForcesAndEnergyKernel::beginComputation(ContextImpl& context, bool includeForce, bool includeEnergy, int groups) {
@@ -184,30 +186,26 @@ double CudaParallelCalcForcesAndEnergyKernel::finishComputation(ContextImpl& con
        void* args[] = {&cu.getForce().getDevicePointer(), &contextForces->getDevicePointer(), &bufferSize, &numBuffers};
        cu.executeKernel(sumKernel, args, bufferSize);
-        // Balance work between the contexts by transferring a few nonbonded tiles from the context that
+        // Balance work between the contexts by transferring a little nonbonded work from the context that
        // finished last to the one that finished first.
        int firstIndex = 0, lastIndex = 0;
-        int totalTiles = 0;
        for (int i = 0; i < (int) completionTimes.size(); i++) {
            if (completionTimes[i] < completionTimes[firstIndex])
                firstIndex = i;
            if (completionTimes[i] > completionTimes[lastIndex])
                lastIndex = i;
-            contextTiles[i] = data.contexts[i]->getNonbondedUtilities().getNumTiles();
+        }
-            totalTiles += contextTiles[i];
+        double fractionToTransfer = min(0.001, contextNonbondedFractions[lastIndex]);
-        }
+        contextNonbondedFractions[firstIndex] += fractionToTransfer;
-        int tilesToTransfer = totalTiles/1000;
+        contextNonbondedFractions[lastIndex] -= fractionToTransfer;
-        if (tilesToTransfer < 1)
+        double startFraction = 0.0;
-            tilesToTransfer = 1;
+        for (int i = 0; i < (int) contextNonbondedFractions.size(); i++) {
-        if (tilesToTransfer > contextTiles[lastIndex])
+            double endFraction = startFraction+contextNonbondedFractions[i];
-            tilesToTransfer = contextTiles[lastIndex];
+            if (i == contextNonbondedFractions.size()-1)
-        contextTiles[firstIndex] += tilesToTransfer;
+                endFraction = 1.0; // Avoid roundoff error
-        contextTiles[lastIndex] -= tilesToTransfer;
+            data.contexts[i]->getNonbondedUtilities().setAtomBlockRange(startFraction, endFraction);
-        int startIndex = 0;
+            startFraction = endFraction;
-        for (int i = 0; i < (int) contextTiles.size(); i++) {
-            data.contexts[i]->getNonbondedUtilities().setTileRange(startIndex, contextTiles[i]);
-            startIndex += contextTiles[i];
        }
    }
    return energy;

--- a/platforms/cuda/src/CudaParallelKernels.h
+++ b/platforms/cuda/src/CudaParallelKernels.h
@@ -9,7 +9,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2011-2012 Stanford University and the Authors.      *
+ * Portions copyright (c) 2011-2013 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -80,7 +80,7 @@ private:
    CudaPlatform::PlatformData& data;
    std::vector<Kernel> kernels;
    std::vector<long long> completionTimes;
-    std::vector<int> contextTiles;
+    std::vector<double> contextNonbondedFractions;
    CudaArray* contextForces;
    void* pinnedPositionBuffer;
    long long* pinnedForceBuffer;

--- a/platforms/cuda/src/CudaSort.cpp
+++ b/platforms/cuda/src/CudaSort.cpp
@@ -32,7 +32,7 @@ using namespace OpenMM;
 using namespace std;
 CudaSort::CudaSort(CudaContext& context, SortTrait* trait, unsigned int length) : context(context), trait(trait),
-        dataRange(NULL), bucketOfElement(NULL), offsetInBucket(NULL), bucketOffset(NULL), buckets(NULL) {
+        dataRange(NULL), bucketOfElement(NULL), offsetInBucket(NULL), bucketOffset(NULL), buckets(NULL), dataLength(length) {
    // Create kernels.
    map<string, string> replacements;
@@ -43,6 +43,7 @@ CudaSort::CudaSort(CudaContext& context, SortTrait* trait, unsigned int length)
    replacements["MAX_KEY"] = trait->getMaxKey();
    replacements["MAX_VALUE"] = trait->getMaxValue();
    CUmodule module = context.createModule(context.replaceStrings(CudaKernelSources::sort, replacements));
+    shortListKernel = context.getKernel(module, "sortShortList");
    computeRangeKernel = context.getKernel(module, "computeRange");
    assignElementsKernel = context.getKernel(module, "assignElementsToBuckets");
    computeBucketPositionsKernel = context.getKernel(module, "computeBucketPositions");
@@ -53,15 +54,16 @@ CudaSort::CudaSort(CudaContext& context, SortTrait* trait, unsigned int length)
    int maxBlockSize;
    cuDeviceGetAttribute(&maxBlockSize, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, context.getDevice());
+    int maxSharedMem;
+    cuDeviceGetAttribute(&maxSharedMem, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK, context.getDevice());
+    unsigned int maxLocalBuffer = (unsigned int) ((maxSharedMem/trait->getDataSize())/2);
+    isShortList = (length <= maxLocalBuffer);
    for (rangeKernelSize = 1; rangeKernelSize*2 <= maxBlockSize; rangeKernelSize *= 2)
        ;
    positionsKernelSize = rangeKernelSize;
-    sortKernelSize = rangeKernelSize/2;
+    sortKernelSize = (isShortList ? rangeKernelSize/2 : rangeKernelSize/4);
    if (rangeKernelSize > length)
        rangeKernelSize = length;
-    int maxSharedMem;
-    cuDeviceGetAttribute(&maxSharedMem, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK, context.getDevice());
-    unsigned int maxLocalBuffer = (unsigned int) ((maxSharedMem/trait->getDataSize())/2);
    if (sortKernelSize > maxLocalBuffer)
        sortKernelSize = maxLocalBuffer;
    unsigned int targetBucketSize = sortKernelSize/2;
@@ -73,11 +75,13 @@ CudaSort::CudaSort(CudaContext& context, SortTrait* trait, unsigned int length)
    // Create workspace arrays.
+    if (!isShortList) {
        dataRange = new CudaArray(context, 2, trait->getKeySize(), "sortDataRange");
        bucketOffset = CudaArray::create<uint1>(context, numBuckets, "bucketOffset");
        bucketOfElement = CudaArray::create<uint1>(context, length, "bucketOfElement");
        offsetInBucket = CudaArray::create<uint1>(context, length, "offsetInBucket");
        buckets = new CudaArray(context, length, trait->getDataSize(), "buckets");
+    }
 }
 CudaSort::~CudaSort() {
@@ -95,22 +99,27 @@ CudaSort::~CudaSort() {
 }
 void CudaSort::sort(CudaArray& data) {
-    if (data.getSize() != bucketOfElement->getSize() || data.getElementSize() != trait->getDataSize())
+    if (data.getSize() != dataLength || data.getElementSize() != trait->getDataSize())
        throw OpenMMException("CudaSort called with different data size");
    if (data.getSize() == 0)
        return;
+    if (isShortList) {
+        // We can use a simpler sort kernel that does the entire operation at once in local memory.
+        void* sortArgs[] = {&data.getDevicePointer(), &dataLength};
+        context.executeKernel(shortListKernel, sortArgs, sortKernelSize, sortKernelSize, dataLength*trait->getDataSize());
+    }
+    else {
        // Compute the range of data values.
-    unsigned int dataSize = data.getSize();
+        void* rangeArgs[] = {&data.getDevicePointer(), &dataLength, &dataRange->getDevicePointer()};
-    void* rangeArgs[] = {&data.getDevicePointer(), &dataSize, &dataRange->getDevicePointer()};
        context.executeKernel(computeRangeKernel, rangeArgs, rangeKernelSize, rangeKernelSize, rangeKernelSize*trait->getKeySize());
        // Assign array elements to buckets.
        unsigned int numBuckets = bucketOffset->getSize();
        context.clearBuffer(*bucketOffset);
-    void* elementsArgs[] = {&data.getDevicePointer(), &dataSize, &numBuckets, &dataRange->getDevicePointer(),
+        void* elementsArgs[] = {&data.getDevicePointer(), &dataLength, &numBuckets, &dataRange->getDevicePointer(),
                &bucketOffset->getDevicePointer(), &bucketOfElement->getDevicePointer(), &offsetInBucket->getDevicePointer()};
        context.executeKernel(assignElementsKernel, elementsArgs, data.getSize());
@@ -121,7 +130,7 @@ void CudaSort::sort(CudaArray& data) {
        // Copy the data into the buckets.
-    void* copyArgs[] = {&data.getDevicePointer(), &buckets->getDevicePointer(), &dataSize, &bucketOffset->getDevicePointer(),
+        void* copyArgs[] = {&data.getDevicePointer(), &buckets->getDevicePointer(), &dataLength, &bucketOffset->getDevicePointer(),
                &bucketOfElement->getDevicePointer(), &offsetInBucket->getDevicePointer()};
        context.executeKernel(copyToBucketsKernel, copyArgs, data.getSize());
@@ -129,4 +138,5 @@ void CudaSort::sort(CudaArray& data) {
        void* sortArgs[] = {&data.getDevicePointer(), &buckets->getDevicePointer(), &numBuckets, &bucketOffset->getDevicePointer()};
        context.executeKernel(sortBucketsKernel, sortArgs, ((data.getSize()+sortKernelSize-1)/sortKernelSize)*sortKernelSize, sortKernelSize, sortKernelSize*trait->getDataSize());
+    }
 }
--- a/platforms/cuda/src/CudaSort.h
+++ b/platforms/cuda/src/CudaSort.h
@@ -92,8 +92,9 @@ private:
    CudaArray* offsetInBucket;
    CudaArray* bucketOffset;
    CudaArray* buckets;
-    CUfunction computeRangeKernel, assignElementsKernel, computeBucketPositionsKernel, copyToBucketsKernel, sortBucketsKernel;
+    CUfunction shortListKernel, computeRangeKernel, assignElementsKernel, computeBucketPositionsKernel, copyToBucketsKernel, sortBucketsKernel;
-    unsigned int rangeKernelSize, positionsKernelSize, sortKernelSize;
+    unsigned int dataLength, rangeKernelSize, positionsKernelSize, sortKernelSize;
+    bool isShortList;
 };
 /**

--- a/platforms/cuda/src/kernels/coulombLennardJones.cu
+++ b/platforms/cuda/src/kernels/coulombLennardJones.cu
 #if USE_EWALD
-bool needCorrection = isExcluded && atom1 != atom2 && atom1 < NUM_ATOMS && atom2 < NUM_ATOMS;
+bool needCorrection = hasExclusions && isExcluded && atom1 != atom2 && atom1 < NUM_ATOMS && atom2 < NUM_ATOMS;
 if (!isExcluded || needCorrection) {
-    real tempForce = 0.0f;
    if (r2 < CUTOFF_SQUARED || needCorrection) {
        const real alphaR = EWALD_ALPHA*r;
        const real expAlphaRSqr = EXP(-alphaR*alphaR);
@@ -16,6 +15,7 @@ if (!isExcluded || needCorrection) {
        t *= t;
        t *= t;
        const real erfcAlphaR = RECIP(t*t);
+        real tempForce = 0.0f;
        if (needCorrection) {
            // Subtract off the part of this interaction that was included in the reciprocal space contribution.
@@ -36,8 +36,8 @@ if (!isExcluded || needCorrection) {
            tempEnergy += prefactor*erfcAlphaR;
 #endif
        }
-    }
        dEdR += tempForce*invR*invR;
+    }
 }
 #else
 {

--- a/platforms/cuda/src/kernels/customGBEnergyN2.cu
+++ b/platforms/cuda/src/kernels/customGBEnergyN2.cu
 #define STORE_DERIVATIVE_1(INDEX) atomicAdd(&derivBuffers[offset+(INDEX-1)*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (deriv##INDEX##_1*0x100000000)));
 #define STORE_DERIVATIVE_2(INDEX) atomicAdd(&derivBuffers[offset+(INDEX-1)*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].deriv##INDEX*0x100000000)));
-#define TILE_SIZE 32
 typedef struct {
    real4 posq;
@@ -15,88 +14,43 @@ typedef struct {
 * Compute a force based on pair interactions.
 */
 extern "C" __global__ void computeN2Energy(unsigned long long* __restrict__ forceBuffers, real* __restrict__ energyBuffer,
-        const real4* __restrict__ posq, const unsigned int* __restrict__ exclusions, const unsigned int* __restrict__ exclusionIndices,
+        const real4* __restrict__ posq, const unsigned int* __restrict__ exclusions, const ushort2* __restrict__ exclusionTiles,
-        const unsigned int* __restrict__ exclusionRowIndices,
 #ifdef USE_CUTOFF
-        const ushort2* __restrict__ tiles, const unsigned int* __restrict__ interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize, unsigned int maxTiles, const unsigned int* __restrict__ interactionFlags
+        const ushort2* __restrict__ tiles, const unsigned int* __restrict__ interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize, unsigned int maxTiles, const real4* __restrict__ blockCenter, const unsigned int* __restrict__ interactingAtoms
 #else
        unsigned int numTiles
 #endif
        PARAMETER_ARGUMENTS) {
-    unsigned int totalWarps = (blockDim.x*gridDim.x)/TILE_SIZE;
+    const unsigned int totalWarps = (blockDim.x*gridDim.x)/TILE_SIZE;
-    unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/TILE_SIZE;
+    const unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/TILE_SIZE;
-#ifdef USE_CUTOFF
+    const unsigned int tgx = threadIdx.x & (TILE_SIZE-1);
-    unsigned int numTiles = interactionCount[0];
+    const unsigned int tbx = threadIdx.x - tgx;
-    unsigned int pos = warp*(numTiles > maxTiles ? NUM_BLOCKS*(NUM_BLOCKS+1)/2 : numTiles)/totalWarps;
-    unsigned int end = (warp+1)*(numTiles > maxTiles ? NUM_BLOCKS*(NUM_BLOCKS+1)/2 : numTiles)/totalWarps;
-#else
-    unsigned int pos = warp*numTiles/totalWarps;
-    unsigned int end = (warp+1)*numTiles/totalWarps;
-#endif
    real energy = 0;
-    unsigned int lasty = 0xFFFFFFFF;
    __shared__ AtomData localData[THREAD_BLOCK_SIZE];
-    __shared__ unsigned int exclusionRange[2*WARPS_PER_GROUP];
-    __shared__ int exclusionIndex[WARPS_PER_GROUP];
-    do {
+    // First loop: process tiles that contain exclusions.
-        // Extract the coordinates of this tile
-        const unsigned int tgx = threadIdx.x & (TILE_SIZE-1);
+    const unsigned int firstExclusionTile = FIRST_EXCLUSION_TILE+warp*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/totalWarps;
-        const unsigned int tbx = threadIdx.x - tgx;
+    const unsigned int lastExclusionTile = FIRST_EXCLUSION_TILE+(warp+1)*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/totalWarps;
-        const unsigned int localGroupIndex = threadIdx.x/TILE_SIZE;
+    for (int pos = firstExclusionTile; pos < lastExclusionTile; pos++) {
-        unsigned int x, y;
+        const ushort2 tileIndices = exclusionTiles[pos];
+        const unsigned int x = tileIndices.x;
+        const unsigned int y = tileIndices.y;
        real3 force = make_real3(0);
        DECLARE_ATOM1_DERIVATIVES
-        if (pos < end) {
-#ifdef USE_CUTOFF
-            if (numTiles <= maxTiles) {
-                ushort2 tileIndices = tiles[pos];
-                x = tileIndices.x;
-                y = tileIndices.y;
-            }
-            else
-#endif
-            {
-                y = (unsigned int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
-                x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
-                if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
-                    y += (x < y ? -1 : 1);
-                    x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
-                }
-            }
        unsigned int atom1 = x*TILE_SIZE + tgx;
        real4 posq1 = posq[atom1];
        LOAD_ATOM1_PARAMETERS
-            // Locate the exclusion data for this tile.
 #ifdef USE_EXCLUSIONS
-            if (tgx < 2)
+        unsigned int excl = exclusions[pos*TILE_SIZE+tgx];
-                exclusionRange[2*localGroupIndex+tgx] = exclusionRowIndices[x+tgx];
-            if (tgx == 0)
-                exclusionIndex[localGroupIndex] = -1;
-            for (unsigned int i = exclusionRange[2*localGroupIndex]+tgx; i < exclusionRange[2*localGroupIndex+1]; i += TILE_SIZE)
-                if (exclusionIndices[i] == y)
-                    exclusionIndex[localGroupIndex] = i*TILE_SIZE;
-            bool hasExclusions = (exclusionIndex[localGroupIndex] > -1);
-#else
-            bool hasExclusions = false;
 #endif
-            if (pos >= end)
+        if (x == y) {
-                ; // This warp is done.
-            else if (x == y) {
            // This tile is on the diagonal.
            const unsigned int localAtomIndex = threadIdx.x;
            localData[localAtomIndex].posq = posq1;
            LOAD_LOCAL_PARAMETERS_FROM_1
-#ifdef USE_EXCLUSIONS
-                unsigned int excl = exclusions[exclusionIndex[localGroupIndex]+tgx];
-#endif
            for (unsigned int j = 0; j < TILE_SIZE; j++) {
-#ifdef USE_EXCLUSIONS
-                    bool isExcluded = !(excl & 0x1);
-#endif
                int atom2 = tbx+j;
                real4 posq2 = localData[atom2].posq;
                real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z);
@@ -115,6 +69,9 @@ extern "C" __global__ void computeN2Energy(unsigned long long* __restrict__ forc
                    atom2 = y*TILE_SIZE+j;
                    real dEdR = 0;
                    real tempEnergy = 0;
+#ifdef USE_EXCLUSIONS
+                    bool isExcluded = !(excl & 0x1);
+#endif
                    if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && atom1 != atom2) {
                        COMPUTE_INTERACTION
                        dEdR /= -r;
@@ -136,32 +93,16 @@ extern "C" __global__ void computeN2Energy(unsigned long long* __restrict__ forc
            // This is an off-diagonal tile.
            const unsigned int localAtomIndex = threadIdx.x;
-                if (lasty != y) {
            unsigned int j = y*TILE_SIZE + tgx;
            localData[localAtomIndex].posq = posq[j];
            LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
-                }
            localData[localAtomIndex].force = make_real3(0);
            CLEAR_LOCAL_DERIVATIVES
-#ifdef USE_CUTOFF
-                unsigned int flags = (numTiles <= maxTiles ? interactionFlags[pos] : 0xFFFFFFFF);
-                if (!hasExclusions && flags == 0) {
-                    // No interactions in this tile.
-                }
-                else
-#endif
-                {
-                    // Compute the full set of interactions in this tile.
 #ifdef USE_EXCLUSIONS
-                    unsigned int excl = (hasExclusions ? exclusions[exclusionIndex[localGroupIndex]+tgx] : 0xFFFFFFFF);
            excl = (excl >> tgx) | (excl << (TILE_SIZE - tgx));
 #endif
            unsigned int tj = tgx;
-                    for (unsigned int j = 0; j < TILE_SIZE; j++) {
+            for (j = 0; j < TILE_SIZE; j++) {
-#ifdef USE_EXCLUSIONS
-                        bool isExcluded = !(excl & 0x1);
-#endif
                int atom2 = tbx+tj;
                real4 posq2 = localData[atom2].posq;
                real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z);
@@ -180,6 +121,9 @@ extern "C" __global__ void computeN2Energy(unsigned long long* __restrict__ forc
                    atom2 = y*TILE_SIZE+tj;
                    real dEdR = 0;
                    real tempEnergy = 0;
+#ifdef USE_EXCLUSIONS
+                    bool isExcluded = !(excl & 0x1);
+#endif
                    if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
                        COMPUTE_INTERACTION
                        dEdR /= -r;
@@ -203,27 +147,216 @@ extern "C" __global__ void computeN2Energy(unsigned long long* __restrict__ forc
                tj = (tj + 1) & (TILE_SIZE - 1);
            }
        }
-            }
-        }
-        lasty = y;
        // Write results.
-        if (pos < end) {
+        unsigned int offset = x*TILE_SIZE + tgx;
-            const unsigned int offset = x*TILE_SIZE + tgx;
        atomicAdd(&forceBuffers[offset], static_cast<unsigned long long>((long long) (force.x*0x100000000)));
        atomicAdd(&forceBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force.y*0x100000000)));
        atomicAdd(&forceBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force.z*0x100000000)));
        STORE_DERIVATIVES_1
-        }
+        if (x != y) {
-        if (pos < end && x != y) {
+            offset = y*TILE_SIZE + tgx;
-            const unsigned int offset = y*TILE_SIZE + tgx;
            atomicAdd(&forceBuffers[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.x*0x100000000)));
            atomicAdd(&forceBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.y*0x100000000)));
            atomicAdd(&forceBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.z*0x100000000)));
            STORE_DERIVATIVES_2
        }
+    }
+    // Second loop: tiles without exclusions, either from the neighbor list (with cutoff) or just enumerating all
+    // of them (no cutoff).
+#ifdef USE_CUTOFF
+    unsigned int numTiles = interactionCount[0];
+    int pos = warp*(numTiles > maxTiles ? NUM_BLOCKS*(NUM_BLOCKS+1)/2 : numTiles)/totalWarps;
+    int end = (warp+1)*(numTiles > maxTiles ? NUM_BLOCKS*(NUM_BLOCKS+1)/2 : numTiles)/totalWarps;
+#else
+    int pos = warp*numTiles/totalWarps;
+    int end = (warp+1)*numTiles/totalWarps;
+#endif
+    int skipBase = 0;
+    int currentSkipIndex = tbx;
+    __shared__ int atomIndices[THREAD_BLOCK_SIZE];
+    __shared__ int skipTiles[THREAD_BLOCK_SIZE];
+    skipTiles[threadIdx.x] = -1;
+    while (pos < end) {
+        const bool isExcluded = false;
+        real3 force = make_real3(0);
+        DECLARE_ATOM1_DERIVATIVES
+        bool includeTile = true;
+        // Extract the coordinates of this tile.
+        unsigned int x, y;
+        bool singlePeriodicCopy = false;
+#ifdef USE_CUTOFF
+        if (numTiles <= maxTiles) {
+            ushort2 tileIndices = tiles[pos];
+            x = tileIndices.x;
+            singlePeriodicCopy = tileIndices.y;
+        }
+        else
+#endif
+        {
+            y = (unsigned int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
+            x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
+            if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
+                y += (x < y ? -1 : 1);
+                x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
+            }
+            // Skip over tiles that have exclusions, since they were already processed.
+            while (skipTiles[tbx+TILE_SIZE-1] < pos) {
+                if (skipBase+tgx < NUM_TILES_WITH_EXCLUSIONS) {
+                    ushort2 tile = exclusionTiles[skipBase+tgx];
+                    skipTiles[threadIdx.x] = tile.x + tile.y*NUM_BLOCKS - tile.y*(tile.y+1)/2;
+                }
+                else
+                    skipTiles[threadIdx.x] = end;
+                skipBase += TILE_SIZE;            
+                currentSkipIndex = tbx;
+            }
+            while (skipTiles[currentSkipIndex] < pos)
+                currentSkipIndex++;
+            includeTile = (skipTiles[currentSkipIndex] != pos);
+        }
+        if (includeTile) {
+            unsigned int atom1 = x*TILE_SIZE + tgx;
+            // Load atom data for this tile.
+            real4 posq1 = posq[atom1];
+            LOAD_ATOM1_PARAMETERS
+            const unsigned int localAtomIndex = threadIdx.x;
+#ifdef USE_CUTOFF
+            unsigned int j = (numTiles <= maxTiles ? interactingAtoms[pos*TILE_SIZE+tgx] : y*TILE_SIZE + tgx);
+#else
+            unsigned int j = y*TILE_SIZE + tgx;
+#endif
+            atomIndices[threadIdx.x] = j;
+            if (j < PADDED_NUM_ATOMS) {
+                localData[localAtomIndex].posq = posq[j];
+                LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
+                localData[localAtomIndex].force = make_real3(0);
+                CLEAR_LOCAL_DERIVATIVES
+            }
+#ifdef USE_PERIODIC
+            if (singlePeriodicCopy) {
+                // The box is small enough that we can just translate all the atoms into a single periodic
+                // box, then skip having to apply periodic boundary conditions later.
+                real4 blockCenterX = blockCenter[x];
+                posq1.x -= floor((posq1.x-blockCenterX.x)*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
+                posq1.y -= floor((posq1.y-blockCenterX.y)*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
+                posq1.z -= floor((posq1.z-blockCenterX.z)*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
+                localData[threadIdx.x].posq.x -= floor((localData[threadIdx.x].posq.x-blockCenterX.x)*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
+                localData[threadIdx.x].posq.y -= floor((localData[threadIdx.x].posq.y-blockCenterX.y)*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
+                localData[threadIdx.x].posq.z -= floor((localData[threadIdx.x].posq.z-blockCenterX.z)*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
+                unsigned int tj = tgx;
+                for (j = 0; j < TILE_SIZE; j++) {
+                    int atom2 = tbx+tj;
+                    real4 posq2 = localData[atom2].posq;
+                    real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z);
+                    real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
+#ifdef USE_CUTOFF
+                    if (r2 < CUTOFF_SQUARED) {
+#endif
+                        real invR = RSQRT(r2);
+                        real r = RECIP(invR);
+                        LOAD_ATOM2_PARAMETERS
+                        atom2 = atomIndices[tbx+tj];
+                        real dEdR = 0;
+                        real tempEnergy = 0;
+                        if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
+                            COMPUTE_INTERACTION
+                            dEdR /= -r;
+                        }
+                        energy += tempEnergy;
+                        delta *= dEdR;
+                        force.x -= delta.x;
+                        force.y -= delta.y;
+                        force.z -= delta.z;
+                        atom2 = tbx+tj;
+                        localData[atom2].force.x += delta.x;
+                        localData[atom2].force.y += delta.y;
+                        localData[atom2].force.z += delta.z;
+                        RECORD_DERIVATIVE_2
+#ifdef USE_CUTOFF
+                    }
+#endif
+                    tj = (tj + 1) & (TILE_SIZE - 1);
+                }
+            }
+            else
+#endif
+            {
+                // We need to apply periodic boundary conditions separately for each interaction.
+                unsigned int tj = tgx;
+                for (j = 0; j < TILE_SIZE; j++) {
+                    int atom2 = tbx+tj;
+                    real4 posq2 = localData[atom2].posq;
+                    real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z);
+#ifdef USE_PERIODIC
+                    delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
+                    delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
+                    delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
+#endif
+                    real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
+#ifdef USE_CUTOFF
+                    if (r2 < CUTOFF_SQUARED) {
+#endif
+                        real invR = RSQRT(r2);
+                        real r = RECIP(invR);
+                        LOAD_ATOM2_PARAMETERS
+                        atom2 = atomIndices[tbx+tj];
+                        real dEdR = 0;
+                        real tempEnergy = 0;
+                        if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
+                            COMPUTE_INTERACTION
+                            dEdR /= -r;
+                        }
+                        energy += tempEnergy;
+                        delta *= dEdR;
+                        force.x -= delta.x;
+                        force.y -= delta.y;
+                        force.z -= delta.z;
+                        atom2 = tbx+tj;
+                        localData[atom2].force.x += delta.x;
+                        localData[atom2].force.y += delta.y;
+                        localData[atom2].force.z += delta.z;
+                        RECORD_DERIVATIVE_2
+#ifdef USE_CUTOFF
+                    }
+#endif
+                    tj = (tj + 1) & (TILE_SIZE - 1);
+                }
+            }
+            // Write results.
+            atomicAdd(&forceBuffers[atom1], static_cast<unsigned long long>((long long) (force.x*0x100000000)));
+            atomicAdd(&forceBuffers[atom1+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force.y*0x100000000)));
+            atomicAdd(&forceBuffers[atom1+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force.z*0x100000000)));
+            unsigned int offset = atom1;
+            STORE_DERIVATIVES_1
+#ifdef USE_CUTOFF
+            unsigned int atom2 = atomIndices[threadIdx.x];
+#else
+            unsigned int atom2 = y*TILE_SIZE + tgx;
+#endif
+            if (atom2 < PADDED_NUM_ATOMS) {
+                atomicAdd(&forceBuffers[atom2], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.x*0x100000000)));
+                atomicAdd(&forceBuffers[atom2+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.y*0x100000000)));
+                atomicAdd(&forceBuffers[atom2+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.z*0x100000000)));
+                offset = atom2;
+                STORE_DERIVATIVES_2
+            }
+        }
        pos++;
-    } while (pos < end);
+    }
    energyBuffer[blockIdx.x*blockDim.x+threadIdx.x] += energy;
 }
--- a/platforms/cuda/src/kernels/customGBValueN2.cu
+++ b/platforms/cuda/src/kernels/customGBValueN2.cu
-#define TILE_SIZE 32
 typedef struct {
    real4 posq;
    real value, temp;
@@ -13,86 +11,41 @@ typedef struct {
 * Compute a value based on pair interactions.
 */
 extern "C" __global__ void computeN2Value(const real4* __restrict__ posq, const unsigned int* __restrict__ exclusions,
-        const unsigned int* __restrict__ exclusionIndices, const unsigned int* __restrict__ exclusionRowIndices, unsigned long long* __restrict__ global_value,
+        const ushort2* __restrict__ exclusionTiles, unsigned long long* __restrict__ global_value,
 #ifdef USE_CUTOFF
-        const ushort2* __restrict__ tiles, const unsigned int* __restrict__ interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize, unsigned int maxTiles, const unsigned int* __restrict__ interactionFlags
+        const ushort2* __restrict__ tiles, const unsigned int* __restrict__ interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize, unsigned int maxTiles, const real4* __restrict__ blockCenter, const unsigned int* __restrict__ interactingAtoms
 #else
        unsigned int numTiles
 #endif
        PARAMETER_ARGUMENTS) {
-    unsigned int totalWarps = (blockDim.x*gridDim.x)/TILE_SIZE;
+    const unsigned int totalWarps = (blockDim.x*gridDim.x)/TILE_SIZE;
-    unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/TILE_SIZE;
+    const unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/TILE_SIZE;
-#ifdef USE_CUTOFF
-    unsigned int numTiles = interactionCount[0];
-    unsigned int pos = warp*(numTiles > maxTiles ? NUM_BLOCKS*(NUM_BLOCKS+1)/2 : numTiles)/totalWarps;
-    unsigned int end = (warp+1)*(numTiles > maxTiles ? NUM_BLOCKS*(NUM_BLOCKS+1)/2 : numTiles)/totalWarps;
-#else
-    unsigned int pos = warp*numTiles/totalWarps;
-    unsigned int end = (warp+1)*numTiles/totalWarps;
-#endif
-    real energy = 0;
-    unsigned int lasty = 0xFFFFFFFF;
-    __shared__ AtomData localData[THREAD_BLOCK_SIZE];
-    __shared__ unsigned int exclusionRange[2*WARPS_PER_GROUP];
-    __shared__ int exclusionIndex[WARPS_PER_GROUP];
-    do {
-        // Extract the coordinates of this tile
    const unsigned int tgx = threadIdx.x & (TILE_SIZE-1);
    const unsigned int tbx = threadIdx.x - tgx;
-        const unsigned int localGroupIndex = threadIdx.x/TILE_SIZE;
+    __shared__ AtomData localData[THREAD_BLOCK_SIZE];
-        unsigned int x, y;
+    // First loop: process tiles that contain exclusions.
+    const unsigned int firstExclusionTile = FIRST_EXCLUSION_TILE+warp*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/totalWarps;
+    const unsigned int lastExclusionTile = FIRST_EXCLUSION_TILE+(warp+1)*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/totalWarps;
+    for (int pos = firstExclusionTile; pos < lastExclusionTile; pos++) {
+        const ushort2 tileIndices = exclusionTiles[pos];
+        const unsigned int x = tileIndices.x;
+        const unsigned int y = tileIndices.y;
        real value = 0;
-        if (pos < end) {
-#ifdef USE_CUTOFF
-            if (numTiles <= maxTiles) {
-                ushort2 tileIndices = tiles[pos];
-                x = tileIndices.x;
-                y = tileIndices.y;
-            }
-            else
-#endif
-            {
-                y = (unsigned int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
-                x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
-                if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
-                    y += (x < y ? -1 : 1);
-                    x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
-                }
-            }
        unsigned int atom1 = x*TILE_SIZE + tgx;
        real4 posq1 = posq[atom1];
        LOAD_ATOM1_PARAMETERS
-            // Locate the exclusion data for this tile.
 #ifdef USE_EXCLUSIONS
-            if (tgx < 2)
+        unsigned int excl = exclusions[pos*TILE_SIZE+tgx];
-                exclusionRange[2*localGroupIndex+tgx] = exclusionRowIndices[x+tgx];
-            if (tgx == 0)
-                exclusionIndex[localGroupIndex] = -1;
-            for (unsigned int i = exclusionRange[2*localGroupIndex]+tgx; i < exclusionRange[2*localGroupIndex+1]; i += TILE_SIZE)
-                if (exclusionIndices[i] == y)
-                    exclusionIndex[localGroupIndex] = i*TILE_SIZE;
-            bool hasExclusions = (exclusionIndex[localGroupIndex] > -1);
-#else
-            bool hasExclusions = false;
 #endif
-            if (pos >= end)
+        if (x == y) {
-                ; // This warp is done.
-            else if (x == y) {
            // This tile is on the diagonal.
            const unsigned int localAtomIndex = threadIdx.x;
            localData[localAtomIndex].posq = posq1;
            LOAD_LOCAL_PARAMETERS_FROM_1
-#ifdef USE_EXCLUSIONS
-                unsigned int excl = exclusions[exclusionIndex[localGroupIndex]+tgx];
-#endif
            for (unsigned int j = 0; j < TILE_SIZE; j++) {
-#ifdef USE_EXCLUSIONS
-                    bool isExcluded = !(excl & 0x1);
-#endif
                int atom2 = tbx+j;
                real4 posq2 = localData[atom2].posq;
                real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z);
@@ -112,7 +65,8 @@ extern "C" __global__ void computeN2Value(const real4* __restrict__ posq, const
                    real tempValue1 = 0;
                    real tempValue2 = 0;
 #ifdef USE_EXCLUSIONS
-                    if (!isExcluded && atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && atom1 != atom2) {
+                    bool isExcluded = (atom1 >= NUM_ATOMS || atom2 >= NUM_ATOMS || !(excl & 0x1));
+                    if (!isExcluded && atom1 != atom2) {
 #else
                    if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && atom1 != atom2) {
 #endif
@@ -130,25 +84,17 @@ extern "C" __global__ void computeN2Value(const real4* __restrict__ posq, const
        else {
            // This is an off-diagonal tile.
-                if (lasty != y) {
-                    unsigned int j = y*TILE_SIZE + tgx;
-                    localData[threadIdx.x].posq = posq[j];
            const unsigned int localAtomIndex = threadIdx.x;
+            unsigned int j = y*TILE_SIZE + tgx;
+            localData[localAtomIndex].posq = posq[j];
            LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
-                }
+            localData[localAtomIndex].value = 0;
-                localData[threadIdx.x].value = 0;
+#ifdef USE_EXCLUSIONS
-#ifdef USE_CUTOFF
+            excl = (excl >> tgx) | (excl << (TILE_SIZE - tgx));
-                unsigned int flags = (numTiles <= maxTiles ? interactionFlags[pos] : 0xFFFFFFFF);
+#endif
-                if (!hasExclusions && flags != 0xFFFFFFFF) {
+            unsigned int tj = tgx;
-                    if (flags == 0) {
+            for (j = 0; j < TILE_SIZE; j++) {
-                        // No interactions in this tile.
+                int atom2 = tbx+tj;
-                    }
-                    else {
-                        // Compute only a subset of the interactions in this tile.
-                        for (unsigned int j = 0; j < TILE_SIZE; j++) {
-                            if ((flags&(1<<j)) != 0) {
-                                int atom2 = tbx+j;
                real4 posq2 = localData[atom2].posq;
                real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z);
 #ifdef USE_PERIODIC
@@ -157,44 +103,162 @@ extern "C" __global__ void computeN2Value(const real4* __restrict__ posq, const
                delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
 #endif
                real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
-                                real tempValue1 = 0;
+#ifdef USE_CUTOFF
-                                real tempValue2 = 0;
                if (r2 < CUTOFF_SQUARED) {
+#endif
                    real invR = RSQRT(r2);
                    real r = RECIP(invR);
                    LOAD_ATOM2_PARAMETERS
-                                    atom2 = y*TILE_SIZE+j;
+                    atom2 = y*TILE_SIZE+tj;
+                    real tempValue1 = 0;
+                    real tempValue2 = 0;
+#ifdef USE_EXCLUSIONS
+                    bool isExcluded = (atom1 >= NUM_ATOMS || atom2 >= NUM_ATOMS || !(excl & 0x1));
+                    if (!isExcluded) {
+#else
                    if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
+#endif
                        COMPUTE_VALUE
                    }
                    value += tempValue1;
+                    localData[tbx+tj].value += tempValue2;
+#ifdef USE_CUTOFF
+                }
+#endif
+#ifdef USE_EXCLUSIONS
+                excl >>= 1;
+#endif
+                tj = (tj + 1) & (TILE_SIZE - 1);
+            }
        }
-                                localData[threadIdx.x].temp = tempValue2;
-                                // Sum the forces on atom2.
+        // Write results.
-                                if (tgx % 4 == 0)
+        unsigned int offset = x*TILE_SIZE + tgx;
-                                    localData[threadIdx.x].temp += localData[threadIdx.x+1].temp+localData[threadIdx.x+2].temp+localData[threadIdx.x+3].temp;
+        atomicAdd(&global_value[offset], static_cast<unsigned long long>((long long) (value*0x100000000)));
-                                if (tgx == 0)
+        if (x != y) {
-                                    localData[tbx+j].value += localData[threadIdx.x].temp+localData[threadIdx.x+4].temp+localData[threadIdx.x+8].temp+localData[threadIdx.x+12].temp+localData[threadIdx.x+16].temp+localData[threadIdx.x+20].temp+localData[threadIdx.x+24].temp+localData[threadIdx.x+28].temp;
+            offset = y*TILE_SIZE + tgx;
-                            }
+            atomicAdd(&global_value[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].value*0x100000000)));
        }
    }
+    // Second loop: tiles without exclusions, either from the neighbor list (with cutoff) or just enumerating all
+    // of them (no cutoff).
+#ifdef USE_CUTOFF
+    unsigned int numTiles = interactionCount[0];
+    int pos = warp*(numTiles > maxTiles ? NUM_BLOCKS*(NUM_BLOCKS+1)/2 : numTiles)/totalWarps;
+    int end = (warp+1)*(numTiles > maxTiles ? NUM_BLOCKS*(NUM_BLOCKS+1)/2 : numTiles)/totalWarps;
+#else
+    int pos = warp*numTiles/totalWarps;
+    int end = (warp+1)*numTiles/totalWarps;
+#endif
+    int skipBase = 0;
+    int currentSkipIndex = tbx;
+    __shared__ int atomIndices[THREAD_BLOCK_SIZE];
+    __shared__ int skipTiles[THREAD_BLOCK_SIZE];
+    skipTiles[threadIdx.x] = -1;
+    while (pos < end) {
+        real value = 0;
+        bool includeTile = true;
+        // Extract the coordinates of this tile.
+        unsigned int x, y;
+        bool singlePeriodicCopy = false;
+#ifdef USE_CUTOFF
+        if (numTiles <= maxTiles) {
+            ushort2 tileIndices = tiles[pos];
+            x = tileIndices.x;
+            singlePeriodicCopy = tileIndices.y;
        }
        else
 #endif
        {
-                    // Compute the full set of interactions in this tile.
+            y = (unsigned int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
+            x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
+            if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
+                y += (x < y ? -1 : 1);
+                x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
+            }
-#ifdef USE_EXCLUSIONS
+            // Skip over tiles that have exclusions, since they were already processed.
-                    unsigned int excl = (hasExclusions ? exclusions[exclusionIndex[localGroupIndex]+tgx] : 0xFFFFFFFF);
-                    excl = (excl >> tgx) | (excl << (TILE_SIZE - tgx));
+            while (skipTiles[tbx+TILE_SIZE-1] < pos) {
+                if (skipBase+tgx < NUM_TILES_WITH_EXCLUSIONS) {
+                    ushort2 tile = exclusionTiles[skipBase+tgx];
+                    skipTiles[threadIdx.x] = tile.x + tile.y*NUM_BLOCKS - tile.y*(tile.y+1)/2;
+                }
+                else
+                    skipTiles[threadIdx.x] = end;
+                skipBase += TILE_SIZE;            
+                currentSkipIndex = tbx;
+            }
+            while (skipTiles[currentSkipIndex] < pos)
+                currentSkipIndex++;
+            includeTile = (skipTiles[currentSkipIndex] != pos);
+        }
+        if (includeTile) {
+            unsigned int atom1 = x*TILE_SIZE + tgx;
+            // Load atom data for this tile.
+            real4 posq1 = posq[atom1];
+            LOAD_ATOM1_PARAMETERS
+            const unsigned int localAtomIndex = threadIdx.x;
+#ifdef USE_CUTOFF
+            unsigned int j = (numTiles <= maxTiles ? interactingAtoms[pos*TILE_SIZE+tgx] : y*TILE_SIZE + tgx);
+#else
+            unsigned int j = y*TILE_SIZE + tgx;
 #endif
+            atomIndices[threadIdx.x] = j;
+            if (j < PADDED_NUM_ATOMS) {
+                localData[localAtomIndex].posq = posq[j];
+                LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
+                localData[localAtomIndex].value = 0;
+            }
+#ifdef USE_PERIODIC
+            if (singlePeriodicCopy) {
+                // The box is small enough that we can just translate all the atoms into a single periodic
+                // box, then skip having to apply periodic boundary conditions later.
+                real4 blockCenterX = blockCenter[x];
+                posq1.x -= floor((posq1.x-blockCenterX.x)*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
+                posq1.y -= floor((posq1.y-blockCenterX.y)*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
+                posq1.z -= floor((posq1.z-blockCenterX.z)*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
+                localData[threadIdx.x].posq.x -= floor((localData[threadIdx.x].posq.x-blockCenterX.x)*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
+                localData[threadIdx.x].posq.y -= floor((localData[threadIdx.x].posq.y-blockCenterX.y)*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
+                localData[threadIdx.x].posq.z -= floor((localData[threadIdx.x].posq.z-blockCenterX.z)*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
                unsigned int tj = tgx;
                for (unsigned int j = 0; j < TILE_SIZE; j++) {
-#ifdef USE_EXCLUSIONS
+                    int atom2 = tbx+tj;
-                        bool isExcluded = !(excl & 0x1);
+                    real4 posq2 = localData[atom2].posq;
+                    real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z);
+                    real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
+                    if (r2 < CUTOFF_SQUARED) {
+                        real invR = RSQRT(r2);
+                        real r = RECIP(invR);
+                        LOAD_ATOM2_PARAMETERS
+                        atom2 = atomIndices[tbx+tj];
+                        real tempValue1 = 0;
+                        real tempValue2 = 0;
+                        if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
+                            COMPUTE_VALUE
+                        }
+                        value += tempValue1;
+                        localData[tbx+tj].value += tempValue2;
+                    }
+                    tj = (tj + 1) & (TILE_SIZE - 1);
+                }
+            }
+            else
 #endif
+            {
+                // We need to apply periodic boundary conditions separately for each interaction.
+                unsigned int tj = tgx;
+                for (unsigned int j = 0; j < TILE_SIZE; j++) {
                    int atom2 = tbx+tj;
                    real4 posq2 = localData[atom2].posq;
                    real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z);
@@ -210,41 +274,32 @@ extern "C" __global__ void computeN2Value(const real4* __restrict__ posq, const
                        real invR = RSQRT(r2);
                        real r = RECIP(invR);
                        LOAD_ATOM2_PARAMETERS
-                        atom2 = y*TILE_SIZE+tj;
+                        atom2 = atomIndices[tbx+tj];
                        real tempValue1 = 0;
                        real tempValue2 = 0;
-#ifdef USE_EXCLUSIONS
-                        if (!isExcluded && atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
-#else
                        if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
-#endif
                            COMPUTE_VALUE
                        }
                        value += tempValue1;
                        localData[tbx+tj].value += tempValue2;
 #ifdef USE_CUTOFF
                    }
-#endif
-#ifdef USE_EXCLUSIONS
-                        excl >>= 1;
 #endif
                    tj = (tj + 1) & (TILE_SIZE - 1);
                }
            }
-            }
-        }
            // Write results.
-        if (pos < end) {
+            atomicAdd(&global_value[atom1], static_cast<unsigned long long>((long long) (value*0x100000000)));
-            const unsigned int offset = x*TILE_SIZE + tgx;
+#ifdef USE_CUTOFF
-            atomicAdd(&global_value[offset], static_cast<unsigned long long>((long long) (value*0x100000000)));
+            unsigned int atom2 = atomIndices[threadIdx.x];
-        }
+#else
-        if (pos < end && x != y) {
+            unsigned int atom2 = y*TILE_SIZE + tgx;
-            const unsigned int offset = y*TILE_SIZE + tgx;
+#endif
-            atomicAdd(&global_value[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].value*0x100000000)));
+            if (atom2 < PADDED_NUM_ATOMS)
+                atomicAdd(&global_value[atom2], static_cast<unsigned long long>((long long) (localData[threadIdx.x].value*0x100000000)));
        }
-        lasty = y;
        pos++;
-    } while (pos < end);
+    }
 }
--- a/platforms/cuda/src/kernels/customHbondForce.cu
+++ b/platforms/cuda/src/kernels/customHbondForce.cu
@@ -48,12 +48,12 @@ inline __device__ real computeAngle(real4 vec1, real4 vec2) {
        real3 crossProduct = cross(vec1, vec2);
        real scale = vec1.w*vec2.w;
-        angle = asin(SQRT(dot(crossProduct, crossProduct)/scale));
+        angle = ASIN(SQRT(dot(crossProduct, crossProduct)/scale));
        if (cosine < 0.0f)
            angle = M_PI-angle;
    }
    else
-       angle = acos(cosine);
+       angle = ACOS(cosine);
    return angle;
 }

--- a/platforms/cuda/src/kernels/ewald.cu
+++ b/platforms/cuda/src/kernels/ewald.cu
@@ -35,11 +35,11 @@ extern "C" __global__ void calculateEwaldCosSinSums(real* __restrict__ energyBuf
        for (int atom = 0; atom < NUM_ATOMS; atom++) {
            real4 apos = posq[atom];
            real phase = apos.x*kx;
-            real2 structureFactor = make_real2(cos(phase), sin(phase));
+            real2 structureFactor = make_real2(COS(phase), SIN(phase));
            phase = apos.y*ky;
-            structureFactor = multofReal2(structureFactor, make_real2(cos(phase), sin(phase)));
+            structureFactor = multofReal2(structureFactor, make_real2(COS(phase), SIN(phase)));
            phase = apos.z*kz;
-            structureFactor = multofReal2(structureFactor, make_real2(cos(phase), sin(phase)));
+            structureFactor = multofReal2(structureFactor, make_real2(COS(phase), SIN(phase)));
            sum += apos.w*structureFactor;
        }
        cosSinSum[index] = sum;
@@ -76,9 +76,9 @@ extern "C" __global__ void calculateEwaldForces(unsigned long long* __restrict__
            for (int ry = lowry; ry < KMAX_Y; ry++) {
                real ky = ry*reciprocalBoxSize.y;
                real phase = apos.x*kx;
-                real2 tab_xy = make_real2(cos(phase), sin(phase));
+                real2 tab_xy = make_real2(COS(phase), SIN(phase));
                phase = apos.y*ky;
-                tab_xy = multofReal2(tab_xy, make_real2(cos(phase), sin(phase)));
+                tab_xy = multofReal2(tab_xy, make_real2(COS(phase), SIN(phase)));
                for (int rz = lowrz; rz < KMAX_Z; rz++) {
                    real kz = rz*reciprocalBoxSize.z;
@@ -88,7 +88,7 @@ extern "C" __global__ void calculateEwaldForces(unsigned long long* __restrict__
                    real k2 = kx*kx + ky*ky + kz*kz;
                    real ak = EXP(k2*EXP_COEFFICIENT)/k2;
                    phase = apos.z*kz;
-                    real2 structureFactor = multofReal2(tab_xy, make_real2(cos(phase), sin(phase)));
+                    real2 structureFactor = multofReal2(tab_xy, make_real2(COS(phase), SIN(phase)));
                    real2 sum = cosSinSum[index];
                    real dEdR = 2*reciprocalCoefficient*ak*apos.w*(sum.x*structureFactor.y - sum.y*structureFactor.x);
                    force.x += dEdR*kx;

--- a/platforms/cuda/src/kernels/findInteractingBlocks.cu
+++ b/platforms/cuda/src/kernels/findInteractingBlocks.cu
-#define TILE_SIZE 32
+#define GROUP_SIZE 256
-#define GROUP_SIZE 64
+#define BUFFER_GROUPS 2
-#define BUFFER_GROUPS 4
 #define BUFFER_SIZE BUFFER_GROUPS*GROUP_SIZE
+#define WARP_SIZE 32
+#define INVALID 0xFFFF
 /**
 * Find a bounding box for the atoms in each block.
 */
-extern "C" __global__ void findBlockBounds(int numAtoms, real4 periodicBoxSize, real4 invPeriodicBoxSize, const real4* __restrict__ posq, real4* __restrict__ blockCenter, real4* __restrict__ blockBoundingBox, unsigned int* __restrict__ interactionCount) {
+extern "C" __global__ void findBlockBounds(int numAtoms, real4 periodicBoxSize, real4 invPeriodicBoxSize, const real4* __restrict__ posq,
+        real4* __restrict__ blockCenter, real4* __restrict__ blockBoundingBox, int* __restrict__ rebuildNeighborList, real2* __restrict__ sortedBlocks) {
    int index = blockIdx.x*blockDim.x+threadIdx.x;
    int base = index*TILE_SIZE;
    while (base < numAtoms) {
@@ -30,68 +32,231 @@ extern "C" __global__ void findBlockBounds(int numAtoms, real4 periodicBoxSize,
            minPos = make_real4(min(minPos.x,pos.x), min(minPos.y,pos.y), min(minPos.z,pos.z), 0);
            maxPos = make_real4(max(maxPos.x,pos.x), max(maxPos.y,pos.y), max(maxPos.z,pos.z), 0);
        }
-        blockBoundingBox[index] = 0.5f*(maxPos-minPos);
+        real4 blockSize = 0.5f*(maxPos-minPos);
+        blockBoundingBox[index] = blockSize;
        blockCenter[index] = 0.5f*(maxPos+minPos);
+        sortedBlocks[index] = make_real2(blockSize.x+blockSize.y+blockSize.z, index);
        index += blockDim.x*gridDim.x;
        base = index*TILE_SIZE;
    }
    if (blockIdx.x == 0 && threadIdx.x == 0)
-        interactionCount[0] = 0;
+        rebuildNeighborList[0] = 0;
 }
 /**
- * This is called by findBlocksWithInteractions().  It compacts the list of blocks and writes them
+ * Sort the data about bounding boxes so it can be accessed more efficiently in the next kernel.
- * to global memory.
 */
-__device__ void storeInteractionData(ushort2* buffer, int* valid, short* sum, ushort2* temp, int* baseIndex,
+extern "C" __global__ void sortBoxData(const real2* __restrict__ sortedBlock, const real4* __restrict__ blockCenter,
-            unsigned int* interactionCount, ushort2* interactingTiles, real4 periodicBoxSize,
+        const real4* __restrict__ blockBoundingBox, real4* __restrict__ sortedBlockCenter,
-            real4 invPeriodicBoxSize, const real4* posq, const real4* blockCenter, const real4* blockBoundingBox, unsigned int maxTiles) {
+        real4* __restrict__ sortedBlockBoundingBox, const real4* __restrict__ posq, const real4* __restrict__ oldPositions,
-    // The buffer is full, so we need to compact it and write out results.  Start by doing a parallel prefix sum.
+        unsigned int* __restrict__ interactionCount, int* __restrict__ rebuildNeighborList) {
+    for (int i = threadIdx.x+blockIdx.x*blockDim.x; i < NUM_BLOCKS; i += blockDim.x*gridDim.x) {
+        int index = (int) sortedBlock[i].y;
+        sortedBlockCenter[i] = blockCenter[index];
+        sortedBlockBoundingBox[i] = blockBoundingBox[index];
+    }
+    // Also check whether any atom has moved enough so that we really need to rebuild the neighbor list.
-    for (int i = threadIdx.x; i < BUFFER_SIZE; i += GROUP_SIZE)
+    bool rebuild = false;
-        temp[i].x = (valid[i] ? 1 : 0);
+    for (int i = threadIdx.x+blockIdx.x*blockDim.x; i < NUM_ATOMS; i += blockDim.x*gridDim.x) {
+        real4 delta = oldPositions[i]-posq[i];
+        if (delta.x*delta.x + delta.y*delta.y + delta.z*delta.z > 0.25f*PADDING*PADDING)
+            rebuild = true;
+    }
+    if (rebuild) {
+        rebuildNeighborList[0] = 1;
+        interactionCount[0] = 0;
+    }
+}
+/**
+ * Perform a parallel prefix sum over an array.  The input values are all assumed to be 0 or 1.
+ */
+__device__ void prefixSum(short* sum, ushort2* temp) {
+#if __CUDA_ARCH__ >= 300
+    const int indexInWarp = threadIdx.x%WARP_SIZE;
+    const int warpMask = (2<<indexInWarp)-1;
+    for (int base = 0; base < BUFFER_SIZE; base += blockDim.x)
+        temp[base+threadIdx.x].x = __popc(__ballot(sum[base+threadIdx.x])&warpMask);
+    __syncthreads();
+    if (threadIdx.x < BUFFER_SIZE/WARP_SIZE) {
+        int multiWarpSum = temp[(threadIdx.x+1)*WARP_SIZE-1].x;
+        for (int offset = 1; offset < BUFFER_SIZE/WARP_SIZE; offset *= 2) {
+            short n = __shfl_up(multiWarpSum, offset, WARP_SIZE);
+            if (indexInWarp >= offset)
+                multiWarpSum += n;
+        }
+        temp[threadIdx.x].y = multiWarpSum;
+    }
+    __syncthreads();
+    for (int i = threadIdx.x; i < BUFFER_SIZE; i += blockDim.x)
+        sum[i] = temp[i].x+(i < WARP_SIZE ? 0 : temp[i/WARP_SIZE-1].y);
+    __syncthreads();
+#else
+    for (int i = threadIdx.x; i < BUFFER_SIZE; i += blockDim.x)
+        temp[i].x = sum[i];
    __syncthreads();
    int whichBuffer = 0;
    for (int offset = 1; offset < BUFFER_SIZE; offset *= 2) {
        if (whichBuffer == 0)
-            for (int i = threadIdx.x; i < BUFFER_SIZE; i += GROUP_SIZE)
+            for (int i = threadIdx.x; i < BUFFER_SIZE; i += blockDim.x)
                temp[i].y = (i < offset ? temp[i].x : temp[i].x+temp[i-offset].x);
        else
-            for (int i = threadIdx.x; i < BUFFER_SIZE; i += GROUP_SIZE)
+            for (int i = threadIdx.x; i < BUFFER_SIZE; i += blockDim.x)
                temp[i].x = (i < offset ? temp[i].y : temp[i].y+temp[i-offset].y);
        whichBuffer = 1-whichBuffer;
        __syncthreads();
    }
    if (whichBuffer == 0)
-        for (int i = threadIdx.x; i < BUFFER_SIZE; i += GROUP_SIZE)
+        for (int i = threadIdx.x; i < BUFFER_SIZE; i += blockDim.x)
            sum[i] = temp[i].x;
    else
-        for (int i = threadIdx.x; i < BUFFER_SIZE; i += GROUP_SIZE)
+        for (int i = threadIdx.x; i < BUFFER_SIZE; i += blockDim.x)
            sum[i] = temp[i].y;
    __syncthreads();
-    int numValid = sum[BUFFER_SIZE-1];
+#endif
+}
+/**
+ * This is called by findBlocksWithInteractions().  It compacts the list of blocks, identifies interactions
+ * in them, and writes the result to global memory.
+ */
+__device__ void storeInteractionData(unsigned short x, unsigned short* buffer, short* sum, ushort2* temp, int* atoms, int& numAtoms,
+            int& baseIndex, unsigned int* interactionCount, ushort2* interactingTiles, unsigned int* interactingAtoms, real4 periodicBoxSize,
+            real4 invPeriodicBoxSize, const real4* posq, real3* posBuffer, real4 blockCenterX, real4 blockSizeX, unsigned int maxTiles, bool finish) {
+    const bool singlePeriodicCopy = (0.5f*periodicBoxSize.x-blockSizeX.x >= PADDED_CUTOFF &&
+                                     0.5f*periodicBoxSize.y-blockSizeX.y >= PADDED_CUTOFF &&
+                                     0.5f*periodicBoxSize.z-blockSizeX.z >= PADDED_CUTOFF);
+    if (threadIdx.x < TILE_SIZE) {
+        real3 pos = trimTo3(posq[x*TILE_SIZE+threadIdx.x]);
+        posBuffer[threadIdx.x] = pos;
+#ifdef USE_PERIODIC
+        if (singlePeriodicCopy) {
+            // The box is small enough that we can just translate all the atoms into a single periodic
+            // box, then skip having to apply periodic boundary conditions later.
+            pos.x -= floor((pos.x-blockCenterX.x)*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
+            pos.y -= floor((pos.y-blockCenterX.y)*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
+            pos.z -= floor((pos.z-blockCenterX.z)*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
+            posBuffer[threadIdx.x] = pos;
+        }
+#endif
+    }
+    // The buffer is full, so we need to compact it and write out results.  Start by doing a parallel prefix sum.
+    for (int i = threadIdx.x; i < BUFFER_SIZE; i += blockDim.x)
+        sum[i] = (buffer[i] == INVALID ? 0 : 1);
    __syncthreads();
+    prefixSum(sum, temp);
+    int numValid = sum[BUFFER_SIZE-1];
    // Compact the buffer.
-    for (int i = threadIdx.x; i < BUFFER_SIZE; i += GROUP_SIZE)
+    for (int i = threadIdx.x; i < BUFFER_SIZE; i += blockDim.x)
-        if (valid[i]) {
+        if (buffer[i] != INVALID)
-            temp[sum[i]-1] = buffer[i];
+            temp[sum[i]-1].x = buffer[i];
-            sum[i] = valid[i];
+    __syncthreads();
-            valid[i] = false;
+    for (int i = threadIdx.x; i < BUFFER_SIZE; i += blockDim.x)
-            buffer[i] = make_ushort2(1, 1);
+        buffer[i] = temp[i].x;
+    __syncthreads();
+    // Loop over the tiles and find specific interactions in them.
+    const int indexInWarp = threadIdx.x%WARP_SIZE;
+    for (int base = 0; base < numValid; base += BUFFER_SIZE/WARP_SIZE) {
+        for (int i = threadIdx.x/WARP_SIZE; i < BUFFER_SIZE/WARP_SIZE && base+i < numValid; i += GROUP_SIZE/WARP_SIZE) {
+            // Check each atom in block Y for interactions.
+            real3 pos = trimTo3(posq[buffer[base+i]*TILE_SIZE+indexInWarp]);
+#ifdef USE_PERIODIC
+            if (singlePeriodicCopy) {
+                pos.x -= floor((pos.x-blockCenterX.x)*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
+                pos.y -= floor((pos.y-blockCenterX.y)*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
+                pos.z -= floor((pos.z-blockCenterX.z)*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
+            }
+#endif
+            bool interacts = false;
+#ifdef USE_PERIODIC
+            if (!singlePeriodicCopy) {
+                for (int j = 0; j < TILE_SIZE; j++) {
+                    real3 delta = pos-posBuffer[j];
+                    delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
+                    delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
+                    delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
+                    interacts |= (delta.x*delta.x+delta.y*delta.y+delta.z*delta.z < PADDED_CUTOFF_SQUARED);
+                }
+            }
+            else {
+#endif
+                for (int j = 0; j < TILE_SIZE; j++) {
+                    real3 delta = pos-posBuffer[j];
+                    interacts |= (delta.x*delta.x+delta.y*delta.y+delta.z*delta.z < PADDED_CUTOFF_SQUARED);
+                }
+#ifdef USE_PERIODIC
            }
+#endif
+            sum[i*WARP_SIZE+indexInWarp] = (interacts ? 1 : 0);
+        }
+        for (int i = numValid-base+threadIdx.x/WARP_SIZE; i < BUFFER_SIZE/WARP_SIZE; i += GROUP_SIZE/WARP_SIZE)
+            sum[i*WARP_SIZE+indexInWarp] = 0;
+        // Compact the list of atoms.
        __syncthreads();
+        prefixSum(sum, temp);
+        for (int i = threadIdx.x; i < BUFFER_SIZE; i += blockDim.x)
+            if (sum[i] != (i == 0 ? 0 : sum[i-1]))
+                atoms[numAtoms+sum[i]-1] = buffer[base+i/WARP_SIZE]*TILE_SIZE+indexInWarp;
-    // Store it to global memory.
+        // Store them to global memory.
+        int atomsToStore = numAtoms+sum[BUFFER_SIZE-1];
+        bool storePartialTile = (finish && base >= numValid-BUFFER_SIZE/WARP_SIZE);
+        int tilesToStore = (storePartialTile ? (atomsToStore+TILE_SIZE-1)/TILE_SIZE : atomsToStore/TILE_SIZE);
+        if (tilesToStore > 0) {
            if (threadIdx.x == 0)
-        *baseIndex = atomicAdd(interactionCount, numValid);
+                baseIndex = atomicAdd(interactionCount, tilesToStore);
            __syncthreads();
-    if (*baseIndex+numValid <= maxTiles)
+            if (threadIdx.x == 0)
-        for (int i = threadIdx.x; i < numValid; i += GROUP_SIZE)
+                numAtoms = atomsToStore-tilesToStore*TILE_SIZE;
-            interactingTiles[*baseIndex+i] = temp[i];
+            if (baseIndex+tilesToStore <= maxTiles) {
+                if (threadIdx.x < tilesToStore)
+                    interactingTiles[baseIndex+threadIdx.x] = make_ushort2(x, singlePeriodicCopy);
+                for (int i = threadIdx.x; i < tilesToStore*TILE_SIZE; i += blockDim.x)
+                    interactingAtoms[baseIndex*TILE_SIZE+i] = (i < atomsToStore ? atoms[i] : NUM_ATOMS);
+            }
+        }
+        else {
+            __syncthreads();
+            if (threadIdx.x == 0)
+                numAtoms += sum[BUFFER_SIZE-1];
+        }
        __syncthreads();
+        if (threadIdx.x < numAtoms && !storePartialTile)
+            atoms[threadIdx.x] = atoms[tilesToStore*TILE_SIZE+threadIdx.x];
+    }
+    if (numValid == 0 && numAtoms > 0 && finish) {
+        // We didn't have any more tiles to process, but there were some atoms left over from a
+        // previous call to this function.  Save them now.
+        if (threadIdx.x == 0)
+            baseIndex = atomicAdd(interactionCount, 1);
+        __syncthreads();
+        if (baseIndex < maxTiles) {
+            if (threadIdx.x == 0)
+                interactingTiles[baseIndex] = make_ushort2(x, singlePeriodicCopy);
+            if (threadIdx.x < TILE_SIZE)
+                interactingAtoms[baseIndex*TILE_SIZE+threadIdx.x] = (threadIdx.x < numAtoms ? atoms[threadIdx.x] : NUM_ATOMS);
+        }
+    }
+    // Reset the buffer for processing more tiles.
+    for (int i = threadIdx.x; i < BUFFER_SIZE; i += blockDim.x)
+        buffer[i] = INVALID;
 }
 /**
@@ -100,139 +265,92 @@ __device__ void storeInteractionData(ushort2* buffer, int* valid, short* sum, us
 */
 extern "C" __global__ void findBlocksWithInteractions(real4 periodicBoxSize, real4 invPeriodicBoxSize, const real4* __restrict__ blockCenter,
        const real4* __restrict__ blockBoundingBox, unsigned int* __restrict__ interactionCount, ushort2* __restrict__ interactingTiles,
-        unsigned int* __restrict__ interactionFlags, const real4* __restrict__ posq, unsigned int maxTiles, unsigned int startTileIndex,
+        unsigned int* __restrict__ interactingAtoms, const real4* __restrict__ posq, unsigned int maxTiles, unsigned int startBlockIndex,
-        unsigned int numTiles) {
+        unsigned int numBlocks, real2* __restrict__ sortedBlocks, const real4* __restrict__ sortedBlockCenter, const real4* __restrict__ sortedBlockBoundingBox,
-    __shared__ ushort2 buffer[BUFFER_SIZE];
+        const unsigned int* __restrict__ exclusionIndices, const unsigned int* __restrict__ exclusionRowIndices, real4* __restrict__ oldPositions,
-    __shared__ int valid[BUFFER_SIZE];
+        const int* __restrict__ rebuildNeighborList) {
+    __shared__ unsigned short buffer[BUFFER_SIZE];
    __shared__ short sum[BUFFER_SIZE];
    __shared__ ushort2 temp[BUFFER_SIZE];
+    __shared__ int atoms[BUFFER_SIZE+TILE_SIZE];
+    __shared__ real3 posBuffer[TILE_SIZE];
+    __shared__ int exclusionsForX[MAX_EXCLUSIONS];
    __shared__ int bufferFull;
    __shared__ int globalIndex;
-    unsigned int endTileIndex = startTileIndex+numTiles;
+    __shared__ int numAtoms;
+    if (rebuildNeighborList[0] == 0)
+        return; // The neighbor list doesn't need to be rebuilt.
    int valuesInBuffer = 0;
    if (threadIdx.x == 0)
        bufferFull = false;
    for (int i = 0; i < BUFFER_GROUPS; ++i)
-        valid[i*GROUP_SIZE+threadIdx.x] = false;
+        buffer[i*GROUP_SIZE+threadIdx.x] = INVALID;
    __syncthreads();
-    for (int baseIndex = startTileIndex+blockIdx.x*blockDim.x; baseIndex < endTileIndex; baseIndex += blockDim.x*gridDim.x) {
-        // Identify the pair of blocks to compare.
-        int index = baseIndex+threadIdx.x;
+    // Loop over blocks sorted by size.
-        if (index < endTileIndex) {
-            unsigned int y = (unsigned int) floor(NUM_BLOCKS+0.5f-sqrt((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*index));
+    for (int i = startBlockIndex+blockIdx.x; i < startBlockIndex+numBlocks; i += gridDim.x) {
-            unsigned int x = (index-y*NUM_BLOCKS+y*(y+1)/2);
+        if (threadIdx.x == blockDim.x-1)
-            if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
+            numAtoms = 0;
-                y += (x < y ? -1 : 1);
+        real2 sortedKey = sortedBlocks[i];
-                x = (index-y*NUM_BLOCKS+y*(y+1)/2);
+        unsigned short x = (unsigned short) sortedKey.y;
-            }
+        real4 blockCenterX = blockCenter[x];
+        real4 blockSizeX = blockBoundingBox[x];
-            // Find the distance between the bounding boxes of the two cells.
+        // Load exclusion data for block x.
-            real4 delta = blockCenter[x]-blockCenter[y];
+        const int exclusionStart = exclusionRowIndices[x];
-            real4 boxSizea = blockBoundingBox[x];
+        const int exclusionEnd = exclusionRowIndices[x+1];
-            real4 boxSizeb = blockBoundingBox[y];
+        const int numExclusions = exclusionEnd-exclusionStart;
+        for (int j = threadIdx.x; j < numExclusions; j += blockDim.x)
+            exclusionsForX[j] = exclusionIndices[exclusionStart+j];
+        __syncthreads();
+        // Compare it to other blocks after this one in sorted order.
+        for (int base = i+1; base < NUM_BLOCKS; base += blockDim.x) {
+            int j = base+threadIdx.x;
+            real2 sortedKey2 = (j < NUM_BLOCKS ? sortedBlocks[j] : make_real2(0));
+            real4 blockCenterY = (j < NUM_BLOCKS ? sortedBlockCenter[j] : make_real4(0));
+            real4 blockSizeY = (j < NUM_BLOCKS ? sortedBlockBoundingBox[j] : make_real4(0));
+            unsigned short y = (unsigned short) sortedKey2.y;
+            real4 delta = blockCenterX-blockCenterY;
 #ifdef USE_PERIODIC
            delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
            delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
            delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
 #endif
-            delta.x = max(0.0f, fabs(delta.x)-boxSizea.x-boxSizeb.x);
+            delta.x = max(0.0f, fabs(delta.x)-blockSizeX.x-blockSizeY.x);
-            delta.y = max(0.0f, fabs(delta.y)-boxSizea.y-boxSizeb.y);
+            delta.y = max(0.0f, fabs(delta.y)-blockSizeX.y-blockSizeY.y);
-            delta.z = max(0.0f, fabs(delta.z)-boxSizea.z-boxSizeb.z);
+            delta.z = max(0.0f, fabs(delta.z)-blockSizeX.z-blockSizeY.z);
-            if (delta.x*delta.x+delta.y*delta.y+delta.z*delta.z < CUTOFF_SQUARED) {
+            bool hasExclusions = false;
+            for (int k = 0; k < numExclusions; k++)
+                hasExclusions |= (exclusionsForX[k] == y);
+            if (j < NUM_BLOCKS && delta.x*delta.x+delta.y*delta.y+delta.z*delta.z < PADDED_CUTOFF_SQUARED && !hasExclusions) {
                // Add this tile to the buffer.
                int bufferIndex = valuesInBuffer*GROUP_SIZE+threadIdx.x;
-                valid[bufferIndex] = true;
+                buffer[bufferIndex] = y;
-                buffer[bufferIndex] = make_ushort2(x, y);
                valuesInBuffer++;
                if (!bufferFull && valuesInBuffer == BUFFER_GROUPS)
                    bufferFull = true;
            }
-        }
            __syncthreads();
            if (bufferFull) {
-            storeInteractionData(buffer, valid, sum, temp, &globalIndex, interactionCount, interactingTiles, periodicBoxSize, invPeriodicBoxSize, posq, blockCenter, blockBoundingBox, maxTiles);
+                storeInteractionData(x, buffer, sum, temp, atoms, numAtoms, globalIndex, interactionCount, interactingTiles, interactingAtoms, periodicBoxSize, invPeriodicBoxSize, posq, posBuffer, blockCenterX, blockSizeX, maxTiles, false);
                valuesInBuffer = 0;
                if (threadIdx.x == 0)
                    bufferFull = false;
                __syncthreads();
            }
        }
-    storeInteractionData(buffer, valid, sum, temp, &globalIndex, interactionCount, interactingTiles, periodicBoxSize, invPeriodicBoxSize, posq, blockCenter, blockBoundingBox, maxTiles);
+        storeInteractionData(x, buffer, sum, temp, atoms, numAtoms, globalIndex, interactionCount, interactingTiles, interactingAtoms, periodicBoxSize, invPeriodicBoxSize, posq, posBuffer, blockCenterX, blockSizeX, maxTiles, true);
-}
-/**
- * Compare each atom in one block to the bounding box of another block, and set
- * flags for which ones are interacting.
- */
-extern "C" __global__ void findInteractionsWithinBlocks(real4 periodicBoxSize, real4 invPeriodicBoxSize, const real4* __restrict__ posq, const ushort2* __restrict__ tiles, const real4* __restrict__ blockCenter,
-            const real4* __restrict__ blockBoundingBox, unsigned int* __restrict__ interactionFlags, const unsigned int* __restrict__ interactionCount, unsigned int maxTiles) {
-    unsigned int totalWarps = (blockDim.x*gridDim.x)/TILE_SIZE;
-    unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/TILE_SIZE;
-    unsigned int numTiles = interactionCount[0];
-    unsigned int pos = warp*numTiles/totalWarps;
-    unsigned int end = (warp+1)*numTiles/totalWarps;
-    unsigned int index = threadIdx.x & (TILE_SIZE - 1);
-#if (__CUDA_ARCH__ < 200)
-    __shared__ unsigned int flags[128];
-#endif
-    if (numTiles > maxTiles)
-        return;
-    unsigned int lasty = 0xFFFFFFFF;
-    real4 apos;
-    while (pos < end) {
-        // Extract the coordinates of this tile
-        ushort2 tileIndices = tiles[pos];
-        unsigned int x = tileIndices.x;
-        unsigned int y = tileIndices.y;
-        if (x == y) {
-            if (index == 0)
-                interactionFlags[pos] = 0xFFFFFFFF;
    }
-        else {
-            // Load the bounding box for x and the atom positions for y.
-            real4 center = blockCenter[x];
-            real4 boxSize = blockBoundingBox[x];
-            if (y != lasty)
-                apos = posq[y*TILE_SIZE+index];
-            // Find the distance of the atom from the bounding box.
+    // Record the positions the neighbor list is based on.
-            real4 delta = apos-center;
+    for (int i = threadIdx.x+blockIdx.x*blockDim.x; i < NUM_ATOMS; i += blockDim.x*gridDim.x)
-#ifdef USE_PERIODIC
+        oldPositions[i] = posq[i];
-                delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
-                delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
-                delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
-#endif
-            delta.x = max((real) 0, fabs(delta.x)-boxSize.x);
-            delta.y = max((real) 0, fabs(delta.y)-boxSize.y);
-            delta.z = max((real) 0, fabs(delta.z)-boxSize.z);
-#if (__CUDA_ARCH__ < 200)
-            flags[threadIdx.x] = (delta.x*delta.x+delta.y*delta.y+delta.z*delta.z > CUTOFF_SQUARED ? 0 : 1 << index);
-            if (index % 4 == 0)
-                flags[threadIdx.x] += flags[threadIdx.x+1]+flags[threadIdx.x+2]+flags[threadIdx.x+3];
-            unsigned int allFlags = 0;
-            if (index == 0)
-                allFlags = flags[threadIdx.x]+flags[threadIdx.x+4]+flags[threadIdx.x+8]+flags[threadIdx.x+12]+flags[threadIdx.x+16]+flags[threadIdx.x+20]+flags[threadIdx.x+24]+flags[threadIdx.x+28];
-#else
-            unsigned int allFlags = __ballot(delta.x*delta.x+delta.y*delta.y+delta.z*delta.z < CUTOFF_SQUARED);
-#endif
-            // Sum the flags.
-            if (index == 0) {
-                // Count how many flags are set, and based on that decide whether to compute all interactions
-                // or only a fraction of them.
-                int bits = __popc(allFlags);
-                interactionFlags[pos] = (bits > 12 ? 0xFFFFFFFF : allFlags);
-            }
-            lasty = y;
-        }
-        pos++;
-    }
 }
--- a/platforms/cuda/src/kernels/gbsaObc1.cu
+++ b/platforms/cuda/src/kernels/gbsaObc1.cu
 #define DIELECTRIC_OFFSET 0.009f
 #define PROBE_RADIUS 0.14f
 #define SURFACE_AREA_FACTOR -170.351730667551f //-6.0f*3.14159265358979323846f*0.0216f*1000.0f*0.4184f;
-#define TILE_SIZE 32
 #define WARPS_PER_GROUP (FORCE_WORK_GROUP_SIZE/TILE_SIZE)
 /**
@@ -70,58 +69,30 @@ typedef struct {
 */
 extern "C" __global__ void computeBornSum(unsigned long long* __restrict__ global_bornSum, const real4* __restrict__ posq, const float2* __restrict__ global_params,
 #ifdef USE_CUTOFF
-        const ushort2* __restrict__ tiles, const unsigned int* __restrict__ interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize, unsigned int maxTiles, const unsigned int* __restrict__ interactionFlags,
+        const ushort2* __restrict__ tiles, const unsigned int* __restrict__ interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize, unsigned int maxTiles, const real4* __restrict__ blockCenter, const unsigned int* __restrict__ interactingAtoms,
 #else
        unsigned int numTiles,
 #endif
-        unsigned int* exclusionIndices, unsigned int* exclusionRowIndices) {
+        const ushort2* __restrict__ exclusionTiles) {
-    unsigned int totalWarps = (blockDim.x*gridDim.x)/TILE_SIZE;
+    const unsigned int totalWarps = (blockDim.x*gridDim.x)/TILE_SIZE;
-    unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/TILE_SIZE;
+    const unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/TILE_SIZE;
-#ifdef USE_CUTOFF
-    unsigned int numTiles = interactionCount[0];
-    unsigned int pos = warp*(numTiles > maxTiles ? NUM_BLOCKS*(NUM_BLOCKS+1)/2 : numTiles)/totalWarps;
-    unsigned int end = (warp+1)*(numTiles > maxTiles ? NUM_BLOCKS*(NUM_BLOCKS+1)/2 : numTiles)/totalWarps;
-#else
-    unsigned int pos = warp*numTiles/totalWarps;
-    unsigned int end = (warp+1)*numTiles/totalWarps;
-#endif
-    unsigned int lasty = 0xFFFFFFFF;
-    __shared__ AtomData1 localData[FORCE_WORK_GROUP_SIZE];
-    __shared__ unsigned int exclusionRange[2*WARPS_PER_GROUP];
-    __shared__ int exclusionIndex[WARPS_PER_GROUP];
-#ifndef ENABLE_SHUFFLE
-    __shared__ real tempBuffer[FORCE_WORK_GROUP_SIZE];
-#endif    
-    do {
-        // Extract the coordinates of this tile
    const unsigned int tgx = threadIdx.x & (TILE_SIZE-1);
    const unsigned int tbx = threadIdx.x - tgx;
-        const unsigned int localGroupIndex = threadIdx.x/TILE_SIZE;
+    __shared__ AtomData1 localData[FORCE_WORK_GROUP_SIZE];
-        unsigned int x, y;
+    // First loop: process tiles that contain exclusions.
+    const unsigned int firstExclusionTile = FIRST_EXCLUSION_TILE+warp*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/totalWarps;
+    const unsigned int lastExclusionTile = FIRST_EXCLUSION_TILE+(warp+1)*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/totalWarps;
+    for (int pos = firstExclusionTile; pos < lastExclusionTile; pos++) {
+        const ushort2 tileIndices = exclusionTiles[pos];
+        const unsigned int x = tileIndices.x;
+        const unsigned int y = tileIndices.y;
        real bornSum = 0;
-        if (pos < end) {
-#ifdef USE_CUTOFF
-            if (numTiles <= maxTiles) {
-                ushort2 tileIndices = tiles[pos];
-                x = tileIndices.x;
-                y = tileIndices.y;
-            }
-            else
-#endif
-            {
-                y = (unsigned int) floor(NUM_BLOCKS+0.5f-sqrt((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
-                x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
-                if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
-                    y += (x < y ? -1 : 1);
-                    x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
-                }
-            }
        unsigned int atom1 = x*TILE_SIZE + tgx;
        real4 posq1 = posq[atom1];
        float2 params1 = global_params[atom1];
-            if (pos >= end)
+        if (x == y) {
-                ; // This warp is done.
-            else if (x == y) {
            // This tile is on the diagonal.
            localData[threadIdx.x].x = posq1.x;
@@ -155,8 +126,7 @@ extern "C" __global__ void computeBornSum(unsigned long long* __restrict__ globa
                        real ratio = LOG(u_ij * RECIP(l_ij));
                        bornSum += l_ij - u_ij + (0.50f*invR*ratio) + 0.25f*(r*(u_ij2-l_ij2) +
                                            (params2.y*params2.y*invR)*(l_ij2-u_ij2));
-                            if (params1.x < params2.y-r)
+                        bornSum += (params1.x < params2.y-r ? 2.0f*(RECIP(params1.x)-l_ij) : 0);
-                                bornSum += 2.0f*(RECIP(params1.x)-l_ij);
                    }
                }
            }
@@ -164,7 +134,6 @@ extern "C" __global__ void computeBornSum(unsigned long long* __restrict__ globa
        else {
            // This is an off-diagonal tile.
-                if (lasty != y) {
            unsigned int j = y*TILE_SIZE + tgx;
            real4 tempPosq = posq[j];
            localData[threadIdx.x].x = tempPosq.x;
@@ -174,46 +143,27 @@ extern "C" __global__ void computeBornSum(unsigned long long* __restrict__ globa
            float2 tempParams = global_params[j];
            localData[threadIdx.x].radius = tempParams.x;
            localData[threadIdx.x].scaledRadius = tempParams.y;
-                }
            localData[threadIdx.x].bornSum = 0.0f;
-#ifdef USE_CUTOFF
-                unsigned int flags = (numTiles <= maxTiles ? interactionFlags[pos] : 0xFFFFFFFF);
-                bool computeSubset = false;
-                if (flags != 0xFFFFFFFF) {
-                    if (tgx < 2)
-                        exclusionRange[2*localGroupIndex+tgx] = exclusionRowIndices[x+tgx];
-                    if (tgx == 0)
-                        exclusionIndex[localGroupIndex] = -1;
-                    for (unsigned int i = exclusionRange[2*localGroupIndex]+tgx; i < exclusionRange[2*localGroupIndex+1]; i += TILE_SIZE)
-                        if (exclusionIndices[i] == y)
-                            exclusionIndex[localGroupIndex] = i*TILE_SIZE;
-                    computeSubset = (exclusionIndex[localGroupIndex] == -1);
-                }
-                if (computeSubset) {
-                    if (flags == 0) {
-                        // No interactions in this tile.
-                    }
-                    else {
-                        // Compute only a subset of the interactions in this tile.
-                        for (unsigned int j = 0; j < TILE_SIZE; j++) {
+            // Compute the full set of interactions in this tile.
-                            if ((flags&(1<<j)) != 0) {
-                                real3 delta = make_real3(localData[tbx+j].x-posq1.x, localData[tbx+j].y-posq1.y, localData[tbx+j].z-posq1.z);
+            unsigned int tj = tgx;
+            for (j = 0; j < TILE_SIZE; j++) {
+                real3 delta = make_real3(localData[tbx+tj].x-posq1.x, localData[tbx+tj].y-posq1.y, localData[tbx+tj].z-posq1.z);
 #ifdef USE_PERIODIC
                delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
                delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
                delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
 #endif
                real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
-                                real sum = 0;
 #ifdef USE_CUTOFF
-                                if (atom1 < NUM_ATOMS && y*TILE_SIZE+j < NUM_ATOMS && r2 < CUTOFF_SQUARED) {
+                if (atom1 < NUM_ATOMS && y*TILE_SIZE+tj < NUM_ATOMS && r2 < CUTOFF_SQUARED) {
 #else
-                                if (atom1 < NUM_ATOMS && y*TILE_SIZE+j < NUM_ATOMS) {
+                if (atom1 < NUM_ATOMS && y*TILE_SIZE+tj < NUM_ATOMS) {
 #endif
                    real invR = RSQRT(r2);
                    real r = RECIP(invR);
-                                    float2 params2 = make_float2(localData[tbx+j].radius, localData[tbx+j].scaledRadius);
+                    float2 params2 = make_float2(localData[tbx+tj].radius, localData[tbx+tj].scaledRadius);
                    real rScaledRadiusJ = r+params2.y;
                    if (params1.x < rScaledRadiusJ) {
                        real l_ij = RECIP(max(params1.x, fabs(r-params2.y)));
@@ -223,8 +173,7 @@ extern "C" __global__ void computeBornSum(unsigned long long* __restrict__ globa
                        real ratio = LOG(u_ij * RECIP(l_ij));
                        bornSum += l_ij - u_ij + (0.50f*invR*ratio) + 0.25f*(r*(u_ij2-l_ij2) +
                                            (params2.y*params2.y*invR)*(l_ij2-u_ij2));
-                                        if (params1.x < params2.y-r)
+                        bornSum += (params1.x < params2.y-r ? 2.0f*(RECIP(params1.x)-l_ij) : 0);
-                                            bornSum += 2.0f*(RECIP(params1.x)-l_ij);
                    }
                    real rScaledRadiusI = r+params1.y;
                    if (params2.x < rScaledRadiusI) {
@@ -235,37 +184,160 @@ extern "C" __global__ void computeBornSum(unsigned long long* __restrict__ globa
                        real ratio = LOG(u_ij * RECIP(l_ij));
                        real term = l_ij - u_ij + (0.50f*invR*ratio) + 0.25f*(r*(u_ij2-l_ij2) +
                                            (params1.y*params1.y*invR)*(l_ij2-u_ij2));
-                                        if (params2.x < params1.y-r)
+                        term += (params2.x < params1.y-r ? 2.0f*(RECIP(params2.x)-l_ij) : 0);
-                                            term += 2.0f*(RECIP(params2.x)-l_ij);
+                        localData[tbx+tj].bornSum += term;
-                                        sum = term;
+                    }
+                }
+                tj = (tj + 1) & (TILE_SIZE - 1);
            }
        }
-                                // Sum the forces on atom j.
+        // Write results.
-#ifdef ENABLE_SHUFFLE
+        unsigned int offset = x*TILE_SIZE + tgx;
-                                for (int i = 16; i >= 1; i /= 2)
+        atomicAdd(&global_bornSum[offset], static_cast<unsigned long long>((long long) (bornSum*0x100000000)));
-                                    sum += __shfl_xor(sum, i, 32);
+        if (x != y) {
-                                if (tgx == 0)
+            offset = y*TILE_SIZE + tgx;
-                                    localData[tbx+j].bornSum += sum;
+            atomicAdd(&global_bornSum[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].bornSum*0x100000000)));
+        }
+    }
+    // Second loop: tiles without exclusions, either from the neighbor list (with cutoff) or just enumerating all
+    // of them (no cutoff).
+#ifdef USE_CUTOFF
+    unsigned int numTiles = interactionCount[0];
+    int pos = warp*(numTiles > maxTiles ? NUM_BLOCKS*(NUM_BLOCKS+1)/2 : numTiles)/totalWarps;
+    int end = (warp+1)*(numTiles > maxTiles ? NUM_BLOCKS*(NUM_BLOCKS+1)/2 : numTiles)/totalWarps;
 #else
-                                tempBuffer[threadIdx.x] = sum;
+    int pos = warp*numTiles/totalWarps;
-                                if (tgx % 4 == 0)
+    int end = (warp+1)*numTiles/totalWarps;
-                                    tempBuffer[threadIdx.x] += tempBuffer[threadIdx.x+1]+tempBuffer[threadIdx.x+2]+tempBuffer[threadIdx.x+3];
-                                if (tgx == 0)
-                                    localData[tbx+j].bornSum += tempBuffer[threadIdx.x]+tempBuffer[threadIdx.x+4]+tempBuffer[threadIdx.x+8]+tempBuffer[threadIdx.x+12]+tempBuffer[threadIdx.x+16]+tempBuffer[threadIdx.x+20]+tempBuffer[threadIdx.x+24]+tempBuffer[threadIdx.x+28];
 #endif
+    int skipBase = 0;
+    int currentSkipIndex = tbx;
+    __shared__ int atomIndices[FORCE_WORK_GROUP_SIZE];
+    __shared__ int skipTiles[FORCE_WORK_GROUP_SIZE];
+    skipTiles[threadIdx.x] = -1;
+    while (pos < end) {
+        real bornSum = 0;
+        bool includeTile = true;
+        // Extract the coordinates of this tile.
+        unsigned int x, y;
+        bool singlePeriodicCopy = false;
+#ifdef USE_CUTOFF
+        if (numTiles <= maxTiles) {
+            ushort2 tileIndices = tiles[pos];
+            x = tileIndices.x;
+            singlePeriodicCopy = tileIndices.y;
+        }
+        else
+#endif
+        {
+            y = (unsigned int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
+            x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
+            if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
+                y += (x < y ? -1 : 1);
+                x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
            }
+            // Skip over tiles that have exclusions, since they were already processed.
+            while (skipTiles[tbx+TILE_SIZE-1] < pos) {
+                if (skipBase+tgx < NUM_TILES_WITH_EXCLUSIONS) {
+                    ushort2 tile = exclusionTiles[skipBase+tgx];
+                    skipTiles[threadIdx.x] = tile.x + tile.y*NUM_BLOCKS - tile.y*(tile.y+1)/2;
                }
+                else
+                    skipTiles[threadIdx.x] = end;
+                skipBase += TILE_SIZE;            
+                currentSkipIndex = tbx;
+            }
+            while (skipTiles[currentSkipIndex] < pos)
+                currentSkipIndex++;
+            includeTile = (skipTiles[currentSkipIndex] != pos);
+        }
+        if (includeTile) {
+            unsigned int atom1 = x*TILE_SIZE + tgx;
+            // Load atom data for this tile.
+            real4 posq1 = posq[atom1];
+            float2 params1 = global_params[atom1];
+#ifdef USE_CUTOFF
+            unsigned int j = (numTiles <= maxTiles ? interactingAtoms[pos*TILE_SIZE+tgx] : y*TILE_SIZE + tgx);
+#else
+            unsigned int j = y*TILE_SIZE + tgx;
+#endif
+            atomIndices[threadIdx.x] = j;
+            if (j < PADDED_NUM_ATOMS) {
+                real4 tempPosq = posq[j];
+                localData[threadIdx.x].x = tempPosq.x;
+                localData[threadIdx.x].y = tempPosq.y;
+                localData[threadIdx.x].z = tempPosq.z;
+                localData[threadIdx.x].q = tempPosq.w;
+                float2 tempParams = global_params[j];
+                localData[threadIdx.x].radius = tempParams.x;
+                localData[threadIdx.x].scaledRadius = tempParams.y;
+                localData[threadIdx.x].bornSum = 0.0f;
+            }
+#ifdef USE_PERIODIC
+            if (singlePeriodicCopy) {
+                // The box is small enough that we can just translate all the atoms into a single periodic
+                // box, then skip having to apply periodic boundary conditions later.
+                real4 blockCenterX = blockCenter[x];
+                posq1.x -= floor((posq1.x-blockCenterX.x)*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
+                posq1.y -= floor((posq1.y-blockCenterX.y)*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
+                posq1.z -= floor((posq1.z-blockCenterX.z)*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
+                localData[threadIdx.x].x -= floor((localData[threadIdx.x].x-blockCenterX.x)*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
+                localData[threadIdx.x].y -= floor((localData[threadIdx.x].y-blockCenterX.y)*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
+                localData[threadIdx.x].z -= floor((localData[threadIdx.x].z-blockCenterX.z)*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
+                unsigned int tj = tgx;
+                for (j = 0; j < TILE_SIZE; j++) {
+                    real3 delta = make_real3(localData[tbx+tj].x-posq1.x, localData[tbx+tj].y-posq1.y, localData[tbx+tj].z-posq1.z);
+                    real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
+                    int atom2 = atomIndices[tbx+tj];
+                    if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && r2 < CUTOFF_SQUARED) {
+                        real invR = RSQRT(r2);
+                        real r = RECIP(invR);
+                        float2 params2 = make_float2(localData[tbx+tj].radius, localData[tbx+tj].scaledRadius);
+                        real rScaledRadiusJ = r+params2.y;
+                        if (params1.x < rScaledRadiusJ) {
+                            real l_ij = RECIP(max(params1.x, fabs(r-params2.y)));
+                            real u_ij = RECIP(rScaledRadiusJ);
+                            real l_ij2 = l_ij*l_ij;
+                            real u_ij2 = u_ij*u_ij;
+                            real ratio = LOG(u_ij * RECIP(l_ij));
+                            bornSum += l_ij - u_ij + (0.50f*invR*ratio) + 0.25f*(r*(u_ij2-l_ij2) +
+                                                (params2.y*params2.y*invR)*(l_ij2-u_ij2));
+                            bornSum += (params1.x < params2.y-r ? 2.0f*(RECIP(params1.x)-l_ij) : 0);
+                        }
+                        real rScaledRadiusI = r+params1.y;
+                        if (params2.x < rScaledRadiusI) {
+                            real l_ij = RECIP(max(params2.x, fabs(r-params1.y)));
+                            real u_ij = RECIP(rScaledRadiusI);
+                            real l_ij2 = l_ij*l_ij;
+                            real u_ij2 = u_ij*u_ij;
+                            real ratio = LOG(u_ij * RECIP(l_ij));
+                            real term = l_ij - u_ij + (0.50f*invR*ratio) + 0.25f*(r*(u_ij2-l_ij2) +
+                                                (params1.y*params1.y*invR)*(l_ij2-u_ij2));
+                            term += (params2.x < params1.y-r ? 2.0f*(RECIP(params2.x)-l_ij) : 0);
+                            localData[tbx+tj].bornSum += term;
+                        }
+                    }
+                    tj = (tj + 1) & (TILE_SIZE - 1);
                }
            }
            else
 #endif
            {
-                    // Compute the full set of interactions in this tile.
+                // We need to apply periodic boundary conditions separately for each interaction.
                unsigned int tj = tgx;
-                    for (unsigned int j = 0; j < TILE_SIZE; j++) {
+                for (j = 0; j < TILE_SIZE; j++) {
                    real3 delta = make_real3(localData[tbx+tj].x-posq1.x, localData[tbx+tj].y-posq1.y, localData[tbx+tj].z-posq1.z);
 #ifdef USE_PERIODIC
                    delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
@@ -273,10 +345,11 @@ extern "C" __global__ void computeBornSum(unsigned long long* __restrict__ globa
                    delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
 #endif
                    real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
+                    int atom2 = atomIndices[tbx+tj];
 #ifdef USE_CUTOFF
-                        if (atom1 < NUM_ATOMS && y*TILE_SIZE+tj < NUM_ATOMS && r2 < CUTOFF_SQUARED) {
+                    if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && r2 < CUTOFF_SQUARED) {
 #else
-                        if (atom1 < NUM_ATOMS && y*TILE_SIZE+tj < NUM_ATOMS) {
+                    if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
 #endif
                        real invR = RSQRT(r2);
                        real r = RECIP(invR);
@@ -290,8 +363,7 @@ extern "C" __global__ void computeBornSum(unsigned long long* __restrict__ globa
                            real ratio = LOG(u_ij * RECIP(l_ij));
                            bornSum += l_ij - u_ij + (0.50f*invR*ratio) + 0.25f*(r*(u_ij2-l_ij2) +
                                                (params2.y*params2.y*invR)*(l_ij2-u_ij2));
-                                if (params1.x < params2.y-r)
+                            bornSum += (params1.x < params2.y-r ? 2.0f*(RECIP(params1.x)-l_ij) : 0);
-                                    bornSum += 2.0f*(RECIP(params1.x)-l_ij);
                        }
                        real rScaledRadiusI = r+params1.y;
                        if (params2.x < rScaledRadiusI) {
@@ -302,30 +374,27 @@ extern "C" __global__ void computeBornSum(unsigned long long* __restrict__ globa
                            real ratio = LOG(u_ij * RECIP(l_ij));
                            real term = l_ij - u_ij + (0.50f*invR*ratio) + 0.25f*(r*(u_ij2-l_ij2) +
                                                (params1.y*params1.y*invR)*(l_ij2-u_ij2));
-                                if (params2.x < params1.y-r)
+                            term += (params2.x < params1.y-r ? 2.0f*(RECIP(params2.x)-l_ij) : 0);
-                                    term += 2.0f*(RECIP(params2.x)-l_ij);
                            localData[tbx+tj].bornSum += term;
                        }
                    }
                    tj = (tj + 1) & (TILE_SIZE - 1);
                }
            }
-            }
-        }
            // Write results.
-        if (pos < end) {
+            atomicAdd(&global_bornSum[atom1], static_cast<unsigned long long>((long long) (bornSum*0x100000000)));
-            const unsigned int offset = x*TILE_SIZE + tgx;
+#ifdef USE_CUTOFF
-            atomicAdd(&global_bornSum[offset], static_cast<unsigned long long>((long long) (bornSum*0x100000000)));
+            unsigned int atom2 = atomIndices[threadIdx.x];
-        }
+#else
-        if (pos < end && x != y) {
+            unsigned int atom2 = y*TILE_SIZE + tgx;
-            const unsigned int offset = y*TILE_SIZE + tgx;
+#endif
-            atomicAdd(&global_bornSum[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].bornSum*0x100000000)));
+            if (atom2 < PADDED_NUM_ATOMS)
+                atomicAdd(&global_bornSum[atom2], static_cast<unsigned long long>((long long) (localData[threadIdx.x].bornSum*0x100000000)));
        }
-        lasty = y;
        pos++;
-    } while (pos < end);
+    }
 }
 typedef struct {
@@ -342,54 +411,27 @@ typedef struct {
 extern "C" __global__ void computeGBSAForce1(unsigned long long* __restrict__ forceBuffers, unsigned long long* __restrict__ global_bornForce,
        real* __restrict__ energyBuffer, const real4* __restrict__ posq, const real* __restrict__ global_bornRadii,
 #ifdef USE_CUTOFF
-        const ushort2* __restrict__ tiles, const unsigned int* __restrict__ interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize, unsigned int maxTiles, const unsigned int* __restrict__ interactionFlags,
+        const ushort2* __restrict__ tiles, const unsigned int* __restrict__ interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize, unsigned int maxTiles, const real4* __restrict__ blockCenter, const unsigned int* __restrict__ interactingAtoms,
 #else
        unsigned int numTiles,
 #endif
-        unsigned int* exclusionIndices, unsigned int* exclusionRowIndices) {
+        const ushort2* __restrict__ exclusionTiles) {
-    unsigned int totalWarps = (blockDim.x*gridDim.x)/TILE_SIZE;
+    const unsigned int totalWarps = (blockDim.x*gridDim.x)/TILE_SIZE;
-    unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/TILE_SIZE;
+    const unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/TILE_SIZE;
-#ifdef USE_CUTOFF
+    const unsigned int tgx = threadIdx.x & (TILE_SIZE-1);
-    unsigned int numTiles = interactionCount[0];
+    const unsigned int tbx = threadIdx.x - tgx;
-    unsigned int pos = warp*(numTiles > maxTiles ? NUM_BLOCKS*(NUM_BLOCKS+1)/2 : numTiles)/totalWarps;
-    unsigned int end = (warp+1)*(numTiles > maxTiles ? NUM_BLOCKS*(NUM_BLOCKS+1)/2 : numTiles)/totalWarps;
-#else
-    unsigned int pos = warp*numTiles/totalWarps;
-    unsigned int end = (warp+1)*numTiles/totalWarps;
-#endif
    real energy = 0;
-    unsigned int lasty = 0xFFFFFFFF;
    __shared__ AtomData2 localData[FORCE_WORK_GROUP_SIZE];
-    __shared__ unsigned int exclusionRange[2*WARPS_PER_GROUP];
-    __shared__ int exclusionIndex[WARPS_PER_GROUP];
-#ifndef ENABLE_SHUFFLE
-    __shared__ real4 tempBuffer[FORCE_WORK_GROUP_SIZE];
-#endif    
-    do {
+    // First loop: process tiles that contain exclusions.
-        // Extract the coordinates of this tile
-        const unsigned int tgx = threadIdx.x & (TILE_SIZE-1);
+    const unsigned int firstExclusionTile = FIRST_EXCLUSION_TILE+warp*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/totalWarps;
-        const unsigned int tbx = threadIdx.x - tgx;
+    const unsigned int lastExclusionTile = FIRST_EXCLUSION_TILE+(warp+1)*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/totalWarps;
-        const unsigned int localGroupIndex = threadIdx.x/TILE_SIZE;
+    for (int pos = firstExclusionTile; pos < lastExclusionTile; pos++) {
-        unsigned int x, y;
+        const ushort2 tileIndices = exclusionTiles[pos];
+        const unsigned int x = tileIndices.x;
+        const unsigned int y = tileIndices.y;
        real4 force = make_real4(0);
-        if (pos < end) {
-#ifdef USE_CUTOFF
-            if (numTiles <= maxTiles) {
-                ushort2 tileIndices = tiles[pos];
-                x = tileIndices.x;
-                y = tileIndices.y;
-            }
-            else
-#endif
-            {
-                y = (unsigned int) floor(NUM_BLOCKS+0.5f-sqrt((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
-                x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
-                if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
-                    y += (x < y ? -1 : 1);
-                    x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
-                }
-            }
        unsigned int atom1 = x*TILE_SIZE + tgx;
        real4 posq1 = posq[atom1];
        real bornRadius1 = global_bornRadii[atom1];
@@ -441,7 +483,6 @@ extern "C" __global__ void computeGBSAForce1(unsigned long long* __restrict__ fo
        else {
            // This is an off-diagonal tile.
-                if (lasty != y) {
            unsigned int j = y*TILE_SIZE + tgx;
            real4 tempPosq = posq[j];
            localData[threadIdx.x].x = tempPosq.x;
@@ -449,35 +490,15 @@ extern "C" __global__ void computeGBSAForce1(unsigned long long* __restrict__ fo
            localData[threadIdx.x].z = tempPosq.z;
            localData[threadIdx.x].q = tempPosq.w;
            localData[threadIdx.x].bornRadius = global_bornRadii[j];
-                }
            localData[threadIdx.x].fx = 0.0f;
            localData[threadIdx.x].fy = 0.0f;
            localData[threadIdx.x].fz = 0.0f;
            localData[threadIdx.x].fw = 0.0f;
-#ifdef USE_CUTOFF
+            unsigned int tj = tgx;
-                unsigned int flags = (numTiles <= maxTiles ? interactionFlags[pos] : 0xFFFFFFFF);
+            for (j = 0; j < TILE_SIZE; j++) {
-                bool computeSubset = false;
+                if (atom1 < NUM_ATOMS && y*TILE_SIZE+tj < NUM_ATOMS) {
-                if (flags != 0xFFFFFFFF) {
+                    real4 posq2 = make_real4(localData[tbx+tj].x, localData[tbx+tj].y, localData[tbx+tj].z, localData[tbx+tj].q);
-                    if (tgx < 2)
+                    real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z);
-                        exclusionRange[2*localGroupIndex+tgx] = exclusionRowIndices[x+tgx];
-                    if (tgx == 0)
-                        exclusionIndex[localGroupIndex] = -1;
-                    for (unsigned int i = exclusionRange[2*localGroupIndex]+tgx; i < exclusionRange[2*localGroupIndex+1]; i += TILE_SIZE)
-                        if (exclusionIndices[i] == y)
-                            exclusionIndex[localGroupIndex] = i*TILE_SIZE;
-                    computeSubset = (exclusionIndex[localGroupIndex] == -1);
-                }
-                if (computeSubset) {
-                    if (flags == 0) {
-                        // No interactions in this tile.
-                    }
-                    else {
-                        // Compute only a subset of the interactions in this tile.
-                        for (unsigned int j = 0; j < TILE_SIZE; j++) {
-                            if ((flags&(1<<j)) != 0) {
-                                real4 posq2 = make_real4(localData[tbx+j].x, localData[tbx+j].y, localData[tbx+j].z, localData[tbx+j].q);
-                                real4 delta = make_real4(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z, 0);
 #ifdef USE_PERIODIC
                    delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
                    delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
@@ -489,7 +510,7 @@ extern "C" __global__ void computeGBSAForce1(unsigned long long* __restrict__ fo
 #endif
                        real invR = RSQRT(r2);
                        real r = RECIP(invR);
-                                real bornRadius2 = localData[tbx+j].bornRadius;
+                        real bornRadius2 = localData[tbx+tj].bornRadius;
                        real alpha2_ij = bornRadius1*bornRadius2;
                        real D_ij = r2*RECIP(4.0f*alpha2_ij);
                        real expTerm = EXP(-D_ij);
@@ -499,67 +520,178 @@ extern "C" __global__ void computeGBSAForce1(unsigned long long* __restrict__ fo
                        real Gpol = tempEnergy*RECIP(denominator2);
                        real dGpol_dalpha2_ij = -0.5f*Gpol*expTerm*(1.0f+D_ij);
                        real dEdR = Gpol*(1.0f - 0.25f*expTerm);
-#ifdef USE_CUTOFF
-                                if (atom1 >= NUM_ATOMS || y*TILE_SIZE+j >= NUM_ATOMS || r2 > CUTOFF_SQUARED) {
-#else
-                                if (atom1 >= NUM_ATOMS || y*TILE_SIZE+j >= NUM_ATOMS) {
-#endif
-                                    dEdR = 0.0f;
-                                    dGpol_dalpha2_ij = 0.0f;
-                                    tempEnergy = 0.0f;
-                                }
-                                energy += tempEnergy;
                        force.w += dGpol_dalpha2_ij*bornRadius2;
+                        energy += tempEnergy;
                        delta *= dEdR;
                        force.x -= delta.x;
                        force.y -= delta.y;
                        force.z -= delta.z;
-                                delta.w = dGpol_dalpha2_ij*bornRadius1;
+                        localData[tbx+tj].fx += delta.x;
+                        localData[tbx+tj].fy += delta.y;
+                        localData[tbx+tj].fz += delta.z;
+                        localData[tbx+tj].fw += dGpol_dalpha2_ij*bornRadius1;
 #ifdef USE_CUTOFF
                    }
-                                else
-                                    delta = make_real4(0);
 #endif
+                }
+                tj = (tj + 1) & (TILE_SIZE - 1);
+            }
+        }
-                                // Sum the forces on atom j.
+        // Write results.
-#ifdef ENABLE_SHUFFLE
+        unsigned int offset = x*TILE_SIZE + tgx;
-                                for (int i = 16; i >= 1; i /= 2) {
+        atomicAdd(&forceBuffers[offset], static_cast<unsigned long long>((long long) (force.x*0x100000000)));
-                                    delta.x += __shfl_xor(delta.x, i, 32);
+        atomicAdd(&forceBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force.y*0x100000000)));
-                                    delta.y += __shfl_xor(delta.y, i, 32);
+        atomicAdd(&forceBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force.z*0x100000000)));
-                                    delta.z += __shfl_xor(delta.z, i, 32);
+        atomicAdd(&global_bornForce[offset], static_cast<unsigned long long>((long long) (force.w*0x100000000)));
-                                    delta.w += __shfl_xor(delta.w, i, 32);
+        if (x != y) {
+            offset = y*TILE_SIZE + tgx;
+            atomicAdd(&forceBuffers[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fx*0x100000000)));
+            atomicAdd(&forceBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fy*0x100000000)));
+            atomicAdd(&forceBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fz*0x100000000)));
+            atomicAdd(&global_bornForce[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fw*0x100000000)));
        }
-                                if (tgx == 0) {
-                                    localData[tbx+j].fx += delta.x;
-                                    localData[tbx+j].fy += delta.y;
-                                    localData[tbx+j].fz += delta.z;
-                                    localData[tbx+j].fw += delta.w;
    }
+    // Second loop: tiles without exclusions, either from the neighbor list (with cutoff) or just enumerating all
+    // of them (no cutoff).
+#ifdef USE_CUTOFF
+    unsigned int numTiles = interactionCount[0];
+    int pos = warp*(numTiles > maxTiles ? NUM_BLOCKS*(NUM_BLOCKS+1)/2 : numTiles)/totalWarps;
+    int end = (warp+1)*(numTiles > maxTiles ? NUM_BLOCKS*(NUM_BLOCKS+1)/2 : numTiles)/totalWarps;
 #else
-                                tempBuffer[threadIdx.x] = delta;
+    int pos = warp*numTiles/totalWarps;
-                                if (tgx % 4 == 0)
+    int end = (warp+1)*numTiles/totalWarps;
-                                    tempBuffer[threadIdx.x] += tempBuffer[threadIdx.x+1]+tempBuffer[threadIdx.x+2]+tempBuffer[threadIdx.x+3];
+#endif
-                                if (tgx == 0) {
+    int skipBase = 0;
-                                    real4 sum = tempBuffer[threadIdx.x]+tempBuffer[threadIdx.x+4]+tempBuffer[threadIdx.x+8]+tempBuffer[threadIdx.x+12]+tempBuffer[threadIdx.x+16]+tempBuffer[threadIdx.x+20]+tempBuffer[threadIdx.x+24]+tempBuffer[threadIdx.x+28];
+    int currentSkipIndex = tbx;
-                                    localData[tbx+j].fx += sum.x;
+    __shared__ int atomIndices[FORCE_WORK_GROUP_SIZE];
-                                    localData[tbx+j].fy += sum.y;
+    __shared__ int skipTiles[FORCE_WORK_GROUP_SIZE];
-                                    localData[tbx+j].fz += sum.z;
+    skipTiles[threadIdx.x] = -1;
-                                    localData[tbx+j].fw += sum.w;
+    while (pos < end) {
+        real4 force = make_real4(0);
+        bool includeTile = true;
+        // Extract the coordinates of this tile.
+        unsigned int x, y;
+        bool singlePeriodicCopy = false;
+#ifdef USE_CUTOFF
+        if (numTiles <= maxTiles) {
+            ushort2 tileIndices = tiles[pos];
+            x = tileIndices.x;
+            singlePeriodicCopy = tileIndices.y;
+        }
+        else
+#endif
+        {
+            y = (unsigned int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
+            x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
+            if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
+                y += (x < y ? -1 : 1);
+                x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
+            }
+            // Skip over tiles that have exclusions, since they were already processed.
+            while (skipTiles[tbx+TILE_SIZE-1] < pos) {
+                if (skipBase+tgx < NUM_TILES_WITH_EXCLUSIONS) {
+                    ushort2 tile = exclusionTiles[skipBase+tgx];
+                    skipTiles[threadIdx.x] = tile.x + tile.y*NUM_BLOCKS - tile.y*(tile.y+1)/2;
                }
+                else
+                    skipTiles[threadIdx.x] = end;
+                skipBase += TILE_SIZE;            
+                currentSkipIndex = tbx;
+            }
+            while (skipTiles[currentSkipIndex] < pos)
+                currentSkipIndex++;
+            includeTile = (skipTiles[currentSkipIndex] != pos);
+        }
+        if (includeTile) {
+            unsigned int atom1 = x*TILE_SIZE + tgx;
+            // Load atom data for this tile.
+            real4 posq1 = posq[atom1];
+            real bornRadius1 = global_bornRadii[atom1];
+#ifdef USE_CUTOFF
+            unsigned int j = (numTiles <= maxTiles ? interactingAtoms[pos*TILE_SIZE+tgx] : y*TILE_SIZE + tgx);
+#else
+            unsigned int j = y*TILE_SIZE + tgx;
 #endif
+            atomIndices[threadIdx.x] = j;
+            if (j < PADDED_NUM_ATOMS) {
+                real4 tempPosq = posq[j];
+                localData[threadIdx.x].x = tempPosq.x;
+                localData[threadIdx.x].y = tempPosq.y;
+                localData[threadIdx.x].z = tempPosq.z;
+                localData[threadIdx.x].q = tempPosq.w;
+                localData[threadIdx.x].bornRadius = global_bornRadii[j];
+                localData[threadIdx.x].fx = 0.0f;
+                localData[threadIdx.x].fy = 0.0f;
+                localData[threadIdx.x].fz = 0.0f;
+                localData[threadIdx.x].fw = 0.0f;
+            }
+#ifdef USE_PERIODIC
+            if (singlePeriodicCopy) {
+                // The box is small enough that we can just translate all the atoms into a single periodic
+                // box, then skip having to apply periodic boundary conditions later.
+                real4 blockCenterX = blockCenter[x];
+                posq1.x -= floor((posq1.x-blockCenterX.x)*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
+                posq1.y -= floor((posq1.y-blockCenterX.y)*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
+                posq1.z -= floor((posq1.z-blockCenterX.z)*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
+                localData[threadIdx.x].x -= floor((localData[threadIdx.x].x-blockCenterX.x)*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
+                localData[threadIdx.x].y -= floor((localData[threadIdx.x].y-blockCenterX.y)*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
+                localData[threadIdx.x].z -= floor((localData[threadIdx.x].z-blockCenterX.z)*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
+                unsigned int tj = tgx;
+                for (j = 0; j < TILE_SIZE; j++) {
+                    int atom2 = atomIndices[tbx+tj];
+                    if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
+                        real4 posq2 = make_real4(localData[tbx+tj].x, localData[tbx+tj].y, localData[tbx+tj].z, localData[tbx+tj].q);
+                        real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z);
+                        real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
+                        if (r2 < CUTOFF_SQUARED) {
+                            real invR = RSQRT(r2);
+                            real r = RECIP(invR);
+                            real bornRadius2 = localData[tbx+tj].bornRadius;
+                            real alpha2_ij = bornRadius1*bornRadius2;
+                            real D_ij = r2*RECIP(4.0f*alpha2_ij);
+                            real expTerm = EXP(-D_ij);
+                            real denominator2 = r2 + alpha2_ij*expTerm;
+                            real denominator = SQRT(denominator2);
+                            real tempEnergy = (PREFACTOR*posq1.w*posq2.w)*RECIP(denominator);
+                            real Gpol = tempEnergy*RECIP(denominator2);
+                            real dGpol_dalpha2_ij = -0.5f*Gpol*expTerm*(1.0f+D_ij);
+                            real dEdR = Gpol*(1.0f - 0.25f*expTerm);
+                            force.w += dGpol_dalpha2_ij*bornRadius2;
+                            energy += tempEnergy;
+                            delta *= dEdR;
+                            force.x -= delta.x;
+                            force.y -= delta.y;
+                            force.z -= delta.z;
+                            localData[tbx+tj].fx += delta.x;
+                            localData[tbx+tj].fy += delta.y;
+                            localData[tbx+tj].fz += delta.z;
+                            localData[tbx+tj].fw += dGpol_dalpha2_ij*bornRadius1;
                        }
                    }
+                    tj = (tj + 1) & (TILE_SIZE - 1);
                }
            }
            else
 #endif
            {
-                    // Compute the full set of interactions in this tile.
+                // We need to apply periodic boundary conditions separately for each interaction.
                unsigned int tj = tgx;
-                    for (unsigned int j = 0; j < TILE_SIZE; j++) {
+                for (j = 0; j < TILE_SIZE; j++) {
-                        if (atom1 < NUM_ATOMS && y*TILE_SIZE+tj < NUM_ATOMS) {
+                    int atom2 = atomIndices[tbx+tj];
+                    if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
                        real4 posq2 = make_real4(localData[tbx+tj].x, localData[tbx+tj].y, localData[tbx+tj].z, localData[tbx+tj].q);
                        real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z);
 #ifdef USE_PERIODIC
@@ -600,27 +732,26 @@ extern "C" __global__ void computeGBSAForce1(unsigned long long* __restrict__ fo
                    tj = (tj + 1) & (TILE_SIZE - 1);
                }
            }
-            }
-        }
            // Write results.
-        if (pos < end) {
+            atomicAdd(&forceBuffers[atom1], static_cast<unsigned long long>((long long) (force.x*0x100000000)));
-            const unsigned int offset = x*TILE_SIZE + tgx;
+            atomicAdd(&forceBuffers[atom1+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force.y*0x100000000)));
-            atomicAdd(&forceBuffers[offset], static_cast<unsigned long long>((long long) (force.x*0x100000000)));
+            atomicAdd(&forceBuffers[atom1+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force.z*0x100000000)));
-            atomicAdd(&forceBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force.y*0x100000000)));
+            atomicAdd(&global_bornForce[atom1], static_cast<unsigned long long>((long long) (force.w*0x100000000)));
-            atomicAdd(&forceBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force.z*0x100000000)));
+#ifdef USE_CUTOFF
-            atomicAdd(&global_bornForce[offset], static_cast<unsigned long long>((long long) (force.w*0x100000000)));
+            unsigned int atom2 = atomIndices[threadIdx.x];
+#else
+            unsigned int atom2 = y*TILE_SIZE + tgx;
+#endif
+            if (atom2 < PADDED_NUM_ATOMS) {
+                atomicAdd(&forceBuffers[atom2], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fx*0x100000000)));
+                atomicAdd(&forceBuffers[atom2+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fy*0x100000000)));
+                atomicAdd(&forceBuffers[atom2+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fz*0x100000000)));
+                atomicAdd(&global_bornForce[atom2], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fw*0x100000000)));
            }
-        if (pos < end && x != y) {
-            const unsigned int offset = y*TILE_SIZE + tgx;
-            atomicAdd(&forceBuffers[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fx*0x100000000)));
-            atomicAdd(&forceBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fy*0x100000000)));
-            atomicAdd(&forceBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fz*0x100000000)));
-            atomicAdd(&global_bornForce[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fw*0x100000000)));
        }
-        lasty = y;
        pos++;
-    } while (pos < end);
+    }
    energyBuffer[blockIdx.x*blockDim.x+threadIdx.x] += energy;
 }
--- a/platforms/cuda/src/kernels/integrationUtilities.cu
+++ b/platforms/cuda/src/kernels/integrationUtilities.cu
@@ -24,14 +24,14 @@ extern "C" __global__ void generateRandomNumbers(int numValues, float4* __restri
        state.y ^= state.y << 13;
        state.y ^= state.y >> 17;
        state.y ^= state.y << 5;
-        x1 = sqrt(-2.0f * log(x1));
+        x1 = SQRT(-2.0f * LOG(x1));
        k = (state.z >> 2) + (state.w >> 3) + (carry >> 2);
        m = state.w + state.w + state.z + carry;
        state.z = state.w;
        state.w = m;
        carry = k >> 30;
        float x2 = (float)(state.x + state.y + state.w) / (float)0xffffffff;
-        value.x = x1 * cos(2.0f * 3.14159265f * x2);
+        value.x = x1 * COS(2.0f * 3.14159265f * x2);
        // Generate second value.
@@ -49,14 +49,14 @@ extern "C" __global__ void generateRandomNumbers(int numValues, float4* __restri
        state.y ^= state.y << 13;
        state.y ^= state.y >> 17;
        state.y ^= state.y << 5;
-        x3 = sqrt(-2.0f * log(x3));
+        x3 = SQRT(-2.0f * LOG(x3));
        k = (state.z >> 2) + (state.w >> 3) + (carry >> 2);
        m = state.w + state.w + state.z + carry;
        state.z = state.w;
        state.w = m;
        carry = k >> 30;
        float x4 = (float)(state.x + state.y + state.w) / (float)0xffffffff;
-        value.y = x3 * cos(2.0f * 3.14159265f * x4);
+        value.y = x3 * COS(2.0f * 3.14159265f * x4);
        // Generate third value.
@@ -74,14 +74,14 @@ extern "C" __global__ void generateRandomNumbers(int numValues, float4* __restri
        state.y ^= state.y << 13;
        state.y ^= state.y >> 17;
        state.y ^= state.y << 5;
-        x5 = sqrt(-2.0f * log(x5));
+        x5 = SQRT(-2.0f * LOG(x5));
        k = (state.z >> 2) + (state.w >> 3) + (carry >> 2);
        m = state.w + state.w + state.z + carry;
        state.z = state.w;
        state.w = m;
        carry = k >> 30;
        float x6 = (float)(state.x + state.y + state.w) / (float)0xffffffff;
-        value.z = x5 * cos(2.0f * 3.14159265f * x6);
+        value.z = x5 * COS(2.0f * 3.14159265f * x6);
        // Generate fourth value.
@@ -99,14 +99,14 @@ extern "C" __global__ void generateRandomNumbers(int numValues, float4* __restri
        state.y ^= state.y << 13;
        state.y ^= state.y >> 17;
        state.y ^= state.y << 5;
-        x7 = sqrt(-2.0f * log(x7));
+        x7 = SQRT(-2.0f * LOG(x7));
        k = (state.z >> 2) + (state.w >> 3) + (carry >> 2);
        m = state.w + state.w + state.z + carry;
        state.z = state.w;
        state.w = m;
        carry = k >> 30;
        float x8 = (float)(state.x + state.y + state.w) / (float)0xffffffff;
-        value.w = x7 * cos(2.0f * 3.14159265f * x8);
+        value.w = x7 * COS(2.0f * 3.14159265f * x8);
        // Record the values.
@@ -412,9 +412,9 @@ extern "C" __global__ void applySettleToPositions(int numClusters, mixed tol, co
        mixed yaksYd = zaksZd*xaksXd - xaksZd*zaksXd;
        mixed zaksYd = xaksZd*yaksXd - yaksZd*xaksXd;
-        mixed axlng = sqrt(xaksXd*xaksXd + yaksXd*yaksXd + zaksXd*zaksXd);
+        mixed axlng = SQRT(xaksXd*xaksXd + yaksXd*yaksXd + zaksXd*zaksXd);
-        mixed aylng = sqrt(xaksYd*xaksYd + yaksYd*yaksYd + zaksYd*zaksYd);
+        mixed aylng = SQRT(xaksYd*xaksYd + yaksYd*yaksYd + zaksYd*zaksYd);
-        mixed azlng = sqrt(xaksZd*xaksZd + yaksZd*yaksZd + zaksZd*zaksZd);
+        mixed azlng = SQRT(xaksZd*xaksZd + yaksZd*yaksZd + zaksZd*zaksZd);
        mixed trns11 = xaksXd / axlng;
        mixed trns21 = yaksXd / axlng;
        mixed trns31 = zaksXd / axlng;
@@ -440,13 +440,13 @@ extern "C" __global__ void applySettleToPositions(int numClusters, mixed tol, co
        //                                        --- Step2  A2' ---
        float rc = 0.5f*params.y;
-        mixed rb = sqrt(params.x*params.x-rc*rc);
+        mixed rb = SQRT(params.x*params.x-rc*rc);
        mixed ra = rb*(m1+m2)*invTotalMass;
        rb -= ra;
        mixed sinphi = za1d/ra;
-        mixed cosphi = sqrt(1-sinphi*sinphi);
+        mixed cosphi = SQRT(1-sinphi*sinphi);
        mixed sinpsi = (zb1d-zc1d) / (2*rc*cosphi);
-        mixed cospsi = sqrt(1-sinpsi*sinpsi);
+        mixed cospsi = SQRT(1-sinpsi*sinpsi);
        mixed ya2d =   ra*cosphi;
        mixed xb2d = - rc*cospsi;
@@ -454,7 +454,7 @@ extern "C" __global__ void applySettleToPositions(int numClusters, mixed tol, co
        mixed yc2d = - rb*cosphi + rc*sinpsi*sinphi;
        mixed xb2d2 = xb2d*xb2d;
        mixed hh2 = 4.0f*xb2d2 + (yb2d-yc2d)*(yb2d-yc2d) + (zb1d-zc1d)*(zb1d-zc1d);
-        mixed deltx = 2.0f*xb2d + sqrt(4.0f*xb2d2 - hh2 + params.y*params.y);
+        mixed deltx = 2.0f*xb2d + SQRT(4.0f*xb2d2 - hh2 + params.y*params.y);
        xb2d -= deltx*0.5f;
        //                                        --- Step3  al,be,ga ---
@@ -464,11 +464,11 @@ extern "C" __global__ void applySettleToPositions(int numClusters, mixed tol, co
        mixed gamma = xb0d*yb1d - xb1d*yb0d + xc0d*yc1d - xc1d*yc0d;
        mixed al2be2 = alpha*alpha + beta*beta;
-        mixed sintheta = (alpha*gamma - beta*sqrt(al2be2 - gamma*gamma)) / al2be2;
+        mixed sintheta = (alpha*gamma - beta*SQRT(al2be2 - gamma*gamma)) / al2be2;
        //                                        --- Step4  A3' ---
-        mixed costheta = sqrt(1-sintheta*sintheta);
+        mixed costheta = SQRT(1-sintheta*sintheta);
        mixed xa3d = - ya2d*sintheta;
        mixed ya3d =   ya2d*costheta;
        mixed za3d = za1d;
@@ -534,9 +534,9 @@ extern "C" __global__ void applySettleToVelocities(int numClusters, mixed tol, c
        mixed3 eAB = make_mixed3(apos1.x-apos0.x, apos1.y-apos0.y, apos1.z-apos0.z);
        mixed3 eBC = make_mixed3(apos2.x-apos1.x, apos2.y-apos1.y, apos2.z-apos1.z);
        mixed3 eCA = make_mixed3(apos0.x-apos2.x, apos0.y-apos2.y, apos0.z-apos2.z);
-        eAB *= rsqrt(eAB.x*eAB.x + eAB.y*eAB.y + eAB.z*eAB.z);
+        eAB *= RSQRT(eAB.x*eAB.x + eAB.y*eAB.y + eAB.z*eAB.z);
-        eBC *= rsqrt(eBC.x*eBC.x + eBC.y*eBC.y + eBC.z*eBC.z);
+        eBC *= RSQRT(eBC.x*eBC.x + eBC.y*eBC.y + eBC.z*eBC.z);
-        eCA *= rsqrt(eCA.x*eCA.x + eCA.y*eCA.y + eCA.z*eCA.z);
+        eCA *= RSQRT(eCA.x*eCA.x + eCA.y*eCA.y + eCA.z*eCA.z);
        mixed vAB = (v1.x-v0.x)*eAB.x + (v1.y-v0.y)*eAB.y + (v1.z-v0.z)*eAB.z;
        mixed vBC = (v2.x-v1.x)*eBC.x + (v2.y-v1.y)*eBC.y + (v2.z-v1.z)*eBC.z;
        mixed vCA = (v0.x-v2.x)*eCA.x + (v0.y-v2.y)*eCA.y + (v0.z-v2.z)*eCA.z;
@@ -574,7 +574,8 @@ extern "C" __global__ void applySettleToVelocities(int numClusters, mixed tol, c
 /**
 * Compute the direction each CCMA constraint is pointing in.  This is called once at the beginning of constraint evaluation.
 */
-extern "C" __global__ void computeCCMAConstraintDirections(const int2* __restrict__ constraintAtoms, mixed4* __restrict__ constraintDistance, const real4* __restrict__ atomPositions, const real4* __restrict__ posqCorrection) {
+extern "C" __global__ void computeCCMAConstraintDirections(const int2* __restrict__ constraintAtoms, mixed4* __restrict__ constraintDistance,
+        const real4* __restrict__ atomPositions, const real4* __restrict__ posqCorrection, int* __restrict__ converged) {
    for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < NUM_CCMA_CONSTRAINTS; index += blockDim.x*gridDim.x) {
        // Compute the direction for this constraint.
@@ -587,6 +588,10 @@ extern "C" __global__ void computeCCMAConstraintDirections(const int2* __restric
        dir.z = oldPos1.z-oldPos2.z;
        constraintDistance[index] = dir;
    }
+    if (threadIdx.x == 0 && blockIdx.x == 0) {
+        converged[0] = 1;
+        converged[1] = 0;
+    }
 }
 /**
@@ -605,6 +610,7 @@ extern "C" __global__ void computeCCMAPositionConstraintForce(const int2* __rest
    __syncthreads();
    mixed lowerTol = 1-2*tol+tol*tol;
    mixed upperTol = 1+2*tol+tol*tol;
+    bool threadConverged = true;
    for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < NUM_CCMA_CONSTRAINTS; index += blockDim.x*gridDim.x) {
        // Compute the force due to this constraint.
@@ -620,14 +626,13 @@ extern "C" __global__ void computeCCMAPositionConstraintForce(const int2* __rest
        mixed dist2 = dir.w*dir.w;
        mixed diff = dist2 - rp2;
        delta1[index] = (rrpr > d_ij2*1e-6f ? reducedMass[index]*diff/rrpr : 0.0f);
+        threadConverged &= (rp2 > lowerTol*dist2 && rp2 < upperTol*dist2);
-        // See whether it has converged.
+    }
+    if (groupConverged && !threadConverged)
-        if (groupConverged && (rp2 < lowerTol*dist2 || rp2 > upperTol*dist2)) {
        groupConverged = 0;
+    __syncthreads();
+    if (threadIdx.x == 0 && !groupConverged)
        converged[iteration%2] = 0;
-        }
-    }
 }
 /**