Merged 5.1Optimizations branch back to trunk

93c467b2 · Peter Eastman · f6d4557d · 93c467b2 · 93c467b2 · 93c467b2
Commit 93c467b2 authored Mar 22, 2013 by Peter Eastman
20 changed files
--- a/platforms/cuda/src/CudaContext.cpp
+++ b/platforms/cuda/src/CudaContext.cpp
@@ -61,7 +61,7 @@ using namespace OpenMM;
 using namespace std;

 const int CudaContext::ThreadBlockSize = 64;
-const int CudaContext::TileSize = 32;
+const int CudaContext::TileSize = sizeof(tileflags)*8;
 bool CudaContext::hasInitializedCuda = false;

 CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlockingSync, const string& precision, const string& compiler,
@@ -369,6 +369,7 @@ CUmodule CudaContext::createModule(const string source, const map<string, string
        src << "typedef float3 mixed3;\n";
        src << "typedef float4 mixed4;\n";
    }
+    src << "typedef unsigned int tileflags;\n";
    for (map<string, string>::const_iterator iter = defines.begin(); iter != defines.end(); ++iter) {
        src << "#define " << iter->first;
        if (!iter->second.empty())

--- a/platforms/cuda/src/CudaContext.h
+++ b/platforms/cuda/src/CudaContext.h
@@ -42,6 +42,8 @@
 #include "windowsExportCuda.h"
 #include "CudaPlatform.h"

+typedef unsigned int tileflags;
+
 namespace OpenMM {

 class CudaArray;

--- a/platforms/cuda/src/CudaIntegrationUtilities.cpp
+++ b/platforms/cuda/src/CudaIntegrationUtilities.cpp
@@ -6,7 +6,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2009-2012 Stanford University and the Authors.      *
+ * Portions copyright (c) 2009-2013 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -99,7 +99,7 @@ CudaIntegrationUtilities::CudaIntegrationUtilities(CudaContext& context, const S
        posDelta(NULL), settleAtoms(NULL), settleParams(NULL), shakeAtoms(NULL), shakeParams(NULL),
        random(NULL), randomSeed(NULL), randomPos(0), stepSize(NULL), ccmaAtoms(NULL), ccmaDistance(NULL),
        ccmaReducedMass(NULL), ccmaAtomConstraints(NULL), ccmaNumAtomConstraints(NULL), ccmaConstraintMatrixColumn(NULL),
-        ccmaConstraintMatrixValue(NULL), ccmaDelta1(NULL), ccmaDelta2(NULL), ccmaConvergedMemory(NULL),
+        ccmaConstraintMatrixValue(NULL), ccmaDelta1(NULL), ccmaDelta2(NULL), ccmaConverged(NULL),
        vsite2AvgAtoms(NULL), vsite2AvgWeights(NULL), vsite3AvgAtoms(NULL), vsite3AvgWeights(NULL),
        vsiteOutOfPlaneAtoms(NULL), vsiteOutOfPlaneWeights(NULL) {
    // Create workspace arrays.
@@ -466,9 +466,8 @@ CudaIntegrationUtilities::CudaIntegrationUtilities(CudaContext& context, const S
        ccmaAtoms = CudaArray::create<int2>(context, numCCMA, "CcmaAtoms");
        ccmaAtomConstraints = CudaArray::create<int>(context, numAtoms*maxAtomConstraints, "CcmaAtomConstraints");
        ccmaNumAtomConstraints = CudaArray::create<int>(context, numAtoms, "CcmaAtomConstraintsIndex");
-        CHECK_RESULT2(cuMemHostAlloc((void**) &ccmaConvergedMemory, 2*sizeof(int), CU_MEMHOSTALLOC_DEVICEMAP), "Error allocating pinned memory");
-        CHECK_RESULT2(cuMemHostGetDevicePointer(&ccmaConvergedDeviceMemory, ccmaConvergedMemory, 0), "Error getting device address for pinned memory");
        ccmaConstraintMatrixColumn = CudaArray::create<int>(context, numCCMA*maxRowElements, "ConstraintMatrixColumn");
+        ccmaConverged = CudaArray::create<int>(context, 2, "ccmaConverged");
        vector<int2> atomsVec(ccmaAtoms->getSize());
        vector<int> atomConstraintsVec(ccmaAtomConstraints->getSize());
        vector<int> numAtomConstraintsVec(ccmaNumAtomConstraints->getSize());
@@ -680,8 +679,8 @@ CudaIntegrationUtilities::~CudaIntegrationUtilities() {
        delete ccmaDelta1;
    if (ccmaDelta2 != NULL)
        delete ccmaDelta2;
-    if (ccmaConvergedMemory != NULL)
-        cuMemFreeHost(ccmaConvergedMemory);
+    if (ccmaConverged != NULL)
+        delete ccmaConverged;
    if (vsite2AvgAtoms != NULL)
        delete vsite2AvgAtoms;
    if (vsite2AvgWeights != NULL)
@@ -734,33 +733,32 @@ void CudaIntegrationUtilities::applyConstraints(bool constrainVelocities, double
        context.executeKernel(shakeKernel, args, shakeAtoms->getSize());
    }
    if (ccmaAtoms != NULL) {
-        void* directionsArgs[] = {&ccmaAtoms->getDevicePointer(), &ccmaDistance->getDevicePointer(), &context.getPosq().getDevicePointer(), &posCorrection};
+        void* directionsArgs[] = {&ccmaAtoms->getDevicePointer(), &ccmaDistance->getDevicePointer(), &context.getPosq().getDevicePointer(), &posCorrection, &ccmaConverged->getDevicePointer()};
        context.executeKernel(ccmaDirectionsKernel, directionsArgs, ccmaAtoms->getSize());
        int i;
        void* forceArgs[] = {&ccmaAtoms->getDevicePointer(), &ccmaDistance->getDevicePointer(),
                constrainVelocities ? &context.getVelm().getDevicePointer() : &posDelta->getDevicePointer(),
-                &ccmaReducedMass->getDevicePointer(), &ccmaDelta1->getDevicePointer(), &ccmaConvergedDeviceMemory,
+                &ccmaReducedMass->getDevicePointer(), &ccmaDelta1->getDevicePointer(), &ccmaConverged->getDevicePointer(),
                tolPointer, &i};
        void* multiplyArgs[] = {&ccmaDelta1->getDevicePointer(), &ccmaDelta2->getDevicePointer(),
-                &ccmaConstraintMatrixColumn->getDevicePointer(), &ccmaConstraintMatrixValue->getDevicePointer(), &ccmaConvergedDeviceMemory, &i};
+                &ccmaConstraintMatrixColumn->getDevicePointer(), &ccmaConstraintMatrixValue->getDevicePointer(), &ccmaConverged->getDevicePointer(), &i};
        void* updateArgs[] = {&ccmaNumAtomConstraints->getDevicePointer(), &ccmaAtomConstraints->getDevicePointer(), &ccmaDistance->getDevicePointer(),
                constrainVelocities ? &context.getVelm().getDevicePointer() : &posDelta->getDevicePointer(),
                &context.getVelm().getDevicePointer(), &ccmaDelta1->getDevicePointer(), &ccmaDelta2->getDevicePointer(),
-                &ccmaConvergedDeviceMemory, &i};
+                &ccmaConverged->getDevicePointer(), &i};
        const int checkInterval = 4;
+        int* converged = (int*) context.getPinnedBuffer();
        for (i = 0; i < 150; i++) {
-            if (i == 0) {
-                ccmaConvergedMemory[0] = 1;
-                ccmaConvergedMemory[1] = 0;
-            }
            context.executeKernel(ccmaForceKernel, forceArgs, ccmaAtoms->getSize());
-            if ((i+1)%checkInterval == 0)
+            if ((i+1)%checkInterval == 0) {
+                ccmaConverged->download(converged, false);
                CHECK_RESULT2(cuEventRecord(ccmaEvent, 0), "Error recording event for CCMA");
+            }
            context.executeKernel(ccmaMultiplyKernel, multiplyArgs, ccmaAtoms->getSize());
            context.executeKernel(ccmaUpdateKernel, updateArgs, context.getNumAtoms());
            if ((i+1)%checkInterval == 0) {
                CHECK_RESULT2(cuEventSynchronize(ccmaEvent), "Error synchronizing on event for CCMA");
-                if (ccmaConvergedMemory[i%2])
+                if (converged[i%2])
                    break;
            }
        }

--- a/platforms/cuda/src/CudaIntegrationUtilities.h
+++ b/platforms/cuda/src/CudaIntegrationUtilities.h
@@ -9,7 +9,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2009-2012 Stanford University and the Authors.      *
+ * Portions copyright (c) 2009-2013 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -140,8 +140,7 @@ private:
    CudaArray* ccmaConstraintMatrixValue;
    CudaArray* ccmaDelta1;
    CudaArray* ccmaDelta2;
-    int* ccmaConvergedMemory;
-    CUdeviceptr ccmaConvergedDeviceMemory;
+    CudaArray* ccmaConverged;
    CUevent ccmaEvent;
    CudaArray* vsite2AvgAtoms;
    CudaArray* vsite2AvgWeights;

--- a/platforms/cuda/src/CudaKernels.cpp
+++ b/platforms/cuda/src/CudaKernels.cpp
--- a/platforms/cuda/src/CudaKernels.h
+++ b/platforms/cuda/src/CudaKernels.h
@@ -557,8 +557,7 @@ class CudaCalcNonbondedForceKernel : public CalcNonbondedForceKernel {
 public:
    CudaCalcNonbondedForceKernel(std::string name, const Platform& platform, CudaContext& cu, System& system) : CalcNonbondedForceKernel(name, platform),
            cu(cu), hasInitializedFFT(false), sigmaEpsilon(NULL), exceptionParams(NULL), cosSinSums(NULL), directPmeGrid(NULL), reciprocalPmeGrid(NULL),
-            pmeBsplineModuliX(NULL), pmeBsplineModuliY(NULL), pmeBsplineModuliZ(NULL), pmeBsplineTheta(NULL), pmeBsplineDTheta(NULL),
-            pmeAtomRange(NULL), pmeAtomGridIndex(NULL), sort(NULL) {
+            pmeBsplineModuliX(NULL), pmeBsplineModuliY(NULL), pmeBsplineModuliZ(NULL),  pmeAtomRange(NULL), pmeAtomGridIndex(NULL), sort(NULL) {
    }
    ~CudaCalcNonbondedForceKernel();
    /**
@@ -607,8 +606,6 @@ private:
    CudaArray* pmeBsplineModuliX;
    CudaArray* pmeBsplineModuliY;
    CudaArray* pmeBsplineModuliZ;
-    CudaArray* pmeBsplineTheta;
-    CudaArray* pmeBsplineDTheta;
    CudaArray* pmeAtomRange;
    CudaArray* pmeAtomGridIndex;
    CudaSort* sort;
@@ -617,9 +614,6 @@ private:
    CUfunction ewaldSumsKernel;
    CUfunction ewaldForcesKernel;
    CUfunction pmeGridIndexKernel;
-    CUfunction pmeAtomRangeKernel;
-    CUfunction pmeZIndexKernel;
-    CUfunction pmeUpdateBsplinesKernel;
    CUfunction pmeSpreadChargeKernel;
    CUfunction pmeFinishSpreadChargeKernel;
    CUfunction pmeEvalEnergyKernel;
@@ -776,6 +770,8 @@ private:
    System& system;
    CUfunction pairValueKernel, perParticleValueKernel, pairEnergyKernel, perParticleEnergyKernel, gradientChainRuleKernel;
    std::vector<void*> pairValueArgs, perParticleValueArgs, pairEnergyArgs, perParticleEnergyArgs, gradientChainRuleArgs;
+    std::string pairValueSrc, pairEnergySrc;
+    std::map<std::string, std::string> pairValueDefines, pairEnergyDefines;
 };

 /**

--- a/platforms/cuda/src/CudaNonbondedUtilities.cpp
+++ b/platforms/cuda/src/CudaNonbondedUtilities.cpp
--- a/platforms/cuda/src/CudaNonbondedUtilities.h
+++ b/platforms/cuda/src/CudaNonbondedUtilities.h
@@ -9,7 +9,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2009-2012 Stanford University and the Authors.      *
+ * Portions copyright (c) 2009-2013 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -36,6 +36,8 @@

 namespace OpenMM {
    
+class CudaSort;
+
 /**
 * This class provides a generic interface for calculating nonbonded interactions.  It does this in two
 * ways.  First, it can be used to create kernels that evaluate nonbonded interactions.  Clients
@@ -181,10 +183,10 @@ public:
        return *interactingTiles;
    }
    /**
-     * Get the array containing flags for tiles with interactions.
+     * Get the array containing the atoms in each tile with interactions.
     */
-    CudaArray& getInteractionFlags() {
-        return *interactionFlags;
+    CudaArray& getInteractingAtoms() {
+        return *interactingAtoms;
    }
    /**
     * Get the array containing exclusion flags.
@@ -192,6 +194,12 @@ public:
    CudaArray& getExclusions() {
        return *exclusions;
    }
+    /**
+     * Get the array containing tiles with exclusions.
+     */
+    CudaArray& getExclusionTiles() {
+        return *exclusionTiles;
+    }
    /**
     * Get the array containing the index into the exclusion array for each tile.
     */
@@ -217,9 +225,17 @@ public:
        return numTiles;
    }
    /**
-     * Set the range of tiles that should be processed by this context.
+     * Set whether to add padding to the cutoff distance when building the neighbor list.
+     * This increases the size of the neighbor list (and thus the cost of computing interactions),
+     * but also means we don't need to rebuild it every time step.  The default value is true,
+     * since usually this improves performance.  For very expensive interactions, however,
+     * it may be better to set this to false.
+     */
+    void setUsePadding(bool padding);
+    /**
+     * Set the range of atom blocks and tiles that should be processed by this context.
     */
-    void setTileRange(int startTileIndex, int numTiles);
+    void setAtomBlockRange(double startFraction, double endFraction);
    /**
     * Create a Kernel for evaluating a nonbonded interaction.  Cutoffs and periodic boundary conditions
     * are assumed to be the same as those for the default interaction Kernel, since this kernel will use
@@ -232,42 +248,38 @@ public:
     * @param isSymmetric   specifies whether the interaction is symmetric
     */
    CUfunction createInteractionKernel(const std::string& source, std::vector<ParameterInfo>& params, std::vector<ParameterInfo>& arguments, bool useExclusions, bool isSymmetric);
-    /**
-     * This is a utility routine for locating data in the exclusions array.  It takes the (x,y) indices of a tile,
-     * and returns the location in the array where the data for that tile begins.
-     * 
-     * This routine requires that x >= y.  If not, it will throw an exception.
-     * 
-     * @param x                   the x index of the tile
-     * @param y                   the y index of the tile
-     * @param exclusionIndices    the content of the exclusionIndices array
-     * @param exclusionRowIndices the content of the exclusionRowIndices array
-     * @return the index in the exclusions array at which the data for that tile begins
-     */
-    static int findExclusionIndex(int x, int y, const std::vector<unsigned int>& exclusionIndices, const std::vector<unsigned int>& exclusionRowIndices);
 private:
+    class BlockSortTrait;
    CudaContext& context;
    CUfunction forceKernel;
    CUfunction findBlockBoundsKernel;
+    CUfunction sortBoxDataKernel;
    CUfunction findInteractingBlocksKernel;
    CUfunction findInteractionsWithinBlocksKernel;
+    CudaArray* exclusionTiles;
    CudaArray* exclusions;
    CudaArray* exclusionIndices;
    CudaArray* exclusionRowIndices;
    CudaArray* interactingTiles;
-    CudaArray* interactionFlags;
+    CudaArray* interactingAtoms;
    CudaArray* interactionCount;
    CudaArray* blockCenter;
    CudaArray* blockBoundingBox;
-    std::vector<void*> forceArgs, findBlockBoundsArgs, findInteractingBlocksArgs, findInteractionsWithinBlocksArgs;
+    CudaArray* sortedBlocks;
+    CudaArray* sortedBlockCenter;
+    CudaArray* sortedBlockBoundingBox;
+    CudaArray* oldPositions;
+    CudaArray* rebuildNeighborList;
+    CudaSort* blockSorter;
+    std::vector<void*> forceArgs, findBlockBoundsArgs, sortBoxDataArgs, findInteractingBlocksArgs;
    std::vector<std::vector<int> > atomExclusions;
    std::vector<ParameterInfo> parameters;
    std::vector<ParameterInfo> arguments;
    std::string kernelSource;
    std::map<std::string, std::string> kernelDefines;
    double cutoff;
-    bool useCutoff, usePeriodic, anyExclusions;
-    int startTileIndex, numTiles, maxTiles, numForceThreadBlocks, forceThreadBlockSize, nonbondedForceGroup, numAtoms;
+    bool useCutoff, usePeriodic, anyExclusions, usePadding;
+    int startTileIndex, numTiles, startBlockIndex, numBlocks, maxTiles, numForceThreadBlocks, forceThreadBlockSize, nonbondedForceGroup, numAtoms;
 };

 /**

--- a/platforms/cuda/src/CudaParallelKernels.cpp
+++ b/platforms/cuda/src/CudaParallelKernels.cpp
@@ -6,7 +6,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2011-2012 Stanford University and the Authors.      *
+ * Portions copyright (c) 2011-2013 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -118,7 +118,7 @@ private:
 };

 CudaParallelCalcForcesAndEnergyKernel::CudaParallelCalcForcesAndEnergyKernel(string name, const Platform& platform, CudaPlatform::PlatformData& data) :
-        CalcForcesAndEnergyKernel(name, platform), data(data), completionTimes(data.contexts.size()), contextTiles(data.contexts.size()), contextForces(NULL),
+        CalcForcesAndEnergyKernel(name, platform), data(data), completionTimes(data.contexts.size()), contextNonbondedFractions(data.contexts.size()), contextForces(NULL),
        pinnedPositionBuffer(NULL), pinnedForceBuffer(NULL) {
    for (int i = 0; i < (int) data.contexts.size(); i++)
        kernels.push_back(Kernel(new CudaCalcForcesAndEnergyKernel(name, platform, *data.contexts[i])));
@@ -141,6 +141,8 @@ void CudaParallelCalcForcesAndEnergyKernel::initialize(const System& system) {
    sumKernel = cu.getKernel(module, "sumForces");
    for (int i = 0; i < (int) kernels.size(); i++)
        getKernel(i).initialize(system);
+    for (int i = 0; i < (int) contextNonbondedFractions.size(); i++)
+        contextNonbondedFractions[i] = 1/(double) contextNonbondedFractions.size();
 }

 void CudaParallelCalcForcesAndEnergyKernel::beginComputation(ContextImpl& context, bool includeForce, bool includeEnergy, int groups) {
@@ -184,30 +186,26 @@ double CudaParallelCalcForcesAndEnergyKernel::finishComputation(ContextImpl& con
        void* args[] = {&cu.getForce().getDevicePointer(), &contextForces->getDevicePointer(), &bufferSize, &numBuffers};
        cu.executeKernel(sumKernel, args, bufferSize);
        
-        // Balance work between the contexts by transferring a few nonbonded tiles from the context that
+        // Balance work between the contexts by transferring a little nonbonded work from the context that
        // finished last to the one that finished first.
        
        int firstIndex = 0, lastIndex = 0;
-        int totalTiles = 0;
        for (int i = 0; i < (int) completionTimes.size(); i++) {
            if (completionTimes[i] < completionTimes[firstIndex])
                firstIndex = i;
            if (completionTimes[i] > completionTimes[lastIndex])
                lastIndex = i;
-            contextTiles[i] = data.contexts[i]->getNonbondedUtilities().getNumTiles();
-            totalTiles += contextTiles[i];
-        }
-        int tilesToTransfer = totalTiles/1000;
-        if (tilesToTransfer < 1)
-            tilesToTransfer = 1;
-        if (tilesToTransfer > contextTiles[lastIndex])
-            tilesToTransfer = contextTiles[lastIndex];
-        contextTiles[firstIndex] += tilesToTransfer;
-        contextTiles[lastIndex] -= tilesToTransfer;
-        int startIndex = 0;
-        for (int i = 0; i < (int) contextTiles.size(); i++) {
-            data.contexts[i]->getNonbondedUtilities().setTileRange(startIndex, contextTiles[i]);
-            startIndex += contextTiles[i];
+        }
+        double fractionToTransfer = min(0.001, contextNonbondedFractions[lastIndex]);
+        contextNonbondedFractions[firstIndex] += fractionToTransfer;
+        contextNonbondedFractions[lastIndex] -= fractionToTransfer;
+        double startFraction = 0.0;
+        for (int i = 0; i < (int) contextNonbondedFractions.size(); i++) {
+            double endFraction = startFraction+contextNonbondedFractions[i];
+            if (i == contextNonbondedFractions.size()-1)
+                endFraction = 1.0; // Avoid roundoff error
+            data.contexts[i]->getNonbondedUtilities().setAtomBlockRange(startFraction, endFraction);
+            startFraction = endFraction;
        }
    }
    return energy;

--- a/platforms/cuda/src/CudaParallelKernels.h
+++ b/platforms/cuda/src/CudaParallelKernels.h
@@ -9,7 +9,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2011-2012 Stanford University and the Authors.      *
+ * Portions copyright (c) 2011-2013 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -80,7 +80,7 @@ private:
    CudaPlatform::PlatformData& data;
    std::vector<Kernel> kernels;
    std::vector<long long> completionTimes;
-    std::vector<int> contextTiles;
+    std::vector<double> contextNonbondedFractions;
    CudaArray* contextForces;
    void* pinnedPositionBuffer;
    long long* pinnedForceBuffer;

--- a/platforms/cuda/src/CudaSort.cpp
+++ b/platforms/cuda/src/CudaSort.cpp
@@ -32,7 +32,7 @@ using namespace OpenMM;
 using namespace std;

 CudaSort::CudaSort(CudaContext& context, SortTrait* trait, unsigned int length) : context(context), trait(trait),
-        dataRange(NULL), bucketOfElement(NULL), offsetInBucket(NULL), bucketOffset(NULL), buckets(NULL) {
+        dataRange(NULL), bucketOfElement(NULL), offsetInBucket(NULL), bucketOffset(NULL), buckets(NULL), dataLength(length) {
    // Create kernels.

    map<string, string> replacements;
@@ -43,6 +43,7 @@ CudaSort::CudaSort(CudaContext& context, SortTrait* trait, unsigned int length)
    replacements["MAX_KEY"] = trait->getMaxKey();
    replacements["MAX_VALUE"] = trait->getMaxValue();
    CUmodule module = context.createModule(context.replaceStrings(CudaKernelSources::sort, replacements));
+    shortListKernel = context.getKernel(module, "sortShortList");
    computeRangeKernel = context.getKernel(module, "computeRange");
    assignElementsKernel = context.getKernel(module, "assignElementsToBuckets");
    computeBucketPositionsKernel = context.getKernel(module, "computeBucketPositions");
@@ -53,15 +54,16 @@ CudaSort::CudaSort(CudaContext& context, SortTrait* trait, unsigned int length)

    int maxBlockSize;
    cuDeviceGetAttribute(&maxBlockSize, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, context.getDevice());
+    int maxSharedMem;
+    cuDeviceGetAttribute(&maxSharedMem, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK, context.getDevice());
+    unsigned int maxLocalBuffer = (unsigned int) ((maxSharedMem/trait->getDataSize())/2);
+    isShortList = (length <= maxLocalBuffer);
    for (rangeKernelSize = 1; rangeKernelSize*2 <= maxBlockSize; rangeKernelSize *= 2)
        ;
    positionsKernelSize = rangeKernelSize;
-    sortKernelSize = rangeKernelSize/2;
+    sortKernelSize = (isShortList ? rangeKernelSize/2 : rangeKernelSize/4);
    if (rangeKernelSize > length)
        rangeKernelSize = length;
-    int maxSharedMem;
-    cuDeviceGetAttribute(&maxSharedMem, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK, context.getDevice());
-    unsigned int maxLocalBuffer = (unsigned int) ((maxSharedMem/trait->getDataSize())/2);
    if (sortKernelSize > maxLocalBuffer)
        sortKernelSize = maxLocalBuffer;
    unsigned int targetBucketSize = sortKernelSize/2;
@@ -73,11 +75,13 @@ CudaSort::CudaSort(CudaContext& context, SortTrait* trait, unsigned int length)

    // Create workspace arrays.

+    if (!isShortList) {
        dataRange = new CudaArray(context, 2, trait->getKeySize(), "sortDataRange");
        bucketOffset = CudaArray::create<uint1>(context, numBuckets, "bucketOffset");
        bucketOfElement = CudaArray::create<uint1>(context, length, "bucketOfElement");
        offsetInBucket = CudaArray::create<uint1>(context, length, "offsetInBucket");
        buckets = new CudaArray(context, length, trait->getDataSize(), "buckets");
+    }
 }

 CudaSort::~CudaSort() {
@@ -95,22 +99,27 @@ CudaSort::~CudaSort() {
 }

 void CudaSort::sort(CudaArray& data) {
-    if (data.getSize() != bucketOfElement->getSize() || data.getElementSize() != trait->getDataSize())
+    if (data.getSize() != dataLength || data.getElementSize() != trait->getDataSize())
        throw OpenMMException("CudaSort called with different data size");
    if (data.getSize() == 0)
        return;
+    if (isShortList) {
+        // We can use a simpler sort kernel that does the entire operation at once in local memory.
        
+        void* sortArgs[] = {&data.getDevicePointer(), &dataLength};
+        context.executeKernel(shortListKernel, sortArgs, sortKernelSize, sortKernelSize, dataLength*trait->getDataSize());
+    }
+    else {
        // Compute the range of data values.

-    unsigned int dataSize = data.getSize();
-    void* rangeArgs[] = {&data.getDevicePointer(), &dataSize, &dataRange->getDevicePointer()};
+        void* rangeArgs[] = {&data.getDevicePointer(), &dataLength, &dataRange->getDevicePointer()};
        context.executeKernel(computeRangeKernel, rangeArgs, rangeKernelSize, rangeKernelSize, rangeKernelSize*trait->getKeySize());

        // Assign array elements to buckets.

        unsigned int numBuckets = bucketOffset->getSize();
        context.clearBuffer(*bucketOffset);
-    void* elementsArgs[] = {&data.getDevicePointer(), &dataSize, &numBuckets, &dataRange->getDevicePointer(),
+        void* elementsArgs[] = {&data.getDevicePointer(), &dataLength, &numBuckets, &dataRange->getDevicePointer(),
                &bucketOffset->getDevicePointer(), &bucketOfElement->getDevicePointer(), &offsetInBucket->getDevicePointer()};
        context.executeKernel(assignElementsKernel, elementsArgs, data.getSize());

@@ -121,7 +130,7 @@ void CudaSort::sort(CudaArray& data) {

        // Copy the data into the buckets.

-    void* copyArgs[] = {&data.getDevicePointer(), &buckets->getDevicePointer(), &dataSize, &bucketOffset->getDevicePointer(),
+        void* copyArgs[] = {&data.getDevicePointer(), &buckets->getDevicePointer(), &dataLength, &bucketOffset->getDevicePointer(),
                &bucketOfElement->getDevicePointer(), &offsetInBucket->getDevicePointer()};
        context.executeKernel(copyToBucketsKernel, copyArgs, data.getSize());

@@ -129,4 +138,5 @@ void CudaSort::sort(CudaArray& data) {

        void* sortArgs[] = {&data.getDevicePointer(), &buckets->getDevicePointer(), &numBuckets, &bucketOffset->getDevicePointer()};
        context.executeKernel(sortBucketsKernel, sortArgs, ((data.getSize()+sortKernelSize-1)/sortKernelSize)*sortKernelSize, sortKernelSize, sortKernelSize*trait->getDataSize());
+    }
 }
--- a/platforms/cuda/src/CudaSort.h
+++ b/platforms/cuda/src/CudaSort.h
@@ -92,8 +92,9 @@ private:
    CudaArray* offsetInBucket;
    CudaArray* bucketOffset;
    CudaArray* buckets;
-    CUfunction computeRangeKernel, assignElementsKernel, computeBucketPositionsKernel, copyToBucketsKernel, sortBucketsKernel;
-    unsigned int rangeKernelSize, positionsKernelSize, sortKernelSize;
+    CUfunction shortListKernel, computeRangeKernel, assignElementsKernel, computeBucketPositionsKernel, copyToBucketsKernel, sortBucketsKernel;
+    unsigned int dataLength, rangeKernelSize, positionsKernelSize, sortKernelSize;
+    bool isShortList;
 };

 /**

--- a/platforms/cuda/src/kernels/coulombLennardJones.cu
+++ b/platforms/cuda/src/kernels/coulombLennardJones.cu
 #if USE_EWALD
-bool needCorrection = isExcluded && atom1 != atom2 && atom1 < NUM_ATOMS && atom2 < NUM_ATOMS;
+bool needCorrection = hasExclusions && isExcluded && atom1 != atom2 && atom1 < NUM_ATOMS && atom2 < NUM_ATOMS;
 if (!isExcluded || needCorrection) {
-    real tempForce = 0.0f;
    if (r2 < CUTOFF_SQUARED || needCorrection) {
        const real alphaR = EWALD_ALPHA*r;
        const real expAlphaRSqr = EXP(-alphaR*alphaR);
@@ -16,6 +15,7 @@ if (!isExcluded || needCorrection) {
        t *= t;
        t *= t;
        const real erfcAlphaR = RECIP(t*t);
+        real tempForce = 0.0f;
        if (needCorrection) {
            // Subtract off the part of this interaction that was included in the reciprocal space contribution.

@@ -36,8 +36,8 @@ if (!isExcluded || needCorrection) {
            tempEnergy += prefactor*erfcAlphaR;
 #endif
        }
-    }
        dEdR += tempForce*invR*invR;
+    }
 }
 #else
 {

--- a/platforms/cuda/src/kernels/customGBEnergyN2.cu
+++ b/platforms/cuda/src/kernels/customGBEnergyN2.cu
 #define STORE_DERIVATIVE_1(INDEX) atomicAdd(&derivBuffers[offset+(INDEX-1)*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (deriv##INDEX##_1*0x100000000)));
 #define STORE_DERIVATIVE_2(INDEX) atomicAdd(&derivBuffers[offset+(INDEX-1)*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].deriv##INDEX*0x100000000)));
-#define TILE_SIZE 32

 typedef struct {
    real4 posq;
@@ -15,88 +14,43 @@ typedef struct {
 * Compute a force based on pair interactions.
 */
 extern "C" __global__ void computeN2Energy(unsigned long long* __restrict__ forceBuffers, real* __restrict__ energyBuffer,
-        const real4* __restrict__ posq, const unsigned int* __restrict__ exclusions, const unsigned int* __restrict__ exclusionIndices,
-        const unsigned int* __restrict__ exclusionRowIndices,
+        const real4* __restrict__ posq, const unsigned int* __restrict__ exclusions, const ushort2* __restrict__ exclusionTiles,
 #ifdef USE_CUTOFF
-        const ushort2* __restrict__ tiles, const unsigned int* __restrict__ interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize, unsigned int maxTiles, const unsigned int* __restrict__ interactionFlags
+        const ushort2* __restrict__ tiles, const unsigned int* __restrict__ interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize, unsigned int maxTiles, const real4* __restrict__ blockCenter, const unsigned int* __restrict__ interactingAtoms
 #else
        unsigned int numTiles
 #endif
        PARAMETER_ARGUMENTS) {
-    unsigned int totalWarps = (blockDim.x*gridDim.x)/TILE_SIZE;
-    unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/TILE_SIZE;
-#ifdef USE_CUTOFF
-    unsigned int numTiles = interactionCount[0];
-    unsigned int pos = warp*(numTiles > maxTiles ? NUM_BLOCKS*(NUM_BLOCKS+1)/2 : numTiles)/totalWarps;
-    unsigned int end = (warp+1)*(numTiles > maxTiles ? NUM_BLOCKS*(NUM_BLOCKS+1)/2 : numTiles)/totalWarps;
-#else
-    unsigned int pos = warp*numTiles/totalWarps;
-    unsigned int end = (warp+1)*numTiles/totalWarps;
-#endif
+    const unsigned int totalWarps = (blockDim.x*gridDim.x)/TILE_SIZE;
+    const unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/TILE_SIZE;
+    const unsigned int tgx = threadIdx.x & (TILE_SIZE-1);
+    const unsigned int tbx = threadIdx.x - tgx;
    real energy = 0;
-    unsigned int lasty = 0xFFFFFFFF;
    __shared__ AtomData localData[THREAD_BLOCK_SIZE];
-    __shared__ unsigned int exclusionRange[2*WARPS_PER_GROUP];
-    __shared__ int exclusionIndex[WARPS_PER_GROUP];

-    do {
-        // Extract the coordinates of this tile
-        const unsigned int tgx = threadIdx.x & (TILE_SIZE-1);
-        const unsigned int tbx = threadIdx.x - tgx;
-        const unsigned int localGroupIndex = threadIdx.x/TILE_SIZE;
-        unsigned int x, y;
+    // First loop: process tiles that contain exclusions.
+    
+    const unsigned int firstExclusionTile = FIRST_EXCLUSION_TILE+warp*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/totalWarps;
+    const unsigned int lastExclusionTile = FIRST_EXCLUSION_TILE+(warp+1)*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/totalWarps;
+    for (int pos = firstExclusionTile; pos < lastExclusionTile; pos++) {
+        const ushort2 tileIndices = exclusionTiles[pos];
+        const unsigned int x = tileIndices.x;
+        const unsigned int y = tileIndices.y;
        real3 force = make_real3(0);
        DECLARE_ATOM1_DERIVATIVES
-        if (pos < end) {
-#ifdef USE_CUTOFF
-            if (numTiles <= maxTiles) {
-                ushort2 tileIndices = tiles[pos];
-                x = tileIndices.x;
-                y = tileIndices.y;
-            }
-            else
-#endif
-            {
-                y = (unsigned int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
-                x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
-                if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
-                    y += (x < y ? -1 : 1);
-                    x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
-                }
-            }
        unsigned int atom1 = x*TILE_SIZE + tgx;
        real4 posq1 = posq[atom1];
        LOAD_ATOM1_PARAMETERS
-
-            // Locate the exclusion data for this tile.
-
 #ifdef USE_EXCLUSIONS
-            if (tgx < 2)
-                exclusionRange[2*localGroupIndex+tgx] = exclusionRowIndices[x+tgx];
-            if (tgx == 0)
-                exclusionIndex[localGroupIndex] = -1;
-            for (unsigned int i = exclusionRange[2*localGroupIndex]+tgx; i < exclusionRange[2*localGroupIndex+1]; i += TILE_SIZE)
-                if (exclusionIndices[i] == y)
-                    exclusionIndex[localGroupIndex] = i*TILE_SIZE;
-            bool hasExclusions = (exclusionIndex[localGroupIndex] > -1);
-#else
-            bool hasExclusions = false;
+        unsigned int excl = exclusions[pos*TILE_SIZE+tgx];
 #endif
-            if (pos >= end)
-                ; // This warp is done.
-            else if (x == y) {
+        if (x == y) {
            // This tile is on the diagonal.

            const unsigned int localAtomIndex = threadIdx.x;
            localData[localAtomIndex].posq = posq1;
            LOAD_LOCAL_PARAMETERS_FROM_1
-#ifdef USE_EXCLUSIONS
-                unsigned int excl = exclusions[exclusionIndex[localGroupIndex]+tgx];
-#endif
            for (unsigned int j = 0; j < TILE_SIZE; j++) {
-#ifdef USE_EXCLUSIONS
-                    bool isExcluded = !(excl & 0x1);
-#endif
                int atom2 = tbx+j;
                real4 posq2 = localData[atom2].posq;
                real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z);
@@ -115,6 +69,9 @@ extern "C" __global__ void computeN2Energy(unsigned long long* __restrict__ forc
                    atom2 = y*TILE_SIZE+j;
                    real dEdR = 0;
                    real tempEnergy = 0;
+#ifdef USE_EXCLUSIONS
+                    bool isExcluded = !(excl & 0x1);
+#endif
                    if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && atom1 != atom2) {
                        COMPUTE_INTERACTION
                        dEdR /= -r;
@@ -136,32 +93,16 @@ extern "C" __global__ void computeN2Energy(unsigned long long* __restrict__ forc
            // This is an off-diagonal tile.

            const unsigned int localAtomIndex = threadIdx.x;
-                if (lasty != y) {
            unsigned int j = y*TILE_SIZE + tgx;
            localData[localAtomIndex].posq = posq[j];
            LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
-                }
            localData[localAtomIndex].force = make_real3(0);
            CLEAR_LOCAL_DERIVATIVES
-#ifdef USE_CUTOFF
-                unsigned int flags = (numTiles <= maxTiles ? interactionFlags[pos] : 0xFFFFFFFF);
-                if (!hasExclusions && flags == 0) {
-                    // No interactions in this tile.
-                }
-                else
-#endif
-                {
-                    // Compute the full set of interactions in this tile.
-
 #ifdef USE_EXCLUSIONS
-                    unsigned int excl = (hasExclusions ? exclusions[exclusionIndex[localGroupIndex]+tgx] : 0xFFFFFFFF);
            excl = (excl >> tgx) | (excl << (TILE_SIZE - tgx));
 #endif
            unsigned int tj = tgx;
-                    for (unsigned int j = 0; j < TILE_SIZE; j++) {
-#ifdef USE_EXCLUSIONS
-                        bool isExcluded = !(excl & 0x1);
-#endif
+            for (j = 0; j < TILE_SIZE; j++) {
                int atom2 = tbx+tj;
                real4 posq2 = localData[atom2].posq;
                real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z);
@@ -180,6 +121,9 @@ extern "C" __global__ void computeN2Energy(unsigned long long* __restrict__ forc
                    atom2 = y*TILE_SIZE+tj;
                    real dEdR = 0;
                    real tempEnergy = 0;
+#ifdef USE_EXCLUSIONS
+                    bool isExcluded = !(excl & 0x1);
+#endif
                    if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
                        COMPUTE_INTERACTION
                        dEdR /= -r;
@@ -203,27 +147,216 @@ extern "C" __global__ void computeN2Energy(unsigned long long* __restrict__ forc
                tj = (tj + 1) & (TILE_SIZE - 1);
            }
        }
-            }
-        }
-        lasty = y;

        // Write results.

-        if (pos < end) {
-            const unsigned int offset = x*TILE_SIZE + tgx;
+        unsigned int offset = x*TILE_SIZE + tgx;
        atomicAdd(&forceBuffers[offset], static_cast<unsigned long long>((long long) (force.x*0x100000000)));
        atomicAdd(&forceBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force.y*0x100000000)));
        atomicAdd(&forceBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force.z*0x100000000)));
        STORE_DERIVATIVES_1
-        }
-        if (pos < end && x != y) {
-            const unsigned int offset = y*TILE_SIZE + tgx;
+        if (x != y) {
+            offset = y*TILE_SIZE + tgx;
            atomicAdd(&forceBuffers[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.x*0x100000000)));
            atomicAdd(&forceBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.y*0x100000000)));
            atomicAdd(&forceBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.z*0x100000000)));
            STORE_DERIVATIVES_2
        }
+    }
+
+    // Second loop: tiles without exclusions, either from the neighbor list (with cutoff) or just enumerating all
+    // of them (no cutoff).
+
+#ifdef USE_CUTOFF
+    unsigned int numTiles = interactionCount[0];
+    int pos = warp*(numTiles > maxTiles ? NUM_BLOCKS*(NUM_BLOCKS+1)/2 : numTiles)/totalWarps;
+    int end = (warp+1)*(numTiles > maxTiles ? NUM_BLOCKS*(NUM_BLOCKS+1)/2 : numTiles)/totalWarps;
+#else
+    int pos = warp*numTiles/totalWarps;
+    int end = (warp+1)*numTiles/totalWarps;
+#endif
+    int skipBase = 0;
+    int currentSkipIndex = tbx;
+    __shared__ int atomIndices[THREAD_BLOCK_SIZE];
+    __shared__ int skipTiles[THREAD_BLOCK_SIZE];
+    skipTiles[threadIdx.x] = -1;
+    
+    while (pos < end) {
+        const bool isExcluded = false;
+        real3 force = make_real3(0);
+        DECLARE_ATOM1_DERIVATIVES
+        bool includeTile = true;
+        
+        // Extract the coordinates of this tile.
+        
+        unsigned int x, y;
+        bool singlePeriodicCopy = false;
+#ifdef USE_CUTOFF
+        if (numTiles <= maxTiles) {
+            ushort2 tileIndices = tiles[pos];
+            x = tileIndices.x;
+            singlePeriodicCopy = tileIndices.y;
+        }
+        else
+#endif
+        {
+            y = (unsigned int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
+            x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
+            if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
+                y += (x < y ? -1 : 1);
+                x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
+            }
+
+            // Skip over tiles that have exclusions, since they were already processed.
+
+            while (skipTiles[tbx+TILE_SIZE-1] < pos) {
+                if (skipBase+tgx < NUM_TILES_WITH_EXCLUSIONS) {
+                    ushort2 tile = exclusionTiles[skipBase+tgx];
+                    skipTiles[threadIdx.x] = tile.x + tile.y*NUM_BLOCKS - tile.y*(tile.y+1)/2;
+                }
+                else
+                    skipTiles[threadIdx.x] = end;
+                skipBase += TILE_SIZE;            
+                currentSkipIndex = tbx;
+            }
+            while (skipTiles[currentSkipIndex] < pos)
+                currentSkipIndex++;
+            includeTile = (skipTiles[currentSkipIndex] != pos);
+        }
+        if (includeTile) {
+            unsigned int atom1 = x*TILE_SIZE + tgx;
+
+            // Load atom data for this tile.
+
+            real4 posq1 = posq[atom1];
+            LOAD_ATOM1_PARAMETERS
+            const unsigned int localAtomIndex = threadIdx.x;
+#ifdef USE_CUTOFF
+            unsigned int j = (numTiles <= maxTiles ? interactingAtoms[pos*TILE_SIZE+tgx] : y*TILE_SIZE + tgx);
+#else
+            unsigned int j = y*TILE_SIZE + tgx;
+#endif
+            atomIndices[threadIdx.x] = j;
+            if (j < PADDED_NUM_ATOMS) {
+                localData[localAtomIndex].posq = posq[j];
+                LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
+                localData[localAtomIndex].force = make_real3(0);
+                CLEAR_LOCAL_DERIVATIVES
+            }
+#ifdef USE_PERIODIC
+            if (singlePeriodicCopy) {
+                // The box is small enough that we can just translate all the atoms into a single periodic
+                // box, then skip having to apply periodic boundary conditions later.
+
+                real4 blockCenterX = blockCenter[x];
+                posq1.x -= floor((posq1.x-blockCenterX.x)*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
+                posq1.y -= floor((posq1.y-blockCenterX.y)*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
+                posq1.z -= floor((posq1.z-blockCenterX.z)*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
+                localData[threadIdx.x].posq.x -= floor((localData[threadIdx.x].posq.x-blockCenterX.x)*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
+                localData[threadIdx.x].posq.y -= floor((localData[threadIdx.x].posq.y-blockCenterX.y)*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
+                localData[threadIdx.x].posq.z -= floor((localData[threadIdx.x].posq.z-blockCenterX.z)*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
+                unsigned int tj = tgx;
+                for (j = 0; j < TILE_SIZE; j++) {
+                    int atom2 = tbx+tj;
+                    real4 posq2 = localData[atom2].posq;
+                    real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z);
+                    real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
+#ifdef USE_CUTOFF
+                    if (r2 < CUTOFF_SQUARED) {
+#endif
+                        real invR = RSQRT(r2);
+                        real r = RECIP(invR);
+                        LOAD_ATOM2_PARAMETERS
+                        atom2 = atomIndices[tbx+tj];
+                        real dEdR = 0;
+                        real tempEnergy = 0;
+                        if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
+                            COMPUTE_INTERACTION
+                            dEdR /= -r;
+                        }
+                        energy += tempEnergy;
+                        delta *= dEdR;
+                        force.x -= delta.x;
+                        force.y -= delta.y;
+                        force.z -= delta.z;
+                        atom2 = tbx+tj;
+                        localData[atom2].force.x += delta.x;
+                        localData[atom2].force.y += delta.y;
+                        localData[atom2].force.z += delta.z;
+                        RECORD_DERIVATIVE_2
+#ifdef USE_CUTOFF
+                    }
+#endif
+                    tj = (tj + 1) & (TILE_SIZE - 1);
+                }
+            }
+            else
+#endif
+            {
+                // We need to apply periodic boundary conditions separately for each interaction.
+
+                unsigned int tj = tgx;
+                for (j = 0; j < TILE_SIZE; j++) {
+                    int atom2 = tbx+tj;
+                    real4 posq2 = localData[atom2].posq;
+                    real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z);
+#ifdef USE_PERIODIC
+                    delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
+                    delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
+                    delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
+#endif
+                    real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
+#ifdef USE_CUTOFF
+                    if (r2 < CUTOFF_SQUARED) {
+#endif
+                        real invR = RSQRT(r2);
+                        real r = RECIP(invR);
+                        LOAD_ATOM2_PARAMETERS
+                        atom2 = atomIndices[tbx+tj];
+                        real dEdR = 0;
+                        real tempEnergy = 0;
+                        if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
+                            COMPUTE_INTERACTION
+                            dEdR /= -r;
+                        }
+                        energy += tempEnergy;
+                        delta *= dEdR;
+                        force.x -= delta.x;
+                        force.y -= delta.y;
+                        force.z -= delta.z;
+                        atom2 = tbx+tj;
+                        localData[atom2].force.x += delta.x;
+                        localData[atom2].force.y += delta.y;
+                        localData[atom2].force.z += delta.z;
+                        RECORD_DERIVATIVE_2
+#ifdef USE_CUTOFF
+                    }
+#endif
+                    tj = (tj + 1) & (TILE_SIZE - 1);
+                }
+            }
+        
+            // Write results.
+
+            atomicAdd(&forceBuffers[atom1], static_cast<unsigned long long>((long long) (force.x*0x100000000)));
+            atomicAdd(&forceBuffers[atom1+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force.y*0x100000000)));
+            atomicAdd(&forceBuffers[atom1+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force.z*0x100000000)));
+            unsigned int offset = atom1;
+            STORE_DERIVATIVES_1
+#ifdef USE_CUTOFF
+            unsigned int atom2 = atomIndices[threadIdx.x];
+#else
+            unsigned int atom2 = y*TILE_SIZE + tgx;
+#endif
+            if (atom2 < PADDED_NUM_ATOMS) {
+                atomicAdd(&forceBuffers[atom2], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.x*0x100000000)));
+                atomicAdd(&forceBuffers[atom2+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.y*0x100000000)));
+                atomicAdd(&forceBuffers[atom2+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.z*0x100000000)));
+                offset = atom2;
+                STORE_DERIVATIVES_2
+            }
+        }
        pos++;
-    } while (pos < end);
+    }
    energyBuffer[blockIdx.x*blockDim.x+threadIdx.x] += energy;
 }
--- a/platforms/cuda/src/kernels/customGBValueN2.cu
+++ b/platforms/cuda/src/kernels/customGBValueN2.cu
-#define TILE_SIZE 32
-
 typedef struct {
    real4 posq;
    real value, temp;
@@ -13,86 +11,41 @@ typedef struct {
 * Compute a value based on pair interactions.
 */
 extern "C" __global__ void computeN2Value(const real4* __restrict__ posq, const unsigned int* __restrict__ exclusions,
-        const unsigned int* __restrict__ exclusionIndices, const unsigned int* __restrict__ exclusionRowIndices, unsigned long long* __restrict__ global_value,
+        const ushort2* __restrict__ exclusionTiles, unsigned long long* __restrict__ global_value,
 #ifdef USE_CUTOFF
-        const ushort2* __restrict__ tiles, const unsigned int* __restrict__ interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize, unsigned int maxTiles, const unsigned int* __restrict__ interactionFlags
+        const ushort2* __restrict__ tiles, const unsigned int* __restrict__ interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize, unsigned int maxTiles, const real4* __restrict__ blockCenter, const unsigned int* __restrict__ interactingAtoms
 #else
        unsigned int numTiles
 #endif
        PARAMETER_ARGUMENTS) {
-    unsigned int totalWarps = (blockDim.x*gridDim.x)/TILE_SIZE;
-    unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/TILE_SIZE;
-#ifdef USE_CUTOFF
-    unsigned int numTiles = interactionCount[0];
-    unsigned int pos = warp*(numTiles > maxTiles ? NUM_BLOCKS*(NUM_BLOCKS+1)/2 : numTiles)/totalWarps;
-    unsigned int end = (warp+1)*(numTiles > maxTiles ? NUM_BLOCKS*(NUM_BLOCKS+1)/2 : numTiles)/totalWarps;
-#else
-    unsigned int pos = warp*numTiles/totalWarps;
-    unsigned int end = (warp+1)*numTiles/totalWarps;
-#endif
-    real energy = 0;
-    unsigned int lasty = 0xFFFFFFFF;
-    __shared__ AtomData localData[THREAD_BLOCK_SIZE];
-    __shared__ unsigned int exclusionRange[2*WARPS_PER_GROUP];
-    __shared__ int exclusionIndex[WARPS_PER_GROUP];
-    
-    do {
-        // Extract the coordinates of this tile
+    const unsigned int totalWarps = (blockDim.x*gridDim.x)/TILE_SIZE;
+    const unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/TILE_SIZE;
    const unsigned int tgx = threadIdx.x & (TILE_SIZE-1);
    const unsigned int tbx = threadIdx.x - tgx;
-        const unsigned int localGroupIndex = threadIdx.x/TILE_SIZE;
-        unsigned int x, y;
+    __shared__ AtomData localData[THREAD_BLOCK_SIZE];
+
+    // First loop: process tiles that contain exclusions.
+    
+    const unsigned int firstExclusionTile = FIRST_EXCLUSION_TILE+warp*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/totalWarps;
+    const unsigned int lastExclusionTile = FIRST_EXCLUSION_TILE+(warp+1)*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/totalWarps;
+    for (int pos = firstExclusionTile; pos < lastExclusionTile; pos++) {
+        const ushort2 tileIndices = exclusionTiles[pos];
+        const unsigned int x = tileIndices.x;
+        const unsigned int y = tileIndices.y;
        real value = 0;
-        if (pos < end) {
-#ifdef USE_CUTOFF
-            if (numTiles <= maxTiles) {
-                ushort2 tileIndices = tiles[pos];
-                x = tileIndices.x;
-                y = tileIndices.y;
-            }
-            else
-#endif
-            {
-                y = (unsigned int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
-                x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
-                if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
-                    y += (x < y ? -1 : 1);
-                    x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
-                }
-            }
        unsigned int atom1 = x*TILE_SIZE + tgx;
        real4 posq1 = posq[atom1];
        LOAD_ATOM1_PARAMETERS
-
-            // Locate the exclusion data for this tile.
-
 #ifdef USE_EXCLUSIONS
-            if (tgx < 2)
-                exclusionRange[2*localGroupIndex+tgx] = exclusionRowIndices[x+tgx];
-            if (tgx == 0)
-                exclusionIndex[localGroupIndex] = -1;
-            for (unsigned int i = exclusionRange[2*localGroupIndex]+tgx; i < exclusionRange[2*localGroupIndex+1]; i += TILE_SIZE)
-                if (exclusionIndices[i] == y)
-                    exclusionIndex[localGroupIndex] = i*TILE_SIZE;
-            bool hasExclusions = (exclusionIndex[localGroupIndex] > -1);
-#else
-            bool hasExclusions = false;
+        unsigned int excl = exclusions[pos*TILE_SIZE+tgx];
 #endif
-            if (pos >= end)
-                ; // This warp is done.
-            else if (x == y) {
+        if (x == y) {
            // This tile is on the diagonal.

            const unsigned int localAtomIndex = threadIdx.x;
            localData[localAtomIndex].posq = posq1;
            LOAD_LOCAL_PARAMETERS_FROM_1
-#ifdef USE_EXCLUSIONS
-                unsigned int excl = exclusions[exclusionIndex[localGroupIndex]+tgx];
-#endif
            for (unsigned int j = 0; j < TILE_SIZE; j++) {
-#ifdef USE_EXCLUSIONS
-                    bool isExcluded = !(excl & 0x1);
-#endif
                int atom2 = tbx+j;
                real4 posq2 = localData[atom2].posq;
                real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z);
@@ -112,7 +65,8 @@ extern "C" __global__ void computeN2Value(const real4* __restrict__ posq, const
                    real tempValue1 = 0;
                    real tempValue2 = 0;
 #ifdef USE_EXCLUSIONS
-                    if (!isExcluded && atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && atom1 != atom2) {
+                    bool isExcluded = (atom1 >= NUM_ATOMS || atom2 >= NUM_ATOMS || !(excl & 0x1));
+                    if (!isExcluded && atom1 != atom2) {
 #else
                    if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && atom1 != atom2) {
 #endif
@@ -130,25 +84,17 @@ extern "C" __global__ void computeN2Value(const real4* __restrict__ posq, const
        else {
            // This is an off-diagonal tile.

-                if (lasty != y) {
-                    unsigned int j = y*TILE_SIZE + tgx;
-                    localData[threadIdx.x].posq = posq[j];
            const unsigned int localAtomIndex = threadIdx.x;
+            unsigned int j = y*TILE_SIZE + tgx;
+            localData[localAtomIndex].posq = posq[j];
            LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
-                }
-                localData[threadIdx.x].value = 0;
-#ifdef USE_CUTOFF
-                unsigned int flags = (numTiles <= maxTiles ? interactionFlags[pos] : 0xFFFFFFFF);
-                if (!hasExclusions && flags != 0xFFFFFFFF) {
-                    if (flags == 0) {
-                        // No interactions in this tile.
-                    }
-                    else {
-                        // Compute only a subset of the interactions in this tile.
-
-                        for (unsigned int j = 0; j < TILE_SIZE; j++) {
-                            if ((flags&(1<<j)) != 0) {
-                                int atom2 = tbx+j;
+            localData[localAtomIndex].value = 0;
+#ifdef USE_EXCLUSIONS
+            excl = (excl >> tgx) | (excl << (TILE_SIZE - tgx));
+#endif
+            unsigned int tj = tgx;
+            for (j = 0; j < TILE_SIZE; j++) {
+                int atom2 = tbx+tj;
                real4 posq2 = localData[atom2].posq;
                real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z);
 #ifdef USE_PERIODIC
@@ -157,44 +103,162 @@ extern "C" __global__ void computeN2Value(const real4* __restrict__ posq, const
                delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
 #endif
                real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
-                                real tempValue1 = 0;
-                                real tempValue2 = 0;
+#ifdef USE_CUTOFF
                if (r2 < CUTOFF_SQUARED) {
+#endif
                    real invR = RSQRT(r2);
                    real r = RECIP(invR);
                    LOAD_ATOM2_PARAMETERS
-                                    atom2 = y*TILE_SIZE+j;
+                    atom2 = y*TILE_SIZE+tj;
+                    real tempValue1 = 0;
+                    real tempValue2 = 0;
+#ifdef USE_EXCLUSIONS
+                    bool isExcluded = (atom1 >= NUM_ATOMS || atom2 >= NUM_ATOMS || !(excl & 0x1));
+                    if (!isExcluded) {
+#else
                    if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
+#endif
                        COMPUTE_VALUE
                    }
                    value += tempValue1;
+                    localData[tbx+tj].value += tempValue2;
+#ifdef USE_CUTOFF
+                }
+#endif
+#ifdef USE_EXCLUSIONS
+                excl >>= 1;
+#endif
+                tj = (tj + 1) & (TILE_SIZE - 1);
+            }
        }
-                                localData[threadIdx.x].temp = tempValue2;

-                                // Sum the forces on atom2.
+        // Write results.

-                                if (tgx % 4 == 0)
-                                    localData[threadIdx.x].temp += localData[threadIdx.x+1].temp+localData[threadIdx.x+2].temp+localData[threadIdx.x+3].temp;
-                                if (tgx == 0)
-                                    localData[tbx+j].value += localData[threadIdx.x].temp+localData[threadIdx.x+4].temp+localData[threadIdx.x+8].temp+localData[threadIdx.x+12].temp+localData[threadIdx.x+16].temp+localData[threadIdx.x+20].temp+localData[threadIdx.x+24].temp+localData[threadIdx.x+28].temp;
-                            }
+        unsigned int offset = x*TILE_SIZE + tgx;
+        atomicAdd(&global_value[offset], static_cast<unsigned long long>((long long) (value*0x100000000)));
+        if (x != y) {
+            offset = y*TILE_SIZE + tgx;
+            atomicAdd(&global_value[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].value*0x100000000)));
        }
    }
+
+    // Second loop: tiles without exclusions, either from the neighbor list (with cutoff) or just enumerating all
+    // of them (no cutoff).
+
+#ifdef USE_CUTOFF
+    unsigned int numTiles = interactionCount[0];
+    int pos = warp*(numTiles > maxTiles ? NUM_BLOCKS*(NUM_BLOCKS+1)/2 : numTiles)/totalWarps;
+    int end = (warp+1)*(numTiles > maxTiles ? NUM_BLOCKS*(NUM_BLOCKS+1)/2 : numTiles)/totalWarps;
+#else
+    int pos = warp*numTiles/totalWarps;
+    int end = (warp+1)*numTiles/totalWarps;
+#endif
+    int skipBase = 0;
+    int currentSkipIndex = tbx;
+    __shared__ int atomIndices[THREAD_BLOCK_SIZE];
+    __shared__ int skipTiles[THREAD_BLOCK_SIZE];
+    skipTiles[threadIdx.x] = -1;
+    
+    while (pos < end) {
+        real value = 0;
+        bool includeTile = true;
+        
+        // Extract the coordinates of this tile.
+        
+        unsigned int x, y;
+        bool singlePeriodicCopy = false;
+#ifdef USE_CUTOFF
+        if (numTiles <= maxTiles) {
+            ushort2 tileIndices = tiles[pos];
+            x = tileIndices.x;
+            singlePeriodicCopy = tileIndices.y;
        }
        else
 #endif
        {
-                    // Compute the full set of interactions in this tile.
+            y = (unsigned int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
+            x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
+            if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
+                y += (x < y ? -1 : 1);
+                x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
+            }

-#ifdef USE_EXCLUSIONS
-                    unsigned int excl = (hasExclusions ? exclusions[exclusionIndex[localGroupIndex]+tgx] : 0xFFFFFFFF);
-                    excl = (excl >> tgx) | (excl << (TILE_SIZE - tgx));
+            // Skip over tiles that have exclusions, since they were already processed.
+
+            while (skipTiles[tbx+TILE_SIZE-1] < pos) {
+                if (skipBase+tgx < NUM_TILES_WITH_EXCLUSIONS) {
+                    ushort2 tile = exclusionTiles[skipBase+tgx];
+                    skipTiles[threadIdx.x] = tile.x + tile.y*NUM_BLOCKS - tile.y*(tile.y+1)/2;
+                }
+                else
+                    skipTiles[threadIdx.x] = end;
+                skipBase += TILE_SIZE;            
+                currentSkipIndex = tbx;
+            }
+            while (skipTiles[currentSkipIndex] < pos)
+                currentSkipIndex++;
+            includeTile = (skipTiles[currentSkipIndex] != pos);
+        }
+        if (includeTile) {
+            unsigned int atom1 = x*TILE_SIZE + tgx;
+
+            // Load atom data for this tile.
+            
+            real4 posq1 = posq[atom1];
+            LOAD_ATOM1_PARAMETERS
+            const unsigned int localAtomIndex = threadIdx.x;
+#ifdef USE_CUTOFF
+            unsigned int j = (numTiles <= maxTiles ? interactingAtoms[pos*TILE_SIZE+tgx] : y*TILE_SIZE + tgx);
+#else
+            unsigned int j = y*TILE_SIZE + tgx;
 #endif
+            atomIndices[threadIdx.x] = j;
+            if (j < PADDED_NUM_ATOMS) {
+                localData[localAtomIndex].posq = posq[j];
+                LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
+                localData[localAtomIndex].value = 0;
+            }
+#ifdef USE_PERIODIC
+            if (singlePeriodicCopy) {
+                // The box is small enough that we can just translate all the atoms into a single periodic
+                // box, then skip having to apply periodic boundary conditions later.
+
+                real4 blockCenterX = blockCenter[x];
+                posq1.x -= floor((posq1.x-blockCenterX.x)*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
+                posq1.y -= floor((posq1.y-blockCenterX.y)*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
+                posq1.z -= floor((posq1.z-blockCenterX.z)*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
+                localData[threadIdx.x].posq.x -= floor((localData[threadIdx.x].posq.x-blockCenterX.x)*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
+                localData[threadIdx.x].posq.y -= floor((localData[threadIdx.x].posq.y-blockCenterX.y)*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
+                localData[threadIdx.x].posq.z -= floor((localData[threadIdx.x].posq.z-blockCenterX.z)*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
                unsigned int tj = tgx;
                for (unsigned int j = 0; j < TILE_SIZE; j++) {
-#ifdef USE_EXCLUSIONS
-                        bool isExcluded = !(excl & 0x1);
+                    int atom2 = tbx+tj;
+                    real4 posq2 = localData[atom2].posq;
+                    real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z);
+                    real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
+                    if (r2 < CUTOFF_SQUARED) {
+                        real invR = RSQRT(r2);
+                        real r = RECIP(invR);
+                        LOAD_ATOM2_PARAMETERS
+                        atom2 = atomIndices[tbx+tj];
+                        real tempValue1 = 0;
+                        real tempValue2 = 0;
+                        if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
+                            COMPUTE_VALUE
+                        }
+                        value += tempValue1;
+                        localData[tbx+tj].value += tempValue2;
+                    }
+                    tj = (tj + 1) & (TILE_SIZE - 1);
+                }
+            }
+            else
 #endif
+            {
+                // We need to apply periodic boundary conditions separately for each interaction.
+
+                unsigned int tj = tgx;
+                for (unsigned int j = 0; j < TILE_SIZE; j++) {
                    int atom2 = tbx+tj;
                    real4 posq2 = localData[atom2].posq;
                    real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z);
@@ -210,41 +274,32 @@ extern "C" __global__ void computeN2Value(const real4* __restrict__ posq, const
                        real invR = RSQRT(r2);
                        real r = RECIP(invR);
                        LOAD_ATOM2_PARAMETERS
-                        atom2 = y*TILE_SIZE+tj;
+                        atom2 = atomIndices[tbx+tj];
                        real tempValue1 = 0;
                        real tempValue2 = 0;
-#ifdef USE_EXCLUSIONS
-                        if (!isExcluded && atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
-#else
                        if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
-#endif
                            COMPUTE_VALUE
                        }
                        value += tempValue1;
                        localData[tbx+tj].value += tempValue2;
 #ifdef USE_CUTOFF
                    }
-#endif
-#ifdef USE_EXCLUSIONS
-                        excl >>= 1;
 #endif
                    tj = (tj + 1) & (TILE_SIZE - 1);
                }
            }
-            }
-        }
        
            // Write results.

-        if (pos < end) {
-            const unsigned int offset = x*TILE_SIZE + tgx;
-            atomicAdd(&global_value[offset], static_cast<unsigned long long>((long long) (value*0x100000000)));
-        }
-        if (pos < end && x != y) {
-            const unsigned int offset = y*TILE_SIZE + tgx;
-            atomicAdd(&global_value[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].value*0x100000000)));
+            atomicAdd(&global_value[atom1], static_cast<unsigned long long>((long long) (value*0x100000000)));
+#ifdef USE_CUTOFF
+            unsigned int atom2 = atomIndices[threadIdx.x];
+#else
+            unsigned int atom2 = y*TILE_SIZE + tgx;
+#endif
+            if (atom2 < PADDED_NUM_ATOMS)
+                atomicAdd(&global_value[atom2], static_cast<unsigned long long>((long long) (localData[threadIdx.x].value*0x100000000)));
        }
-        lasty = y;
        pos++;
-    } while (pos < end);
+    }
 }
--- a/platforms/cuda/src/kernels/customHbondForce.cu
+++ b/platforms/cuda/src/kernels/customHbondForce.cu
@@ -48,12 +48,12 @@ inline __device__ real computeAngle(real4 vec1, real4 vec2) {

        real3 crossProduct = cross(vec1, vec2);
        real scale = vec1.w*vec2.w;
-        angle = asin(SQRT(dot(crossProduct, crossProduct)/scale));
+        angle = ASIN(SQRT(dot(crossProduct, crossProduct)/scale));
        if (cosine < 0.0f)
            angle = M_PI-angle;
    }
    else
-       angle = acos(cosine);
+       angle = ACOS(cosine);
    return angle;
 }


--- a/platforms/cuda/src/kernels/ewald.cu
+++ b/platforms/cuda/src/kernels/ewald.cu
@@ -35,11 +35,11 @@ extern "C" __global__ void calculateEwaldCosSinSums(real* __restrict__ energyBuf
        for (int atom = 0; atom < NUM_ATOMS; atom++) {
            real4 apos = posq[atom];
            real phase = apos.x*kx;
-            real2 structureFactor = make_real2(cos(phase), sin(phase));
+            real2 structureFactor = make_real2(COS(phase), SIN(phase));
            phase = apos.y*ky;
-            structureFactor = multofReal2(structureFactor, make_real2(cos(phase), sin(phase)));
+            structureFactor = multofReal2(structureFactor, make_real2(COS(phase), SIN(phase)));
            phase = apos.z*kz;
-            structureFactor = multofReal2(structureFactor, make_real2(cos(phase), sin(phase)));
+            structureFactor = multofReal2(structureFactor, make_real2(COS(phase), SIN(phase)));
            sum += apos.w*structureFactor;
        }
        cosSinSum[index] = sum;
@@ -76,9 +76,9 @@ extern "C" __global__ void calculateEwaldForces(unsigned long long* __restrict__
            for (int ry = lowry; ry < KMAX_Y; ry++) {
                real ky = ry*reciprocalBoxSize.y;
                real phase = apos.x*kx;
-                real2 tab_xy = make_real2(cos(phase), sin(phase));
+                real2 tab_xy = make_real2(COS(phase), SIN(phase));
                phase = apos.y*ky;
-                tab_xy = multofReal2(tab_xy, make_real2(cos(phase), sin(phase)));
+                tab_xy = multofReal2(tab_xy, make_real2(COS(phase), SIN(phase)));
                for (int rz = lowrz; rz < KMAX_Z; rz++) {
                    real kz = rz*reciprocalBoxSize.z;

@@ -88,7 +88,7 @@ extern "C" __global__ void calculateEwaldForces(unsigned long long* __restrict__
                    real k2 = kx*kx + ky*ky + kz*kz;
                    real ak = EXP(k2*EXP_COEFFICIENT)/k2;
                    phase = apos.z*kz;
-                    real2 structureFactor = multofReal2(tab_xy, make_real2(cos(phase), sin(phase)));
+                    real2 structureFactor = multofReal2(tab_xy, make_real2(COS(phase), SIN(phase)));
                    real2 sum = cosSinSum[index];
                    real dEdR = 2*reciprocalCoefficient*ak*apos.w*(sum.x*structureFactor.y - sum.y*structureFactor.x);
                    force.x += dEdR*kx;

--- a/platforms/cuda/src/kernels/findInteractingBlocks.cu
+++ b/platforms/cuda/src/kernels/findInteractingBlocks.cu
--- a/platforms/cuda/src/kernels/gbsaObc1.cu
+++ b/platforms/cuda/src/kernels/gbsaObc1.cu
--- a/platforms/cuda/src/kernels/integrationUtilities.cu
+++ b/platforms/cuda/src/kernels/integrationUtilities.cu
@@ -24,14 +24,14 @@ extern "C" __global__ void generateRandomNumbers(int numValues, float4* __restri
        state.y ^= state.y << 13;
        state.y ^= state.y >> 17;
        state.y ^= state.y << 5;
-        x1 = sqrt(-2.0f * log(x1));
+        x1 = SQRT(-2.0f * LOG(x1));
        k = (state.z >> 2) + (state.w >> 3) + (carry >> 2);
        m = state.w + state.w + state.z + carry;
        state.z = state.w;
        state.w = m;
        carry = k >> 30;
        float x2 = (float)(state.x + state.y + state.w) / (float)0xffffffff;
-        value.x = x1 * cos(2.0f * 3.14159265f * x2);
+        value.x = x1 * COS(2.0f * 3.14159265f * x2);

        // Generate second value.

@@ -49,14 +49,14 @@ extern "C" __global__ void generateRandomNumbers(int numValues, float4* __restri
        state.y ^= state.y << 13;
        state.y ^= state.y >> 17;
        state.y ^= state.y << 5;
-        x3 = sqrt(-2.0f * log(x3));
+        x3 = SQRT(-2.0f * LOG(x3));
        k = (state.z >> 2) + (state.w >> 3) + (carry >> 2);
        m = state.w + state.w + state.z + carry;
        state.z = state.w;
        state.w = m;
        carry = k >> 30;
        float x4 = (float)(state.x + state.y + state.w) / (float)0xffffffff;
-        value.y = x3 * cos(2.0f * 3.14159265f * x4);
+        value.y = x3 * COS(2.0f * 3.14159265f * x4);

        // Generate third value.

@@ -74,14 +74,14 @@ extern "C" __global__ void generateRandomNumbers(int numValues, float4* __restri
        state.y ^= state.y << 13;
        state.y ^= state.y >> 17;
        state.y ^= state.y << 5;
-        x5 = sqrt(-2.0f * log(x5));
+        x5 = SQRT(-2.0f * LOG(x5));
        k = (state.z >> 2) + (state.w >> 3) + (carry >> 2);
        m = state.w + state.w + state.z + carry;
        state.z = state.w;
        state.w = m;
        carry = k >> 30;
        float x6 = (float)(state.x + state.y + state.w) / (float)0xffffffff;
-        value.z = x5 * cos(2.0f * 3.14159265f * x6);
+        value.z = x5 * COS(2.0f * 3.14159265f * x6);

        // Generate fourth value.

@@ -99,14 +99,14 @@ extern "C" __global__ void generateRandomNumbers(int numValues, float4* __restri
        state.y ^= state.y << 13;
        state.y ^= state.y >> 17;
        state.y ^= state.y << 5;
-        x7 = sqrt(-2.0f * log(x7));
+        x7 = SQRT(-2.0f * LOG(x7));
        k = (state.z >> 2) + (state.w >> 3) + (carry >> 2);
        m = state.w + state.w + state.z + carry;
        state.z = state.w;
        state.w = m;
        carry = k >> 30;
        float x8 = (float)(state.x + state.y + state.w) / (float)0xffffffff;
-        value.w = x7 * cos(2.0f * 3.14159265f * x8);
+        value.w = x7 * COS(2.0f * 3.14159265f * x8);

        // Record the values.

@@ -412,9 +412,9 @@ extern "C" __global__ void applySettleToPositions(int numClusters, mixed tol, co
        mixed yaksYd = zaksZd*xaksXd - xaksZd*zaksXd;
        mixed zaksYd = xaksZd*yaksXd - yaksZd*xaksXd;

-        mixed axlng = sqrt(xaksXd*xaksXd + yaksXd*yaksXd + zaksXd*zaksXd);
-        mixed aylng = sqrt(xaksYd*xaksYd + yaksYd*yaksYd + zaksYd*zaksYd);
-        mixed azlng = sqrt(xaksZd*xaksZd + yaksZd*yaksZd + zaksZd*zaksZd);
+        mixed axlng = SQRT(xaksXd*xaksXd + yaksXd*yaksXd + zaksXd*zaksXd);
+        mixed aylng = SQRT(xaksYd*xaksYd + yaksYd*yaksYd + zaksYd*zaksYd);
+        mixed azlng = SQRT(xaksZd*xaksZd + yaksZd*yaksZd + zaksZd*zaksZd);
        mixed trns11 = xaksXd / axlng;
        mixed trns21 = yaksXd / axlng;
        mixed trns31 = zaksXd / axlng;
@@ -440,13 +440,13 @@ extern "C" __global__ void applySettleToPositions(int numClusters, mixed tol, co
        //                                        --- Step2  A2' ---

        float rc = 0.5f*params.y;
-        mixed rb = sqrt(params.x*params.x-rc*rc);
+        mixed rb = SQRT(params.x*params.x-rc*rc);
        mixed ra = rb*(m1+m2)*invTotalMass;
        rb -= ra;
        mixed sinphi = za1d/ra;
-        mixed cosphi = sqrt(1-sinphi*sinphi);
+        mixed cosphi = SQRT(1-sinphi*sinphi);
        mixed sinpsi = (zb1d-zc1d) / (2*rc*cosphi);
-        mixed cospsi = sqrt(1-sinpsi*sinpsi);
+        mixed cospsi = SQRT(1-sinpsi*sinpsi);

        mixed ya2d =   ra*cosphi;
        mixed xb2d = - rc*cospsi;
@@ -454,7 +454,7 @@ extern "C" __global__ void applySettleToPositions(int numClusters, mixed tol, co
        mixed yc2d = - rb*cosphi + rc*sinpsi*sinphi;
        mixed xb2d2 = xb2d*xb2d;
        mixed hh2 = 4.0f*xb2d2 + (yb2d-yc2d)*(yb2d-yc2d) + (zb1d-zc1d)*(zb1d-zc1d);
-        mixed deltx = 2.0f*xb2d + sqrt(4.0f*xb2d2 - hh2 + params.y*params.y);
+        mixed deltx = 2.0f*xb2d + SQRT(4.0f*xb2d2 - hh2 + params.y*params.y);
        xb2d -= deltx*0.5f;

        //                                        --- Step3  al,be,ga ---
@@ -464,11 +464,11 @@ extern "C" __global__ void applySettleToPositions(int numClusters, mixed tol, co
        mixed gamma = xb0d*yb1d - xb1d*yb0d + xc0d*yc1d - xc1d*yc0d;

        mixed al2be2 = alpha*alpha + beta*beta;
-        mixed sintheta = (alpha*gamma - beta*sqrt(al2be2 - gamma*gamma)) / al2be2;
+        mixed sintheta = (alpha*gamma - beta*SQRT(al2be2 - gamma*gamma)) / al2be2;

        //                                        --- Step4  A3' ---

-        mixed costheta = sqrt(1-sintheta*sintheta);
+        mixed costheta = SQRT(1-sintheta*sintheta);
        mixed xa3d = - ya2d*sintheta;
        mixed ya3d =   ya2d*costheta;
        mixed za3d = za1d;
@@ -534,9 +534,9 @@ extern "C" __global__ void applySettleToVelocities(int numClusters, mixed tol, c
        mixed3 eAB = make_mixed3(apos1.x-apos0.x, apos1.y-apos0.y, apos1.z-apos0.z);
        mixed3 eBC = make_mixed3(apos2.x-apos1.x, apos2.y-apos1.y, apos2.z-apos1.z);
        mixed3 eCA = make_mixed3(apos0.x-apos2.x, apos0.y-apos2.y, apos0.z-apos2.z);
-        eAB *= rsqrt(eAB.x*eAB.x + eAB.y*eAB.y + eAB.z*eAB.z);
-        eBC *= rsqrt(eBC.x*eBC.x + eBC.y*eBC.y + eBC.z*eBC.z);
-        eCA *= rsqrt(eCA.x*eCA.x + eCA.y*eCA.y + eCA.z*eCA.z);
+        eAB *= RSQRT(eAB.x*eAB.x + eAB.y*eAB.y + eAB.z*eAB.z);
+        eBC *= RSQRT(eBC.x*eBC.x + eBC.y*eBC.y + eBC.z*eBC.z);
+        eCA *= RSQRT(eCA.x*eCA.x + eCA.y*eCA.y + eCA.z*eCA.z);
        mixed vAB = (v1.x-v0.x)*eAB.x + (v1.y-v0.y)*eAB.y + (v1.z-v0.z)*eAB.z;
        mixed vBC = (v2.x-v1.x)*eBC.x + (v2.y-v1.y)*eBC.y + (v2.z-v1.z)*eBC.z;
        mixed vCA = (v0.x-v2.x)*eCA.x + (v0.y-v2.y)*eCA.y + (v0.z-v2.z)*eCA.z;
@@ -574,7 +574,8 @@ extern "C" __global__ void applySettleToVelocities(int numClusters, mixed tol, c
 /**
 * Compute the direction each CCMA constraint is pointing in.  This is called once at the beginning of constraint evaluation.
 */
-extern "C" __global__ void computeCCMAConstraintDirections(const int2* __restrict__ constraintAtoms, mixed4* __restrict__ constraintDistance, const real4* __restrict__ atomPositions, const real4* __restrict__ posqCorrection) {
+extern "C" __global__ void computeCCMAConstraintDirections(const int2* __restrict__ constraintAtoms, mixed4* __restrict__ constraintDistance,
+        const real4* __restrict__ atomPositions, const real4* __restrict__ posqCorrection, int* __restrict__ converged) {
    for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < NUM_CCMA_CONSTRAINTS; index += blockDim.x*gridDim.x) {
        // Compute the direction for this constraint.

@@ -587,6 +588,10 @@ extern "C" __global__ void computeCCMAConstraintDirections(const int2* __restric
        dir.z = oldPos1.z-oldPos2.z;
        constraintDistance[index] = dir;
    }
+    if (threadIdx.x == 0 && blockIdx.x == 0) {
+        converged[0] = 1;
+        converged[1] = 0;
+    }
 }

 /**
@@ -605,6 +610,7 @@ extern "C" __global__ void computeCCMAPositionConstraintForce(const int2* __rest
    __syncthreads();
    mixed lowerTol = 1-2*tol+tol*tol;
    mixed upperTol = 1+2*tol+tol*tol;
+    bool threadConverged = true;
    for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < NUM_CCMA_CONSTRAINTS; index += blockDim.x*gridDim.x) {
        // Compute the force due to this constraint.

@@ -620,14 +626,13 @@ extern "C" __global__ void computeCCMAPositionConstraintForce(const int2* __rest
        mixed dist2 = dir.w*dir.w;
        mixed diff = dist2 - rp2;
        delta1[index] = (rrpr > d_ij2*1e-6f ? reducedMass[index]*diff/rrpr : 0.0f);
-
-        // See whether it has converged.
-
-        if (groupConverged && (rp2 < lowerTol*dist2 || rp2 > upperTol*dist2)) {
+        threadConverged &= (rp2 > lowerTol*dist2 && rp2 < upperTol*dist2);
+    }
+    if (groupConverged && !threadConverged)
        groupConverged = 0;
+    __syncthreads();
+    if (threadIdx.x == 0 && !groupConverged)
        converged[iteration%2] = 0;
-        }
-    }
 }

 /**