Merged 5.1Optimizations branch back to trunk

93c467b2 · Peter Eastman · f6d4557d · 93c467b2 · 93c467b2 · 93c467b2
Commit 93c467b2 authored Mar 22, 2013 by Peter Eastman
20 changed files
--- a/platforms/cuda/src/CudaContext.cpp
+++ b/platforms/cuda/src/CudaContext.cpp
@@ -61,7 +61,7 @@ using namespace OpenMM;
 using namespace std;

 const int CudaContext::ThreadBlockSize = 64;
-const int CudaContext::TileSize = 32;
+const int CudaContext::TileSize = sizeof(tileflags)*8;
 bool CudaContext::hasInitializedCuda = false;

 CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlockingSync, const string& precision, const string& compiler,
@@ -369,6 +369,7 @@ CUmodule CudaContext::createModule(const string source, const map<string, string
        src << "typedef float3 mixed3;\n";
        src << "typedef float4 mixed4;\n";
    }
+    src << "typedef unsigned int tileflags;\n";
    for (map<string, string>::const_iterator iter = defines.begin(); iter != defines.end(); ++iter) {
        src << "#define " << iter->first;
        if (!iter->second.empty())

--- a/platforms/cuda/src/CudaContext.h
+++ b/platforms/cuda/src/CudaContext.h
@@ -42,6 +42,8 @@
 #include "windowsExportCuda.h"
 #include "CudaPlatform.h"

+typedef unsigned int tileflags;
+
 namespace OpenMM {

 class CudaArray;

--- a/platforms/cuda/src/CudaIntegrationUtilities.cpp
+++ b/platforms/cuda/src/CudaIntegrationUtilities.cpp
@@ -6,7 +6,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2009-2012 Stanford University and the Authors.      *
+ * Portions copyright (c) 2009-2013 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -99,7 +99,7 @@ CudaIntegrationUtilities::CudaIntegrationUtilities(CudaContext& context, const S
        posDelta(NULL), settleAtoms(NULL), settleParams(NULL), shakeAtoms(NULL), shakeParams(NULL),
        random(NULL), randomSeed(NULL), randomPos(0), stepSize(NULL), ccmaAtoms(NULL), ccmaDistance(NULL),
        ccmaReducedMass(NULL), ccmaAtomConstraints(NULL), ccmaNumAtomConstraints(NULL), ccmaConstraintMatrixColumn(NULL),
-        ccmaConstraintMatrixValue(NULL), ccmaDelta1(NULL), ccmaDelta2(NULL), ccmaConvergedMemory(NULL),
+        ccmaConstraintMatrixValue(NULL), ccmaDelta1(NULL), ccmaDelta2(NULL), ccmaConverged(NULL),
        vsite2AvgAtoms(NULL), vsite2AvgWeights(NULL), vsite3AvgAtoms(NULL), vsite3AvgWeights(NULL),
        vsiteOutOfPlaneAtoms(NULL), vsiteOutOfPlaneWeights(NULL) {
    // Create workspace arrays.
@@ -466,9 +466,8 @@ CudaIntegrationUtilities::CudaIntegrationUtilities(CudaContext& context, const S
        ccmaAtoms = CudaArray::create<int2>(context, numCCMA, "CcmaAtoms");
        ccmaAtomConstraints = CudaArray::create<int>(context, numAtoms*maxAtomConstraints, "CcmaAtomConstraints");
        ccmaNumAtomConstraints = CudaArray::create<int>(context, numAtoms, "CcmaAtomConstraintsIndex");
-        CHECK_RESULT2(cuMemHostAlloc((void**) &ccmaConvergedMemory, 2*sizeof(int), CU_MEMHOSTALLOC_DEVICEMAP), "Error allocating pinned memory");
-        CHECK_RESULT2(cuMemHostGetDevicePointer(&ccmaConvergedDeviceMemory, ccmaConvergedMemory, 0), "Error getting device address for pinned memory");
        ccmaConstraintMatrixColumn = CudaArray::create<int>(context, numCCMA*maxRowElements, "ConstraintMatrixColumn");
+        ccmaConverged = CudaArray::create<int>(context, 2, "ccmaConverged");
        vector<int2> atomsVec(ccmaAtoms->getSize());
        vector<int> atomConstraintsVec(ccmaAtomConstraints->getSize());
        vector<int> numAtomConstraintsVec(ccmaNumAtomConstraints->getSize());
@@ -680,8 +679,8 @@ CudaIntegrationUtilities::~CudaIntegrationUtilities() {
        delete ccmaDelta1;
    if (ccmaDelta2 != NULL)
        delete ccmaDelta2;
-    if (ccmaConvergedMemory != NULL)
-        cuMemFreeHost(ccmaConvergedMemory);
+    if (ccmaConverged != NULL)
+        delete ccmaConverged;
    if (vsite2AvgAtoms != NULL)
        delete vsite2AvgAtoms;
    if (vsite2AvgWeights != NULL)
@@ -734,33 +733,32 @@ void CudaIntegrationUtilities::applyConstraints(bool constrainVelocities, double
        context.executeKernel(shakeKernel, args, shakeAtoms->getSize());
    }
    if (ccmaAtoms != NULL) {
-        void* directionsArgs[] = {&ccmaAtoms->getDevicePointer(), &ccmaDistance->getDevicePointer(), &context.getPosq().getDevicePointer(), &posCorrection};
+        void* directionsArgs[] = {&ccmaAtoms->getDevicePointer(), &ccmaDistance->getDevicePointer(), &context.getPosq().getDevicePointer(), &posCorrection, &ccmaConverged->getDevicePointer()};
        context.executeKernel(ccmaDirectionsKernel, directionsArgs, ccmaAtoms->getSize());
        int i;
        void* forceArgs[] = {&ccmaAtoms->getDevicePointer(), &ccmaDistance->getDevicePointer(),
                constrainVelocities ? &context.getVelm().getDevicePointer() : &posDelta->getDevicePointer(),
-                &ccmaReducedMass->getDevicePointer(), &ccmaDelta1->getDevicePointer(), &ccmaConvergedDeviceMemory,
+                &ccmaReducedMass->getDevicePointer(), &ccmaDelta1->getDevicePointer(), &ccmaConverged->getDevicePointer(),
                tolPointer, &i};
        void* multiplyArgs[] = {&ccmaDelta1->getDevicePointer(), &ccmaDelta2->getDevicePointer(),
-                &ccmaConstraintMatrixColumn->getDevicePointer(), &ccmaConstraintMatrixValue->getDevicePointer(), &ccmaConvergedDeviceMemory, &i};
+                &ccmaConstraintMatrixColumn->getDevicePointer(), &ccmaConstraintMatrixValue->getDevicePointer(), &ccmaConverged->getDevicePointer(), &i};
        void* updateArgs[] = {&ccmaNumAtomConstraints->getDevicePointer(), &ccmaAtomConstraints->getDevicePointer(), &ccmaDistance->getDevicePointer(),
                constrainVelocities ? &context.getVelm().getDevicePointer() : &posDelta->getDevicePointer(),
                &context.getVelm().getDevicePointer(), &ccmaDelta1->getDevicePointer(), &ccmaDelta2->getDevicePointer(),
-                &ccmaConvergedDeviceMemory, &i};
+                &ccmaConverged->getDevicePointer(), &i};
        const int checkInterval = 4;
+        int* converged = (int*) context.getPinnedBuffer();
        for (i = 0; i < 150; i++) {
-            if (i == 0) {
-                ccmaConvergedMemory[0] = 1;
-                ccmaConvergedMemory[1] = 0;
-            }
            context.executeKernel(ccmaForceKernel, forceArgs, ccmaAtoms->getSize());
-            if ((i+1)%checkInterval == 0)
+            if ((i+1)%checkInterval == 0) {
+                ccmaConverged->download(converged, false);
                CHECK_RESULT2(cuEventRecord(ccmaEvent, 0), "Error recording event for CCMA");
+            }
            context.executeKernel(ccmaMultiplyKernel, multiplyArgs, ccmaAtoms->getSize());
            context.executeKernel(ccmaUpdateKernel, updateArgs, context.getNumAtoms());
            if ((i+1)%checkInterval == 0) {
                CHECK_RESULT2(cuEventSynchronize(ccmaEvent), "Error synchronizing on event for CCMA");
-                if (ccmaConvergedMemory[i%2])
+                if (converged[i%2])
                    break;
            }
        }

--- a/platforms/cuda/src/CudaIntegrationUtilities.h
+++ b/platforms/cuda/src/CudaIntegrationUtilities.h
@@ -9,7 +9,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2009-2012 Stanford University and the Authors.      *
+ * Portions copyright (c) 2009-2013 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -140,8 +140,7 @@ private:
    CudaArray* ccmaConstraintMatrixValue;
    CudaArray* ccmaDelta1;
    CudaArray* ccmaDelta2;
-    int* ccmaConvergedMemory;
-    CUdeviceptr ccmaConvergedDeviceMemory;
+    CudaArray* ccmaConverged;
    CUevent ccmaEvent;
    CudaArray* vsite2AvgAtoms;
    CudaArray* vsite2AvgWeights;

--- a/platforms/cuda/src/CudaKernels.cpp
+++ b/platforms/cuda/src/CudaKernels.cpp
--- a/platforms/cuda/src/CudaKernels.h
+++ b/platforms/cuda/src/CudaKernels.h
@@ -557,8 +557,7 @@ class CudaCalcNonbondedForceKernel : public CalcNonbondedForceKernel {
 public:
    CudaCalcNonbondedForceKernel(std::string name, const Platform& platform, CudaContext& cu, System& system) : CalcNonbondedForceKernel(name, platform),
            cu(cu), hasInitializedFFT(false), sigmaEpsilon(NULL), exceptionParams(NULL), cosSinSums(NULL), directPmeGrid(NULL), reciprocalPmeGrid(NULL),
-            pmeBsplineModuliX(NULL), pmeBsplineModuliY(NULL), pmeBsplineModuliZ(NULL), pmeBsplineTheta(NULL), pmeBsplineDTheta(NULL),
-            pmeAtomRange(NULL), pmeAtomGridIndex(NULL), sort(NULL) {
+            pmeBsplineModuliX(NULL), pmeBsplineModuliY(NULL), pmeBsplineModuliZ(NULL),  pmeAtomRange(NULL), pmeAtomGridIndex(NULL), sort(NULL) {
    }
    ~CudaCalcNonbondedForceKernel();
    /**
@@ -607,8 +606,6 @@ private:
    CudaArray* pmeBsplineModuliX;
    CudaArray* pmeBsplineModuliY;
    CudaArray* pmeBsplineModuliZ;
-    CudaArray* pmeBsplineTheta;
-    CudaArray* pmeBsplineDTheta;
    CudaArray* pmeAtomRange;
    CudaArray* pmeAtomGridIndex;
    CudaSort* sort;
@@ -617,9 +614,6 @@ private:
    CUfunction ewaldSumsKernel;
    CUfunction ewaldForcesKernel;
    CUfunction pmeGridIndexKernel;
-    CUfunction pmeAtomRangeKernel;
-    CUfunction pmeZIndexKernel;
-    CUfunction pmeUpdateBsplinesKernel;
    CUfunction pmeSpreadChargeKernel;
    CUfunction pmeFinishSpreadChargeKernel;
    CUfunction pmeEvalEnergyKernel;
@@ -776,6 +770,8 @@ private:
    System& system;
    CUfunction pairValueKernel, perParticleValueKernel, pairEnergyKernel, perParticleEnergyKernel, gradientChainRuleKernel;
    std::vector<void*> pairValueArgs, perParticleValueArgs, pairEnergyArgs, perParticleEnergyArgs, gradientChainRuleArgs;
+    std::string pairValueSrc, pairEnergySrc;
+    std::map<std::string, std::string> pairValueDefines, pairEnergyDefines;
 };

 /**

--- a/platforms/cuda/src/CudaNonbondedUtilities.cpp
+++ b/platforms/cuda/src/CudaNonbondedUtilities.cpp
--- a/platforms/cuda/src/CudaNonbondedUtilities.h
+++ b/platforms/cuda/src/CudaNonbondedUtilities.h
@@ -9,7 +9,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2009-2012 Stanford University and the Authors.      *
+ * Portions copyright (c) 2009-2013 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -35,6 +35,8 @@
 #include <vector>

 namespace OpenMM {
+    
+class CudaSort;

 /**
 * This class provides a generic interface for calculating nonbonded interactions.  It does this in two
@@ -181,10 +183,10 @@ public:
        return *interactingTiles;
    }
    /**
-     * Get the array containing flags for tiles with interactions.
+     * Get the array containing the atoms in each tile with interactions.
     */
-    CudaArray& getInteractionFlags() {
-        return *interactionFlags;
+    CudaArray& getInteractingAtoms() {
+        return *interactingAtoms;
    }
    /**
     * Get the array containing exclusion flags.
@@ -192,6 +194,12 @@ public:
    CudaArray& getExclusions() {
        return *exclusions;
    }
+    /**
+     * Get the array containing tiles with exclusions.
+     */
+    CudaArray& getExclusionTiles() {
+        return *exclusionTiles;
+    }
    /**
     * Get the array containing the index into the exclusion array for each tile.
     */
@@ -217,9 +225,17 @@ public:
        return numTiles;
    }
    /**
-     * Set the range of tiles that should be processed by this context.
+     * Set whether to add padding to the cutoff distance when building the neighbor list.
+     * This increases the size of the neighbor list (and thus the cost of computing interactions),
+     * but also means we don't need to rebuild it every time step.  The default value is true,
+     * since usually this improves performance.  For very expensive interactions, however,
+     * it may be better to set this to false.
+     */
+    void setUsePadding(bool padding);
+    /**
+     * Set the range of atom blocks and tiles that should be processed by this context.
     */
-    void setTileRange(int startTileIndex, int numTiles);
+    void setAtomBlockRange(double startFraction, double endFraction);
    /**
     * Create a Kernel for evaluating a nonbonded interaction.  Cutoffs and periodic boundary conditions
     * are assumed to be the same as those for the default interaction Kernel, since this kernel will use
@@ -232,42 +248,38 @@ public:
     * @param isSymmetric   specifies whether the interaction is symmetric
     */
    CUfunction createInteractionKernel(const std::string& source, std::vector<ParameterInfo>& params, std::vector<ParameterInfo>& arguments, bool useExclusions, bool isSymmetric);
-    /**
-     * This is a utility routine for locating data in the exclusions array.  It takes the (x,y) indices of a tile,
-     * and returns the location in the array where the data for that tile begins.
-     * 
-     * This routine requires that x >= y.  If not, it will throw an exception.
-     * 
-     * @param x                   the x index of the tile
-     * @param y                   the y index of the tile
-     * @param exclusionIndices    the content of the exclusionIndices array
-     * @param exclusionRowIndices the content of the exclusionRowIndices array
-     * @return the index in the exclusions array at which the data for that tile begins
-     */
-    static int findExclusionIndex(int x, int y, const std::vector<unsigned int>& exclusionIndices, const std::vector<unsigned int>& exclusionRowIndices);
 private:
+    class BlockSortTrait;
    CudaContext& context;
    CUfunction forceKernel;
    CUfunction findBlockBoundsKernel;
+    CUfunction sortBoxDataKernel;
    CUfunction findInteractingBlocksKernel;
    CUfunction findInteractionsWithinBlocksKernel;
+    CudaArray* exclusionTiles;
    CudaArray* exclusions;
    CudaArray* exclusionIndices;
    CudaArray* exclusionRowIndices;
    CudaArray* interactingTiles;
-    CudaArray* interactionFlags;
+    CudaArray* interactingAtoms;
    CudaArray* interactionCount;
    CudaArray* blockCenter;
    CudaArray* blockBoundingBox;
-    std::vector<void*> forceArgs, findBlockBoundsArgs, findInteractingBlocksArgs, findInteractionsWithinBlocksArgs;
+    CudaArray* sortedBlocks;
+    CudaArray* sortedBlockCenter;
+    CudaArray* sortedBlockBoundingBox;
+    CudaArray* oldPositions;
+    CudaArray* rebuildNeighborList;
+    CudaSort* blockSorter;
+    std::vector<void*> forceArgs, findBlockBoundsArgs, sortBoxDataArgs, findInteractingBlocksArgs;
    std::vector<std::vector<int> > atomExclusions;
    std::vector<ParameterInfo> parameters;
    std::vector<ParameterInfo> arguments;
    std::string kernelSource;
    std::map<std::string, std::string> kernelDefines;
    double cutoff;
-    bool useCutoff, usePeriodic, anyExclusions;
-    int startTileIndex, numTiles, maxTiles, numForceThreadBlocks, forceThreadBlockSize, nonbondedForceGroup, numAtoms;
+    bool useCutoff, usePeriodic, anyExclusions, usePadding;
+    int startTileIndex, numTiles, startBlockIndex, numBlocks, maxTiles, numForceThreadBlocks, forceThreadBlockSize, nonbondedForceGroup, numAtoms;
 };

 /**

--- a/platforms/cuda/src/CudaParallelKernels.cpp
+++ b/platforms/cuda/src/CudaParallelKernels.cpp
@@ -6,7 +6,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2011-2012 Stanford University and the Authors.      *
+ * Portions copyright (c) 2011-2013 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -118,7 +118,7 @@ private:
 };

 CudaParallelCalcForcesAndEnergyKernel::CudaParallelCalcForcesAndEnergyKernel(string name, const Platform& platform, CudaPlatform::PlatformData& data) :
-        CalcForcesAndEnergyKernel(name, platform), data(data), completionTimes(data.contexts.size()), contextTiles(data.contexts.size()), contextForces(NULL),
+        CalcForcesAndEnergyKernel(name, platform), data(data), completionTimes(data.contexts.size()), contextNonbondedFractions(data.contexts.size()), contextForces(NULL),
        pinnedPositionBuffer(NULL), pinnedForceBuffer(NULL) {
    for (int i = 0; i < (int) data.contexts.size(); i++)
        kernels.push_back(Kernel(new CudaCalcForcesAndEnergyKernel(name, platform, *data.contexts[i])));
@@ -141,6 +141,8 @@ void CudaParallelCalcForcesAndEnergyKernel::initialize(const System& system) {
    sumKernel = cu.getKernel(module, "sumForces");
    for (int i = 0; i < (int) kernels.size(); i++)
        getKernel(i).initialize(system);
+    for (int i = 0; i < (int) contextNonbondedFractions.size(); i++)
+        contextNonbondedFractions[i] = 1/(double) contextNonbondedFractions.size();
 }

 void CudaParallelCalcForcesAndEnergyKernel::beginComputation(ContextImpl& context, bool includeForce, bool includeEnergy, int groups) {
@@ -184,30 +186,26 @@ double CudaParallelCalcForcesAndEnergyKernel::finishComputation(ContextImpl& con
        void* args[] = {&cu.getForce().getDevicePointer(), &contextForces->getDevicePointer(), &bufferSize, &numBuffers};
        cu.executeKernel(sumKernel, args, bufferSize);
        
-        // Balance work between the contexts by transferring a few nonbonded tiles from the context that
+        // Balance work between the contexts by transferring a little nonbonded work from the context that
        // finished last to the one that finished first.
        
        int firstIndex = 0, lastIndex = 0;
-        int totalTiles = 0;
        for (int i = 0; i < (int) completionTimes.size(); i++) {
            if (completionTimes[i] < completionTimes[firstIndex])
                firstIndex = i;
            if (completionTimes[i] > completionTimes[lastIndex])
                lastIndex = i;
-            contextTiles[i] = data.contexts[i]->getNonbondedUtilities().getNumTiles();
-            totalTiles += contextTiles[i];
        }
-        int tilesToTransfer = totalTiles/1000;
-        if (tilesToTransfer < 1)
-            tilesToTransfer = 1;
-        if (tilesToTransfer > contextTiles[lastIndex])
-            tilesToTransfer = contextTiles[lastIndex];
-        contextTiles[firstIndex] += tilesToTransfer;
-        contextTiles[lastIndex] -= tilesToTransfer;
-        int startIndex = 0;
-        for (int i = 0; i < (int) contextTiles.size(); i++) {
-            data.contexts[i]->getNonbondedUtilities().setTileRange(startIndex, contextTiles[i]);
-            startIndex += contextTiles[i];
+        double fractionToTransfer = min(0.001, contextNonbondedFractions[lastIndex]);
+        contextNonbondedFractions[firstIndex] += fractionToTransfer;
+        contextNonbondedFractions[lastIndex] -= fractionToTransfer;
+        double startFraction = 0.0;
+        for (int i = 0; i < (int) contextNonbondedFractions.size(); i++) {
+            double endFraction = startFraction+contextNonbondedFractions[i];
+            if (i == contextNonbondedFractions.size()-1)
+                endFraction = 1.0; // Avoid roundoff error
+            data.contexts[i]->getNonbondedUtilities().setAtomBlockRange(startFraction, endFraction);
+            startFraction = endFraction;
        }
    }
    return energy;

--- a/platforms/cuda/src/CudaParallelKernels.h
+++ b/platforms/cuda/src/CudaParallelKernels.h
@@ -9,7 +9,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2011-2012 Stanford University and the Authors.      *
+ * Portions copyright (c) 2011-2013 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -80,7 +80,7 @@ private:
    CudaPlatform::PlatformData& data;
    std::vector<Kernel> kernels;
    std::vector<long long> completionTimes;
-    std::vector<int> contextTiles;
+    std::vector<double> contextNonbondedFractions;
    CudaArray* contextForces;
    void* pinnedPositionBuffer;
    long long* pinnedForceBuffer;

--- a/platforms/cuda/src/CudaSort.cpp
+++ b/platforms/cuda/src/CudaSort.cpp
@@ -32,7 +32,7 @@ using namespace OpenMM;
 using namespace std;

 CudaSort::CudaSort(CudaContext& context, SortTrait* trait, unsigned int length) : context(context), trait(trait),
-        dataRange(NULL), bucketOfElement(NULL), offsetInBucket(NULL), bucketOffset(NULL), buckets(NULL) {
+        dataRange(NULL), bucketOfElement(NULL), offsetInBucket(NULL), bucketOffset(NULL), buckets(NULL), dataLength(length) {
    // Create kernels.

    map<string, string> replacements;
@@ -43,6 +43,7 @@ CudaSort::CudaSort(CudaContext& context, SortTrait* trait, unsigned int length)
    replacements["MAX_KEY"] = trait->getMaxKey();
    replacements["MAX_VALUE"] = trait->getMaxValue();
    CUmodule module = context.createModule(context.replaceStrings(CudaKernelSources::sort, replacements));
+    shortListKernel = context.getKernel(module, "sortShortList");
    computeRangeKernel = context.getKernel(module, "computeRange");
    assignElementsKernel = context.getKernel(module, "assignElementsToBuckets");
    computeBucketPositionsKernel = context.getKernel(module, "computeBucketPositions");
@@ -53,15 +54,16 @@ CudaSort::CudaSort(CudaContext& context, SortTrait* trait, unsigned int length)

    int maxBlockSize;
    cuDeviceGetAttribute(&maxBlockSize, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, context.getDevice());
+    int maxSharedMem;
+    cuDeviceGetAttribute(&maxSharedMem, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK, context.getDevice());
+    unsigned int maxLocalBuffer = (unsigned int) ((maxSharedMem/trait->getDataSize())/2);
+    isShortList = (length <= maxLocalBuffer);
    for (rangeKernelSize = 1; rangeKernelSize*2 <= maxBlockSize; rangeKernelSize *= 2)
        ;
    positionsKernelSize = rangeKernelSize;
-    sortKernelSize = rangeKernelSize/2;
+    sortKernelSize = (isShortList ? rangeKernelSize/2 : rangeKernelSize/4);
    if (rangeKernelSize > length)
        rangeKernelSize = length;
-    int maxSharedMem;
-    cuDeviceGetAttribute(&maxSharedMem, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK, context.getDevice());
-    unsigned int maxLocalBuffer = (unsigned int) ((maxSharedMem/trait->getDataSize())/2);
    if (sortKernelSize > maxLocalBuffer)
        sortKernelSize = maxLocalBuffer;
    unsigned int targetBucketSize = sortKernelSize/2;
@@ -73,11 +75,13 @@ CudaSort::CudaSort(CudaContext& context, SortTrait* trait, unsigned int length)

    // Create workspace arrays.

-    dataRange = new CudaArray(context, 2, trait->getKeySize(), "sortDataRange");
-    bucketOffset = CudaArray::create<uint1>(context, numBuckets, "bucketOffset");
-    bucketOfElement = CudaArray::create<uint1>(context, length, "bucketOfElement");
-    offsetInBucket = CudaArray::create<uint1>(context, length, "offsetInBucket");
-    buckets = new CudaArray(context, length, trait->getDataSize(), "buckets");
+    if (!isShortList) {
+        dataRange = new CudaArray(context, 2, trait->getKeySize(), "sortDataRange");
+        bucketOffset = CudaArray::create<uint1>(context, numBuckets, "bucketOffset");
+        bucketOfElement = CudaArray::create<uint1>(context, length, "bucketOfElement");
+        offsetInBucket = CudaArray::create<uint1>(context, length, "offsetInBucket");
+        buckets = new CudaArray(context, length, trait->getDataSize(), "buckets");
+    }
 }

 CudaSort::~CudaSort() {
@@ -95,38 +99,44 @@ CudaSort::~CudaSort() {
 }

 void CudaSort::sort(CudaArray& data) {
-    if (data.getSize() != bucketOfElement->getSize() || data.getElementSize() != trait->getDataSize())
+    if (data.getSize() != dataLength || data.getElementSize() != trait->getDataSize())
        throw OpenMMException("CudaSort called with different data size");
    if (data.getSize() == 0)
        return;
+    if (isShortList) {
+        // We can use a simpler sort kernel that does the entire operation at once in local memory.
+        
+        void* sortArgs[] = {&data.getDevicePointer(), &dataLength};
+        context.executeKernel(shortListKernel, sortArgs, sortKernelSize, sortKernelSize, dataLength*trait->getDataSize());
+    }
+    else {
+        // Compute the range of data values.

-    // Compute the range of data values.
-
-    unsigned int dataSize = data.getSize();
-    void* rangeArgs[] = {&data.getDevicePointer(), &dataSize, &dataRange->getDevicePointer()};
-    context.executeKernel(computeRangeKernel, rangeArgs, rangeKernelSize, rangeKernelSize, rangeKernelSize*trait->getKeySize());
+        void* rangeArgs[] = {&data.getDevicePointer(), &dataLength, &dataRange->getDevicePointer()};
+        context.executeKernel(computeRangeKernel, rangeArgs, rangeKernelSize, rangeKernelSize, rangeKernelSize*trait->getKeySize());

-    // Assign array elements to buckets.
+        // Assign array elements to buckets.

-    unsigned int numBuckets = bucketOffset->getSize();
-    context.clearBuffer(*bucketOffset);
-    void* elementsArgs[] = {&data.getDevicePointer(), &dataSize, &numBuckets, &dataRange->getDevicePointer(),
-            &bucketOffset->getDevicePointer(), &bucketOfElement->getDevicePointer(), &offsetInBucket->getDevicePointer()};
-    context.executeKernel(assignElementsKernel, elementsArgs, data.getSize());
+        unsigned int numBuckets = bucketOffset->getSize();
+        context.clearBuffer(*bucketOffset);
+        void* elementsArgs[] = {&data.getDevicePointer(), &dataLength, &numBuckets, &dataRange->getDevicePointer(),
+                &bucketOffset->getDevicePointer(), &bucketOfElement->getDevicePointer(), &offsetInBucket->getDevicePointer()};
+        context.executeKernel(assignElementsKernel, elementsArgs, data.getSize());

-    // Compute the position of each bucket.
+        // Compute the position of each bucket.

-    void* computeArgs[] = {&numBuckets, &bucketOffset->getDevicePointer()};
-    context.executeKernel(computeBucketPositionsKernel, computeArgs, positionsKernelSize, positionsKernelSize, positionsKernelSize*sizeof(int));
+        void* computeArgs[] = {&numBuckets, &bucketOffset->getDevicePointer()};
+        context.executeKernel(computeBucketPositionsKernel, computeArgs, positionsKernelSize, positionsKernelSize, positionsKernelSize*sizeof(int));

-    // Copy the data into the buckets.
+        // Copy the data into the buckets.

-    void* copyArgs[] = {&data.getDevicePointer(), &buckets->getDevicePointer(), &dataSize, &bucketOffset->getDevicePointer(),
-            &bucketOfElement->getDevicePointer(), &offsetInBucket->getDevicePointer()};
-    context.executeKernel(copyToBucketsKernel, copyArgs, data.getSize());
+        void* copyArgs[] = {&data.getDevicePointer(), &buckets->getDevicePointer(), &dataLength, &bucketOffset->getDevicePointer(),
+                &bucketOfElement->getDevicePointer(), &offsetInBucket->getDevicePointer()};
+        context.executeKernel(copyToBucketsKernel, copyArgs, data.getSize());

-    // Sort each bucket.
+        // Sort each bucket.

-    void* sortArgs[] = {&data.getDevicePointer(), &buckets->getDevicePointer(), &numBuckets, &bucketOffset->getDevicePointer()};
-    context.executeKernel(sortBucketsKernel, sortArgs, ((data.getSize()+sortKernelSize-1)/sortKernelSize)*sortKernelSize, sortKernelSize, sortKernelSize*trait->getDataSize());
+        void* sortArgs[] = {&data.getDevicePointer(), &buckets->getDevicePointer(), &numBuckets, &bucketOffset->getDevicePointer()};
+        context.executeKernel(sortBucketsKernel, sortArgs, ((data.getSize()+sortKernelSize-1)/sortKernelSize)*sortKernelSize, sortKernelSize, sortKernelSize*trait->getDataSize());
+    }
 }
--- a/platforms/cuda/src/CudaSort.h
+++ b/platforms/cuda/src/CudaSort.h
@@ -92,8 +92,9 @@ private:
    CudaArray* offsetInBucket;
    CudaArray* bucketOffset;
    CudaArray* buckets;
-    CUfunction computeRangeKernel, assignElementsKernel, computeBucketPositionsKernel, copyToBucketsKernel, sortBucketsKernel;
-    unsigned int rangeKernelSize, positionsKernelSize, sortKernelSize;
+    CUfunction shortListKernel, computeRangeKernel, assignElementsKernel, computeBucketPositionsKernel, copyToBucketsKernel, sortBucketsKernel;
+    unsigned int dataLength, rangeKernelSize, positionsKernelSize, sortKernelSize;
+    bool isShortList;
 };

 /**

--- a/platforms/cuda/src/kernels/coulombLennardJones.cu
+++ b/platforms/cuda/src/kernels/coulombLennardJones.cu
 #if USE_EWALD
-bool needCorrection = isExcluded && atom1 != atom2 && atom1 < NUM_ATOMS && atom2 < NUM_ATOMS;
+bool needCorrection = hasExclusions && isExcluded && atom1 != atom2 && atom1 < NUM_ATOMS && atom2 < NUM_ATOMS;
 if (!isExcluded || needCorrection) {
-    real tempForce = 0.0f;
    if (r2 < CUTOFF_SQUARED || needCorrection) {
        const real alphaR = EWALD_ALPHA*r;
        const real expAlphaRSqr = EXP(-alphaR*alphaR);
@@ -16,6 +15,7 @@ if (!isExcluded || needCorrection) {
        t *= t;
        t *= t;
        const real erfcAlphaR = RECIP(t*t);
+        real tempForce = 0.0f;
        if (needCorrection) {
            // Subtract off the part of this interaction that was included in the reciprocal space contribution.

@@ -36,8 +36,8 @@ if (!isExcluded || needCorrection) {
            tempEnergy += prefactor*erfcAlphaR;
 #endif
        }
+        dEdR += tempForce*invR*invR;
    }
-    dEdR += tempForce*invR*invR;
 }
 #else
 {

--- a/platforms/cuda/src/kernels/customGBEnergyN2.cu
+++ b/platforms/cuda/src/kernels/customGBEnergyN2.cu
--- a/platforms/cuda/src/kernels/customGBValueN2.cu
+++ b/platforms/cuda/src/kernels/customGBValueN2.cu
--- a/platforms/cuda/src/kernels/customHbondForce.cu
+++ b/platforms/cuda/src/kernels/customHbondForce.cu
@@ -48,12 +48,12 @@ inline __device__ real computeAngle(real4 vec1, real4 vec2) {

        real3 crossProduct = cross(vec1, vec2);
        real scale = vec1.w*vec2.w;
-        angle = asin(SQRT(dot(crossProduct, crossProduct)/scale));
+        angle = ASIN(SQRT(dot(crossProduct, crossProduct)/scale));
        if (cosine < 0.0f)
            angle = M_PI-angle;
    }
    else
-       angle = acos(cosine);
+       angle = ACOS(cosine);
    return angle;
 }


--- a/platforms/cuda/src/kernels/ewald.cu
+++ b/platforms/cuda/src/kernels/ewald.cu
@@ -35,11 +35,11 @@ extern "C" __global__ void calculateEwaldCosSinSums(real* __restrict__ energyBuf
        for (int atom = 0; atom < NUM_ATOMS; atom++) {
            real4 apos = posq[atom];
            real phase = apos.x*kx;
-            real2 structureFactor = make_real2(cos(phase), sin(phase));
+            real2 structureFactor = make_real2(COS(phase), SIN(phase));
            phase = apos.y*ky;
-            structureFactor = multofReal2(structureFactor, make_real2(cos(phase), sin(phase)));
+            structureFactor = multofReal2(structureFactor, make_real2(COS(phase), SIN(phase)));
            phase = apos.z*kz;
-            structureFactor = multofReal2(structureFactor, make_real2(cos(phase), sin(phase)));
+            structureFactor = multofReal2(structureFactor, make_real2(COS(phase), SIN(phase)));
            sum += apos.w*structureFactor;
        }
        cosSinSum[index] = sum;
@@ -76,9 +76,9 @@ extern "C" __global__ void calculateEwaldForces(unsigned long long* __restrict__
            for (int ry = lowry; ry < KMAX_Y; ry++) {
                real ky = ry*reciprocalBoxSize.y;
                real phase = apos.x*kx;
-                real2 tab_xy = make_real2(cos(phase), sin(phase));
+                real2 tab_xy = make_real2(COS(phase), SIN(phase));
                phase = apos.y*ky;
-                tab_xy = multofReal2(tab_xy, make_real2(cos(phase), sin(phase)));
+                tab_xy = multofReal2(tab_xy, make_real2(COS(phase), SIN(phase)));
                for (int rz = lowrz; rz < KMAX_Z; rz++) {
                    real kz = rz*reciprocalBoxSize.z;

@@ -88,7 +88,7 @@ extern "C" __global__ void calculateEwaldForces(unsigned long long* __restrict__
                    real k2 = kx*kx + ky*ky + kz*kz;
                    real ak = EXP(k2*EXP_COEFFICIENT)/k2;
                    phase = apos.z*kz;
-                    real2 structureFactor = multofReal2(tab_xy, make_real2(cos(phase), sin(phase)));
+                    real2 structureFactor = multofReal2(tab_xy, make_real2(COS(phase), SIN(phase)));
                    real2 sum = cosSinSum[index];
                    real dEdR = 2*reciprocalCoefficient*ak*apos.w*(sum.x*structureFactor.y - sum.y*structureFactor.x);
                    force.x += dEdR*kx;

--- a/platforms/cuda/src/kernels/findInteractingBlocks.cu
+++ b/platforms/cuda/src/kernels/findInteractingBlocks.cu
--- a/platforms/cuda/src/kernels/gbsaObc1.cu
+++ b/platforms/cuda/src/kernels/gbsaObc1.cu
--- a/platforms/cuda/src/kernels/integrationUtilities.cu
+++ b/platforms/cuda/src/kernels/integrationUtilities.cu
@@ -24,14 +24,14 @@ extern "C" __global__ void generateRandomNumbers(int numValues, float4* __restri
        state.y ^= state.y << 13;
        state.y ^= state.y >> 17;
        state.y ^= state.y << 5;
-        x1 = sqrt(-2.0f * log(x1));
+        x1 = SQRT(-2.0f * LOG(x1));
        k = (state.z >> 2) + (state.w >> 3) + (carry >> 2);
        m = state.w + state.w + state.z + carry;
        state.z = state.w;
        state.w = m;
        carry = k >> 30;
        float x2 = (float)(state.x + state.y + state.w) / (float)0xffffffff;
-        value.x = x1 * cos(2.0f * 3.14159265f * x2);
+        value.x = x1 * COS(2.0f * 3.14159265f * x2);

        // Generate second value.

@@ -49,14 +49,14 @@ extern "C" __global__ void generateRandomNumbers(int numValues, float4* __restri
        state.y ^= state.y << 13;
        state.y ^= state.y >> 17;
        state.y ^= state.y << 5;
-        x3 = sqrt(-2.0f * log(x3));
+        x3 = SQRT(-2.0f * LOG(x3));
        k = (state.z >> 2) + (state.w >> 3) + (carry >> 2);
        m = state.w + state.w + state.z + carry;
        state.z = state.w;
        state.w = m;
        carry = k >> 30;
        float x4 = (float)(state.x + state.y + state.w) / (float)0xffffffff;
-        value.y = x3 * cos(2.0f * 3.14159265f * x4);
+        value.y = x3 * COS(2.0f * 3.14159265f * x4);

        // Generate third value.

@@ -74,14 +74,14 @@ extern "C" __global__ void generateRandomNumbers(int numValues, float4* __restri
        state.y ^= state.y << 13;
        state.y ^= state.y >> 17;
        state.y ^= state.y << 5;
-        x5 = sqrt(-2.0f * log(x5));
+        x5 = SQRT(-2.0f * LOG(x5));
        k = (state.z >> 2) + (state.w >> 3) + (carry >> 2);
        m = state.w + state.w + state.z + carry;
        state.z = state.w;
        state.w = m;
        carry = k >> 30;
        float x6 = (float)(state.x + state.y + state.w) / (float)0xffffffff;
-        value.z = x5 * cos(2.0f * 3.14159265f * x6);
+        value.z = x5 * COS(2.0f * 3.14159265f * x6);

        // Generate fourth value.

@@ -99,14 +99,14 @@ extern "C" __global__ void generateRandomNumbers(int numValues, float4* __restri
        state.y ^= state.y << 13;
        state.y ^= state.y >> 17;
        state.y ^= state.y << 5;
-        x7 = sqrt(-2.0f * log(x7));
+        x7 = SQRT(-2.0f * LOG(x7));
        k = (state.z >> 2) + (state.w >> 3) + (carry >> 2);
        m = state.w + state.w + state.z + carry;
        state.z = state.w;
        state.w = m;
        carry = k >> 30;
        float x8 = (float)(state.x + state.y + state.w) / (float)0xffffffff;
-        value.w = x7 * cos(2.0f * 3.14159265f * x8);
+        value.w = x7 * COS(2.0f * 3.14159265f * x8);

        // Record the values.

@@ -412,9 +412,9 @@ extern "C" __global__ void applySettleToPositions(int numClusters, mixed tol, co
        mixed yaksYd = zaksZd*xaksXd - xaksZd*zaksXd;
        mixed zaksYd = xaksZd*yaksXd - yaksZd*xaksXd;

-        mixed axlng = sqrt(xaksXd*xaksXd + yaksXd*yaksXd + zaksXd*zaksXd);
-        mixed aylng = sqrt(xaksYd*xaksYd + yaksYd*yaksYd + zaksYd*zaksYd);
-        mixed azlng = sqrt(xaksZd*xaksZd + yaksZd*yaksZd + zaksZd*zaksZd);
+        mixed axlng = SQRT(xaksXd*xaksXd + yaksXd*yaksXd + zaksXd*zaksXd);
+        mixed aylng = SQRT(xaksYd*xaksYd + yaksYd*yaksYd + zaksYd*zaksYd);
+        mixed azlng = SQRT(xaksZd*xaksZd + yaksZd*yaksZd + zaksZd*zaksZd);
        mixed trns11 = xaksXd / axlng;
        mixed trns21 = yaksXd / axlng;
        mixed trns31 = zaksXd / axlng;
@@ -440,13 +440,13 @@ extern "C" __global__ void applySettleToPositions(int numClusters, mixed tol, co
        //                                        --- Step2  A2' ---

        float rc = 0.5f*params.y;
-        mixed rb = sqrt(params.x*params.x-rc*rc);
+        mixed rb = SQRT(params.x*params.x-rc*rc);
        mixed ra = rb*(m1+m2)*invTotalMass;
        rb -= ra;
        mixed sinphi = za1d/ra;
-        mixed cosphi = sqrt(1-sinphi*sinphi);
+        mixed cosphi = SQRT(1-sinphi*sinphi);
        mixed sinpsi = (zb1d-zc1d) / (2*rc*cosphi);
-        mixed cospsi = sqrt(1-sinpsi*sinpsi);
+        mixed cospsi = SQRT(1-sinpsi*sinpsi);

        mixed ya2d =   ra*cosphi;
        mixed xb2d = - rc*cospsi;
@@ -454,7 +454,7 @@ extern "C" __global__ void applySettleToPositions(int numClusters, mixed tol, co
        mixed yc2d = - rb*cosphi + rc*sinpsi*sinphi;
        mixed xb2d2 = xb2d*xb2d;
        mixed hh2 = 4.0f*xb2d2 + (yb2d-yc2d)*(yb2d-yc2d) + (zb1d-zc1d)*(zb1d-zc1d);
-        mixed deltx = 2.0f*xb2d + sqrt(4.0f*xb2d2 - hh2 + params.y*params.y);
+        mixed deltx = 2.0f*xb2d + SQRT(4.0f*xb2d2 - hh2 + params.y*params.y);
        xb2d -= deltx*0.5f;

        //                                        --- Step3  al,be,ga ---
@@ -464,11 +464,11 @@ extern "C" __global__ void applySettleToPositions(int numClusters, mixed tol, co
        mixed gamma = xb0d*yb1d - xb1d*yb0d + xc0d*yc1d - xc1d*yc0d;

        mixed al2be2 = alpha*alpha + beta*beta;
-        mixed sintheta = (alpha*gamma - beta*sqrt(al2be2 - gamma*gamma)) / al2be2;
+        mixed sintheta = (alpha*gamma - beta*SQRT(al2be2 - gamma*gamma)) / al2be2;

        //                                        --- Step4  A3' ---

-        mixed costheta = sqrt(1-sintheta*sintheta);
+        mixed costheta = SQRT(1-sintheta*sintheta);
        mixed xa3d = - ya2d*sintheta;
        mixed ya3d =   ya2d*costheta;
        mixed za3d = za1d;
@@ -534,9 +534,9 @@ extern "C" __global__ void applySettleToVelocities(int numClusters, mixed tol, c
        mixed3 eAB = make_mixed3(apos1.x-apos0.x, apos1.y-apos0.y, apos1.z-apos0.z);
        mixed3 eBC = make_mixed3(apos2.x-apos1.x, apos2.y-apos1.y, apos2.z-apos1.z);
        mixed3 eCA = make_mixed3(apos0.x-apos2.x, apos0.y-apos2.y, apos0.z-apos2.z);
-        eAB *= rsqrt(eAB.x*eAB.x + eAB.y*eAB.y + eAB.z*eAB.z);
-        eBC *= rsqrt(eBC.x*eBC.x + eBC.y*eBC.y + eBC.z*eBC.z);
-        eCA *= rsqrt(eCA.x*eCA.x + eCA.y*eCA.y + eCA.z*eCA.z);
+        eAB *= RSQRT(eAB.x*eAB.x + eAB.y*eAB.y + eAB.z*eAB.z);
+        eBC *= RSQRT(eBC.x*eBC.x + eBC.y*eBC.y + eBC.z*eBC.z);
+        eCA *= RSQRT(eCA.x*eCA.x + eCA.y*eCA.y + eCA.z*eCA.z);
        mixed vAB = (v1.x-v0.x)*eAB.x + (v1.y-v0.y)*eAB.y + (v1.z-v0.z)*eAB.z;
        mixed vBC = (v2.x-v1.x)*eBC.x + (v2.y-v1.y)*eBC.y + (v2.z-v1.z)*eBC.z;
        mixed vCA = (v0.x-v2.x)*eCA.x + (v0.y-v2.y)*eCA.y + (v0.z-v2.z)*eCA.z;
@@ -574,7 +574,8 @@ extern "C" __global__ void applySettleToVelocities(int numClusters, mixed tol, c
 /**
 * Compute the direction each CCMA constraint is pointing in.  This is called once at the beginning of constraint evaluation.
 */
-extern "C" __global__ void computeCCMAConstraintDirections(const int2* __restrict__ constraintAtoms, mixed4* __restrict__ constraintDistance, const real4* __restrict__ atomPositions, const real4* __restrict__ posqCorrection) {
+extern "C" __global__ void computeCCMAConstraintDirections(const int2* __restrict__ constraintAtoms, mixed4* __restrict__ constraintDistance,
+        const real4* __restrict__ atomPositions, const real4* __restrict__ posqCorrection, int* __restrict__ converged) {
    for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < NUM_CCMA_CONSTRAINTS; index += blockDim.x*gridDim.x) {
        // Compute the direction for this constraint.

@@ -587,6 +588,10 @@ extern "C" __global__ void computeCCMAConstraintDirections(const int2* __restric
        dir.z = oldPos1.z-oldPos2.z;
        constraintDistance[index] = dir;
    }
+    if (threadIdx.x == 0 && blockIdx.x == 0) {
+        converged[0] = 1;
+        converged[1] = 0;
+    }
 }

 /**
@@ -605,6 +610,7 @@ extern "C" __global__ void computeCCMAPositionConstraintForce(const int2* __rest
    __syncthreads();
    mixed lowerTol = 1-2*tol+tol*tol;
    mixed upperTol = 1+2*tol+tol*tol;
+    bool threadConverged = true;
    for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < NUM_CCMA_CONSTRAINTS; index += blockDim.x*gridDim.x) {
        // Compute the force due to this constraint.

@@ -620,14 +626,13 @@ extern "C" __global__ void computeCCMAPositionConstraintForce(const int2* __rest
        mixed dist2 = dir.w*dir.w;
        mixed diff = dist2 - rp2;
        delta1[index] = (rrpr > d_ij2*1e-6f ? reducedMass[index]*diff/rrpr : 0.0f);
-
-        // See whether it has converged.
-
-        if (groupConverged && (rp2 < lowerTol*dist2 || rp2 > upperTol*dist2)) {
-            groupConverged = 0;
-            converged[iteration%2] = 0;
-        }
+        threadConverged &= (rp2 > lowerTol*dist2 && rp2 < upperTol*dist2);
    }
+    if (groupConverged && !threadConverged)
+        groupConverged = 0;
+    __syncthreads();
+    if (threadIdx.x == 0 && !groupConverged)
+        converged[iteration%2] = 0;
 }

 /**