Merge pull request #1632 from peastman/pairs

Fine grained pair list of nonbonded interactions

Merge pull request #1632 from peastman/pairs
Fine grained pair list of nonbonded interactions
388bf972 · peastman · GitHub · 064a781f · 7c911459 · 388bf972
Commit 388bf972 authored Oct 14, 2016 by peastman Committed by GitHub Oct 14, 2016
7 changed files
--- a/platforms/cuda/include/CudaNonbondedUtilities.h
+++ b/platforms/cuda/include/CudaNonbondedUtilities.h
@@ -78,8 +78,9 @@ public:
     * @param exclusionList    for each atom, specifies the list of other atoms whose interactions should be excluded
     * @param kernel           the code to evaluate the interaction
     * @param forceGroup       the force group in which the interaction should be calculated
+     * @param supportsPairList specifies whether this interaction can work with a neighbor list that uses a separate pair list
     */
-    void addInteraction(bool usesCutoff, bool usesPeriodic, bool usesExclusions, double cutoffDistance, const std::vector<std::vector<int> >& exclusionList, const std::string& kernel, int forceGroup);
+    void addInteraction(bool usesCutoff, bool usesPeriodic, bool usesExclusions, double cutoffDistance, const std::vector<std::vector<int> >& exclusionList, const std::string& kernel, int forceGroup, bool supportsPairList=false);
    /**
     * Add a per-atom parameter that the default interaction kernel may depend on.
     */
@@ -189,6 +190,12 @@ public:
    CudaArray& getInteractingAtoms() {
        return *interactingAtoms;
    }
+    /**
+     * Get the array containing single pairs in the neighbor list.
+     */
+    CudaArray& getSinglePairs() {
+        return *singlePairs;
+    }
    /**
     * Get the array containing exclusion flags.
     */
@@ -270,6 +277,8 @@ private:
    CudaArray* interactingTiles;
    CudaArray* interactingAtoms;
    CudaArray* interactionCount;
+    CudaArray* singlePairs;
+    CudaArray* singlePairCount;
    CudaArray* blockCenter;
    CudaArray* blockBoundingBox;
    CudaArray* sortedBlocks;
@@ -288,8 +297,8 @@ private:
    std::map<int, double> groupCutoff;
    std::map<int, std::string> groupKernelSource;
    double lastCutoff;
-    bool useCutoff, usePeriodic, anyExclusions, usePadding, forceRebuildNeighborList;
-    int startTileIndex, numTiles, startBlockIndex, numBlocks, maxTiles, maxExclusions, numForceThreadBlocks, forceThreadBlockSize, numAtoms, groupFlags;
+    bool useCutoff, usePeriodic, anyExclusions, usePadding, forceRebuildNeighborList, canUsePairList;
+    int startTileIndex, numTiles, startBlockIndex, numBlocks, maxTiles, maxSinglePairs, maxExclusions, numForceThreadBlocks, forceThreadBlockSize, numAtoms, groupFlags;
 };

 /**

--- a/platforms/cuda/include/CudaParallelKernels.h
+++ b/platforms/cuda/include/CudaParallelKernels.h
@@ -83,7 +83,7 @@ private:
    std::vector<Kernel> kernels;
    std::vector<long long> completionTimes;
    std::vector<double> contextNonbondedFractions;
-    int* tileCounts;
+    int2* interactionCounts;
    CudaArray* contextForces;
    void* pinnedPositionBuffer;
    long long* pinnedForceBuffer;

--- a/platforms/cuda/src/CudaKernels.cpp
+++ b/platforms/cuda/src/CudaKernels.cpp
@@ -1920,7 +1920,7 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
    // Add the interaction to the default nonbonded kernel.
   
    string source = cu.replaceStrings(CudaKernelSources::coulombLennardJones, defines);
-    cu.getNonbondedUtilities().addInteraction(useCutoff, usePeriodic, true, force.getCutoffDistance(), exclusionList, source, force.getForceGroup());
+    cu.getNonbondedUtilities().addInteraction(useCutoff, usePeriodic, true, force.getCutoffDistance(), exclusionList, source, force.getForceGroup(), true);
    if (hasLJ)
        cu.getNonbondedUtilities().addParameter(CudaNonbondedUtilities::ParameterInfo("sigmaEpsilon", "float", 2, sizeof(float2), sigmaEpsilon->getDevicePointer()));

@@ -2314,7 +2314,7 @@ void CudaCalcCustomNonbondedForceKernel::initialize(const System& system, const
    if (force.getNumInteractionGroups() > 0)
        initInteractionGroups(force, source, tableTypes);
    else {
-        cu.getNonbondedUtilities().addInteraction(useCutoff, usePeriodic, true, force.getCutoffDistance(), exclusionList, source, force.getForceGroup());
+        cu.getNonbondedUtilities().addInteraction(useCutoff, usePeriodic, true, force.getCutoffDistance(), exclusionList, source, force.getForceGroup(), true);
        for (int i = 0; i < (int) params->getBuffers().size(); i++) {
            CudaNonbondedUtilities::ParameterInfo& buffer = params->getBuffers()[i];
            cu.getNonbondedUtilities().addParameter(CudaNonbondedUtilities::ParameterInfo(prefix+"params"+cu.intToString(i+1), buffer.getComponentType(), buffer.getNumComponents(), buffer.getSize(), buffer.getMemory()));

--- a/platforms/cuda/src/CudaNonbondedUtilities.cpp
+++ b/platforms/cuda/src/CudaNonbondedUtilities.cpp
@@ -64,15 +64,16 @@ private:

 CudaNonbondedUtilities::CudaNonbondedUtilities(CudaContext& context) : context(context), useCutoff(false), usePeriodic(false), anyExclusions(false), usePadding(true),
        exclusionIndices(NULL), exclusionRowIndices(NULL), exclusionTiles(NULL), exclusions(NULL), interactingTiles(NULL), interactingAtoms(NULL),
-        interactionCount(NULL), blockCenter(NULL), blockBoundingBox(NULL), sortedBlocks(NULL), sortedBlockCenter(NULL), sortedBlockBoundingBox(NULL),
-        oldPositions(NULL), rebuildNeighborList(NULL), blockSorter(NULL), pinnedCountBuffer(NULL), forceRebuildNeighborList(true), lastCutoff(0.0), groupFlags(0) {
+        interactionCount(NULL), singlePairs(NULL), blockCenter(NULL), blockBoundingBox(NULL), sortedBlocks(NULL), sortedBlockCenter(NULL), sortedBlockBoundingBox(NULL),
+        oldPositions(NULL), rebuildNeighborList(NULL), blockSorter(NULL), pinnedCountBuffer(NULL), forceRebuildNeighborList(true), lastCutoff(0.0), groupFlags(0),
+        canUsePairList(true) {
    // Decide how many thread blocks to use.

    string errorMessage = "Error initializing nonbonded utilities";
    int multiprocessors;
    CHECK_RESULT(cuDeviceGetAttribute(&multiprocessors, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, context.getDevice()));
    CHECK_RESULT(cuEventCreate(&downloadCountEvent, 0));
-    CHECK_RESULT(cuMemHostAlloc((void**) &pinnedCountBuffer, sizeof(int), CU_MEMHOSTALLOC_PORTABLE));
+    CHECK_RESULT(cuMemHostAlloc((void**) &pinnedCountBuffer, 2*sizeof(int), CU_MEMHOSTALLOC_PORTABLE));
    numForceThreadBlocks = 4*multiprocessors;
    forceThreadBlockSize = (context.getComputeCapability() < 2.0 ? 128 : 256);
 }
@@ -92,6 +93,8 @@ CudaNonbondedUtilities::~CudaNonbondedUtilities() {
        delete interactingAtoms;
    if (interactionCount != NULL)
        delete interactionCount;
+    if (singlePairs != NULL)
+        delete singlePairs;
    if (blockCenter != NULL)
        delete blockCenter;
    if (blockBoundingBox != NULL)
@@ -113,7 +116,7 @@ CudaNonbondedUtilities::~CudaNonbondedUtilities() {
    cuEventDestroy(downloadCountEvent);
 }

-void CudaNonbondedUtilities::addInteraction(bool usesCutoff, bool usesPeriodic, bool usesExclusions, double cutoffDistance, const vector<vector<int> >& exclusionList, const string& kernel, int forceGroup) {
+void CudaNonbondedUtilities::addInteraction(bool usesCutoff, bool usesPeriodic, bool usesExclusions, double cutoffDistance, const vector<vector<int> >& exclusionList, const string& kernel, int forceGroup, bool supportsPairList) {
    if (groupCutoff.size() > 0) {
        if (usesCutoff != useCutoff)
            throw OpenMMException("All Forces must agree on whether to use a cutoff");
@@ -128,6 +131,7 @@ void CudaNonbondedUtilities::addInteraction(bool usesCutoff, bool usesPeriodic,
    usePeriodic = usesPeriodic;
    groupCutoff[forceGroup] = cutoffDistance;
    groupFlags |= 1<<forceGroup;
+    canUsePairList &= supportsPairList;
    if (kernel.size() > 0) {
        if (groupKernelSource.find(forceGroup) == groupKernelSource.end())
            groupKernelSource[forceGroup] = "";
@@ -279,9 +283,11 @@ void CudaNonbondedUtilities::initialize(const System& system) {
            maxTiles = numTiles;
        if (maxTiles < 1)
            maxTiles = 1;
+        maxSinglePairs = 5*numAtoms;
        interactingTiles = CudaArray::create<int>(context, maxTiles, "interactingTiles");
        interactingAtoms = CudaArray::create<int>(context, CudaContext::TileSize*maxTiles, "interactingAtoms");
-        interactionCount = CudaArray::create<unsigned int>(context, 1, "interactionCount");
+        interactionCount = CudaArray::create<unsigned int>(context, 2, "interactionCount");
+        singlePairs = CudaArray::create<int2>(context, maxSinglePairs, "singlePairs");
        int elementSize = (context.getUseDoublePrecision() ? sizeof(double) : sizeof(float));
        blockCenter = new CudaArray(context, numAtomBlocks, 4*elementSize, "blockCenter");
        blockBoundingBox = new CudaArray(context, numAtomBlocks, 4*elementSize, "blockBoundingBox");
@@ -291,7 +297,7 @@ void CudaNonbondedUtilities::initialize(const System& system) {
        oldPositions = new CudaArray(context, numAtoms, 4*elementSize, "oldPositions");
        rebuildNeighborList = CudaArray::create<int>(context, 1, "rebuildNeighborList");
        blockSorter = new CudaSort(context, new BlockSortTrait(context.getUseDoublePrecision()), numAtomBlocks);
-        vector<unsigned int> count(1, 0);
+        vector<unsigned int> count(2, 0);
        interactionCount->upload(count);
    }

@@ -316,6 +322,8 @@ void CudaNonbondedUtilities::initialize(const System& system) {
        forceArgs.push_back(&blockCenter->getDevicePointer());
        forceArgs.push_back(&blockBoundingBox->getDevicePointer());
        forceArgs.push_back(&interactingAtoms->getDevicePointer());
+        forceArgs.push_back(&maxSinglePairs);
+        forceArgs.push_back(&singlePairs->getDevicePointer());
    }
    for (int i = 0; i < (int) parameters.size(); i++)
        forceArgs.push_back(&parameters[i].getMemory());
@@ -353,8 +361,10 @@ void CudaNonbondedUtilities::initialize(const System& system) {
        findInteractingBlocksArgs.push_back(&interactionCount->getDevicePointer());
        findInteractingBlocksArgs.push_back(&interactingTiles->getDevicePointer());
        findInteractingBlocksArgs.push_back(&interactingAtoms->getDevicePointer());
+        findInteractingBlocksArgs.push_back(&singlePairs->getDevicePointer());
        findInteractingBlocksArgs.push_back(&context.getPosq().getDevicePointer());
        findInteractingBlocksArgs.push_back(&maxTiles);
+        findInteractingBlocksArgs.push_back(&maxSinglePairs);
        findInteractingBlocksArgs.push_back(&startBlockIndex);
        findInteractingBlocksArgs.push_back(&numBlocks);
        findInteractingBlocksArgs.push_back(&sortedBlocks->getDevicePointer());
@@ -424,12 +434,13 @@ void CudaNonbondedUtilities::computeInteractions(int forceGroups, bool includeFo
 bool CudaNonbondedUtilities::updateNeighborListSize() {
    if (!useCutoff)
        return false;
-    if (pinnedCountBuffer[0] <= (unsigned int) maxTiles)
+    if (pinnedCountBuffer[0] <= maxTiles && pinnedCountBuffer[1] <= maxSinglePairs)
        return false;

    // The most recent timestep had too many interactions to fit in the arrays.  Make the arrays bigger to prevent
    // this from happening in the future.

+    if (pinnedCountBuffer[0] > maxTiles) {
        maxTiles = (int) (1.2*pinnedCountBuffer[0]);
        int totalTiles = context.getNumAtomBlocks()*(context.getNumAtomBlocks()+1)/2;
        if (maxTiles > totalTiles)
@@ -446,6 +457,16 @@ bool CudaNonbondedUtilities::updateNeighborListSize() {
        if (forceArgs.size() > 0)
            forceArgs[17] = &interactingAtoms->getDevicePointer();
        findInteractingBlocksArgs[7] = &interactingAtoms->getDevicePointer();
+    }
+    if (pinnedCountBuffer[1] > maxSinglePairs) {
+        maxSinglePairs = (int) (1.2*pinnedCountBuffer[1]);
+        delete singlePairs;
+        singlePairs = NULL; // Avoid an error in the destructor if the following allocation fails
+        singlePairs = CudaArray::create<int2>(context, maxSinglePairs, "singlePairs");
+        if (forceArgs.size() > 0)
+            forceArgs[19] = &singlePairs->getDevicePointer();
+        findInteractingBlocksArgs[8] = &singlePairs->getDevicePointer();
+    }
    forceRebuildNeighborList = true;
    context.setForcesValid(false);
    return true;
@@ -493,6 +514,7 @@ void CudaNonbondedUtilities::createKernelsForGroups(int groups) {
        if (usePeriodic)
            defines["USE_PERIODIC"] = "1";
        defines["MAX_EXCLUSIONS"] = context.intToString(maxExclusions);
+        defines["MAX_BITS_FOR_PAIRS"] = (canUsePairList ? "2" : "0");
        CUmodule interactingBlocksProgram = context.createModule(CudaKernelSources::vectorOps+CudaKernelSources::findInteractingBlocks, defines);
        kernels.findBlockBoundsKernel = context.getKernel(interactingBlocksProgram, "findBlockBounds");
        kernels.sortBoxDataKernel = context.getKernel(interactingBlocksProgram, "sortBoxData");

--- a/platforms/cuda/src/CudaParallelKernels.cpp
+++ b/platforms/cuda/src/CudaParallelKernels.cpp
@@ -63,8 +63,8 @@ if (result != CUDA_SUCCESS) { \
 class CudaParallelCalcForcesAndEnergyKernel::BeginComputationTask : public CudaContext::WorkTask {
 public:
    BeginComputationTask(ContextImpl& context, CudaContext& cu, CudaCalcForcesAndEnergyKernel& kernel,
-            bool includeForce, bool includeEnergy, int groups, void* pinnedMemory, CUevent event, int& numTiles) : context(context), cu(cu), kernel(kernel),
-            includeForce(includeForce), includeEnergy(includeEnergy), groups(groups), pinnedMemory(pinnedMemory), event(event), numTiles(numTiles) {
+            bool includeForce, bool includeEnergy, int groups, void* pinnedMemory, CUevent event, int2& interactionCount) : context(context), cu(cu), kernel(kernel),
+            includeForce(includeForce), includeEnergy(includeEnergy), groups(groups), pinnedMemory(pinnedMemory), event(event), interactionCount(interactionCount) {
    }
    void execute() {
        // Copy coordinates over to this device and execute the kernel.
@@ -77,7 +77,7 @@ public:
        }
        kernel.beginComputation(context, includeForce, includeEnergy, groups);
        if (cu.getNonbondedUtilities().getUsePeriodic())
-            cu.getNonbondedUtilities().getInteractionCount().download(&numTiles, false);
+            cu.getNonbondedUtilities().getInteractionCount().download(&interactionCount, false);
    }
 private:
    ContextImpl& context;
@@ -87,15 +87,15 @@ private:
    int groups;
    void* pinnedMemory;
    CUevent event;
-    int& numTiles;
+    int2& interactionCount;
 };

 class CudaParallelCalcForcesAndEnergyKernel::FinishComputationTask : public CudaContext::WorkTask {
 public:
    FinishComputationTask(ContextImpl& context, CudaContext& cu, CudaCalcForcesAndEnergyKernel& kernel,
-            bool includeForce, bool includeEnergy, int groups, double& energy, long long& completionTime, long long* pinnedMemory, CudaArray& contextForces, bool& valid, int& numTiles) :
+            bool includeForce, bool includeEnergy, int groups, double& energy, long long& completionTime, long long* pinnedMemory, CudaArray& contextForces, bool& valid, int2& interactionCount) :
            context(context), cu(cu), kernel(kernel), includeForce(includeForce), includeEnergy(includeEnergy), groups(groups), energy(energy),
-            completionTime(completionTime), pinnedMemory(pinnedMemory), contextForces(contextForces), valid(valid), numTiles(numTiles) {
+            completionTime(completionTime), pinnedMemory(pinnedMemory), contextForces(contextForces), valid(valid), interactionCount(interactionCount) {
    }
    void execute() {
        // Execute the kernel, then download forces.
@@ -120,7 +120,8 @@ public:
                    cu.getForce().download(&pinnedMemory[(cu.getContextIndex()-1)*numAtoms*3]);
            }
        }
-        if (cu.getNonbondedUtilities().getUsePeriodic() && numTiles > cu.getNonbondedUtilities().getInteractingTiles().getSize()) {
+        if (cu.getNonbondedUtilities().getUsePeriodic() && (interactionCount.x > cu.getNonbondedUtilities().getInteractingTiles().getSize() ||
+                interactionCount.y > cu.getNonbondedUtilities().getSinglePairs().getSize())) {
            valid = false;
            cu.getNonbondedUtilities().updateNeighborListSize();
        }
@@ -136,12 +137,12 @@ private:
    long long* pinnedMemory;
    CudaArray& contextForces;
    bool& valid;
-    int& numTiles;
+    int2& interactionCount;
 };

 CudaParallelCalcForcesAndEnergyKernel::CudaParallelCalcForcesAndEnergyKernel(string name, const Platform& platform, CudaPlatform::PlatformData& data) :
        CalcForcesAndEnergyKernel(name, platform), data(data), completionTimes(data.contexts.size()), contextNonbondedFractions(data.contexts.size()),
-        tileCounts(NULL), contextForces(NULL), pinnedPositionBuffer(NULL), pinnedForceBuffer(NULL) {
+        interactionCounts(NULL), contextForces(NULL), pinnedPositionBuffer(NULL), pinnedForceBuffer(NULL) {
    for (int i = 0; i < (int) data.contexts.size(); i++)
        kernels.push_back(Kernel(new CudaCalcForcesAndEnergyKernel(name, platform, *data.contexts[i])));
 }
@@ -156,8 +157,8 @@ CudaParallelCalcForcesAndEnergyKernel::~CudaParallelCalcForcesAndEnergyKernel()
        cuMemFreeHost(pinnedForceBuffer);
    cuEventDestroy(event);
    cuStreamDestroy(peerCopyStream);
-    if (tileCounts != NULL)
-        cuMemFreeHost(tileCounts);
+    if (interactionCounts != NULL)
+        cuMemFreeHost(interactionCounts);
 }

 void CudaParallelCalcForcesAndEnergyKernel::initialize(const System& system) {
@@ -172,7 +173,7 @@ void CudaParallelCalcForcesAndEnergyKernel::initialize(const System& system) {
        contextNonbondedFractions[i] = 1/(double) numContexts;
    CHECK_RESULT(cuEventCreate(&event, 0), "Error creating event");
    CHECK_RESULT(cuStreamCreate(&peerCopyStream, CU_STREAM_NON_BLOCKING), "Error creating stream");
-    CHECK_RESULT(cuMemHostAlloc((void**) &tileCounts, numContexts*sizeof(int), 0), "Error creating tile count buffer");
+    CHECK_RESULT(cuMemHostAlloc((void**) &interactionCounts, numContexts*sizeof(int2), 0), "Error creating interaction counts buffer");
 }

 void CudaParallelCalcForcesAndEnergyKernel::beginComputation(ContextImpl& context, bool includeForce, bool includeEnergy, int groups) {
@@ -202,7 +203,7 @@ void CudaParallelCalcForcesAndEnergyKernel::beginComputation(ContextImpl& contex
        data.contextEnergy[i] = 0.0;
        CudaContext& cu = *data.contexts[i];
        CudaContext::WorkThread& thread = cu.getWorkThread();
-        thread.addTask(new BeginComputationTask(context, cu, getKernel(i), includeForce, includeEnergy, groups, pinnedPositionBuffer, event, tileCounts[i]));
+        thread.addTask(new BeginComputationTask(context, cu, getKernel(i), includeForce, includeEnergy, groups, pinnedPositionBuffer, event, interactionCounts[i]));
    }
 }

@@ -210,7 +211,7 @@ double CudaParallelCalcForcesAndEnergyKernel::finishComputation(ContextImpl& con
    for (int i = 0; i < (int) data.contexts.size(); i++) {
        CudaContext& cu = *data.contexts[i];
        CudaContext::WorkThread& thread = cu.getWorkThread();
-        thread.addTask(new FinishComputationTask(context, cu, getKernel(i), includeForce, includeEnergy, groups, data.contextEnergy[i], completionTimes[i], pinnedForceBuffer, *contextForces, valid, tileCounts[i]));
+        thread.addTask(new FinishComputationTask(context, cu, getKernel(i), includeForce, includeEnergy, groups, data.contextEnergy[i], completionTimes[i], pinnedForceBuffer, *contextForces, valid, interactionCounts[i]));
    }
    data.syncContexts();
    double energy = 0.0;

--- a/platforms/cuda/src/kernels/findInteractingBlocks.cu
+++ b/platforms/cuda/src/kernels/findInteractingBlocks.cu
@@ -71,9 +71,60 @@ extern "C" __global__ void sortBoxData(const real2* __restrict__ sortedBlock, co
    if (rebuild) {
        rebuildNeighborList[0] = 1;
        interactionCount[0] = 0;
+        interactionCount[1] = 0;
    }
 }

+__device__ int saveSinglePairs(int x, int* atoms, int* flags, int length, unsigned int maxSinglePairs, unsigned int* singlePairCount, int2* singlePairs, int* sumBuffer, volatile int& pairStartIndex) {
+    // Record interactions that should be computed as single pairs rather than in blocks.
+    
+    const int indexInWarp = threadIdx.x%32;
+    int sum = 0;
+    for (int i = indexInWarp; i < length; i += 32) {
+        int count = __popc(flags[i]);
+        sum += (count <= MAX_BITS_FOR_PAIRS ? count : 0);
+    }
+    sumBuffer[indexInWarp] = sum;
+    for (int step = 1; step < 32; step *= 2) {
+        int add = (indexInWarp >= step ? sumBuffer[indexInWarp-step] : 0);
+        sumBuffer[indexInWarp] += add;
+    }
+    int pairsToStore = sumBuffer[31];
+    if (indexInWarp == 0)
+        pairStartIndex = atomicAdd(singlePairCount, pairsToStore);
+    int pairIndex = pairStartIndex + (indexInWarp > 0 ? sumBuffer[indexInWarp-1] : 0);
+    for (int i = indexInWarp; i < length; i += 32) {
+        int count = __popc(flags[i]);
+        if (count <= MAX_BITS_FOR_PAIRS && pairIndex+count < maxSinglePairs) {
+            int f = flags[i];
+            while (f != 0) {
+                singlePairs[pairIndex] = make_int2(atoms[i], x*TILE_SIZE+__ffs(f)-1);
+                f &= f-1;
+                pairIndex++;
+            }
+        }
+    }
+    
+    // Compact the remaining interactions.
+    
+    const int warpMask = (1<<indexInWarp)-1;
+    int numCompacted = 0;
+    for (int start = 0; start < length; start += 32) {
+        int i = start+indexInWarp;
+        int atom = atoms[i];
+        int flag = flags[i];
+        bool include = (i < length && __popc(flags[i]) > MAX_BITS_FOR_PAIRS);
+        int includeFlags = __ballot(include);
+        if (include) {
+            int index = numCompacted+__popc(includeFlags&warpMask);
+            atoms[index] = atom;
+            flags[index] = flag;
+        }
+        numCompacted += __popc(includeFlags);
+    }
+    return numCompacted;
+}
+
 /**
 * Compare the bounding boxes for each pair of atom blocks (comprised of 32 atoms each), forming a tile. If the two
 * atom blocks are sufficiently far apart, mark them as non-interacting. There are two stages in the algorithm.
@@ -124,8 +175,9 @@ extern "C" __global__ void sortBoxData(const real2* __restrict__ sortedBlock, co
 *
 */
 extern "C" __global__ void findBlocksWithInteractions(real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ,
-        unsigned int* __restrict__ interactionCount, int* __restrict__ interactingTiles, unsigned int* __restrict__ interactingAtoms, const real4* __restrict__ posq,
-        unsigned int maxTiles, unsigned int startBlockIndex, unsigned int numBlocks, real2* __restrict__ sortedBlocks, const real4* __restrict__ sortedBlockCenter,
+        unsigned int* __restrict__ interactionCount, int* __restrict__ interactingTiles, unsigned int* __restrict__ interactingAtoms,
+        int2* __restrict__ singlePairs, const real4* __restrict__ posq, unsigned int maxTiles, unsigned int maxSinglePairs,
+        unsigned int startBlockIndex, unsigned int numBlocks, real2* __restrict__ sortedBlocks, const real4* __restrict__ sortedBlockCenter,
        const real4* __restrict__ sortedBlockBoundingBox, const unsigned int* __restrict__ exclusionIndices, const unsigned int* __restrict__ exclusionRowIndices,
        real4* __restrict__ oldPositions, const int* __restrict__ rebuildNeighborList) {

@@ -138,12 +190,17 @@ extern "C" __global__ void findBlocksWithInteractions(real4 periodicBoxSize, rea
    const int warpIndex = (blockIdx.x*blockDim.x+threadIdx.x)/32;
    const int warpMask = (1<<indexInWarp)-1;
    __shared__ int workgroupBuffer[BUFFER_SIZE*(GROUP_SIZE/32)];
+    __shared__ int workgroupFlagsBuffer[BUFFER_SIZE*(GROUP_SIZE/32)];
    __shared__ int warpExclusions[MAX_EXCLUSIONS*(GROUP_SIZE/32)];
    __shared__ real3 posBuffer[GROUP_SIZE];
    __shared__ volatile int workgroupTileIndex[GROUP_SIZE/32];
+    __shared__ int sumBuffer[GROUP_SIZE];
+    __shared__ int worksgroupPairStartIndex[GROUP_SIZE/32];
    int* buffer = workgroupBuffer+BUFFER_SIZE*(warpStart/32);
+    int* flagsBuffer = workgroupFlagsBuffer+BUFFER_SIZE*(warpStart/32);
    int* exclusionsForX = warpExclusions+MAX_EXCLUSIONS*(warpStart/32);
    volatile int& tileStartIndex = workgroupTileIndex[warpStart/32];
+    volatile int& pairStartIndex = worksgroupPairStartIndex[warpStart/32];

    // Loop over blocks.
    
@@ -227,7 +284,7 @@ extern "C" __global__ void findBlocksWithInteractions(real4 periodicBoxSize, rea
                APPLY_PERIODIC_TO_DELTA(atomDelta)
 #endif
                int atomFlags = ballot(atomDelta.x*atomDelta.x+atomDelta.y*atomDelta.y+atomDelta.z*atomDelta.z < (PADDED_CUTOFF+blockCenterY.w)*(PADDED_CUTOFF+blockCenterY.w));
-                bool interacts = false;
+                int interacts = 0;
                if (atom2 < NUM_ATOMS && atomFlags != 0) {
                    int first = __ffs(atomFlags)-1;
                    int last = 32-__clz(atomFlags);
@@ -236,14 +293,14 @@ extern "C" __global__ void findBlocksWithInteractions(real4 periodicBoxSize, rea
                        for (int j = first; j < last; j++) {
                            real3 delta = pos2-posBuffer[warpStart+j];
                            APPLY_PERIODIC_TO_DELTA(delta)
-                            interacts |= (delta.x*delta.x+delta.y*delta.y+delta.z*delta.z < PADDED_CUTOFF_SQUARED);
+                            interacts |= (delta.x*delta.x+delta.y*delta.y+delta.z*delta.z < PADDED_CUTOFF_SQUARED ? 1<<j : 0);
                        }
                    }
                    else {
 #endif
                        for (int j = first; j < last; j++) {
                            real3 delta = pos2-posBuffer[warpStart+j];
-                            interacts |= (delta.x*delta.x+delta.y*delta.y+delta.z*delta.z < PADDED_CUTOFF_SQUARED);
+                            interacts |= (delta.x*delta.x+delta.y*delta.y+delta.z*delta.z < PADDED_CUTOFF_SQUARED ? 1<<j : 0);
                        }
 #ifdef USE_PERIODIC
                    }
@@ -253,15 +310,22 @@ extern "C" __global__ void findBlocksWithInteractions(real4 periodicBoxSize, rea
                // Add any interacting atoms to the buffer.
                
                int includeAtomFlags = __ballot(interacts);
-                if (interacts)
-                    buffer[neighborsInBuffer+__popc(includeAtomFlags&warpMask)] = atom2;
+                if (interacts) {
+                    int index = neighborsInBuffer+__popc(includeAtomFlags&warpMask);
+                    buffer[index] = atom2;
+                    flagsBuffer[index] = interacts;
+                }
                neighborsInBuffer += __popc(includeAtomFlags);
                if (neighborsInBuffer > BUFFER_SIZE-TILE_SIZE) {
                    // Store the new tiles to memory.
                    
+#if MAX_BITS_FOR_PAIRS > 0
+                    neighborsInBuffer = saveSinglePairs(x, buffer, flagsBuffer, neighborsInBuffer, maxSinglePairs, &interactionCount[1], singlePairs, sumBuffer+warpStart, pairStartIndex);
+#endif
                    int tilesToStore = neighborsInBuffer/TILE_SIZE;
+                    if (tilesToStore > 0) {
                        if (indexInWarp == 0)
-                        tileStartIndex = atomicAdd(interactionCount, tilesToStore);
+                            tileStartIndex = atomicAdd(&interactionCount[0], tilesToStore);
                        int newTileStartIndex = tileStartIndex;
                        if (newTileStartIndex+tilesToStore <= maxTiles) {
                            if (indexInWarp < tilesToStore)
@@ -274,13 +338,18 @@ extern "C" __global__ void findBlocksWithInteractions(real4 periodicBoxSize, rea
                    }
                }
            }
+        }
        
        // If we have a partially filled buffer,  store it to memory.
        
+#if MAX_BITS_FOR_PAIRS > 0
+        if (neighborsInBuffer > 32)
+            neighborsInBuffer = saveSinglePairs(x, buffer, flagsBuffer, neighborsInBuffer, maxSinglePairs, &interactionCount[1], singlePairs, sumBuffer+warpStart, pairStartIndex);
+#endif
        if (neighborsInBuffer > 0) {
            int tilesToStore = (neighborsInBuffer+TILE_SIZE-1)/TILE_SIZE;
            if (indexInWarp == 0)
-                tileStartIndex = atomicAdd(interactionCount, tilesToStore);
+                tileStartIndex = atomicAdd(&interactionCount[0], tilesToStore);
            int newTileStartIndex = tileStartIndex;
            if (newTileStartIndex+tilesToStore <= maxTiles) {
                if (indexInWarp < tilesToStore)

--- a/platforms/cuda/src/kernels/nonbonded.cu
+++ b/platforms/cuda/src/kernels/nonbonded.cu
@@ -103,9 +103,10 @@ extern "C" __global__ void computeNonbonded(
        unsigned long long* __restrict__ forceBuffers, mixed* __restrict__ energyBuffer, const real4* __restrict__ posq, const tileflags* __restrict__ exclusions,
        const ushort2* __restrict__ exclusionTiles, unsigned int startTileIndex, unsigned int numTileIndices
 #ifdef USE_CUTOFF
-        , const int* __restrict__ tiles, const unsigned int* __restrict__ interactionCount,real4 periodicBoxSize, real4 invPeriodicBoxSize, 
+        , const int* __restrict__ tiles, const unsigned int* __restrict__ interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize, 
        real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, unsigned int maxTiles, const real4* __restrict__ blockCenter,
-        const real4* __restrict__ blockSize, const unsigned int* __restrict__ interactingAtoms
+        const real4* __restrict__ blockSize, const unsigned int* __restrict__ interactingAtoms, unsigned int maxSinglePairs,
+        const int2* __restrict__ singlePairs
 #endif
        PARAMETER_ARGUMENTS) {
    const unsigned int totalWarps = (blockDim.x*gridDim.x)/TILE_SIZE;
@@ -588,6 +589,59 @@ extern "C" __global__ void computeNonbonded(
        }
        pos++;
    }
+    
+    // Third loop: single pairs that aren't part of a tile.
+    
+#if USE_CUTOFF
+    const unsigned int numPairs = interactionCount[1];
+    if (numPairs > maxSinglePairs)
+        return; // There wasn't enough memory for the neighbor list.
+    for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < numPairs; i += blockDim.x*gridDim.x) {
+        int2 pair = singlePairs[i];
+        int atom1 = pair.x;
+        int atom2 = pair.y;
+        real4 posq1 = posq[atom1];
+        real4 posq2 = posq[atom2];
+        LOAD_ATOM1_PARAMETERS
+        int j = atom2;
+atom2 = threadIdx.x;
+        DECLARE_LOCAL_PARAMETERS
+        LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
+        LOAD_ATOM2_PARAMETERS
+atom2 = pair.y;
+        real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z);
+#ifdef USE_PERIODIC
+        APPLY_PERIODIC_TO_DELTA(delta)
+#endif
+        real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
+        real invR = RSQRT(r2);
+        real r = r2*invR;
+#ifdef USE_SYMMETRIC
+        real dEdR = 0.0f;
+#else
+        real3 dEdR1 = make_real3(0);
+        real3 dEdR2 = make_real3(0);
+#endif
+        bool hasExclusions = false;
+        bool isExcluded = false;
+        real tempEnergy = 0.0f;
+        const real interactionScale = 1.0f;
+        COMPUTE_INTERACTION
+        energy += tempEnergy;
+#ifdef INCLUDE_FORCES
+#ifdef USE_SYMMETRIC
+        real3 dEdR1 = delta*dEdR;
+        real3 dEdR2 = -dEdR1;
+#endif
+        atomicAdd(&forceBuffers[atom1], static_cast<unsigned long long>((long long) (-dEdR1.x*0x100000000)));
+        atomicAdd(&forceBuffers[atom1+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (-dEdR1.y*0x100000000)));
+        atomicAdd(&forceBuffers[atom1+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (-dEdR1.z*0x100000000)));
+        atomicAdd(&forceBuffers[atom2], static_cast<unsigned long long>((long long) (-dEdR2.x*0x100000000)));
+        atomicAdd(&forceBuffers[atom2+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (-dEdR2.y*0x100000000)));
+        atomicAdd(&forceBuffers[atom2+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (-dEdR2.z*0x100000000)));
+#endif
+    }
+#endif
 #ifdef INCLUDE_ENERGY
    energyBuffer[blockIdx.x*blockDim.x+threadIdx.x] += energy;
 #endif