Merge pull request #1499 from peastman/reallocneighbors

Improved method for resizing neighbor list

Merge pull request #1499 from peastman/reallocneighbors
Improved method for resizing neighbor list
6c6d68d2 · peastman · c2be00a6 · 6cc2edd1 · 6c6d68d2 · 6c6d68d2
Commit 6c6d68d2 authored May 26, 2016 by peastman
20 changed files
--- a/platforms/cuda/include/CudaContext.h
+++ b/platforms/cuda/include/CudaContext.h
@@ -326,6 +326,18 @@ public:
    void setStepsSinceReorder(int steps) {
        stepsSinceReorder = steps;
    }
+    /**
+     * Get the flag that marks whether the current force evaluation is valid.
+     */
+    bool getForcesValid() const {
+        return forcesValid;
+    }
+    /**
+     * Get the flag that marks whether the current force evaluation is valid.
+     */
+    void setForcesValid(bool valid) {
+        forcesValid = valid;
+    }
    /**
     * Get the number of atoms.
     */
@@ -572,7 +584,7 @@ private:
    int paddedNumAtoms;
    int numAtomBlocks;
    int numThreadBlocks;
-    bool useBlockingSync, useDoublePrecision, useMixedPrecision, contextIsValid, atomsWereReordered, boxIsTriclinic, hasCompilerKernel;
+    bool useBlockingSync, useDoublePrecision, useMixedPrecision, contextIsValid, atomsWereReordered, boxIsTriclinic, hasCompilerKernel, forcesValid;
    std::string compiler, tempDir, cacheDir, gpuArchitecture;
    float4 periodicBoxVecXFloat, periodicBoxVecYFloat, periodicBoxVecZFloat, periodicBoxSizeFloat, invPeriodicBoxSizeFloat;
    double4 periodicBoxVecX, periodicBoxVecY, periodicBoxVecZ, periodicBoxSize, invPeriodicBoxSize;

--- a/platforms/cuda/include/CudaNonbondedUtilities.h
+++ b/platforms/cuda/include/CudaNonbondedUtilities.h
@@ -269,6 +269,8 @@ private:
    CudaArray* oldPositions;
    CudaArray* rebuildNeighborList;
    CudaSort* blockSorter;
+    CUevent downloadCountEvent;
+    int* pinnedCountBuffer;
    std::vector<void*> forceArgs, findBlockBoundsArgs, sortBoxDataArgs, findInteractingBlocksArgs;
    std::vector<std::vector<int> > atomExclusions;
    std::vector<ParameterInfo> parameters;

--- a/platforms/cuda/src/CudaContext.cpp
+++ b/platforms/cuda/src/CudaContext.cpp
@@ -1131,7 +1131,6 @@ void CudaContext::reorderAtoms() {
        reorderAtomsImpl<float, float4, double, double4>();
    else
        reorderAtomsImpl<float, float4, float, float4>();
-    nonbonded->updateNeighborListSize();
 }

 template <class Real, class Real4, class Mixed, class Mixed4>

--- a/platforms/cuda/src/CudaKernels.cpp
+++ b/platforms/cuda/src/CudaKernels.cpp
@@ -94,6 +94,7 @@ void CudaCalcForcesAndEnergyKernel::initialize(const System& system) {
 }

 void CudaCalcForcesAndEnergyKernel::beginComputation(ContextImpl& context, bool includeForces, bool includeEnergy, int groups) {
+    cu.setForcesValid(true);
    cu.setAsCurrent();
    cu.clearAutoclearBuffers();
    for (vector<CudaContext::ForcePreComputation*>::iterator iter = cu.getPreComputations().begin(); iter != cu.getPreComputations().end(); ++iter)
@@ -125,6 +126,8 @@ double CudaCalcForcesAndEnergyKernel::finishComputation(ContextImpl& context, bo
                sum += energy[i];
        }
    }
+    if (!cu.getForcesValid())
+        valid = false;
    return sum;
 }


--- a/platforms/cuda/src/CudaNonbondedUtilities.cpp
+++ b/platforms/cuda/src/CudaNonbondedUtilities.cpp
@@ -65,12 +65,14 @@ private:
 CudaNonbondedUtilities::CudaNonbondedUtilities(CudaContext& context) : context(context), useCutoff(false), usePeriodic(false), anyExclusions(false), usePadding(true),
        exclusionIndices(NULL), exclusionRowIndices(NULL), exclusionTiles(NULL), exclusions(NULL), interactingTiles(NULL), interactingAtoms(NULL),
        interactionCount(NULL), blockCenter(NULL), blockBoundingBox(NULL), sortedBlocks(NULL), sortedBlockCenter(NULL), sortedBlockBoundingBox(NULL),
-        oldPositions(NULL), rebuildNeighborList(NULL), blockSorter(NULL), forceRebuildNeighborList(true), lastCutoff(0.0), groupFlags(0) {
+        oldPositions(NULL), rebuildNeighborList(NULL), blockSorter(NULL), pinnedCountBuffer(NULL), forceRebuildNeighborList(true), lastCutoff(0.0), groupFlags(0) {
    // Decide how many thread blocks to use.

    string errorMessage = "Error initializing nonbonded utilities";
    int multiprocessors;
    CHECK_RESULT(cuDeviceGetAttribute(&multiprocessors, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, context.getDevice()));
+    CHECK_RESULT(cuEventCreate(&downloadCountEvent, 0));
+    CHECK_RESULT(cuMemHostAlloc((void**) &pinnedCountBuffer, sizeof(int), CU_MEMHOSTALLOC_PORTABLE));
    numForceThreadBlocks = 4*multiprocessors;
    forceThreadBlockSize = (context.getComputeCapability() < 2.0 ? 128 : 256);
 }
@@ -106,6 +108,9 @@ CudaNonbondedUtilities::~CudaNonbondedUtilities() {
        delete rebuildNeighborList;
    if (blockSorter != NULL)
        delete blockSorter;
+    if (pinnedCountBuffer != NULL)
+        cuMemFreeHost(pinnedCountBuffer);
+    cuEventDestroy(downloadCountEvent);
 }

 void CudaNonbondedUtilities::addInteraction(bool usesCutoff, bool usesPeriodic, bool usesExclusions, double cutoffDistance, const vector<vector<int> >& exclusionList, const string& kernel, int forceGroup) {
@@ -375,17 +380,14 @@ void CudaNonbondedUtilities::prepareInteractions(int forceGroups) {

    if (lastCutoff != kernels.cutoffDistance)
        forceRebuildNeighborList = true;
-    bool rebuild = false;
-    do {
    context.executeKernel(kernels.findBlockBoundsKernel, &findBlockBoundsArgs[0], context.getNumAtoms());
    blockSorter->sort(*sortedBlocks);
    context.executeKernel(kernels.sortBoxDataKernel, &sortBoxDataArgs[0], context.getNumAtoms());
    context.executeKernel(kernels.findInteractingBlocksKernel, &findInteractingBlocksArgs[0], context.getNumAtoms(), 256);
    forceRebuildNeighborList = false;
-        if (context.getComputeForceCount() == 1)
-            rebuild = updateNeighborListSize(); // This is the first time step, so check whether our initial guess was large enough.
-    } while(rebuild);
    lastCutoff = kernels.cutoffDistance;
+    interactionCount->download(pinnedCountBuffer, false);
+    cuEventRecord(downloadCountEvent, context.getCurrentStream());
 }

 void CudaNonbondedUtilities::computeInteractions(int forceGroups, bool includeForces, bool includeEnergy) {
@@ -398,20 +400,22 @@ void CudaNonbondedUtilities::computeInteractions(int forceGroups, bool includeFo
            kernel = createInteractionKernel(kernels.source, parameters, arguments, true, true, forceGroups, includeForces, includeEnergy);
        context.executeKernel(kernel, &forceArgs[0], numForceThreadBlocks*forceThreadBlockSize, forceThreadBlockSize);
    }
+    if (useCutoff && numTiles > 0) {
+        cuEventSynchronize(downloadCountEvent);
+        updateNeighborListSize();
+    }
 }

 bool CudaNonbondedUtilities::updateNeighborListSize() {
    if (!useCutoff)
        return false;
-    unsigned int* pinnedInteractionCount = (unsigned int*) context.getPinnedBuffer();
-    interactionCount->download(pinnedInteractionCount);
-    if (pinnedInteractionCount[0] <= (unsigned int) maxTiles)
+    if (pinnedCountBuffer[0] <= (unsigned int) maxTiles)
        return false;

    // The most recent timestep had too many interactions to fit in the arrays.  Make the arrays bigger to prevent
    // this from happening in the future.

-    maxTiles = (int) (1.2*pinnedInteractionCount[0]);
+    maxTiles = (int) (1.2*pinnedCountBuffer[0]);
    int totalTiles = context.getNumAtomBlocks()*(context.getNumAtomBlocks()+1)/2;
    if (maxTiles > totalTiles)
        maxTiles = totalTiles;
@@ -428,6 +432,7 @@ bool CudaNonbondedUtilities::updateNeighborListSize() {
        forceArgs[17] = &interactingAtoms->getDevicePointer();
    findInteractingBlocksArgs[7] = &interactingAtoms->getDevicePointer();
    forceRebuildNeighborList = true;
+    context.setForcesValid(false);
    return true;
 }


--- a/platforms/cuda/src/kernels/customGBEnergyN2.cu
+++ b/platforms/cuda/src/kernels/customGBEnergyN2.cu
@@ -168,6 +168,8 @@ extern "C" __global__ void computeN2Energy(unsigned long long* __restrict__ forc

 #ifdef USE_CUTOFF
    unsigned int numTiles = interactionCount[0];
+    if (numTiles > maxTiles)
+        return; // There wasn't enough memory for the neighbor list.
    int pos = (int) (warp*(numTiles > maxTiles ? NUM_BLOCKS*((long long)NUM_BLOCKS+1)/2 : (long)numTiles)/totalWarps);
    int end = (int) ((warp+1)*(numTiles > maxTiles ? NUM_BLOCKS*((long long)NUM_BLOCKS+1)/2 : (long)numTiles)/totalWarps);
 #else
@@ -191,16 +193,12 @@ extern "C" __global__ void computeN2Energy(unsigned long long* __restrict__ forc
        int x, y;
        bool singlePeriodicCopy = false;
 #ifdef USE_CUTOFF
-        if (numTiles <= maxTiles) {
            x = tiles[pos];
            real4 blockSizeX = blockSize[x];
            singlePeriodicCopy = (0.5f*periodicBoxSize.x-blockSizeX.x >= CUTOFF &&
                                  0.5f*periodicBoxSize.y-blockSizeX.y >= CUTOFF &&
                                  0.5f*periodicBoxSize.z-blockSizeX.z >= CUTOFF);
-        }
-        else
-#endif
-        {
+#else
        y = (int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
        x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
        if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
@@ -223,7 +221,7 @@ extern "C" __global__ void computeN2Energy(unsigned long long* __restrict__ forc
        while (skipTiles[currentSkipIndex] < pos)
            currentSkipIndex++;
        includeTile = (skipTiles[currentSkipIndex] != pos);
-        }
+#endif
        if (includeTile) {
            unsigned int atom1 = x*TILE_SIZE + tgx;


--- a/platforms/cuda/src/kernels/customGBValueN2.cu
+++ b/platforms/cuda/src/kernels/customGBValueN2.cu
@@ -146,6 +146,8 @@ extern "C" __global__ void computeN2Value(const real4* __restrict__ posq, const

 #ifdef USE_CUTOFF
    unsigned int numTiles = interactionCount[0];
+    if (numTiles > maxTiles)
+        return; // There wasn't enough memory for the neighbor list.
    int pos = (int) (warp*(numTiles > maxTiles ? NUM_BLOCKS*((long long)NUM_BLOCKS+1)/2 : (long)numTiles)/totalWarps);
    int end = (int) ((warp+1)*(numTiles > maxTiles ? NUM_BLOCKS*((long long)NUM_BLOCKS+1)/2 : (long)numTiles)/totalWarps);
 #else
@@ -167,16 +169,12 @@ extern "C" __global__ void computeN2Value(const real4* __restrict__ posq, const
        int x, y;
        bool singlePeriodicCopy = false;
 #ifdef USE_CUTOFF
-        if (numTiles <= maxTiles) {
        x = tiles[pos];
        real4 blockSizeX = blockSize[x];
        singlePeriodicCopy = (0.5f*periodicBoxSize.x-blockSizeX.x >= CUTOFF &&
                              0.5f*periodicBoxSize.y-blockSizeX.y >= CUTOFF &&
                              0.5f*periodicBoxSize.z-blockSizeX.z >= CUTOFF);
-        }
-        else
-#endif
-        {
+#else
        y = (int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
        x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
        if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
@@ -199,7 +197,7 @@ extern "C" __global__ void computeN2Value(const real4* __restrict__ posq, const
        while (skipTiles[currentSkipIndex] < pos)
            currentSkipIndex++;
        includeTile = (skipTiles[currentSkipIndex] != pos);
-        }
+#endif
        if (includeTile) {
            unsigned int atom1 = x*TILE_SIZE + tgx;


--- a/platforms/cuda/src/kernels/gbsaObc1.cu
+++ b/platforms/cuda/src/kernels/gbsaObc1.cu
@@ -204,6 +204,8 @@ extern "C" __global__ void computeBornSum(unsigned long long* __restrict__ globa

 #ifdef USE_CUTOFF
    unsigned int numTiles = interactionCount[0];
+    if (numTiles > maxTiles)
+        return; // There wasn't enough memory for the neighbor list.
    int pos = (int) (warp*(numTiles > maxTiles ? NUM_BLOCKS*((long long)NUM_BLOCKS+1)/2 : (long)numTiles)/totalWarps);
    int end = (int) ((warp+1)*(numTiles > maxTiles ? NUM_BLOCKS*((long long)NUM_BLOCKS+1)/2 : (long)numTiles)/totalWarps);
 #else
@@ -225,16 +227,12 @@ extern "C" __global__ void computeBornSum(unsigned long long* __restrict__ globa
        int x, y;
        bool singlePeriodicCopy = false;
 #ifdef USE_CUTOFF
-        if (numTiles <= maxTiles) {
            x = tiles[pos];
            real4 blockSizeX = blockSize[x];
            singlePeriodicCopy = (0.5f*periodicBoxSize.x-blockSizeX.x >= CUTOFF &&
                                  0.5f*periodicBoxSize.y-blockSizeX.y >= CUTOFF &&
                                  0.5f*periodicBoxSize.z-blockSizeX.z >= CUTOFF);
-        }
-        else
-#endif
-        {
+#else
        y = (int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
        x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
        if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
@@ -257,7 +255,7 @@ extern "C" __global__ void computeBornSum(unsigned long long* __restrict__ globa
        while (skipTiles[currentSkipIndex] < pos)
            currentSkipIndex++;
        includeTile = (skipTiles[currentSkipIndex] != pos);
-        }
+#endif
        if (includeTile) {
            unsigned int atom1 = x*TILE_SIZE + tgx;

@@ -559,6 +557,8 @@ extern "C" __global__ void computeGBSAForce1(unsigned long long* __restrict__ fo

 #ifdef USE_CUTOFF
    unsigned int numTiles = interactionCount[0];
+    if (numTiles > maxTiles)
+        return; // There wasn't enough memory for the neighbor list.
    int pos = (int) (warp*(numTiles > maxTiles ? NUM_BLOCKS*((long long)NUM_BLOCKS+1)/2 : (long)numTiles)/totalWarps);
    int end = (int) ((warp+1)*(numTiles > maxTiles ? NUM_BLOCKS*((long long)NUM_BLOCKS+1)/2 : (long)numTiles)/totalWarps);
 #else
@@ -580,16 +580,12 @@ extern "C" __global__ void computeGBSAForce1(unsigned long long* __restrict__ fo
        int x, y;
        bool singlePeriodicCopy = false;
 #ifdef USE_CUTOFF
-        if (numTiles <= maxTiles) {
        x = tiles[pos];
        real4 blockSizeX = blockSize[x];
        singlePeriodicCopy = (0.5f*periodicBoxSize.x-blockSizeX.x >= CUTOFF &&
                              0.5f*periodicBoxSize.y-blockSizeX.y >= CUTOFF &&
                              0.5f*periodicBoxSize.z-blockSizeX.z >= CUTOFF);
-        }
-        else
-#endif
-        {
+#else
        y = (int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
        x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
        if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
@@ -612,7 +608,7 @@ extern "C" __global__ void computeGBSAForce1(unsigned long long* __restrict__ fo
        while (skipTiles[currentSkipIndex] < pos)
            currentSkipIndex++;
        includeTile = (skipTiles[currentSkipIndex] != pos);
-        }
+#endif
        if (includeTile) {
            unsigned int atom1 = x*TILE_SIZE + tgx;


--- a/platforms/cuda/src/kernels/nonbonded.cu
+++ b/platforms/cuda/src/kernels/nonbonded.cu
@@ -117,7 +117,9 @@ extern "C" __global__ void computeNonbonded(
 #ifndef ENABLE_SHUFFLE
    __shared__ AtomData localData[THREAD_BLOCK_SIZE];
 #endif
+
    // First loop: process tiles that contain exclusions.
+
    const unsigned int firstExclusionTile = FIRST_EXCLUSION_TILE+warp*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/totalWarps;
    const unsigned int lastExclusionTile = FIRST_EXCLUSION_TILE+(warp+1)*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/totalWarps;
    for (int pos = firstExclusionTile; pos < lastExclusionTile; pos++) {
@@ -309,8 +311,11 @@ extern "C" __global__ void computeNonbonded(

    // Second loop: tiles without exclusions, either from the neighbor list (with cutoff) or just enumerating all
    // of them (no cutoff).
+
 #ifdef USE_CUTOFF
    const unsigned int numTiles = interactionCount[0];
+    if (numTiles > maxTiles)
+        return; // There wasn't enough memory for the neighbor list.
    int pos = (int) (numTiles > maxTiles ? startTileIndex+warp*(long long)numTileIndices/totalWarps : warp*(long long)numTiles/totalWarps);
    int end = (int) (numTiles > maxTiles ? startTileIndex+(warp+1)*(long long)numTileIndices/totalWarps : (warp+1)*(long long)numTiles/totalWarps);
 #else
@@ -335,16 +340,12 @@ extern "C" __global__ void computeNonbonded(
        int x, y;
        bool singlePeriodicCopy = false;
 #ifdef USE_CUTOFF
-        if (numTiles <= maxTiles) {
        x = tiles[pos];
        real4 blockSizeX = blockSize[x];
        singlePeriodicCopy = (0.5f*periodicBoxSize.x-blockSizeX.x >= MAX_CUTOFF &&
                              0.5f*periodicBoxSize.y-blockSizeX.y >= MAX_CUTOFF &&
                              0.5f*periodicBoxSize.z-blockSizeX.z >= MAX_CUTOFF);
-        }
-        else
-#endif
-        {
+#else
        y = (int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
        x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
        if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
@@ -367,7 +368,7 @@ extern "C" __global__ void computeNonbonded(
        while (skipTiles[currentSkipIndex] < pos)
            currentSkipIndex++;
        includeTile = (skipTiles[currentSkipIndex] != pos);
-        }
+#endif
        if (includeTile) {
            unsigned int atom1 = x*TILE_SIZE + tgx;
            // Load atom data for this tile.

--- a/platforms/opencl/include/OpenCLContext.h
+++ b/platforms/opencl/include/OpenCLContext.h
@@ -408,6 +408,18 @@ public:
    void setStepsSinceReorder(int steps) {
        stepsSinceReorder = steps;
    }
+    /**
+     * Get the flag that marks whether the current force evaluation is valid.
+     */
+    bool getForcesValid() const {
+        return forcesValid;
+    }
+    /**
+     * Get the flag that marks whether the current force evaluation is valid.
+     */
+    void setForcesValid(bool valid) {
+        forcesValid = valid;
+    }
    /**
     * Get the number of atoms.
     */
@@ -684,7 +696,7 @@ private:
    int numThreadBlocks;
    int numForceBuffers;
    int simdWidth;
-    bool supports64BitGlobalAtomics, supportsDoublePrecision, useDoublePrecision, useMixedPrecision, atomsWereReordered, boxIsTriclinic;
+    bool supports64BitGlobalAtomics, supportsDoublePrecision, useDoublePrecision, useMixedPrecision, atomsWereReordered, boxIsTriclinic, forcesValid;
    mm_float4 periodicBoxSize, invPeriodicBoxSize, periodicBoxVecX, periodicBoxVecY, periodicBoxVecZ;
    mm_double4 periodicBoxSizeDouble, invPeriodicBoxSizeDouble, periodicBoxVecXDouble, periodicBoxVecYDouble, periodicBoxVecZDouble;
    std::string defaultOptimizationOptions;

--- a/platforms/opencl/include/OpenCLNonbondedUtilities.h
+++ b/platforms/opencl/include/OpenCLNonbondedUtilities.h
@@ -281,6 +281,9 @@ private:
    OpenCLArray* oldPositions;
    OpenCLArray* rebuildNeighborList;
    OpenCLSort* blockSorter;
+    cl::Event downloadCountEvent;
+    cl::Buffer* pinnedCountBuffer;
+    int* pinnedCountMemory;
    std::vector<std::vector<int> > atomExclusions;
    std::vector<ParameterInfo> parameters;
    std::vector<ParameterInfo> arguments;

--- a/platforms/opencl/src/OpenCLContext.cpp
+++ b/platforms/opencl/src/OpenCLContext.cpp
@@ -1052,7 +1052,6 @@ void OpenCLContext::reorderAtoms() {
        reorderAtomsImpl<cl_float, mm_float4, cl_double, mm_double4>();
    else
        reorderAtomsImpl<cl_float, mm_float4, cl_float, mm_float4>();
-    nonbonded->updateNeighborListSize();
 }

 template <class Real, class Real4, class Mixed, class Mixed4>

--- a/platforms/opencl/src/OpenCLKernels.cpp
+++ b/platforms/opencl/src/OpenCLKernels.cpp
@@ -118,6 +118,7 @@ void OpenCLCalcForcesAndEnergyKernel::initialize(const System& system) {
 }

 void OpenCLCalcForcesAndEnergyKernel::beginComputation(ContextImpl& context, bool includeForces, bool includeEnergy, int groups) {
+    cl.setForcesValid(true);
    cl.clearAutoclearBuffers();
    for (vector<OpenCLContext::ForcePreComputation*>::iterator iter = cl.getPreComputations().begin(); iter != cl.getPreComputations().end(); ++iter)
        (*iter)->computeForceAndEnergy(includeForces, includeEnergy, groups);
@@ -149,6 +150,8 @@ double OpenCLCalcForcesAndEnergyKernel::finishComputation(ContextImpl& context,
                sum += energy[i];
        }
    }
+    if (!cl.getForcesValid())
+        valid = false;
    return sum;
 }


--- a/platforms/opencl/src/OpenCLNonbondedUtilities.cpp
+++ b/platforms/opencl/src/OpenCLNonbondedUtilities.cpp
@@ -57,7 +57,7 @@ private:
 OpenCLNonbondedUtilities::OpenCLNonbondedUtilities(OpenCLContext& context) : context(context), useCutoff(false), usePeriodic(false), anyExclusions(false), usePadding(true),
        numForceBuffers(0), exclusionIndices(NULL), exclusionRowIndices(NULL), exclusionTiles(NULL), exclusions(NULL), interactingTiles(NULL), interactingAtoms(NULL),
        interactionCount(NULL), blockCenter(NULL), blockBoundingBox(NULL), sortedBlocks(NULL), sortedBlockCenter(NULL), sortedBlockBoundingBox(NULL),
-        oldPositions(NULL), rebuildNeighborList(NULL), blockSorter(NULL), forceRebuildNeighborList(true), lastCutoff(0.0), groupFlags(0) {
+        oldPositions(NULL), rebuildNeighborList(NULL), blockSorter(NULL), pinnedCountBuffer(NULL), pinnedCountMemory(NULL), forceRebuildNeighborList(true), lastCutoff(0.0), groupFlags(0) {
    // Decide how many thread blocks and force buffers to use.

    deviceIsCpu = (context.getDevice().getInfo<CL_DEVICE_TYPE>() == CL_DEVICE_TYPE_CPU);
@@ -90,6 +90,8 @@ OpenCLNonbondedUtilities::OpenCLNonbondedUtilities(OpenCLContext& context) : con
            numForceBuffers = numForceThreadBlocks*forceThreadBlockSize/OpenCLContext::TileSize;
        }
    }
+    pinnedCountBuffer = new cl::Buffer(context.getContext(), CL_MEM_ALLOC_HOST_PTR, sizeof(int));
+    pinnedCountMemory = (int*) context.getQueue().enqueueMapBuffer(*pinnedCountBuffer, CL_TRUE, CL_MAP_READ, 0, sizeof(int));
 }

 OpenCLNonbondedUtilities::~OpenCLNonbondedUtilities() {
@@ -123,6 +125,8 @@ OpenCLNonbondedUtilities::~OpenCLNonbondedUtilities() {
        delete rebuildNeighborList;
    if (blockSorter != NULL)
        delete blockSorter;
+    if (pinnedCountBuffer != NULL)
+        delete pinnedCountBuffer;
 }

 void OpenCLNonbondedUtilities::addInteraction(bool usesCutoff, bool usesPeriodic, bool usesExclusions, double cutoffDistance, const vector<vector<int> >& exclusionList, const string& kernel, int forceGroup) {
@@ -357,8 +361,6 @@ void OpenCLNonbondedUtilities::prepareInteractions(int forceGroups) {

    if (lastCutoff != kernels.cutoffDistance)
        forceRebuildNeighborList = true;
-    bool rebuild = false;
-    do {
    setPeriodicBoxArgs(context, kernels.findBlockBoundsKernel, 1);
    context.executeKernel(kernels.findBlockBoundsKernel, context.getNumAtoms());
    blockSorter->sort(*sortedBlocks);
@@ -367,10 +369,8 @@ void OpenCLNonbondedUtilities::prepareInteractions(int forceGroups) {
    setPeriodicBoxArgs(context, kernels.findInteractingBlocksKernel, 0);
    context.executeKernel(kernels.findInteractingBlocksKernel, context.getNumAtoms(), interactingBlocksThreadBlockSize);
    forceRebuildNeighborList = false;
-        if (context.getComputeForceCount() == 1)
-            rebuild = updateNeighborListSize(); // This is the first time step, so check whether our initial guess was large enough.
-    } while (rebuild);
    lastCutoff = kernels.cutoffDistance;
+    context.getQueue().enqueueReadBuffer(interactionCount->getDeviceBuffer(), CL_FALSE, 0, sizeof(int), pinnedCountMemory, NULL, &downloadCountEvent); 
 }

 void OpenCLNonbondedUtilities::computeInteractions(int forceGroups, bool includeForces, bool includeEnergy) {
@@ -385,20 +385,22 @@ void OpenCLNonbondedUtilities::computeInteractions(int forceGroups, bool include
            setPeriodicBoxArgs(context, kernel, 9);
        context.executeKernel(kernel, numForceThreadBlocks*forceThreadBlockSize, forceThreadBlockSize);
    }
+    if (useCutoff && numTiles > 0) {
+        downloadCountEvent.wait();
+        updateNeighborListSize();
+    }
 }

 bool OpenCLNonbondedUtilities::updateNeighborListSize() {
    if (!useCutoff)
        return false;
-    unsigned int* pinnedInteractionCount = (unsigned int*) context.getPinnedBuffer();
-    interactionCount->download(pinnedInteractionCount);
-    if (pinnedInteractionCount[0] <= (unsigned int) interactingTiles->getSize())
+    if (pinnedCountMemory[0] <= (unsigned int) interactingTiles->getSize())
        return false;

    // The most recent timestep had too many interactions to fit in the arrays.  Make the arrays bigger to prevent
    // this from happening in the future.

-    int maxTiles = (int) (1.2*pinnedInteractionCount[0]);
+    int maxTiles = (int) (1.2*pinnedCountMemory[0]);
    int totalTiles = context.getNumAtomBlocks()*(context.getNumAtomBlocks()+1)/2;
    if (maxTiles > totalTiles)
        maxTiles = totalTiles;
@@ -430,6 +432,7 @@ bool OpenCLNonbondedUtilities::updateNeighborListSize() {
        kernels.findInteractingBlocksKernel.setArg<cl_uint>(9, maxTiles);
    }
    forceRebuildNeighborList = true;
+    context.setForcesValid(false);
    return true;
 }


--- a/platforms/opencl/src/kernels/customGBEnergyN2.cl
+++ b/platforms/opencl/src/kernels/customGBEnergyN2.cl
@@ -181,6 +181,8 @@ __kernel void computeN2Energy(

 #ifdef USE_CUTOFF
    unsigned int numTiles = interactionCount[0];
+    if (numTiles > maxTiles)
+        return; // There wasn't enough memory for the neighbor list.
    int pos = (int) (warp*(numTiles > maxTiles ? NUM_BLOCKS*((long)NUM_BLOCKS+1)/2 : (long)numTiles)/totalWarps);
    int end = (int) ((warp+1)*(numTiles > maxTiles ? NUM_BLOCKS*((long)NUM_BLOCKS+1)/2 : (long)numTiles)/totalWarps);
 #else
@@ -204,16 +206,12 @@ __kernel void computeN2Energy(
        int x, y;
        bool singlePeriodicCopy = false;
 #ifdef USE_CUTOFF
-        if (numTiles <= maxTiles) {
        x = tiles[pos];
        real4 blockSizeX = blockSize[x];
        singlePeriodicCopy = (0.5f*periodicBoxSize.x-blockSizeX.x >= CUTOFF &&
                              0.5f*periodicBoxSize.y-blockSizeX.y >= CUTOFF &&
                              0.5f*periodicBoxSize.z-blockSizeX.z >= CUTOFF);
-        }
-        else
-#endif
-        {
+#else
        y = (int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
        x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
        if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
@@ -239,7 +237,7 @@ __kernel void computeN2Energy(
        while (skipTiles[currentSkipIndex] < pos)
            currentSkipIndex++;
        includeTile = (skipTiles[currentSkipIndex] != pos);
-        }
+#endif
        if (includeTile) {
            unsigned int atom1 = x*TILE_SIZE + tgx;


--- a/platforms/opencl/src/kernels/customGBEnergyN2_cpu.cl
+++ b/platforms/opencl/src/kernels/customGBEnergyN2_cpu.cl
@@ -201,6 +201,8 @@ __kernel void computeN2Energy(

 #ifdef USE_CUTOFF
    const unsigned int numTiles = interactionCount[0];
+    if (numTiles > maxTiles)
+        return; // There wasn't enough memory for the neighbor list.
    int pos = (int) (get_group_id(0)*(numTiles > maxTiles ? NUM_BLOCKS*((long)NUM_BLOCKS+1)/2 : numTiles)/get_num_groups(0));
    int end = (int) ((get_group_id(0)+1)*(numTiles > maxTiles ? NUM_BLOCKS*((long)NUM_BLOCKS+1)/2 : numTiles)/get_num_groups(0));
 #else
@@ -220,16 +222,12 @@ __kernel void computeN2Energy(
        int x, y;
        bool singlePeriodicCopy = false;
 #ifdef USE_CUTOFF
-        if (numTiles <= maxTiles) {
        x = tiles[pos];
        real4 blockSizeX = blockSize[x];
        singlePeriodicCopy = (0.5f*periodicBoxSize.x-blockSizeX.x >= CUTOFF &&
                              0.5f*periodicBoxSize.y-blockSizeX.y >= CUTOFF &&
                              0.5f*periodicBoxSize.z-blockSizeX.z >= CUTOFF);
-        }
-        else
-#endif
-        {
+#else
        y = (int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
        x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
        if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
@@ -248,7 +246,7 @@ __kernel void computeN2Energy(
                nextToSkip = end;
        }
        includeTile = (nextToSkip != pos);
-        }
+#endif
        if (includeTile) {
            // Load the data for this tile.


--- a/platforms/opencl/src/kernels/customGBValueN2.cl
+++ b/platforms/opencl/src/kernels/customGBValueN2.cl
@@ -157,6 +157,8 @@ __kernel void computeN2Value(__global const real4* restrict posq, __local real4*

 #ifdef USE_CUTOFF
    unsigned int numTiles = interactionCount[0];
+    if (numTiles > maxTiles)
+        return; // There wasn't enough memory for the neighbor list.
    int pos = (int) (warp*(numTiles > maxTiles ? NUM_BLOCKS*((long)NUM_BLOCKS+1)/2 : (long)numTiles)/totalWarps);
    int end = (int) ((warp+1)*(numTiles > maxTiles ? NUM_BLOCKS*((long)NUM_BLOCKS+1)/2 : (long)numTiles)/totalWarps);
 #else
@@ -178,16 +180,12 @@ __kernel void computeN2Value(__global const real4* restrict posq, __local real4*
        int x, y;
        bool singlePeriodicCopy = false;
 #ifdef USE_CUTOFF
-        if (numTiles <= maxTiles) {
        x = tiles[pos];
        real4 blockSizeX = blockSize[x];
        singlePeriodicCopy = (0.5f*periodicBoxSize.x-blockSizeX.x >= CUTOFF &&
                              0.5f*periodicBoxSize.y-blockSizeX.y >= CUTOFF &&
                              0.5f*periodicBoxSize.z-blockSizeX.z >= CUTOFF);
-        }
-        else
-#endif
-        {
+#else
        y = (int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
        x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
        if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
@@ -213,7 +211,7 @@ __kernel void computeN2Value(__global const real4* restrict posq, __local real4*
        while (skipTiles[currentSkipIndex] < pos)
            currentSkipIndex++;
        includeTile = (skipTiles[currentSkipIndex] != pos);
-        }
+#endif
        if (includeTile) {
            unsigned int atom1 = x*TILE_SIZE + tgx;


--- a/platforms/opencl/src/kernels/customGBValueN2_cpu.cl
+++ b/platforms/opencl/src/kernels/customGBValueN2_cpu.cl
@@ -170,6 +170,8 @@ __kernel void computeN2Value(__global const real4* restrict posq, __local real4*

 #ifdef USE_CUTOFF
    const unsigned int numTiles = interactionCount[0];
+    if (numTiles > maxTiles)
+        return; // There wasn't enough memory for the neighbor list.
    int pos = (int) (get_group_id(0)*(numTiles > maxTiles ? NUM_BLOCKS*((long)NUM_BLOCKS+1)/2 : numTiles)/get_num_groups(0));
    int end = (int) ((get_group_id(0)+1)*(numTiles > maxTiles ? NUM_BLOCKS*((long)NUM_BLOCKS+1)/2 : numTiles)/get_num_groups(0));
 #else
@@ -188,16 +190,12 @@ __kernel void computeN2Value(__global const real4* restrict posq, __local real4*
        int x, y;
        bool singlePeriodicCopy = false;
 #ifdef USE_CUTOFF
-        if (numTiles <= maxTiles) {
        x = tiles[pos];
        real4 blockSizeX = blockSize[x];
        singlePeriodicCopy = (0.5f*periodicBoxSize.x-blockSizeX.x >= CUTOFF &&
                              0.5f*periodicBoxSize.y-blockSizeX.y >= CUTOFF &&
                              0.5f*periodicBoxSize.z-blockSizeX.z >= CUTOFF);
-        }
-        else
-#endif
-        {
+#else
        y = (int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
        x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
        if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
@@ -216,7 +214,7 @@ __kernel void computeN2Value(__global const real4* restrict posq, __local real4*
                nextToSkip = end;
        }
        includeTile = (nextToSkip != pos);
-        }
+#endif
        if (includeTile) {
            // Load the data for this tile.


--- a/platforms/opencl/src/kernels/gbsaObc.cl
+++ b/platforms/opencl/src/kernels/gbsaObc.cl
@@ -169,6 +169,8 @@ __kernel void computeBornSum(

 #ifdef USE_CUTOFF
    unsigned int numTiles = interactionCount[0];
+    if (numTiles > maxTiles)
+        return; // There wasn't enough memory for the neighbor list.
    int pos = (int) (warp*(numTiles > maxTiles ? NUM_BLOCKS*((long)NUM_BLOCKS+1)/2 : (long)numTiles)/totalWarps);
    int end = (int) ((warp+1)*(numTiles > maxTiles ? NUM_BLOCKS*((long)NUM_BLOCKS+1)/2 : (long)numTiles)/totalWarps);
 #else
@@ -190,16 +192,12 @@ __kernel void computeBornSum(
        int x, y;
        bool singlePeriodicCopy = false;
 #ifdef USE_CUTOFF
-        if (numTiles <= maxTiles) {
        x = tiles[pos];
        real4 blockSizeX = blockSize[x];
        singlePeriodicCopy = (0.5f*periodicBoxSize.x-blockSizeX.x >= CUTOFF &&
                              0.5f*periodicBoxSize.y-blockSizeX.y >= CUTOFF &&
                              0.5f*periodicBoxSize.z-blockSizeX.z >= CUTOFF);
-        }
-        else
-#endif
-        {
+#else
        y = (int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
        x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
        if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
@@ -225,7 +223,7 @@ __kernel void computeBornSum(
        while (skipTiles[currentSkipIndex] < pos)
            currentSkipIndex++;
        includeTile = (skipTiles[currentSkipIndex] != pos);
-        }
+#endif
        if (includeTile) {
            unsigned int atom1 = x*TILE_SIZE + tgx;

@@ -556,6 +554,8 @@ __kernel void computeGBSAForce1(

 #ifdef USE_CUTOFF
    unsigned int numTiles = interactionCount[0];
+    if (numTiles > maxTiles)
+        return; // There wasn't enough memory for the neighbor list.
    int pos = (int) (warp*(numTiles > maxTiles ? NUM_BLOCKS*((long)NUM_BLOCKS+1)/2 : (long)numTiles)/totalWarps);
    int end = (int) ((warp+1)*(numTiles > maxTiles ? NUM_BLOCKS*((long)NUM_BLOCKS+1)/2 : (long)numTiles)/totalWarps);
 #else
@@ -577,16 +577,12 @@ __kernel void computeGBSAForce1(
        int x, y;
        bool singlePeriodicCopy = false;
 #ifdef USE_CUTOFF
-        if (numTiles <= maxTiles) {
        x = tiles[pos];
        real4 blockSizeX = blockSize[x];
        singlePeriodicCopy = (0.5f*periodicBoxSize.x-blockSizeX.x >= CUTOFF &&
                              0.5f*periodicBoxSize.y-blockSizeX.y >= CUTOFF &&
                              0.5f*periodicBoxSize.z-blockSizeX.z >= CUTOFF);
-        }
-        else
-#endif
-        {
+#else
        y = (int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
        x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
        if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
@@ -612,7 +608,7 @@ __kernel void computeGBSAForce1(
        while (skipTiles[currentSkipIndex] < pos)
            currentSkipIndex++;
        includeTile = (skipTiles[currentSkipIndex] != pos);
-        }
+#endif
        if (includeTile) {
            unsigned int atom1 = x*TILE_SIZE + tgx;


--- a/platforms/opencl/src/kernels/gbsaObc_cpu.cl
+++ b/platforms/opencl/src/kernels/gbsaObc_cpu.cl
@@ -178,6 +178,8 @@ __kernel void computeBornSum(

 #ifdef USE_CUTOFF
    unsigned int numTiles = interactionCount[0];
+    if (numTiles > maxTiles)
+        return; // There wasn't enough memory for the neighbor list.
    int pos = (int) (get_group_id(0)*(numTiles > maxTiles ? NUM_BLOCKS*((long)NUM_BLOCKS+1)/2 : numTiles)/get_num_groups(0));
    int end = (int) ((get_group_id(0)+1)*(numTiles > maxTiles ? NUM_BLOCKS*((long)NUM_BLOCKS+1)/2 : numTiles)/get_num_groups(0));
 #else
@@ -196,16 +198,12 @@ __kernel void computeBornSum(
        int x, y;
        bool singlePeriodicCopy = false;
 #ifdef USE_CUTOFF
-        if (numTiles <= maxTiles) {
        x = tiles[pos];
        real4 blockSizeX = blockSize[x];
        singlePeriodicCopy = (0.5f*periodicBoxSize.x-blockSizeX.x >= CUTOFF &&
                              0.5f*periodicBoxSize.y-blockSizeX.y >= CUTOFF &&
                              0.5f*periodicBoxSize.z-blockSizeX.z >= CUTOFF);
-        }
-        else
-#endif
-        {
+#else
        y = (int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
        x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
        if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
@@ -224,7 +222,7 @@ __kernel void computeBornSum(
                nextToSkip = end;
        }
        includeTile = (nextToSkip != pos);
-        }
+#endif
        if (includeTile) {
            // Load the data for this tile.

@@ -593,6 +591,8 @@ __kernel void computeGBSAForce1(

 #ifdef USE_CUTOFF
    unsigned int numTiles = interactionCount[0];
+    if (numTiles > maxTiles)
+        return; // There wasn't enough memory for the neighbor list.
    int pos = (int) (get_group_id(0)*(numTiles > maxTiles ? NUM_BLOCKS*((long)NUM_BLOCKS+1)/2 : numTiles)/get_num_groups(0));
    int end = (int) ((get_group_id(0)+1)*(numTiles > maxTiles ? NUM_BLOCKS*((long)NUM_BLOCKS+1)/2 : numTiles)/get_num_groups(0));
 #else
@@ -611,16 +611,12 @@ __kernel void computeGBSAForce1(
        int x, y;
        bool singlePeriodicCopy = false;
 #ifdef USE_CUTOFF
-        if (numTiles <= maxTiles) {
        x = tiles[pos];
        real4 blockSizeX = blockSize[x];
        singlePeriodicCopy = (0.5f*periodicBoxSize.x-blockSizeX.x >= CUTOFF &&
                              0.5f*periodicBoxSize.y-blockSizeX.y >= CUTOFF &&
                              0.5f*periodicBoxSize.z-blockSizeX.z >= CUTOFF);
-        }
-        else
-#endif
-        {
+#else
        y = (int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
        x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
        if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
@@ -639,7 +635,7 @@ __kernel void computeGBSAForce1(
                nextToSkip = end;
        }
        includeTile = (nextToSkip != pos);
-        }
+#endif
        if (includeTile) {
            // Load the data for this tile.