Merge remote-tracking branch 'upstream/master'

eb232608 · John Chodera (MSKCC) · 62581e9c · 7f8c5089 · eb232608 · eb232608
Commit eb232608 authored Apr 22, 2015 by John Chodera (MSKCC)
20 changed files
--- a/platforms/cpu/src/CpuCustomGBForce.cpp
+++ b/platforms/cpu/src/CpuCustomGBForce.cpp
@@ -25,8 +25,6 @@
 #include <string.h>
 #include <sstream>

-#include "SimTKOpenMMCommon.h"
-#include "SimTKOpenMMLog.h"
 #include "SimTKOpenMMUtilities.h"
 #include "ReferenceForce.h"
 #include "CpuCustomGBForce.h"

--- a/platforms/cpu/src/CpuCustomManyParticleForce.cpp
+++ b/platforms/cpu/src/CpuCustomManyParticleForce.cpp
@@ -26,8 +26,6 @@
 #include <sstream>
 #include <utility>

-#include "SimTKOpenMMCommon.h"
-#include "SimTKOpenMMLog.h"
 #include "SimTKOpenMMUtilities.h"
 #include "ReferenceForce.h"
 #include "CpuCustomManyParticleForce.h"

--- a/platforms/cpu/src/CpuCustomNonbondedForce.cpp
+++ b/platforms/cpu/src/CpuCustomNonbondedForce.cpp
@@ -25,8 +25,6 @@
 #include <string.h>
 #include <sstream>

-#include "SimTKOpenMMCommon.h"
-#include "SimTKOpenMMLog.h"
 #include "SimTKOpenMMUtilities.h"
 #include "ReferenceForce.h"
 #include "CpuCustomNonbondedForce.h"

--- a/platforms/cpu/src/CpuKernels.cpp
+++ b/platforms/cpu/src/CpuKernels.cpp
@@ -231,13 +231,13 @@ void CpuCalcForcesAndEnergyKernel::beginComputation(ContextImpl& context, bool i
        throw OpenMMException("Particle coordinate is nan");
 }

-double CpuCalcForcesAndEnergyKernel::finishComputation(ContextImpl& context, bool includeForce, bool includeEnergy, int groups) {
+double CpuCalcForcesAndEnergyKernel::finishComputation(ContextImpl& context, bool includeForce, bool includeEnergy, int groups, bool& valid) {
    // Sum the forces from all the threads.
    
    SumForceTask task(context.getSystem().getNumParticles(), extractForces(context), data);
    data.threads.execute(task);
    data.threads.waitForThreads();
-    return referenceKernel.getAs<ReferenceCalcForcesAndEnergyKernel>().finishComputation(context, includeForce, includeEnergy, groups);
+    return referenceKernel.getAs<ReferenceCalcForcesAndEnergyKernel>().finishComputation(context, includeForce, includeEnergy, groups, valid);
 }

 CpuCalcPeriodicTorsionForceKernel::~CpuCalcPeriodicTorsionForceKernel() {
@@ -508,13 +508,12 @@ double CpuCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeFo
        if (nonbondedMethod == PME) {
            // If available, use the optimized PME implementation.

-            try {
+            vector<string> kernelNames;
+            kernelNames.push_back("CalcPmeReciprocalForce");
+            useOptimizedPme = getPlatform().supportsKernels(kernelNames);
+            if (useOptimizedPme) {
                optimizedPme = getPlatform().createKernel(CalcPmeReciprocalForceKernel::Name(), context);
                optimizedPme.getAs<CalcPmeReciprocalForceKernel>().initialize(gridSize[0], gridSize[1], gridSize[2], numParticles, ewaldAlpha);
-                useOptimizedPme = true;
-            }
-            catch (OpenMMException& ex) {
-                // The CPU PME plugin isn't available.
            }
        }
    }

--- a/platforms/cpu/src/CpuLangevinDynamics.cpp
+++ b/platforms/cpu/src/CpuLangevinDynamics.cpp
@@ -23,8 +23,6 @@
 * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 */

-#include "SimTKOpenMMCommon.h"
-#include "SimTKOpenMMLog.h"
 #include "SimTKOpenMMUtilities.h"
 #include "CpuLangevinDynamics.h"


--- a/platforms/cpu/src/CpuNonbondedForce.cpp
+++ b/platforms/cpu/src/CpuNonbondedForce.cpp
@@ -24,7 +24,6 @@

 #include <complex>

-#include "SimTKOpenMMCommon.h"
 #include "SimTKOpenMMUtilities.h"
 #include "CpuNonbondedForce.h"
 #include "ReferenceForce.h"
@@ -217,7 +216,7 @@ void CpuNonbondedForce::calculateReciprocalIxn(int numberOfAtoms, float* posq, c

        // setup reciprocal box

-        float recipBoxSize[3] = { TWO_PI / periodicBoxVectors[0][0], TWO_PI / periodicBoxVectors[1][1], TWO_PI / periodicBoxVectors[2][2]};
+        float recipBoxSize[3] = {(float) (TWO_PI/periodicBoxVectors[0][0]), (float) (TWO_PI/periodicBoxVectors[1][1]), (float) (TWO_PI/periodicBoxVectors[2][2])};


        // setup K-vectors

--- a/platforms/cpu/src/CpuNonbondedForceVec4.cpp
+++ b/platforms/cpu/src/CpuNonbondedForceVec4.cpp
@@ -22,7 +22,6 @@
 * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 */

-#include "SimTKOpenMMCommon.h"
 #include "SimTKOpenMMUtilities.h"
 #include "CpuNonbondedForceVec4.h"


--- a/platforms/cpu/src/CpuNonbondedForceVec8.cpp
+++ b/platforms/cpu/src/CpuNonbondedForceVec8.cpp
@@ -22,7 +22,6 @@
 * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 */

-#include "SimTKOpenMMCommon.h"
 #include "SimTKOpenMMUtilities.h"
 #include "CpuNonbondedForceVec8.h"
 #include "openmm/OpenMMException.h"

--- a/platforms/cpu/tests/TestCpuNeighborList.cpp
+++ b/platforms/cpu/tests/TestCpuNeighborList.cpp
@@ -62,7 +62,7 @@ void testNeighborList(bool periodic, bool triclinic) {
        boxVectors[1] = RealVec(0, 15, 0);
        boxVectors[2] = RealVec(0, 0, 22);
    }
-    const float boxSize[3] = {boxVectors[0][0], boxVectors[1][1], boxVectors[2][2]};
+    const float boxSize[3] = {(float) boxVectors[0][0], (float) boxVectors[1][1], (float) boxVectors[2][2]};
    const int blockSize = 8;
    OpenMM_SFMT::SFMT sfmt;
    init_gen_rand(0, sfmt);

--- a/platforms/cuda/include/CudaKernels.h
+++ b/platforms/cuda/include/CudaKernels.h
@@ -92,11 +92,13 @@ public:
     * @param includeForce  true if forces should be computed
     * @param includeEnergy true if potential energy should be computed
     * @param groups        a set of bit flags for which force groups to include
+     * @param valid         the method may set this to false to indicate the results are invalid and the force/energy
+     *                      calculation should be repeated
     * @return the potential energy of the system.  This value is added to all values returned by ForceImpls'
     * calcForcesAndEnergy() methods.  That is, each force kernel may <i>either</i> return its contribution to the
     * energy directly, <i>or</i> add it to an internal buffer so that it will be included here.
     */
-    double finishComputation(ContextImpl& context, bool includeForce, bool includeEnergy, int groups);
+    double finishComputation(ContextImpl& context, bool includeForce, bool includeEnergy, int groups, bool& valid);
 private:
   CudaContext& cu;
 };

--- a/platforms/cuda/include/CudaNonbondedUtilities.h
+++ b/platforms/cuda/include/CudaNonbondedUtilities.h
@@ -278,7 +278,7 @@ private:
    std::string kernelSource;
    std::map<std::string, std::string> kernelDefines;
    double cutoff;
-    bool useCutoff, usePeriodic, anyExclusions, usePadding;
+    bool useCutoff, usePeriodic, anyExclusions, usePadding, forceRebuildNeighborList;
    int startTileIndex, numTiles, startBlockIndex, numBlocks, maxTiles, numForceThreadBlocks, forceThreadBlockSize, nonbondedForceGroup, numAtoms;
 };


--- a/platforms/cuda/include/CudaParallelKernels.h
+++ b/platforms/cuda/include/CudaParallelKernels.h
@@ -69,11 +69,13 @@ public:
     * @param includeForce  true if forces should be computed
     * @param includeEnergy true if potential energy should be computed
     * @param groups        a set of bit flags for which force groups to include
+     * @param valid         the method may set this to false to indicate the results are invalid and the force/energy
+     *                      calculation should be repeated
     * @return the potential energy of the system.  This value is added to all values returned by ForceImpls'
     * calcForcesAndEnergy() methods.  That is, each force kernel may <i>either</i> return its contribution to the
     * energy directly, <i>or</i> add it to an internal buffer so that it will be included here.
     */
-    double finishComputation(ContextImpl& context, bool includeForce, bool includeEnergy, int groups);
+    double finishComputation(ContextImpl& context, bool includeForce, bool includeEnergy, int groups, bool& valid);
 private:
    class BeginComputationTask;
    class FinishComputationTask;
@@ -81,6 +83,7 @@ private:
    std::vector<Kernel> kernels;
    std::vector<long long> completionTimes;
    std::vector<double> contextNonbondedFractions;
+    int* tileCounts;
    CudaArray* contextForces;
    void* pinnedPositionBuffer;
    long long* pinnedForceBuffer;

--- a/platforms/cuda/src/CudaContext.cpp
+++ b/platforms/cuda/src/CudaContext.cpp
@@ -77,12 +77,14 @@ CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlocking
        time(0.0), platformData(platformData), stepCount(0), computeForceCount(0), stepsSinceReorder(99999), contextIsValid(false), atomsWereReordered(false), hasCompilerKernel(false),
        pinnedBuffer(NULL), posq(NULL), posqCorrection(NULL), velm(NULL), force(NULL), energyBuffer(NULL), integration(NULL), expression(NULL), bonded(NULL), nonbonded(NULL), thread(NULL) {
    this->compiler = "\""+compiler+"\"";
-    try {
-        compilerKernel = platformData.context->getPlatform().createKernel(CudaCompilerKernel::Name(), *platformData.context);
-        hasCompilerKernel = true;
-    }
-    catch (...) {
-        // The runtime compiler plugin isn't available.
+    if (platformData.context != NULL) {
+        try {
+            compilerKernel = platformData.context->getPlatform().createKernel(CudaCompilerKernel::Name(), *platformData.context);
+            hasCompilerKernel = true;
+        }
+        catch (...) {
+            // The runtime compiler plugin isn't available.
+        }
    }
    if (hostCompiler.size() > 0)
        this->compiler = compiler+" --compiler-bindir "+hostCompiler;

--- a/platforms/cuda/src/CudaKernels.cpp
+++ b/platforms/cuda/src/CudaKernels.cpp
@@ -104,7 +104,7 @@ void CudaCalcForcesAndEnergyKernel::beginComputation(ContextImpl& context, bool
        nb.prepareInteractions();
 }

-double CudaCalcForcesAndEnergyKernel::finishComputation(ContextImpl& context, bool includeForces, bool includeEnergy, int groups) {
+double CudaCalcForcesAndEnergyKernel::finishComputation(ContextImpl& context, bool includeForces, bool includeEnergy, int groups, bool& valid) {
    cu.getBondedUtilities().computeInteractions(groups);
    if ((groups&(1<<cu.getNonbondedUtilities().getForceGroup())) != 0)
        cu.getNonbondedUtilities().computeInteractions();

--- a/platforms/cuda/src/CudaNonbondedUtilities.cpp
+++ b/platforms/cuda/src/CudaNonbondedUtilities.cpp
@@ -62,10 +62,10 @@ private:
    bool useDouble;
 };

-CudaNonbondedUtilities::CudaNonbondedUtilities(CudaContext& context) : context(context), cutoff(-1.0), useCutoff(false), anyExclusions(false), usePadding(true),
+CudaNonbondedUtilities::CudaNonbondedUtilities(CudaContext& context) : context(context), cutoff(-1.0), useCutoff(false), usePeriodic(false), anyExclusions(false), usePadding(true),
        exclusionIndices(NULL), exclusionRowIndices(NULL), exclusionTiles(NULL), exclusions(NULL), interactingTiles(NULL), interactingAtoms(NULL),
        interactionCount(NULL), blockCenter(NULL), blockBoundingBox(NULL), sortedBlocks(NULL), sortedBlockCenter(NULL), sortedBlockBoundingBox(NULL),
-        oldPositions(NULL), rebuildNeighborList(NULL), blockSorter(NULL), nonbondedForceGroup(0) {
+        oldPositions(NULL), rebuildNeighborList(NULL), blockSorter(NULL), nonbondedForceGroup(0), forceRebuildNeighborList(true) {
    // Decide how many thread blocks to use.

    string errorMessage = "Error initializing nonbonded utilities";
@@ -264,14 +264,6 @@ void CudaNonbondedUtilities::initialize(const System& system) {
        sortedBlockCenter = new CudaArray(context, numAtomBlocks+1, 4*elementSize, "sortedBlockCenter");
        sortedBlockBoundingBox = new CudaArray(context, numAtomBlocks+1, 4*elementSize, "sortedBlockBoundingBox");
        oldPositions = new CudaArray(context, numAtoms, 4*elementSize, "oldPositions");
-        if (context.getUseDoublePrecision()) {
-            vector<double4> oldPositionsVec(numAtoms, make_double4(1e30, 1e30, 1e30, 0));
-            oldPositions->upload(oldPositionsVec);
-        }
-        else {
-            vector<float4> oldPositionsVec(numAtoms, make_float4(1e30f, 1e30f, 1e30f, 0));
-            oldPositions->upload(oldPositionsVec);
-        }
        rebuildNeighborList = CudaArray::create<int>(context, 1, "rebuildNeighborList");
        blockSorter = new CudaSort(context, new BlockSortTrait(context.getUseDoublePrecision()), numAtomBlocks);
        vector<unsigned int> count(1, 0);
@@ -322,6 +314,7 @@ void CudaNonbondedUtilities::initialize(const System& system) {
        sortBoxDataArgs.push_back(&oldPositions->getDevicePointer());
        sortBoxDataArgs.push_back(&interactionCount->getDevicePointer());
        sortBoxDataArgs.push_back(&rebuildNeighborList->getDevicePointer());
+        sortBoxDataArgs.push_back(&forceRebuildNeighborList);
        findInteractingBlocksKernel = context.getKernel(interactingBlocksProgram, "findBlocksWithInteractions");
        findInteractingBlocksArgs.push_back(context.getPeriodicBoxSizePointer());
        findInteractingBlocksArgs.push_back(context.getInvPeriodicBoxSizePointer());
@@ -363,6 +356,7 @@ void CudaNonbondedUtilities::prepareInteractions() {
    blockSorter->sort(*sortedBlocks);
    context.executeKernel(sortBoxDataKernel, &sortBoxDataArgs[0], context.getNumAtoms());
    context.executeKernel(findInteractingBlocksKernel, &findInteractingBlocksArgs[0], context.getNumAtoms(), 256);
+    forceRebuildNeighborList = false;
 }

 void CudaNonbondedUtilities::computeInteractions() {
@@ -400,14 +394,7 @@ void CudaNonbondedUtilities::updateNeighborListSize() {
    if (forceArgs.size() > 0)
        forceArgs[17] = &interactingAtoms->getDevicePointer();
    findInteractingBlocksArgs[7] = &interactingAtoms->getDevicePointer();
-    if (context.getUseDoublePrecision()) {
-        vector<double4> oldPositionsVec(numAtoms, make_double4(1e30, 1e30, 1e30, 0));
-        oldPositions->upload(oldPositionsVec);
-    }
-    else {
-        vector<float4> oldPositionsVec(numAtoms, make_float4(1e30f, 1e30f, 1e30f, 0));
-        oldPositions->upload(oldPositionsVec);
-    }
+    forceRebuildNeighborList = true;
 }

 void CudaNonbondedUtilities::setUsePadding(bool padding) {
@@ -419,8 +406,9 @@ void CudaNonbondedUtilities::setAtomBlockRange(double startFraction, double endF
    startBlockIndex = (int) (startFraction*numAtomBlocks);
    numBlocks = (int) (endFraction*numAtomBlocks)-startBlockIndex;
    int totalTiles = context.getNumAtomBlocks()*(context.getNumAtomBlocks()+1)/2;
-    startTileIndex = (int) (startFraction*totalTiles);;
+    startTileIndex = (int) (startFraction*totalTiles);
    numTiles = (int) (endFraction*totalTiles)-startTileIndex;
+    forceRebuildNeighborList = true;
 }

 CUfunction CudaNonbondedUtilities::createInteractionKernel(const string& source, vector<ParameterInfo>& params, vector<ParameterInfo>& arguments, bool useExclusions, bool isSymmetric) {

--- a/platforms/cuda/src/CudaParallelKernels.cpp
+++ b/platforms/cuda/src/CudaParallelKernels.cpp
@@ -6,7 +6,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2011-2013 Stanford University and the Authors.      *
+ * Portions copyright (c) 2011-2015 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -63,8 +63,8 @@ if (result != CUDA_SUCCESS) { \
 class CudaParallelCalcForcesAndEnergyKernel::BeginComputationTask : public CudaContext::WorkTask {
 public:
    BeginComputationTask(ContextImpl& context, CudaContext& cu, CudaCalcForcesAndEnergyKernel& kernel,
-            bool includeForce, bool includeEnergy, int groups, void* pinnedMemory, CUevent event) : context(context), cu(cu), kernel(kernel),
-            includeForce(includeForce), includeEnergy(includeEnergy), groups(groups), pinnedMemory(pinnedMemory), event(event) {
+            bool includeForce, bool includeEnergy, int groups, void* pinnedMemory, CUevent event, int& numTiles) : context(context), cu(cu), kernel(kernel),
+            includeForce(includeForce), includeEnergy(includeEnergy), groups(groups), pinnedMemory(pinnedMemory), event(event), numTiles(numTiles) {
    }
    void execute() {
        // Copy coordinates over to this device and execute the kernel.
@@ -76,6 +76,8 @@ public:
                cu.getPosq().upload(pinnedMemory, false);
        }
        kernel.beginComputation(context, includeForce, includeEnergy, groups);
+        if (cu.getNonbondedUtilities().getUsePeriodic())
+            cu.getNonbondedUtilities().getInteractionCount().download(&numTiles, false);
    }
 private:
    ContextImpl& context;
@@ -85,19 +87,20 @@ private:
    int groups;
    void* pinnedMemory;
    CUevent event;
+    int& numTiles;
 };

 class CudaParallelCalcForcesAndEnergyKernel::FinishComputationTask : public CudaContext::WorkTask {
 public:
    FinishComputationTask(ContextImpl& context, CudaContext& cu, CudaCalcForcesAndEnergyKernel& kernel,
-            bool includeForce, bool includeEnergy, int groups, double& energy, long long& completionTime, long long* pinnedMemory, CudaArray& contextForces) :
+            bool includeForce, bool includeEnergy, int groups, double& energy, long long& completionTime, long long* pinnedMemory, CudaArray& contextForces, bool& valid, int& numTiles) :
            context(context), cu(cu), kernel(kernel), includeForce(includeForce), includeEnergy(includeEnergy), groups(groups), energy(energy),
-            completionTime(completionTime), pinnedMemory(pinnedMemory), contextForces(contextForces) {
+            completionTime(completionTime), pinnedMemory(pinnedMemory), contextForces(contextForces), valid(valid), numTiles(numTiles) {
    }
    void execute() {
        // Execute the kernel, then download forces.
        
-        energy += kernel.finishComputation(context, includeForce, includeEnergy, groups);
+        energy += kernel.finishComputation(context, includeForce, includeEnergy, groups, valid);
        if (cu.getComputeForceCount() < 200) {
            // Record timing information for load balancing.  Since this takes time, only do it at the start of the simulation.

@@ -117,6 +120,10 @@ public:
                    cu.getForce().download(&pinnedMemory[(cu.getContextIndex()-1)*numAtoms*3]);
            }
        }
+        if (cu.getNonbondedUtilities().getUsePeriodic() && numTiles > cu.getNonbondedUtilities().getInteractingTiles().getSize()) {
+            valid = false;
+            cu.getNonbondedUtilities().updateNeighborListSize();
+        }
    }
 private:
    ContextImpl& context;
@@ -128,11 +135,13 @@ private:
    long long& completionTime;
    long long* pinnedMemory;
    CudaArray& contextForces;
+    bool& valid;
+    int& numTiles;
 };

 CudaParallelCalcForcesAndEnergyKernel::CudaParallelCalcForcesAndEnergyKernel(string name, const Platform& platform, CudaPlatform::PlatformData& data) :
-        CalcForcesAndEnergyKernel(name, platform), data(data), completionTimes(data.contexts.size()), contextNonbondedFractions(data.contexts.size()), contextForces(NULL),
-        pinnedPositionBuffer(NULL), pinnedForceBuffer(NULL) {
+        CalcForcesAndEnergyKernel(name, platform), data(data), completionTimes(data.contexts.size()), contextNonbondedFractions(data.contexts.size()),
+        tileCounts(NULL), contextForces(NULL), pinnedPositionBuffer(NULL), pinnedForceBuffer(NULL) {
    for (int i = 0; i < (int) data.contexts.size(); i++)
        kernels.push_back(Kernel(new CudaCalcForcesAndEnergyKernel(name, platform, *data.contexts[i])));
 }
@@ -147,6 +156,8 @@ CudaParallelCalcForcesAndEnergyKernel::~CudaParallelCalcForcesAndEnergyKernel()
        cuMemFreeHost(pinnedForceBuffer);
    cuEventDestroy(event);
    cuStreamDestroy(peerCopyStream);
+    if (tileCounts != NULL)
+        cuMemFreeHost(tileCounts);
 }

 void CudaParallelCalcForcesAndEnergyKernel::initialize(const System& system) {
@@ -154,12 +165,14 @@ void CudaParallelCalcForcesAndEnergyKernel::initialize(const System& system) {
    cu.setAsCurrent();
    CUmodule module = cu.createModule(CudaKernelSources::parallel);
    sumKernel = cu.getKernel(module, "sumForces");
-    for (int i = 0; i < (int) kernels.size(); i++)
+    int numContexts = data.contexts.size();
+    for (int i = 0; i < numContexts; i++)
        getKernel(i).initialize(system);
-    for (int i = 0; i < (int) contextNonbondedFractions.size(); i++)
-        contextNonbondedFractions[i] = 1/(double) contextNonbondedFractions.size();
+    for (int i = 0; i < numContexts; i++)
+        contextNonbondedFractions[i] = 1/(double) numContexts;
    CHECK_RESULT(cuEventCreate(&event, 0), "Error creating event");
    CHECK_RESULT(cuStreamCreate(&peerCopyStream, CU_STREAM_NON_BLOCKING), "Error creating stream");
+    CHECK_RESULT(cuMemHostAlloc((void**) &tileCounts, numContexts*sizeof(int), 0), "Error creating tile count buffer");
 }

 void CudaParallelCalcForcesAndEnergyKernel::beginComputation(ContextImpl& context, bool includeForce, bool includeEnergy, int groups) {
@@ -189,22 +202,21 @@ void CudaParallelCalcForcesAndEnergyKernel::beginComputation(ContextImpl& contex
        data.contextEnergy[i] = 0.0;
        CudaContext& cu = *data.contexts[i];
        CudaContext::WorkThread& thread = cu.getWorkThread();
-        thread.addTask(new BeginComputationTask(context, cu, getKernel(i), includeForce, includeEnergy, groups, pinnedPositionBuffer, event));
+        thread.addTask(new BeginComputationTask(context, cu, getKernel(i), includeForce, includeEnergy, groups, pinnedPositionBuffer, event, tileCounts[i]));
    }
 }

-#include <cstdio>
-double CudaParallelCalcForcesAndEnergyKernel::finishComputation(ContextImpl& context, bool includeForce, bool includeEnergy, int groups) {
+double CudaParallelCalcForcesAndEnergyKernel::finishComputation(ContextImpl& context, bool includeForce, bool includeEnergy, int groups, bool& valid) {
    for (int i = 0; i < (int) data.contexts.size(); i++) {
        CudaContext& cu = *data.contexts[i];
        CudaContext::WorkThread& thread = cu.getWorkThread();
-        thread.addTask(new FinishComputationTask(context, cu, getKernel(i), includeForce, includeEnergy, groups, data.contextEnergy[i], completionTimes[i], pinnedForceBuffer, *contextForces));
+        thread.addTask(new FinishComputationTask(context, cu, getKernel(i), includeForce, includeEnergy, groups, data.contextEnergy[i], completionTimes[i], pinnedForceBuffer, *contextForces, valid, tileCounts[i]));
    }
    data.syncContexts();
    double energy = 0.0;
    for (int i = 0; i < (int) data.contextEnergy.size(); i++)
        energy += data.contextEnergy[i];
-    if (includeForce) {
+    if (includeForce && valid) {
        // Sum the forces from all devices.
        
        CudaContext& cu = *data.contexts[0];

--- a/platforms/cuda/src/kernels/findInteractingBlocks.cu
+++ b/platforms/cuda/src/kernels/findInteractingBlocks.cu
@@ -43,7 +43,7 @@ extern "C" __global__ void findBlockBounds(int numAtoms, real4 periodicBoxSize,
 extern "C" __global__ void sortBoxData(const real2* __restrict__ sortedBlock, const real4* __restrict__ blockCenter,
        const real4* __restrict__ blockBoundingBox, real4* __restrict__ sortedBlockCenter,
        real4* __restrict__ sortedBlockBoundingBox, const real4* __restrict__ posq, const real4* __restrict__ oldPositions,
-        unsigned int* __restrict__ interactionCount, int* __restrict__ rebuildNeighborList) {
+        unsigned int* __restrict__ interactionCount, int* __restrict__ rebuildNeighborList, bool forceRebuild) {
    for (int i = threadIdx.x+blockIdx.x*blockDim.x; i < NUM_BLOCKS; i += blockDim.x*gridDim.x) {
        int index = (int) sortedBlock[i].y;
        sortedBlockCenter[i] = blockCenter[index];
@@ -52,7 +52,7 @@ extern "C" __global__ void sortBoxData(const real2* __restrict__ sortedBlock, co
    
    // Also check whether any atom has moved enough so that we really need to rebuild the neighbor list.

-    bool rebuild = false;
+    bool rebuild = forceRebuild;
    for (int i = threadIdx.x+blockIdx.x*blockDim.x; i < NUM_ATOMS; i += blockDim.x*gridDim.x) {
        real4 delta = oldPositions[i]-posq[i];
        if (delta.x*delta.x + delta.y*delta.y + delta.z*delta.z > 0.25f*PADDING*PADDING)

--- a/platforms/cuda/tests/TestCudaLocalEnergyMinimizer.cpp
+++ b/platforms/cuda/tests/TestCudaLocalEnergyMinimizer.cpp
@@ -77,10 +77,10 @@ void testHarmonicBonds() {
 }

 void testLargeSystem() {
-    const int numMolecules = 50;
+    const int numMolecules = 25;
    const int numParticles = numMolecules*2;
    const double cutoff = 2.0;
-    const double boxSize = 5.0;
+    const double boxSize = 4.0;
    const double tolerance = 5;
    System system;
    system.setDefaultPeriodicBoxVectors(Vec3(boxSize, 0, 0), Vec3(0, boxSize, 0), Vec3(0, 0, boxSize));
@@ -134,10 +134,10 @@ void testLargeSystem() {
 }

 void testVirtualSites() {
-    const int numMolecules = 50;
+    const int numMolecules = 25;
    const int numParticles = numMolecules*3;
    const double cutoff = 2.0;
-    const double boxSize = 5.0;
+    const double boxSize = 4.0;
    const double tolerance = 5;
    System system;
    system.setDefaultPeriodicBoxVectors(Vec3(boxSize, 0, 0), Vec3(0, boxSize, 0), Vec3(0, 0, boxSize));

--- a/platforms/cuda/tests/TestCudaVariableLangevinIntegrator.cpp
+++ b/platforms/cuda/tests/TestCudaVariableLangevinIntegrator.cpp
@@ -296,7 +296,7 @@ void testArgonBox() {
    system.setDefaultPeriodicBoxVectors(Vec3(boxSize, 0, 0), Vec3(0, boxSize, 0), Vec3(0, 0, boxSize));
    system.addForce(nonbonded);

-    VariableLangevinIntegrator integrator(temp, 6.0, 1e-5);
+    VariableLangevinIntegrator integrator(temp, 6.0, 1e-4);
    Context context(system, integrator, platform);
    context.setPositions(positions);
    context.setVelocitiesToTemperature(temp);
@@ -308,13 +308,13 @@ void testArgonBox() {
    // Make sure the temperature is correct.
    
    double ke = 0.0;
-    for (int i = 0; i < 2000; ++i) {
-        double t = 2.0 + 0.01 * (i + 1);
+    for (int i = 0; i < 1000; ++i) {
+        double t = 2.0 + 0.02 * (i + 1);
        integrator.stepTo(t);
        State state = context.getState(State::Energy);
        ke += state.getKineticEnergy();
    }
-    ke /= 2000;
+    ke /= 1000;
    double expected = 1.5 * numParticles * BOLTZ * temp;
    ASSERT_USUALLY_EQUAL_TOL(expected, ke, 0.01);
 }

--- a/platforms/opencl/include/OpenCLKernels.h
+++ b/platforms/opencl/include/OpenCLKernels.h
@@ -71,11 +71,13 @@ public:
     * @param includeForce  true if forces should be computed
     * @param includeEnergy true if potential energy should be computed
     * @param groups        a set of bit flags for which force groups to include
+     * @param valid         the method may set this to false to indicate the results are invalid and the force/energy
+     *                      calculation should be repeated
     * @return the potential energy of the system.  This value is added to all values returned by ForceImpls'
     * calcForcesAndEnergy() methods.  That is, each force kernel may <i>either</i> return its contribution to the
     * energy directly, <i>or</i> add it to an internal buffer so that it will be included here.
     */
-    double finishComputation(ContextImpl& context, bool includeForce, bool includeEnergy, int groups);
+    double finishComputation(ContextImpl& context, bool includeForce, bool includeEnergy, int groups, bool& valid);
 private:
   OpenCLContext& cl;
 };