Continuing to implement new CUDA platform: checkpointing, parallelization across multiple devices

387008ce · Peter Eastman · 17ae3aae · 387008ce · 387008ce · 387008ce
Commit 387008ce authored Jun 22, 2012 by Peter Eastman
20 changed files
--- a/platforms/cuda2/src/CudaContext.cpp
+++ b/platforms/cuda2/src/CudaContext.cpp
@@ -113,7 +113,6 @@ CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlocking
                deviceIndex = i;
                bestSpeed = speed;
                bestCompute = major;
-                gpuArchitecture = intToString(major)+intToString(minor);
            }
        }
    }
@@ -121,6 +120,9 @@ CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlocking
        throw OpenMMException("No compatible CUDA device is available");
    CHECK_RESULT(cuDeviceGet(&device, deviceIndex));
    this->deviceIndex = deviceIndex;
+    int major, minor;
+    CHECK_RESULT(cuDeviceComputeCapability(&major, &minor, device));
+    gpuArchitecture = intToString(major)+intToString(minor);
    defaultOptimizationOptions = "--use_fast_math";
    unsigned int flags = CU_CTX_MAP_HOST;
    if (useBlockingSync)

--- a/platforms/cuda2/src/CudaKernelFactory.cpp
+++ b/platforms/cuda2/src/CudaKernelFactory.cpp
@@ -26,7 +26,7 @@
 #include "CudaKernelFactory.h"
 #include "CudaKernels.h"
-//#include "CudaParallelKernels.h"
+#include "CudaParallelKernels.h"
 #include "CudaPlatform.h"
 #include "openmm/internal/ContextImpl.h"
 #include "openmm/OpenMMException.h"
@@ -35,38 +35,38 @@ using namespace OpenMM;
 KernelImpl* CudaKernelFactory::createKernelImpl(std::string name, const Platform& platform, ContextImpl& context) const {
    CudaPlatform::PlatformData& data = *static_cast<CudaPlatform::PlatformData*>(context.getPlatformData());
-//    if (data.contexts.size() > 1) {
+    if (data.contexts.size() > 1) {
-//        // We are running in parallel on multiple devices, so we may want to create a parallel kernel.
+        // We are running in parallel on multiple devices, so we may want to create a parallel kernel.
-//        
-//        if (name == CalcForcesAndEnergyKernel::Name())
+        if (name == CalcForcesAndEnergyKernel::Name())
-//            return new CudaParallelCalcForcesAndEnergyKernel(name, platform, data);
+            return new CudaParallelCalcForcesAndEnergyKernel(name, platform, data);
-//        if (name == CalcHarmonicBondForceKernel::Name())
+        if (name == CalcHarmonicBondForceKernel::Name())
-//            return new CudaParallelCalcHarmonicBondForceKernel(name, platform, data, context.getSystem());
+            return new CudaParallelCalcHarmonicBondForceKernel(name, platform, data, context.getSystem());
-//        if (name == CalcCustomBondForceKernel::Name())
+        if (name == CalcCustomBondForceKernel::Name())
-//            return new CudaParallelCalcCustomBondForceKernel(name, platform, data, context.getSystem());
+            return new CudaParallelCalcCustomBondForceKernel(name, platform, data, context.getSystem());
-//        if (name == CalcHarmonicAngleForceKernel::Name())
+        if (name == CalcHarmonicAngleForceKernel::Name())
-//            return new CudaParallelCalcHarmonicAngleForceKernel(name, platform, data, context.getSystem());
+            return new CudaParallelCalcHarmonicAngleForceKernel(name, platform, data, context.getSystem());
-//        if (name == CalcCustomAngleForceKernel::Name())
+        if (name == CalcCustomAngleForceKernel::Name())
-//            return new CudaParallelCalcCustomAngleForceKernel(name, platform, data, context.getSystem());
+            return new CudaParallelCalcCustomAngleForceKernel(name, platform, data, context.getSystem());
-//        if (name == CalcPeriodicTorsionForceKernel::Name())
+        if (name == CalcPeriodicTorsionForceKernel::Name())
-//            return new CudaParallelCalcPeriodicTorsionForceKernel(name, platform, data, context.getSystem());
+            return new CudaParallelCalcPeriodicTorsionForceKernel(name, platform, data, context.getSystem());
-//        if (name == CalcRBTorsionForceKernel::Name())
+        if (name == CalcRBTorsionForceKernel::Name())
-//            return new CudaParallelCalcRBTorsionForceKernel(name, platform, data, context.getSystem());
+            return new CudaParallelCalcRBTorsionForceKernel(name, platform, data, context.getSystem());
-//        if (name == CalcCMAPTorsionForceKernel::Name())
+        if (name == CalcCMAPTorsionForceKernel::Name())
-//            return new CudaParallelCalcCMAPTorsionForceKernel(name, platform, data, context.getSystem());
+            return new CudaParallelCalcCMAPTorsionForceKernel(name, platform, data, context.getSystem());
-//        if (name == CalcCustomTorsionForceKernel::Name())
+        if (name == CalcCustomTorsionForceKernel::Name())
-//            return new CudaParallelCalcCustomTorsionForceKernel(name, platform, data, context.getSystem());
+            return new CudaParallelCalcCustomTorsionForceKernel(name, platform, data, context.getSystem());
-//        if (name == CalcNonbondedForceKernel::Name())
+        if (name == CalcNonbondedForceKernel::Name())
-//            return new CudaParallelCalcNonbondedForceKernel(name, platform, data, context.getSystem());
+            return new CudaParallelCalcNonbondedForceKernel(name, platform, data, context.getSystem());
-//        if (name == CalcCustomNonbondedForceKernel::Name())
+        if (name == CalcCustomNonbondedForceKernel::Name())
-//            return new CudaParallelCalcCustomNonbondedForceKernel(name, platform, data, context.getSystem());
+            return new CudaParallelCalcCustomNonbondedForceKernel(name, platform, data, context.getSystem());
-//        if (name == CalcCustomExternalForceKernel::Name())
+        if (name == CalcCustomExternalForceKernel::Name())
-//            return new CudaParallelCalcCustomExternalForceKernel(name, platform, data, context.getSystem());
+            return new CudaParallelCalcCustomExternalForceKernel(name, platform, data, context.getSystem());
-//        if (name == CalcCustomHbondForceKernel::Name())
+        if (name == CalcCustomHbondForceKernel::Name())
-//            return new CudaParallelCalcCustomHbondForceKernel(name, platform, data, context.getSystem());
+            return new CudaParallelCalcCustomHbondForceKernel(name, platform, data, context.getSystem());
-//        if (name == CalcCustomCompoundBondForceKernel::Name())
+        if (name == CalcCustomCompoundBondForceKernel::Name())
-//            return new CudaParallelCalcCustomCompoundBondForceKernel(name, platform, data, context.getSystem());
+            return new CudaParallelCalcCustomCompoundBondForceKernel(name, platform, data, context.getSystem());
-//    }
+    }
    CudaContext& cu = *data.contexts[0];
    if (name == CalcForcesAndEnergyKernel::Name())
        return new CudaCalcForcesAndEnergyKernel(name, platform, cu);

--- a/platforms/cuda2/src/CudaKernels.cpp
+++ b/platforms/cuda2/src/CudaKernels.cpp
@@ -37,7 +37,7 @@
 #include "CudaBondedUtilities.h"
 #include "CudaExpressionUtilities.h"
 #include "CudaIntegrationUtilities.h"
-//#include "CudaNonbondedUtilities.h"
+#include "CudaNonbondedUtilities.h"
 #include "CudaKernelSources.h"
 #include "lepton/ExpressionTreeNode.h"
 #include "lepton/Operation.h"
@@ -282,48 +282,62 @@ void CudaUpdateStateDataKernel::setPeriodicBoxVectors(ContextImpl& context, cons
 void CudaUpdateStateDataKernel::createCheckpoint(ContextImpl& context, ostream& stream) {
    cu.setAsCurrent();
-//    int version = 1;
+    int version = 1;
-//    stream.write((char*) &version, sizeof(int));
+    stream.write((char*) &version, sizeof(int));
-//    double time = cu.getTime();
+    double time = cu.getTime();
-//    stream.write((char*) &time, sizeof(double));
+    stream.write((char*) &time, sizeof(double));
-//    cu.getPosq().download();
+    int stepCount = cu.getStepCount();
-//    stream.write((char*) &cu.getPosq()[0], sizeof(mm_float4)*cu.getPosq().getSize());
+    stream.write((char*) &stepCount, sizeof(int));
-//    cu.getVelm().download();
+    int computeForceCount = cu.getComputeForceCount();
-//    stream.write((char*) &cu.getVelm()[0], sizeof(mm_float4)*cu.getVelm().getSize());
+    stream.write((char*) &computeForceCount, sizeof(int));
-//    stream.write((char*) &cu.getAtomIndex()[0], sizeof(cl_int)*cu.getAtomIndex().getSize());
+    int bufferSize = cu.getPaddedNumAtoms()*(cu.getUseDoublePrecision() ? sizeof(double4) : sizeof(float4));
-//    stream.write((char*) &cu.getPosCellOffsets()[0], sizeof(mm_int4)*cu.getPosCellOffsets().size());
+    char* buffer = (char*) cu.getPinnedBuffer();
-//    mm_float4 box = cu.getPeriodicBoxSize();
+    cu.getPosq().download(buffer);
-//    stream.write((char*) &box, sizeof(mm_float4));
+    stream.write(buffer, bufferSize);
-//    cu.getIntegrationUtilities().createCheckpoint(stream);
+    cu.getVelm().download(buffer);
-//    SimTKOpenMMUtilities::createCheckpoint(stream);
+    stream.write(buffer, bufferSize);
+    stream.write((char*) &cu.getAtomIndex()[0], sizeof(int)*cu.getAtomIndex().size());
+    stream.write((char*) &cu.getPosCellOffsets()[0], sizeof(int4)*cu.getPosCellOffsets().size());
+    double4 box = cu.getPeriodicBoxSize();
+    stream.write((char*) &box, sizeof(double4));
+    cu.getIntegrationUtilities().createCheckpoint(stream);
+    SimTKOpenMMUtilities::createCheckpoint(stream);
 }
 void CudaUpdateStateDataKernel::loadCheckpoint(ContextImpl& context, istream& stream) {
    cu.setAsCurrent();
-//    int version;
+    int version;
-//    stream.read((char*) &version, sizeof(int));
+    stream.read((char*) &version, sizeof(int));
-//    if (version != 1)
+    if (version != 1)
-//        throw OpenMMException("Checkpoint was created with a different version of OpenMM");
+        throw OpenMMException("Checkpoint was created with a different version of OpenMM");
-//    double time;
+    double time;
-//    stream.read((char*) &time, sizeof(double));
+    stream.read((char*) &time, sizeof(double));
-//    vector<CudaContext*>& contexts = cu.getPlatformData().contexts;
+    int stepCount, computeForceCount;
-//    for (int i = 0; i < (int) contexts.size(); i++)
+    stream.read((char*) &stepCount, sizeof(int));
-//        contexts[i]->setTime(time);
+    stream.read((char*) &computeForceCount, sizeof(int));
-//    stream.read((char*) &cu.getPosq()[0], sizeof(mm_float4)*cu.getPosq().getSize());
+    vector<CudaContext*>& contexts = cu.getPlatformData().contexts;
-//    cu.getPosq().upload();
+    for (int i = 0; i < (int) contexts.size(); i++) {
-//    stream.read((char*) &cu.getVelm()[0], sizeof(mm_float4)*cu.getVelm().getSize());
+        contexts[i]->setTime(time);
-//    cu.getVelm().upload();
+        contexts[i]->setStepCount(stepCount);
-//    stream.read((char*) &cu.getAtomIndex()[0], sizeof(cl_int)*cu.getAtomIndex().getSize());
+        contexts[i]->setComputeForceCount(computeForceCount);
-//    cu.getAtomIndex().upload();
+    }
-//    stream.read((char*) &cu.getPosCellOffsets()[0], sizeof(mm_int4)*cu.getPosCellOffsets().size());
+    int bufferSize = cu.getPaddedNumAtoms()*(cu.getUseDoublePrecision() ? sizeof(double4) : sizeof(float4));
-//    mm_float4 box;
+    char* buffer = (char*) cu.getPinnedBuffer();
-//    stream.read((char*) &box, sizeof(mm_float4));
+    stream.read(buffer, bufferSize);
-//    for (int i = 0; i < (int) contexts.size(); i++)
+    cu.getPosq().upload(buffer);
-//        contexts[i]->setPeriodicBoxSize(box.x, box.y, box.z);
+    stream.read(buffer, bufferSize);
-//    cu.getIntegrationUtilities().loadCheckpoint(stream);
+    cu.getVelm().upload(buffer);
-//    SimTKOpenMMUtilities::loadCheckpoint(stream);
+    stream.read((char*) &cu.getAtomIndex()[0], sizeof(int)*cu.getAtomIndex().size());
-//    for (int i = 0; i < cu.getReorderListeners().size(); i++)
+    cu.getAtomIndexArray().upload(cu.getAtomIndex());
-//        cu.getReorderListeners()[i]->execute();
+    stream.read((char*) &cu.getPosCellOffsets()[0], sizeof(int4)*cu.getPosCellOffsets().size());
+    double4 box;
+    stream.read((char*) &box, sizeof(double4));
+    for (int i = 0; i < (int) contexts.size(); i++)
+        contexts[i]->setPeriodicBoxSize(box.x, box.y, box.z);
+    cu.getIntegrationUtilities().loadCheckpoint(stream);
+    SimTKOpenMMUtilities::loadCheckpoint(stream);
+    for (int i = 0; i < cu.getReorderListeners().size(); i++)
+        cu.getReorderListeners()[i]->execute();
 }
 void CudaApplyConstraintsKernel::initialize(const System& system) {
@@ -840,6 +854,7 @@ private:
 };
 CudaCalcPeriodicTorsionForceKernel::~CudaCalcPeriodicTorsionForceKernel() {
+    cu.setAsCurrent();
    if (params != NULL)
        delete params;
 }
@@ -926,6 +941,7 @@ private:
 };
 CudaCalcRBTorsionForceKernel::~CudaCalcRBTorsionForceKernel() {
+    cu.setAsCurrent();
    if (params1 != NULL)
        delete params1;
    if (params2 != NULL)
@@ -3983,8 +3999,8 @@ CudaIntegrateVerletStepKernel::~CudaIntegrateVerletStepKernel() {
 }
 void CudaIntegrateVerletStepKernel::initialize(const System& system, const VerletIntegrator& integrator) {
-    cu.setAsCurrent();
    cu.getPlatformData().initializeContexts(system);
+    cu.setAsCurrent();
    map<string, string> defines;
    defines["NUM_ATOMS"] = cu.intToString(cu.getNumAtoms());
    defines["PADDED_NUM_ATOMS"] = cu.intToString(cu.getPaddedNumAtoms());
@@ -3995,6 +4011,7 @@ void CudaIntegrateVerletStepKernel::initialize(const System& system, const Verle
 }
 void CudaIntegrateVerletStepKernel::execute(ContextImpl& context, const VerletIntegrator& integrator) {
+    cu.setAsCurrent();
    CudaIntegrationUtilities& integration = cu.getIntegrationUtilities();
    int numAtoms = cu.getNumAtoms();
    double dt = integrator.getStepSize();
@@ -4042,8 +4059,8 @@ CudaIntegrateLangevinStepKernel::~CudaIntegrateLangevinStepKernel() {
 }
 void CudaIntegrateLangevinStepKernel::initialize(const System& system, const LangevinIntegrator& integrator) {
-    cu.setAsCurrent();
    cu.getPlatformData().initializeContexts(system);
+    cu.setAsCurrent();
    cu.getIntegrationUtilities().initRandomNumberGenerator(integrator.getRandomNumberSeed());
    map<string, string> defines;
    defines["NUM_ATOMS"] = cu.intToString(cu.getNumAtoms());
@@ -4056,6 +4073,7 @@ void CudaIntegrateLangevinStepKernel::initialize(const System& system, const Lan
 }
 void CudaIntegrateLangevinStepKernel::execute(ContextImpl& context, const LangevinIntegrator& integrator) {
+    cu.setAsCurrent();
    CudaIntegrationUtilities& integration = cu.getIntegrationUtilities();
    int numAtoms = cu.getNumAtoms();
    double temperature = integrator.getTemperature();
@@ -4120,8 +4138,8 @@ CudaIntegrateBrownianStepKernel::~CudaIntegrateBrownianStepKernel() {
 }
 void CudaIntegrateBrownianStepKernel::initialize(const System& system, const BrownianIntegrator& integrator) {
-    cu.setAsCurrent();
    cu.getPlatformData().initializeContexts(system);
+    cu.setAsCurrent();
    cu.getIntegrationUtilities().initRandomNumberGenerator(integrator.getRandomNumberSeed());
    map<string, string> defines;
    defines["NUM_ATOMS"] = cu.intToString(cu.getNumAtoms());
@@ -4133,6 +4151,7 @@ void CudaIntegrateBrownianStepKernel::initialize(const System& system, const Bro
 }
 void CudaIntegrateBrownianStepKernel::execute(ContextImpl& context, const BrownianIntegrator& integrator) {
+    cu.setAsCurrent();
    CudaIntegrationUtilities& integration = cu.getIntegrationUtilities();
    int numAtoms = cu.getNumAtoms();
    double temperature = integrator.getTemperature();
@@ -4175,8 +4194,8 @@ CudaIntegrateVariableVerletStepKernel::~CudaIntegrateVariableVerletStepKernel()
 }
 void CudaIntegrateVariableVerletStepKernel::initialize(const System& system, const VariableVerletIntegrator& integrator) {
-    cu.setAsCurrent();
    cu.getPlatformData().initializeContexts(system);
+    cu.setAsCurrent();
    map<string, string> defines;
    defines["NUM_ATOMS"] = cu.intToString(cu.getNumAtoms());
    defines["PADDED_NUM_ATOMS"] = cu.intToString(cu.getPaddedNumAtoms());
@@ -4188,6 +4207,7 @@ void CudaIntegrateVariableVerletStepKernel::initialize(const System& system, con
 }
 double CudaIntegrateVariableVerletStepKernel::execute(ContextImpl& context, const VariableVerletIntegrator& integrator, double maxTime) {
+    cu.setAsCurrent();
    CudaIntegrationUtilities& integration = cu.getIntegrationUtilities();
    int numAtoms = cu.getNumAtoms();
@@ -4252,8 +4272,8 @@ CudaIntegrateVariableLangevinStepKernel::~CudaIntegrateVariableLangevinStepKerne
 }
 void CudaIntegrateVariableLangevinStepKernel::initialize(const System& system, const VariableLangevinIntegrator& integrator) {
-    cu.setAsCurrent();
    cu.getPlatformData().initializeContexts(system);
+    cu.setAsCurrent();
    cu.getIntegrationUtilities().initRandomNumberGenerator(integrator.getRandomNumberSeed());
    map<string, string> defines;
    defines["NUM_ATOMS"] = cu.intToString(cu.getNumAtoms());
@@ -4268,6 +4288,7 @@ void CudaIntegrateVariableLangevinStepKernel::initialize(const System& system, c
 }
 double CudaIntegrateVariableLangevinStepKernel::execute(ContextImpl& context, const VariableLangevinIntegrator& integrator, double maxTime) {
+    cu.setAsCurrent();
    CudaIntegrationUtilities& integration = cu.getIntegrationUtilities();
    int numAtoms = cu.getNumAtoms();
@@ -4412,8 +4433,8 @@ CudaIntegrateCustomStepKernel::~CudaIntegrateCustomStepKernel() {
 }
 void CudaIntegrateCustomStepKernel::initialize(const System& system, const CustomIntegrator& integrator) {
-    cu.setAsCurrent();
    cu.getPlatformData().initializeContexts(system);
+    cu.setAsCurrent();
    cu.getIntegrationUtilities().initRandomNumberGenerator(integrator.getRandomNumberSeed());
    numGlobalVariables = integrator.getNumGlobalVariables();
    int elementSize = (cu.getUseDoublePrecision() ? sizeof(double) : sizeof(float));
@@ -4492,6 +4513,7 @@ string CudaIntegrateCustomStepKernel::createPerDofComputation(const string& vari
 }
 void CudaIntegrateCustomStepKernel::execute(ContextImpl& context, CustomIntegrator& integrator, bool& forcesAreValid) {
+    cu.setAsCurrent();
    CudaIntegrationUtilities& integration = cu.getIntegrationUtilities();
    int numAtoms = cu.getNumAtoms();
    int numSteps = integrator.getNumComputations();

--- a/platforms/cuda2/src/CudaParallelKernels.cpp
+++ b/platforms/cuda2/src/CudaParallelKernels.cpp
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2011-2012 Stanford University and the Authors.      *
+ * Authors: Peter Eastman                                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * This program is free software: you can redistribute it and/or modify       *
+ * it under the terms of the GNU Lesser General Public License as published   *
+ * by the Free Software Foundation, either version 3 of the License, or       *
+ * (at your option) any later version.                                        *
+ *                                                                            *
+ * This program is distributed in the hope that it will be useful,            *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
+ * GNU Lesser General Public License for more details.                        *
+ *                                                                            *
+ * You should have received a copy of the GNU Lesser General Public License   *
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
+ * -------------------------------------------------------------------------- */
+#include "CudaParallelKernels.h"
+#include "CudaKernelSources.h"
+using namespace OpenMM;
+using namespace std;
+#define CHECK_RESULT(result) \
+if (result != CUDA_SUCCESS) { \
+    std::stringstream m; \
+    m<<errorMessage<<": "<<cu.getErrorString(result)<<" ("<<result<<")"<<" at "<<__FILE__<<":"<<__LINE__; \
+    throw OpenMMException(m.str());\
+}
+/**
+ * Get the current clock time, measured in microseconds.
+ */
+#ifdef _MSC_VER
+    #include <Windows.h>
+    static long long getTime() {
+        FILETIME ft;
+        GetSystemTimeAsFileTime(&ft);	 // 100-nanoseconds since 1-1-1601
+        ULARGE_INTEGER result;
+        result.LowPart = ft.dwLowDateTime;
+        result.HighPart = ft.dwHighDateTime;
+        return result.QuadPart/10;
+    }
+#else
+    #include <sys/time.h> 
+    static long long getTime() {
+        struct timeval tod;
+        gettimeofday(&tod, 0);
+        return 1000000*tod.tv_sec+tod.tv_usec;
+    }
+#endif
+class CudaParallelCalcForcesAndEnergyKernel::BeginComputationTask : public CudaContext::WorkTask {
+public:
+    BeginComputationTask(ContextImpl& context, CudaContext& cu, CudaCalcForcesAndEnergyKernel& kernel,
+            bool includeForce, bool includeEnergy, int groups, void* pinnedMemory) : context(context), cu(cu), kernel(kernel),
+            includeForce(includeForce), includeEnergy(includeEnergy), groups(groups), pinnedMemory(pinnedMemory) {
+    }
+    void execute() {
+        // Copy coordinates over to this device and execute the kernel.
+        cu.setAsCurrent();
+        if (cu.getContextIndex() > 0)
+            cu.getPosq().upload(pinnedMemory, false);
+        kernel.beginComputation(context, includeForce, includeEnergy, groups);
+    }
+private:
+    ContextImpl& context;
+    CudaContext& cu;
+    CudaCalcForcesAndEnergyKernel& kernel;
+    bool includeForce, includeEnergy;
+    int groups;
+    void* pinnedMemory;
+};
+class CudaParallelCalcForcesAndEnergyKernel::FinishComputationTask : public CudaContext::WorkTask {
+public:
+    FinishComputationTask(ContextImpl& context, CudaContext& cu, CudaCalcForcesAndEnergyKernel& kernel,
+            bool includeForce, bool includeEnergy, int groups, double& energy, long long& completionTime, void* pinnedMemory) :
+            context(context), cu(cu), kernel(kernel), includeForce(includeForce), includeEnergy(includeEnergy), groups(groups), energy(energy),
+            completionTime(completionTime), pinnedMemory(pinnedMemory) {
+    }
+    void execute() {
+        // Execute the kernel, then download forces.
+        energy += kernel.finishComputation(context, includeForce, includeEnergy, groups);
+        if (includeForce) {
+            if (cu.getContextIndex() > 0) {
+                int numAtoms = cu.getPaddedNumAtoms();
+                cu.getForce().download(&pinnedMemory[(cu.getContextIndex()-1)*numAtoms*3]);
+            }
+            else {
+                string errorMessage = "Error synchronizing CUDA context";
+                CHECK_RESULT(cuCtxSynchronize());
+            }
+        }
+        completionTime = getTime();
+    }
+private:
+    ContextImpl& context;
+    CudaContext& cu;
+    CudaCalcForcesAndEnergyKernel& kernel;
+    bool includeForce, includeEnergy;
+    int groups;
+    double& energy;
+    long long& completionTime;
+    void* pinnedMemory;
+};
+CudaParallelCalcForcesAndEnergyKernel::CudaParallelCalcForcesAndEnergyKernel(string name, const Platform& platform, CudaPlatform::PlatformData& data) :
+        CalcForcesAndEnergyKernel(name, platform), data(data), completionTimes(data.contexts.size()), contextTiles(data.contexts.size()), contextForces(NULL),
+        pinnedPositionBuffer(NULL), pinnedForceBuffer(NULL) {
+    for (int i = 0; i < (int) data.contexts.size(); i++)
+        kernels.push_back(Kernel(new CudaCalcForcesAndEnergyKernel(name, platform, *data.contexts[i])));
+}
+CudaParallelCalcForcesAndEnergyKernel::~CudaParallelCalcForcesAndEnergyKernel() {
+    data.contexts[0]->setAsCurrent();
+    if (contextForces != NULL)
+        delete contextForces;
+    if (pinnedPositionBuffer != NULL)
+        cuMemFreeHost(pinnedPositionBuffer);
+    if (pinnedForceBuffer != NULL)
+        cuMemFreeHost(pinnedForceBuffer);
+}
+void CudaParallelCalcForcesAndEnergyKernel::initialize(const System& system) {
+    CudaContext& cu = *data.contexts[0];
+    cu.setAsCurrent();
+    CUmodule module = cu.createModule(CudaKernelSources::parallel);
+    sumKernel = cu.getKernel(module, "sumForces");
+    for (int i = 0; i < (int) kernels.size(); i++)
+        getKernel(i).initialize(system);
+}
+void CudaParallelCalcForcesAndEnergyKernel::beginComputation(ContextImpl& context, bool includeForce, bool includeEnergy, int groups) {
+    CudaContext& cu = *data.contexts[0];
+    cu.setAsCurrent();
+    if (contextForces == NULL) {
+        contextForces = CudaArray::create<long long>(cu, 3*(data.contexts.size()-1)*cu.getPaddedNumAtoms(), "contextForces");
+        string errorMessage = "Error allocating pinned memory";
+        CHECK_RESULT(cuMemHostAlloc(&pinnedForceBuffer, 3*(data.contexts.size()-1)*cu.getPaddedNumAtoms()*sizeof(long long), CU_MEMHOSTALLOC_PORTABLE));
+        CHECK_RESULT(cuMemHostAlloc(&pinnedPositionBuffer, cu.getPaddedNumAtoms()*(cu.getUseDoublePrecision() ? sizeof(double4) : sizeof(float4)), CU_MEMHOSTALLOC_PORTABLE));
+    }
+    // Copy coordinates over to each device and execute the kernel.
+    cu.getPosq().download(pinnedPositionBuffer);
+    for (int i = 0; i < (int) data.contexts.size(); i++) {
+        data.contextEnergy[i] = 0.0;
+        CudaContext& cu = *data.contexts[i];
+        CudaContext::WorkThread& thread = cu.getWorkThread();
+        thread.addTask(new BeginComputationTask(context, cu, getKernel(i), includeForce, includeEnergy, groups, pinnedPositionBuffer));
+    }
+}
+double CudaParallelCalcForcesAndEnergyKernel::finishComputation(ContextImpl& context, bool includeForce, bool includeEnergy, int groups) {
+    for (int i = 0; i < (int) data.contexts.size(); i++) {
+        CudaContext& cu = *data.contexts[i];
+        CudaContext::WorkThread& thread = cu.getWorkThread();
+        thread.addTask(new FinishComputationTask(context, cu, getKernel(i), includeForce, includeEnergy, groups, data.contextEnergy[i], completionTimes[i], pinnedForceBuffer));
+    }
+    data.syncContexts();
+    double energy = 0.0;
+    for (int i = 0; i < (int) data.contextEnergy.size(); i++)
+        energy += data.contextEnergy[i];
+    if (includeForce) {
+        // Sum the forces from all devices.
+        CudaContext& cu = *data.contexts[0];
+        contextForces->upload(pinnedForceBuffer, false);
+        int bufferSize = 3*cu.getPaddedNumAtoms();
+        int numBuffers = data.contexts.size()-1;
+        void* args[] = {&cu.getForce().getDevicePointer(), &contextForces->getDevicePointer(), &bufferSize, &numBuffers};
+        cu.executeKernel(sumKernel, args, bufferSize);
+        // Balance work between the contexts by transferring a few nonbonded tiles from the context that
+        // finished last to the one that finished first.
+        int firstIndex = 0, lastIndex = 0;
+        int totalTiles = 0;
+        for (int i = 0; i < (int) completionTimes.size(); i++) {
+            if (completionTimes[i] < completionTimes[firstIndex])
+                firstIndex = i;
+            if (completionTimes[i] > completionTimes[lastIndex])
+                lastIndex = i;
+            contextTiles[i] = data.contexts[i]->getNonbondedUtilities().getNumTiles();
+            totalTiles += contextTiles[i];
+        }
+        int tilesToTransfer = totalTiles/1000;
+        if (tilesToTransfer < 1)
+            tilesToTransfer = 1;
+        if (tilesToTransfer > contextTiles[lastIndex])
+            tilesToTransfer = contextTiles[lastIndex];
+        contextTiles[firstIndex] += tilesToTransfer;
+        contextTiles[lastIndex] -= tilesToTransfer;
+        int startIndex = 0;
+        for (int i = 0; i < (int) contextTiles.size(); i++) {
+            data.contexts[i]->getNonbondedUtilities().setTileRange(startIndex, contextTiles[i]);
+            startIndex += contextTiles[i];
+        }
+    }
+    return energy;
+}
+class CudaParallelCalcHarmonicBondForceKernel::Task : public CudaContext::WorkTask {
+public:
+    Task(ContextImpl& context, CudaCalcHarmonicBondForceKernel& kernel, bool includeForce,
+            bool includeEnergy, double& energy) : context(context), kernel(kernel),
+            includeForce(includeForce), includeEnergy(includeEnergy), energy(energy) {
+    }
+    void execute() {
+        energy += kernel.execute(context, includeForce, includeEnergy);
+    }
+private:
+    ContextImpl& context;
+    CudaCalcHarmonicBondForceKernel& kernel;
+    bool includeForce, includeEnergy;
+    double& energy;
+};
+CudaParallelCalcHarmonicBondForceKernel::CudaParallelCalcHarmonicBondForceKernel(std::string name, const Platform& platform, CudaPlatform::PlatformData& data, System& system) :
+        CalcHarmonicBondForceKernel(name, platform), data(data) {
+    for (int i = 0; i < (int) data.contexts.size(); i++)
+        kernels.push_back(Kernel(new CudaCalcHarmonicBondForceKernel(name, platform, *data.contexts[i], system)));
+}
+void CudaParallelCalcHarmonicBondForceKernel::initialize(const System& system, const HarmonicBondForce& force) {
+    for (int i = 0; i < (int) kernels.size(); i++)
+        getKernel(i).initialize(system, force);
+}
+double CudaParallelCalcHarmonicBondForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy) {
+    for (int i = 0; i < (int) data.contexts.size(); i++) {
+        CudaContext& cu = *data.contexts[i];
+        CudaContext::WorkThread& thread = cu.getWorkThread();
+        thread.addTask(new Task(context, getKernel(i), includeForces, includeEnergy, data.contextEnergy[i]));
+    }
+    return 0.0;
+}
+void CudaParallelCalcHarmonicBondForceKernel::copyParametersToContext(ContextImpl& context, const HarmonicBondForce& force) {
+    for (int i = 0; i < (int) kernels.size(); i++)
+        getKernel(i).copyParametersToContext(context, force);
+}
+class CudaParallelCalcCustomBondForceKernel::Task : public CudaContext::WorkTask {
+public:
+    Task(ContextImpl& context, CudaCalcCustomBondForceKernel& kernel, bool includeForce,
+            bool includeEnergy, double& energy) : context(context), kernel(kernel),
+            includeForce(includeForce), includeEnergy(includeEnergy), energy(energy) {
+    }
+    void execute() {
+        energy += kernel.execute(context, includeForce, includeEnergy);
+    }
+private:
+    ContextImpl& context;
+    CudaCalcCustomBondForceKernel& kernel;
+    bool includeForce, includeEnergy;
+    double& energy;
+};
+CudaParallelCalcCustomBondForceKernel::CudaParallelCalcCustomBondForceKernel(std::string name, const Platform& platform, CudaPlatform::PlatformData& data, System& system) :
+        CalcCustomBondForceKernel(name, platform), data(data) {
+    for (int i = 0; i < (int) data.contexts.size(); i++)
+        kernels.push_back(Kernel(new CudaCalcCustomBondForceKernel(name, platform, *data.contexts[i], system)));
+}
+void CudaParallelCalcCustomBondForceKernel::initialize(const System& system, const CustomBondForce& force) {
+    for (int i = 0; i < (int) kernels.size(); i++)
+        getKernel(i).initialize(system, force);
+}
+double CudaParallelCalcCustomBondForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy) {
+    for (int i = 0; i < (int) data.contexts.size(); i++) {
+        CudaContext& cu = *data.contexts[i];
+        CudaContext::WorkThread& thread = cu.getWorkThread();
+        thread.addTask(new Task(context, getKernel(i), includeForces, includeEnergy, data.contextEnergy[i]));
+    }
+    return 0.0;
+}
+void CudaParallelCalcCustomBondForceKernel::copyParametersToContext(ContextImpl& context, const CustomBondForce& force) {
+    for (int i = 0; i < (int) kernels.size(); i++)
+        getKernel(i).copyParametersToContext(context, force);
+}
+class CudaParallelCalcHarmonicAngleForceKernel::Task : public CudaContext::WorkTask {
+public:
+    Task(ContextImpl& context, CudaCalcHarmonicAngleForceKernel& kernel, bool includeForce,
+            bool includeEnergy, double& energy) : context(context), kernel(kernel),
+            includeForce(includeForce), includeEnergy(includeEnergy), energy(energy) {
+    }
+    void execute() {
+        energy += kernel.execute(context, includeForce, includeEnergy);
+    }
+private:
+    ContextImpl& context;
+    CudaCalcHarmonicAngleForceKernel& kernel;
+    bool includeForce, includeEnergy;
+    double& energy;
+};
+CudaParallelCalcHarmonicAngleForceKernel::CudaParallelCalcHarmonicAngleForceKernel(std::string name, const Platform& platform, CudaPlatform::PlatformData& data, System& system) :
+        CalcHarmonicAngleForceKernel(name, platform), data(data) {
+    for (int i = 0; i < (int) data.contexts.size(); i++)
+        kernels.push_back(Kernel(new CudaCalcHarmonicAngleForceKernel(name, platform, *data.contexts[i], system)));
+}
+void CudaParallelCalcHarmonicAngleForceKernel::initialize(const System& system, const HarmonicAngleForce& force) {
+    for (int i = 0; i < (int) kernels.size(); i++)
+        getKernel(i).initialize(system, force);
+}
+double CudaParallelCalcHarmonicAngleForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy) {
+    for (int i = 0; i < (int) data.contexts.size(); i++) {
+        CudaContext& cu = *data.contexts[i];
+        CudaContext::WorkThread& thread = cu.getWorkThread();
+        thread.addTask(new Task(context, getKernel(i), includeForces, includeEnergy, data.contextEnergy[i]));
+    }
+    return 0.0;
+}
+void CudaParallelCalcHarmonicAngleForceKernel::copyParametersToContext(ContextImpl& context, const HarmonicAngleForce& force) {
+    for (int i = 0; i < (int) kernels.size(); i++)
+        getKernel(i).copyParametersToContext(context, force);
+}
+class CudaParallelCalcCustomAngleForceKernel::Task : public CudaContext::WorkTask {
+public:
+    Task(ContextImpl& context, CudaCalcCustomAngleForceKernel& kernel, bool includeForce,
+            bool includeEnergy, double& energy) : context(context), kernel(kernel),
+            includeForce(includeForce), includeEnergy(includeEnergy), energy(energy) {
+    }
+    void execute() {
+        energy += kernel.execute(context, includeForce, includeEnergy);
+    }
+private:
+    ContextImpl& context;
+    CudaCalcCustomAngleForceKernel& kernel;
+    bool includeForce, includeEnergy;
+    double& energy;
+};
+CudaParallelCalcCustomAngleForceKernel::CudaParallelCalcCustomAngleForceKernel(std::string name, const Platform& platform, CudaPlatform::PlatformData& data, System& system) :
+        CalcCustomAngleForceKernel(name, platform), data(data) {
+    for (int i = 0; i < (int) data.contexts.size(); i++)
+        kernels.push_back(Kernel(new CudaCalcCustomAngleForceKernel(name, platform, *data.contexts[i], system)));
+}
+void CudaParallelCalcCustomAngleForceKernel::initialize(const System& system, const CustomAngleForce& force) {
+    for (int i = 0; i < (int) kernels.size(); i++)
+        getKernel(i).initialize(system, force);
+}
+double CudaParallelCalcCustomAngleForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy) {
+    for (int i = 0; i < (int) data.contexts.size(); i++) {
+        CudaContext& cu = *data.contexts[i];
+        CudaContext::WorkThread& thread = cu.getWorkThread();
+        thread.addTask(new Task(context, getKernel(i), includeForces, includeEnergy, data.contextEnergy[i]));
+    }
+    return 0.0;
+}
+void CudaParallelCalcCustomAngleForceKernel::copyParametersToContext(ContextImpl& context, const CustomAngleForce& force) {
+    for (int i = 0; i < (int) kernels.size(); i++)
+        getKernel(i).copyParametersToContext(context, force);
+}
+class CudaParallelCalcPeriodicTorsionForceKernel::Task : public CudaContext::WorkTask {
+public:
+    Task(ContextImpl& context, CudaCalcPeriodicTorsionForceKernel& kernel, bool includeForce,
+            bool includeEnergy, double& energy) : context(context), kernel(kernel),
+            includeForce(includeForce), includeEnergy(includeEnergy), energy(energy) {
+    }
+    void execute() {
+        energy += kernel.execute(context, includeForce, includeEnergy);
+    }
+private:
+    ContextImpl& context;
+    CudaCalcPeriodicTorsionForceKernel& kernel;
+    bool includeForce, includeEnergy;
+    double& energy;
+};
+CudaParallelCalcPeriodicTorsionForceKernel::CudaParallelCalcPeriodicTorsionForceKernel(std::string name, const Platform& platform, CudaPlatform::PlatformData& data, System& system) :
+        CalcPeriodicTorsionForceKernel(name, platform), data(data) {
+    for (int i = 0; i < (int) data.contexts.size(); i++)
+        kernels.push_back(Kernel(new CudaCalcPeriodicTorsionForceKernel(name, platform, *data.contexts[i], system)));
+}
+void CudaParallelCalcPeriodicTorsionForceKernel::initialize(const System& system, const PeriodicTorsionForce& force) {
+    for (int i = 0; i < (int) kernels.size(); i++)
+        getKernel(i).initialize(system, force);
+}
+double CudaParallelCalcPeriodicTorsionForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy) {
+    for (int i = 0; i < (int) data.contexts.size(); i++) {
+        CudaContext& cu = *data.contexts[i];
+        CudaContext::WorkThread& thread = cu.getWorkThread();
+        thread.addTask(new Task(context, getKernel(i), includeForces, includeEnergy, data.contextEnergy[i]));
+    }
+    return 0.0;
+}
+void CudaParallelCalcPeriodicTorsionForceKernel::copyParametersToContext(ContextImpl& context, const PeriodicTorsionForce& force) {
+    for (int i = 0; i < (int) kernels.size(); i++)
+        getKernel(i).copyParametersToContext(context, force);
+}
+class CudaParallelCalcRBTorsionForceKernel::Task : public CudaContext::WorkTask {
+public:
+    Task(ContextImpl& context, CudaCalcRBTorsionForceKernel& kernel, bool includeForce,
+            bool includeEnergy, double& energy) : context(context), kernel(kernel),
+            includeForce(includeForce), includeEnergy(includeEnergy), energy(energy) {
+    }
+    void execute() {
+        energy += kernel.execute(context, includeForce, includeEnergy);
+    }
+private:
+    ContextImpl& context;
+    CudaCalcRBTorsionForceKernel& kernel;
+    bool includeForce, includeEnergy;
+    double& energy;
+};
+CudaParallelCalcRBTorsionForceKernel::CudaParallelCalcRBTorsionForceKernel(std::string name, const Platform& platform, CudaPlatform::PlatformData& data, System& system) :
+        CalcRBTorsionForceKernel(name, platform), data(data) {
+    for (int i = 0; i < (int) data.contexts.size(); i++)
+        kernels.push_back(Kernel(new CudaCalcRBTorsionForceKernel(name, platform, *data.contexts[i], system)));
+}
+void CudaParallelCalcRBTorsionForceKernel::initialize(const System& system, const RBTorsionForce& force) {
+    for (int i = 0; i < (int) kernels.size(); i++)
+        getKernel(i).initialize(system, force);
+}
+double CudaParallelCalcRBTorsionForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy) {
+    for (int i = 0; i < (int) data.contexts.size(); i++) {
+        CudaContext& cu = *data.contexts[i];
+        CudaContext::WorkThread& thread = cu.getWorkThread();
+        thread.addTask(new Task(context, getKernel(i), includeForces, includeEnergy, data.contextEnergy[i]));
+    }
+    return 0.0;
+}
+void CudaParallelCalcRBTorsionForceKernel::copyParametersToContext(ContextImpl& context, const RBTorsionForce& force) {
+    for (int i = 0; i < (int) kernels.size(); i++)
+        getKernel(i).copyParametersToContext(context, force);
+}
+class CudaParallelCalcCMAPTorsionForceKernel::Task : public CudaContext::WorkTask {
+public:
+    Task(ContextImpl& context, CudaCalcCMAPTorsionForceKernel& kernel, bool includeForce,
+            bool includeEnergy, double& energy) : context(context), kernel(kernel),
+            includeForce(includeForce), includeEnergy(includeEnergy), energy(energy) {
+    }
+    void execute() {
+        energy += kernel.execute(context, includeForce, includeEnergy);
+    }
+private:
+    ContextImpl& context;
+    CudaCalcCMAPTorsionForceKernel& kernel;
+    bool includeForce, includeEnergy;
+    double& energy;
+};
+CudaParallelCalcCMAPTorsionForceKernel::CudaParallelCalcCMAPTorsionForceKernel(std::string name, const Platform& platform, CudaPlatform::PlatformData& data, System& system) :
+        CalcCMAPTorsionForceKernel(name, platform), data(data) {
+    for (int i = 0; i < (int) data.contexts.size(); i++)
+        kernels.push_back(Kernel(new CudaCalcCMAPTorsionForceKernel(name, platform, *data.contexts[i], system)));
+}
+void CudaParallelCalcCMAPTorsionForceKernel::initialize(const System& system, const CMAPTorsionForce& force) {
+    for (int i = 0; i < (int) kernels.size(); i++)
+        getKernel(i).initialize(system, force);
+}
+double CudaParallelCalcCMAPTorsionForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy) {
+    for (int i = 0; i < (int) data.contexts.size(); i++) {
+        CudaContext& cu = *data.contexts[i];
+        CudaContext::WorkThread& thread = cu.getWorkThread();
+        thread.addTask(new Task(context, getKernel(i), includeForces, includeEnergy, data.contextEnergy[i]));
+    }
+    return 0.0;
+}
+class CudaParallelCalcCustomTorsionForceKernel::Task : public CudaContext::WorkTask {
+public:
+    Task(ContextImpl& context, CudaCalcCustomTorsionForceKernel& kernel, bool includeForce,
+            bool includeEnergy, double& energy) : context(context), kernel(kernel),
+            includeForce(includeForce), includeEnergy(includeEnergy), energy(energy) {
+    }
+    void execute() {
+        energy += kernel.execute(context, includeForce, includeEnergy);
+    }
+private:
+    ContextImpl& context;
+    CudaCalcCustomTorsionForceKernel& kernel;
+    bool includeForce, includeEnergy;
+    double& energy;
+};
+CudaParallelCalcCustomTorsionForceKernel::CudaParallelCalcCustomTorsionForceKernel(std::string name, const Platform& platform, CudaPlatform::PlatformData& data, System& system) :
+        CalcCustomTorsionForceKernel(name, platform), data(data) {
+    for (int i = 0; i < (int) data.contexts.size(); i++)
+        kernels.push_back(Kernel(new CudaCalcCustomTorsionForceKernel(name, platform, *data.contexts[i], system)));
+}
+void CudaParallelCalcCustomTorsionForceKernel::initialize(const System& system, const CustomTorsionForce& force) {
+    for (int i = 0; i < (int) kernels.size(); i++)
+        getKernel(i).initialize(system, force);
+}
+double CudaParallelCalcCustomTorsionForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy) {
+    for (int i = 0; i < (int) data.contexts.size(); i++) {
+        CudaContext& cu = *data.contexts[i];
+        CudaContext::WorkThread& thread = cu.getWorkThread();
+        thread.addTask(new Task(context, getKernel(i), includeForces, includeEnergy, data.contextEnergy[i]));
+    }
+    return 0.0;
+}
+void CudaParallelCalcCustomTorsionForceKernel::copyParametersToContext(ContextImpl& context, const CustomTorsionForce& force) {
+    for (int i = 0; i < (int) kernels.size(); i++)
+        getKernel(i).copyParametersToContext(context, force);
+}
+class CudaParallelCalcNonbondedForceKernel::Task : public CudaContext::WorkTask {
+public:
+    Task(ContextImpl& context, CudaCalcNonbondedForceKernel& kernel, bool includeForce,
+            bool includeEnergy, bool includeDirect, bool includeReciprocal, double& energy) : context(context), kernel(kernel),
+            includeForce(includeForce), includeEnergy(includeEnergy), includeDirect(includeDirect), includeReciprocal(includeReciprocal), energy(energy) {
+    }
+    void execute() {
+        energy += kernel.execute(context, includeForce, includeEnergy, includeDirect, includeReciprocal);
+    }
+private:
+    ContextImpl& context;
+    CudaCalcNonbondedForceKernel& kernel;
+    bool includeForce, includeEnergy, includeDirect, includeReciprocal;
+    double& energy;
+};
+CudaParallelCalcNonbondedForceKernel::CudaParallelCalcNonbondedForceKernel(std::string name, const Platform& platform, CudaPlatform::PlatformData& data, System& system) :
+        CalcNonbondedForceKernel(name, platform), data(data) {
+    for (int i = 0; i < (int) data.contexts.size(); i++)
+        kernels.push_back(Kernel(new CudaCalcNonbondedForceKernel(name, platform, *data.contexts[i], system)));
+}
+void CudaParallelCalcNonbondedForceKernel::initialize(const System& system, const NonbondedForce& force) {
+    for (int i = 0; i < (int) kernels.size(); i++)
+        getKernel(i).initialize(system, force);
+}
+double CudaParallelCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy, bool includeDirect, bool includeReciprocal) {
+    for (int i = 0; i < (int) data.contexts.size(); i++) {
+        CudaContext& cu = *data.contexts[i];
+        CudaContext::WorkThread& thread = cu.getWorkThread();
+        thread.addTask(new Task(context, getKernel(i), includeForces, includeEnergy, includeDirect, includeReciprocal, data.contextEnergy[i]));
+    }
+    return 0.0;
+}
+void CudaParallelCalcNonbondedForceKernel::copyParametersToContext(ContextImpl& context, const NonbondedForce& force) {
+    for (int i = 0; i < (int) kernels.size(); i++)
+        getKernel(i).copyParametersToContext(context, force);
+}
+class CudaParallelCalcCustomNonbondedForceKernel::Task : public CudaContext::WorkTask {
+public:
+    Task(ContextImpl& context, CudaCalcCustomNonbondedForceKernel& kernel, bool includeForce,
+            bool includeEnergy, double& energy) : context(context), kernel(kernel),
+            includeForce(includeForce), includeEnergy(includeEnergy), energy(energy) {
+    }
+    void execute() {
+        energy += kernel.execute(context, includeForce, includeEnergy);
+    }
+private:
+    ContextImpl& context;
+    CudaCalcCustomNonbondedForceKernel& kernel;
+    bool includeForce, includeEnergy;
+    double& energy;
+};
+CudaParallelCalcCustomNonbondedForceKernel::CudaParallelCalcCustomNonbondedForceKernel(std::string name, const Platform& platform, CudaPlatform::PlatformData& data, System& system) :
+        CalcCustomNonbondedForceKernel(name, platform), data(data) {
+    for (int i = 0; i < (int) data.contexts.size(); i++)
+        kernels.push_back(Kernel(new CudaCalcCustomNonbondedForceKernel(name, platform, *data.contexts[i], system)));
+}
+void CudaParallelCalcCustomNonbondedForceKernel::initialize(const System& system, const CustomNonbondedForce& force) {
+    for (int i = 0; i < (int) kernels.size(); i++)
+        getKernel(i).initialize(system, force);
+}
+double CudaParallelCalcCustomNonbondedForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy) {
+    for (int i = 0; i < (int) data.contexts.size(); i++) {
+        CudaContext& cu = *data.contexts[i];
+        CudaContext::WorkThread& thread = cu.getWorkThread();
+        thread.addTask(new Task(context, getKernel(i), includeForces, includeEnergy, data.contextEnergy[i]));
+    }
+    return 0.0;
+}
+void CudaParallelCalcCustomNonbondedForceKernel::copyParametersToContext(ContextImpl& context, const CustomNonbondedForce& force) {
+    for (int i = 0; i < (int) kernels.size(); i++)
+        getKernel(i).copyParametersToContext(context, force);
+}
+class CudaParallelCalcCustomExternalForceKernel::Task : public CudaContext::WorkTask {
+public:
+    Task(ContextImpl& context, CudaCalcCustomExternalForceKernel& kernel, bool includeForce,
+            bool includeEnergy, double& energy) : context(context), kernel(kernel),
+            includeForce(includeForce), includeEnergy(includeEnergy), energy(energy) {
+    }
+    void execute() {
+        energy += kernel.execute(context, includeForce, includeEnergy);
+    }
+private:
+    ContextImpl& context;
+    CudaCalcCustomExternalForceKernel& kernel;
+    bool includeForce, includeEnergy;
+    double& energy;
+};
+CudaParallelCalcCustomExternalForceKernel::CudaParallelCalcCustomExternalForceKernel(std::string name, const Platform& platform, CudaPlatform::PlatformData& data, System& system) :
+        CalcCustomExternalForceKernel(name, platform), data(data) {
+    for (int i = 0; i < (int) data.contexts.size(); i++)
+        kernels.push_back(Kernel(new CudaCalcCustomExternalForceKernel(name, platform, *data.contexts[i], system)));
+}
+void CudaParallelCalcCustomExternalForceKernel::initialize(const System& system, const CustomExternalForce& force) {
+    for (int i = 0; i < (int) kernels.size(); i++)
+        getKernel(i).initialize(system, force);
+}
+double CudaParallelCalcCustomExternalForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy) {
+    for (int i = 0; i < (int) data.contexts.size(); i++) {
+        CudaContext& cu = *data.contexts[i];
+        CudaContext::WorkThread& thread = cu.getWorkThread();
+        thread.addTask(new Task(context, getKernel(i), includeForces, includeEnergy, data.contextEnergy[i]));
+    }
+    return 0.0;
+}
+void CudaParallelCalcCustomExternalForceKernel::copyParametersToContext(ContextImpl& context, const CustomExternalForce& force) {
+    for (int i = 0; i < (int) kernels.size(); i++)
+        getKernel(i).copyParametersToContext(context, force);
+}
+class CudaParallelCalcCustomHbondForceKernel::Task : public CudaContext::WorkTask {
+public:
+    Task(ContextImpl& context, CudaCalcCustomHbondForceKernel& kernel, bool includeForce,
+            bool includeEnergy, double& energy) : context(context), kernel(kernel),
+            includeForce(includeForce), includeEnergy(includeEnergy), energy(energy) {
+    }
+    void execute() {
+        energy += kernel.execute(context, includeForce, includeEnergy);
+    }
+private:
+    ContextImpl& context;
+    CudaCalcCustomHbondForceKernel& kernel;
+    bool includeForce, includeEnergy;
+    double& energy;
+};
+CudaParallelCalcCustomHbondForceKernel::CudaParallelCalcCustomHbondForceKernel(std::string name, const Platform& platform, CudaPlatform::PlatformData& data, System& system) :
+        CalcCustomHbondForceKernel(name, platform), data(data) {
+    for (int i = 0; i < (int) data.contexts.size(); i++)
+        kernels.push_back(Kernel(new CudaCalcCustomHbondForceKernel(name, platform, *data.contexts[i], system)));
+}
+void CudaParallelCalcCustomHbondForceKernel::initialize(const System& system, const CustomHbondForce& force) {
+    for (int i = 0; i < (int) kernels.size(); i++)
+        getKernel(i).initialize(system, force);
+}
+double CudaParallelCalcCustomHbondForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy) {
+    for (int i = 0; i < (int) data.contexts.size(); i++) {
+        CudaContext& cu = *data.contexts[i];
+        CudaContext::WorkThread& thread = cu.getWorkThread();
+        thread.addTask(new Task(context, getKernel(i), includeForces, includeEnergy, data.contextEnergy[i]));
+    }
+    return 0.0;
+}
+void CudaParallelCalcCustomHbondForceKernel::copyParametersToContext(ContextImpl& context, const CustomHbondForce& force) {
+    for (int i = 0; i < (int) kernels.size(); i++)
+        getKernel(i).copyParametersToContext(context, force);
+}
+class CudaParallelCalcCustomCompoundBondForceKernel::Task : public CudaContext::WorkTask {
+public:
+    Task(ContextImpl& context, CudaCalcCustomCompoundBondForceKernel& kernel, bool includeForce,
+            bool includeEnergy, double& energy) : context(context), kernel(kernel),
+            includeForce(includeForce), includeEnergy(includeEnergy), energy(energy) {
+    }
+    void execute() {
+        energy += kernel.execute(context, includeForce, includeEnergy);
+    }
+private:
+    ContextImpl& context;
+    CudaCalcCustomCompoundBondForceKernel& kernel;
+    bool includeForce, includeEnergy;
+    double& energy;
+};
+CudaParallelCalcCustomCompoundBondForceKernel::CudaParallelCalcCustomCompoundBondForceKernel(std::string name, const Platform& platform, CudaPlatform::PlatformData& data, System& system) :
+        CalcCustomCompoundBondForceKernel(name, platform), data(data) {
+    for (int i = 0; i < (int) data.contexts.size(); i++)
+        kernels.push_back(Kernel(new CudaCalcCustomCompoundBondForceKernel(name, platform, *data.contexts[i], system)));
+}
+void CudaParallelCalcCustomCompoundBondForceKernel::initialize(const System& system, const CustomCompoundBondForce& force) {
+    for (int i = 0; i < (int) kernels.size(); i++)
+        getKernel(i).initialize(system, force);
+}
+double CudaParallelCalcCustomCompoundBondForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy) {
+    for (int i = 0; i < (int) data.contexts.size(); i++) {
+        CudaContext& cu = *data.contexts[i];
+        CudaContext::WorkThread& thread = cu.getWorkThread();
+        thread.addTask(new Task(context, getKernel(i), includeForces, includeEnergy, data.contextEnergy[i]));
+    }
+    return 0.0;
+}
+void CudaParallelCalcCustomCompoundBondForceKernel::copyParametersToContext(ContextImpl& context, const CustomCompoundBondForce& force) {
+    for (int i = 0; i < (int) kernels.size(); i++)
+        getKernel(i).copyParametersToContext(context, force);
+}
--- a/platforms/cuda2/src/CudaParallelKernels.h
+++ b/platforms/cuda2/src/CudaParallelKernels.h
+#ifndef OPENMM_CUDAPARALLELKERNELS_H_
+#define OPENMM_CUDAPARALLELKERNELS_H_
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2011-2012 Stanford University and the Authors.      *
+ * Authors: Peter Eastman                                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * This program is free software: you can redistribute it and/or modify       *
+ * it under the terms of the GNU Lesser General Public License as published   *
+ * by the Free Software Foundation, either version 3 of the License, or       *
+ * (at your option) any later version.                                        *
+ *                                                                            *
+ * This program is distributed in the hope that it will be useful,            *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
+ * GNU Lesser General Public License for more details.                        *
+ *                                                                            *
+ * You should have received a copy of the GNU Lesser General Public License   *
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
+ * -------------------------------------------------------------------------- */
+#include "CudaPlatform.h"
+#include "CudaContext.h"
+#include "CudaKernels.h"
+namespace OpenMM {
+/**
+ * This kernel is invoked at the beginning and end of force and energy computations.  It gives the
+ * Platform a chance to clear buffers and do other initialization at the beginning, and to do any
+ * necessary work at the end to determine the final results.
+ */
+class CudaParallelCalcForcesAndEnergyKernel : public CalcForcesAndEnergyKernel {
+public:
+    CudaParallelCalcForcesAndEnergyKernel(std::string name, const Platform& platform, CudaPlatform::PlatformData& data);
+    ~CudaParallelCalcForcesAndEnergyKernel();
+    CudaCalcForcesAndEnergyKernel& getKernel(int index) {
+        return dynamic_cast<CudaCalcForcesAndEnergyKernel&>(kernels[index].getImpl());
+    }
+    /**
+     * Initialize the kernel.
+     *
+     * @param system     the System this kernel will be applied to
+     */
+    void initialize(const System& system);
+    /**
+     * This is called at the beginning of each force/energy computation, before calcForcesAndEnergy() has been called on
+     * any ForceImpl.
+     *
+     * @param context       the context in which to execute this kernel
+     * @param includeForce  true if forces should be computed
+     * @param includeEnergy true if potential energy should be computed
+     * @param groups        a set of bit flags for which force groups to include
+     */
+    void beginComputation(ContextImpl& context, bool includeForce, bool includeEnergy, int groups);
+    /**
+     * This is called at the end of each force/energy computation, after calcForcesAndEnergy() has been called on
+     * every ForceImpl.
+     *
+     * @param context       the context in which to execute this kernel
+     * @param includeForce  true if forces should be computed
+     * @param includeEnergy true if potential energy should be computed
+     * @param groups        a set of bit flags for which force groups to include
+     * @return the potential energy of the system.  This value is added to all values returned by ForceImpls'
+     * calcForcesAndEnergy() methods.  That is, each force kernel may <i>either</i> return its contribution to the
+     * energy directly, <i>or</i> add it to an internal buffer so that it will be included here.
+     */
+    double finishComputation(ContextImpl& context, bool includeForce, bool includeEnergy, int groups);
+private:
+    class BeginComputationTask;
+    class FinishComputationTask;
+    CudaPlatform::PlatformData& data;
+    std::vector<Kernel> kernels;
+    std::vector<long long> completionTimes;
+    std::vector<int> contextTiles;
+    CudaArray* contextForces;
+    void* pinnedPositionBuffer;
+    void* pinnedForceBuffer;
+    CUfunction sumKernel;
+};
+/**
+ * This kernel is invoked by HarmonicBondForce to calculate the forces acting on the system and the energy of the system.
+ */
+class CudaParallelCalcHarmonicBondForceKernel : public CalcHarmonicBondForceKernel {
+public:
+    CudaParallelCalcHarmonicBondForceKernel(std::string name, const Platform& platform, CudaPlatform::PlatformData& data, System& system);
+    CudaCalcHarmonicBondForceKernel& getKernel(int index) {
+        return dynamic_cast<CudaCalcHarmonicBondForceKernel&>(kernels[index].getImpl());
+    }
+    /**
+     * Initialize the kernel.
+     *
+     * @param system     the System this kernel will be applied to
+     * @param force      the HarmonicBondForce this kernel will be used for
+     */
+    void initialize(const System& system, const HarmonicBondForce& force);
+    /**
+     * Execute the kernel to calculate the forces and/or energy.
+     *
+     * @param context        the context in which to execute this kernel
+     * @param includeForces  true if forces should be calculated
+     * @param includeEnergy  true if the energy should be calculated
+     * @return the potential energy due to the force
+     */
+    double execute(ContextImpl& context, bool includeForces, bool includeEnergy);
+    /**
+     * Copy changed parameters over to a context.
+     *
+     * @param context    the context to copy parameters to
+     * @param force      the HarmonicBondForce to copy the parameters from
+     */
+    void copyParametersToContext(ContextImpl& context, const HarmonicBondForce& force);
+private:
+    class Task;
+    CudaPlatform::PlatformData& data;
+    std::vector<Kernel> kernels;
+};
+/**
+ * This kernel is invoked by CustomBondForce to calculate the forces acting on the system and the energy of the system.
+ */
+class CudaParallelCalcCustomBondForceKernel : public CalcCustomBondForceKernel {
+public:
+    CudaParallelCalcCustomBondForceKernel(std::string name, const Platform& platform, CudaPlatform::PlatformData& data, System& system);
+    CudaCalcCustomBondForceKernel& getKernel(int index) {
+        return dynamic_cast<CudaCalcCustomBondForceKernel&>(kernels[index].getImpl());
+    }
+    /**
+     * Initialize the kernel.
+     *
+     * @param system     the System this kernel will be applied to
+     * @param force      the CustomBondForce this kernel will be used for
+     */
+    void initialize(const System& system, const CustomBondForce& force);
+    /**
+     * Execute the kernel to calculate the forces and/or energy.
+     *
+     * @param context        the context in which to execute this kernel
+     * @param includeForces  true if forces should be calculated
+     * @param includeEnergy  true if the energy should be calculated
+     * @return the potential energy due to the force
+     */
+    double execute(ContextImpl& context, bool includeForces, bool includeEnergy);
+    /**
+     * Copy changed parameters over to a context.
+     *
+     * @param context    the context to copy parameters to
+     * @param force      the CustomBondForce to copy the parameters from
+     */
+    void copyParametersToContext(ContextImpl& context, const CustomBondForce& force);
+private:
+    class Task;
+    CudaPlatform::PlatformData& data;
+    std::vector<Kernel> kernels;
+};
+/**
+ * This kernel is invoked by HarmonicAngleForce to calculate the forces acting on the system and the energy of the system.
+ */
+class CudaParallelCalcHarmonicAngleForceKernel : public CalcHarmonicAngleForceKernel {
+public:
+    CudaParallelCalcHarmonicAngleForceKernel(std::string name, const Platform& platform, CudaPlatform::PlatformData& data, System& system);
+    CudaCalcHarmonicAngleForceKernel& getKernel(int index) {
+        return dynamic_cast<CudaCalcHarmonicAngleForceKernel&>(kernels[index].getImpl());
+    }
+    /**
+     * Initialize the kernel.
+     *
+     * @param system     the System this kernel will be applied to
+     * @param force      the HarmonicAngleForce this kernel will be used for
+     */
+    void initialize(const System& system, const HarmonicAngleForce& force);
+    /**
+     * Execute the kernel to calculate the forces and/or energy.
+     *
+     * @param context        the context in which to execute this kernel
+     * @param includeForces  true if forces should be calculated
+     * @param includeEnergy  true if the energy should be calculated
+     * @return the potential energy due to the force
+     */
+    double execute(ContextImpl& context, bool includeForces, bool includeEnergy);
+    /**
+     * Copy changed parameters over to a context.
+     *
+     * @param context    the context to copy parameters to
+     * @param force      the HarmonicAngleForce to copy the parameters from
+     */
+    void copyParametersToContext(ContextImpl& context, const HarmonicAngleForce& force);
+private:
+    class Task;
+    CudaPlatform::PlatformData& data;
+    std::vector<Kernel> kernels;
+};
+/**
+ * This kernel is invoked by CustomAngleForce to calculate the forces acting on the system and the energy of the system.
+ */
+class CudaParallelCalcCustomAngleForceKernel : public CalcCustomAngleForceKernel {
+public:
+    CudaParallelCalcCustomAngleForceKernel(std::string name, const Platform& platform, CudaPlatform::PlatformData& data, System& system);
+    CudaCalcCustomAngleForceKernel& getKernel(int index) {
+        return dynamic_cast<CudaCalcCustomAngleForceKernel&>(kernels[index].getImpl());
+    }
+    /**
+     * Initialize the kernel.
+     *
+     * @param system     the System this kernel will be applied to
+     * @param force      the CustomAngleForce this kernel will be used for
+     */
+    void initialize(const System& system, const CustomAngleForce& force);
+    /**
+     * Execute the kernel to calculate the forces and/or energy.
+     *
+     * @param context        the context in which to execute this kernel
+     * @param includeForces  true if forces should be calculated
+     * @param includeEnergy  true if the energy should be calculated
+     * @return the potential energy due to the force
+     */
+    double execute(ContextImpl& context, bool includeForces, bool includeEnergy);
+    /**
+     * Copy changed parameters over to a context.
+     *
+     * @param context    the context to copy parameters to
+     * @param force      the CustomAngleForce to copy the parameters from
+     */
+    void copyParametersToContext(ContextImpl& context, const CustomAngleForce& force);
+private:
+    class Task;
+    CudaPlatform::PlatformData& data;
+    std::vector<Kernel> kernels;
+};
+/**
+ * This kernel is invoked by PeriodicTorsionForce to calculate the forces acting on the system and the energy of the system.
+ */
+class CudaParallelCalcPeriodicTorsionForceKernel : public CalcPeriodicTorsionForceKernel {
+public:
+    CudaParallelCalcPeriodicTorsionForceKernel(std::string name, const Platform& platform, CudaPlatform::PlatformData& data, System& system);
+    CudaCalcPeriodicTorsionForceKernel& getKernel(int index) {
+        return dynamic_cast<CudaCalcPeriodicTorsionForceKernel&>(kernels[index].getImpl());
+    }
+    /**
+     * Initialize the kernel.
+     *
+     * @param system     the System this kernel will be applied to
+     * @param force      the PeriodicTorsionForce this kernel will be used for
+     */
+    void initialize(const System& system, const PeriodicTorsionForce& force);
+    /**
+     * Execute the kernel to calculate the forces and/or energy.
+     *
+     * @param context        the context in which to execute this kernel
+     * @param includeForces  true if forces should be calculated
+     * @param includeEnergy  true if the energy should be calculated
+     * @return the potential energy due to the force
+     */
+    double execute(ContextImpl& context, bool includeForces, bool includeEnergy);
+    class Task;
+    /**
+     * Copy changed parameters over to a context.
+     *
+     * @param context    the context to copy parameters to
+     * @param force      the PeriodicTorsionForce to copy the parameters from
+     */
+    void copyParametersToContext(ContextImpl& context, const PeriodicTorsionForce& force);
+private:
+    CudaPlatform::PlatformData& data;
+    std::vector<Kernel> kernels;
+};
+/**
+ * This kernel is invoked by RBTorsionForce to calculate the forces acting on the system and the energy of the system.
+ */
+class CudaParallelCalcRBTorsionForceKernel : public CalcRBTorsionForceKernel {
+public:
+    CudaParallelCalcRBTorsionForceKernel(std::string name, const Platform& platform, CudaPlatform::PlatformData& data, System& system);
+    CudaCalcRBTorsionForceKernel& getKernel(int index) {
+        return dynamic_cast<CudaCalcRBTorsionForceKernel&>(kernels[index].getImpl());
+    }
+    /**
+     * Initialize the kernel.
+     *
+     * @param system     the System this kernel will be applied to
+     * @param force      the RBTorsionForce this kernel will be used for
+     */
+    void initialize(const System& system, const RBTorsionForce& force);
+    /**
+     * Execute the kernel to calculate the forces and/or energy.
+     *
+     * @param context        the context in which to execute this kernel
+     * @param includeForces  true if forces should be calculated
+     * @param includeEnergy  true if the energy should be calculated
+     * @return the potential energy due to the force
+     */
+    double execute(ContextImpl& context, bool includeForces, bool includeEnergy);
+    /**
+     * Copy changed parameters over to a context.
+     *
+     * @param context    the context to copy parameters to
+     * @param force      the RBTorsionForce to copy the parameters from
+     */
+    void copyParametersToContext(ContextImpl& context, const RBTorsionForce& force);
+private:
+    class Task;
+    CudaPlatform::PlatformData& data;
+    std::vector<Kernel> kernels;
+};
+/**
+ * This kernel is invoked by CMAPTorsionForce to calculate the forces acting on the system and the energy of the system.
+ */
+class CudaParallelCalcCMAPTorsionForceKernel : public CalcCMAPTorsionForceKernel {
+public:
+    CudaParallelCalcCMAPTorsionForceKernel(std::string name, const Platform& platform, CudaPlatform::PlatformData& data, System& system);
+    CudaCalcCMAPTorsionForceKernel& getKernel(int index) {
+        return dynamic_cast<CudaCalcCMAPTorsionForceKernel&>(kernels[index].getImpl());
+    }
+    /**
+     * Initialize the kernel.
+     *
+     * @param system     the System this kernel will be applied to
+     * @param force      the CMAPTorsionForce this kernel will be used for
+     */
+    void initialize(const System& system, const CMAPTorsionForce& force);
+    /**
+     * Execute the kernel to calculate the forces and/or energy.
+     *
+     * @param context        the context in which to execute this kernel
+     * @param includeForces  true if forces should be calculated
+     * @param includeEnergy  true if the energy should be calculated
+     * @return the potential energy due to the force
+     */
+    double execute(ContextImpl& context, bool includeForces, bool includeEnergy);
+private:
+    class Task;
+    CudaPlatform::PlatformData& data;
+    std::vector<Kernel> kernels;
+};
+/**
+ * This kernel is invoked by CustomTorsionForce to calculate the forces acting on the system and the energy of the system.
+ */
+class CudaParallelCalcCustomTorsionForceKernel : public CalcCustomTorsionForceKernel {
+public:
+    CudaParallelCalcCustomTorsionForceKernel(std::string name, const Platform& platform, CudaPlatform::PlatformData& data, System& system);
+    CudaCalcCustomTorsionForceKernel& getKernel(int index) {
+        return dynamic_cast<CudaCalcCustomTorsionForceKernel&>(kernels[index].getImpl());
+    }
+    /**
+     * Initialize the kernel.
+     *
+     * @param system     the System this kernel will be applied to
+     * @param force      the CustomTorsionForce this kernel will be used for
+     */
+    void initialize(const System& system, const CustomTorsionForce& force);
+    /**
+     * Execute the kernel to calculate the forces and/or energy.
+     *
+     * @param context        the context in which to execute this kernel
+     * @param includeForces  true if forces should be calculated
+     * @param includeEnergy  true if the energy should be calculated
+     * @return the potential energy due to the force
+     */
+    double execute(ContextImpl& context, bool includeForces, bool includeEnergy);
+    /**
+     * Copy changed parameters over to a context.
+     *
+     * @param context    the context to copy parameters to
+     * @param force      the CustomTorsionForce to copy the parameters from
+     */
+    void copyParametersToContext(ContextImpl& context, const CustomTorsionForce& force);
+private:
+    class Task;
+    CudaPlatform::PlatformData& data;
+    std::vector<Kernel> kernels;
+};
+/**
+ * This kernel is invoked by NonbondedForce to calculate the forces acting on the system.
+ */
+class CudaParallelCalcNonbondedForceKernel : public CalcNonbondedForceKernel {
+public:
+    CudaParallelCalcNonbondedForceKernel(std::string name, const Platform& platform, CudaPlatform::PlatformData& data, System& system);
+    CudaCalcNonbondedForceKernel& getKernel(int index) {
+        return dynamic_cast<CudaCalcNonbondedForceKernel&>(kernels[index].getImpl());
+    }
+    /**
+     * Initialize the kernel.
+     *
+     * @param system     the System this kernel will be applied to
+     * @param force      the NonbondedForce this kernel will be used for
+     */
+    void initialize(const System& system, const NonbondedForce& force);
+    /**
+     * Execute the kernel to calculate the forces and/or energy.
+     *
+     * @param context        the context in which to execute this kernel
+     * @param includeForces  true if forces should be calculated
+     * @param includeEnergy  true if the energy should be calculated
+     * @param includeReciprocal  true if reciprocal space interactions should be included
+     * @param includeReciprocal  true if reciprocal space interactions should be included
+     * @return the potential energy due to the force
+     */
+    double execute(ContextImpl& context, bool includeForces, bool includeEnergy, bool includeDirect, bool includeReciprocal);
+    /**
+     * Copy changed parameters over to a context.
+     *
+     * @param context    the context to copy parameters to
+     * @param force      the NonbondedForce to copy the parameters from
+     */
+    void copyParametersToContext(ContextImpl& context, const NonbondedForce& force);
+private:
+    class Task;
+    CudaPlatform::PlatformData& data;
+    std::vector<Kernel> kernels;
+};
+/**
+ * This kernel is invoked by CustomNonbondedForce to calculate the forces acting on the system.
+ */
+class CudaParallelCalcCustomNonbondedForceKernel : public CalcCustomNonbondedForceKernel {
+public:
+    CudaParallelCalcCustomNonbondedForceKernel(std::string name, const Platform& platform, CudaPlatform::PlatformData& data, System& system);
+    CudaCalcCustomNonbondedForceKernel& getKernel(int index) {
+        return dynamic_cast<CudaCalcCustomNonbondedForceKernel&>(kernels[index].getImpl());
+    }
+    /**
+     * Initialize the kernel.
+     *
+     * @param system     the System this kernel will be applied to
+     * @param force      the CustomNonbondedForce this kernel will be used for
+     */
+    void initialize(const System& system, const CustomNonbondedForce& force);
+    /**
+     * Execute the kernel to calculate the forces and/or energy.
+     *
+     * @param context        the context in which to execute this kernel
+     * @param includeForces  true if forces should be calculated
+     * @param includeEnergy  true if the energy should be calculated
+     * @return the potential energy due to the force
+     */
+    double execute(ContextImpl& context, bool includeForces, bool includeEnergy);
+    /**
+     * Copy changed parameters over to a context.
+     *
+     * @param context    the context to copy parameters to
+     * @param force      the CustomNonbondedForce to copy the parameters from
+     */
+    void copyParametersToContext(ContextImpl& context, const CustomNonbondedForce& force);
+private:
+    class Task;
+    CudaPlatform::PlatformData& data;
+    std::vector<Kernel> kernels;
+};
+/**
+ * This kernel is invoked by CustomExternalForce to calculate the forces acting on the system and the energy of the system.
+ */
+class CudaParallelCalcCustomExternalForceKernel : public CalcCustomExternalForceKernel {
+public:
+    CudaParallelCalcCustomExternalForceKernel(std::string name, const Platform& platform, CudaPlatform::PlatformData& data, System& system);
+    CudaCalcCustomExternalForceKernel& getKernel(int index) {
+        return dynamic_cast<CudaCalcCustomExternalForceKernel&>(kernels[index].getImpl());
+    }
+    /**
+     * Initialize the kernel.
+     *
+     * @param system     the System this kernel will be applied to
+     * @param force      the CustomExternalForce this kernel will be used for
+     */
+    void initialize(const System& system, const CustomExternalForce& force);
+    /**
+     * Execute the kernel to calculate the forces and/or energy.
+     *
+     * @param context        the context in which to execute this kernel
+     * @param includeForces  true if forces should be calculated
+     * @param includeEnergy  true if the energy should be calculated
+     * @return the potential energy due to the force
+     */
+    double execute(ContextImpl& context, bool includeForces, bool includeEnergy);
+    /**
+     * Copy changed parameters over to a context.
+     *
+     * @param context    the context to copy parameters to
+     * @param force      the CustomExternalForce to copy the parameters from
+     */
+    void copyParametersToContext(ContextImpl& context, const CustomExternalForce& force);
+private:
+    class Task;
+    CudaPlatform::PlatformData& data;
+    std::vector<Kernel> kernels;
+};
+/**
+ * This kernel is invoked by CustomHbondForce to calculate the forces acting on the system.
+ */
+class CudaParallelCalcCustomHbondForceKernel : public CalcCustomHbondForceKernel {
+public:
+    CudaParallelCalcCustomHbondForceKernel(std::string name, const Platform& platform, CudaPlatform::PlatformData& data, System& system);
+    CudaCalcCustomHbondForceKernel& getKernel(int index) {
+        return dynamic_cast<CudaCalcCustomHbondForceKernel&>(kernels[index].getImpl());
+    }
+    /**
+     * Initialize the kernel.
+     *
+     * @param system     the System this kernel will be applied to
+     * @param force      the CustomHbondForce this kernel will be used for
+     */
+    void initialize(const System& system, const CustomHbondForce& force);
+    /**
+     * Execute the kernel to calculate the forces and/or energy.
+     *
+     * @param context        the context in which to execute this kernel
+     * @param includeForces  true if forces should be calculated
+     * @param includeEnergy  true if the energy should be calculated
+     * @return the potential energy due to the force
+     */
+    double execute(ContextImpl& context, bool includeForces, bool includeEnergy);
+    /**
+     * Copy changed parameters over to a context.
+     *
+     * @param context    the context to copy parameters to
+     * @param force      the CustomHbondForce to copy the parameters from
+     */
+    void copyParametersToContext(ContextImpl& context, const CustomHbondForce& force);
+private:
+    class Task;
+    CudaPlatform::PlatformData& data;
+    std::vector<Kernel> kernels;
+};
+/**
+ * This kernel is invoked by CustomCompoundBondForce to calculate the forces acting on the system.
+ */
+class CudaParallelCalcCustomCompoundBondForceKernel : public CalcCustomCompoundBondForceKernel {
+public:
+    CudaParallelCalcCustomCompoundBondForceKernel(std::string name, const Platform& platform, CudaPlatform::PlatformData& data, System& system);
+    CudaCalcCustomCompoundBondForceKernel& getKernel(int index) {
+        return dynamic_cast<CudaCalcCustomCompoundBondForceKernel&>(kernels[index].getImpl());
+    }
+    /**
+     * Initialize the kernel.
+     *
+     * @param system     the System this kernel will be applied to
+     * @param force      the CustomCompoundBondForce this kernel will be used for
+     */
+    void initialize(const System& system, const CustomCompoundBondForce& force);
+    /**
+     * Execute the kernel to calculate the forces and/or energy.
+     *
+     * @param context        the context in which to execute this kernel
+     * @param includeForces  true if forces should be calculated
+     * @param includeEnergy  true if the energy should be calculated
+     * @return the potential energy due to the force
+     */
+    double execute(ContextImpl& context, bool includeForces, bool includeEnergy);
+    /**
+     * Copy changed parameters over to a context.
+     *
+     * @param context    the context to copy parameters to
+     * @param force      the CustomCompoundBondForce to copy the parameters from
+     */
+    void copyParametersToContext(ContextImpl& context, const CustomCompoundBondForce& force);
+private:
+    class Task;
+    CudaPlatform::PlatformData& data;
+    std::vector<Kernel> kernels;
+};
+} // namespace OpenMM
+#endif /*OPENMM_CUDAPARALLELKERNELS_H_*/
--- a/platforms/cuda2/src/kernels/findInteractingBlocks.cu
+++ b/platforms/cuda2/src/kernels/findInteractingBlocks.cu
@@ -101,13 +101,14 @@ __device__ void storeInteractionData(ushort2* buffer, int* valid, short* sum, us
 extern "C" __global__ void findBlocksWithInteractions(real4 periodicBoxSize, real4 invPeriodicBoxSize, const real4* __restrict__ blockCenter,
        const real4* __restrict__ blockBoundingBox, unsigned int* __restrict__ interactionCount, ushort2* __restrict__ interactingTiles,
        unsigned int* __restrict__ interactionFlags, const real4* __restrict__ posq, unsigned int maxTiles, unsigned int startTileIndex,
-        unsigned int endTileIndex) {
+        unsigned int numTiles) {
    __shared__ ushort2 buffer[BUFFER_SIZE];
    __shared__ int valid[BUFFER_SIZE];
    __shared__ short sum[BUFFER_SIZE];
    __shared__ ushort2 temp[BUFFER_SIZE];
    __shared__ int bufferFull;
    __shared__ int globalIndex;
+    unsigned int endTileIndex = startTileIndex+numTiles;
    int valuesInBuffer = 0;
    if (threadIdx.x == 0)
        bufferFull = false;

--- a/platforms/cuda2/src/kernels/parallel.cu
+++ b/platforms/cuda2/src/kernels/parallel.cu
+/**
+ * Sum the forces computed by different contexts.
+ */
+extern "C" __global__ void sumForces(long long* __restrict__ force, long long* __restrict__ buffer, int bufferSize, int numBuffers) {
+    int totalSize = bufferSize*numBuffers;
+    for (int index = blockDim.x*blockIdx.x+threadIdx.x; index < bufferSize; index += blockDim.x*gridDim.x) {
+        long long sum = force[index];
+        for (int i = index; i < totalSize; i += bufferSize)
+            sum += buffer[i];
+        force[index] = sum;
+    }
+}
--- a/platforms/cuda2/src/kernels/utilities.cu
+++ b/platforms/cuda2/src/kernels/utilities.cu
@@ -73,34 +73,4 @@ __global__ void clearSixBuffers(int* __restrict__ buffer1, int size1, int* __res
    clearSingleBuffer(buffer6, size6);
 }
-/**
- * Sum a collection of buffers into the first one.
- */
-__global__ void reduceFloat4Buffer(float4* __restrict__ buffer, int bufferSize, int numBuffers) {
-    int index = blockDim.x*blockIdx.x+threadIdx.x;
-    int totalSize = bufferSize*numBuffers;
-    while (index < bufferSize) {
-        float4 sum = buffer[index];
-        for (int i = index+bufferSize; i < totalSize; i += bufferSize)
-            sum += buffer[i];
-        buffer[index] = sum;
-        index += blockDim.x*gridDim.x;
-    }
-}
-/**
- * Sum the various buffers containing forces.
- */
-__global__ void reduceForces(const long* __restrict__ longBuffer, float4* __restrict__ buffer, int bufferSize, int numBuffers) {
-    int totalSize = bufferSize*numBuffers;
-    float scale = 1.0f/(float) 0xFFFFFFFF;
-    for (int index = blockDim.x*blockIdx.x+threadIdx.x; index < bufferSize; index += blockDim.x*gridDim.x) {
-        float4 sum = make_float4(scale*longBuffer[index], scale*longBuffer[index+bufferSize], scale*longBuffer[index+2*bufferSize], 0.0f);
-        for (int i = index; i < totalSize; i += bufferSize)
-            sum += buffer[i];
-        buffer[index] = sum;
-    }
-}
 }
\ No newline at end of file
--- a/platforms/cuda2/tests/TestCudaCheckpoints.cpp
+++ b/platforms/cuda2/tests/TestCudaCheckpoints.cpp
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2012 Stanford University and the Authors.           *
+ * Authors: Peter Eastman                                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * Permission is hereby granted, free of charge, to any person obtaining a    *
+ * copy of this software and associated documentation files (the "Software"), *
+ * to deal in the Software without restriction, including without limitation  *
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
+ * and/or sell copies of the Software, and to permit persons to whom the      *
+ * Software is furnished to do so, subject to the following conditions:       *
+ *                                                                            *
+ * The above copyright notice and this permission notice shall be included in *
+ * all copies or substantial portions of the Software.                        *
+ *                                                                            *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
+ * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
+ * -------------------------------------------------------------------------- */
+/**
+ * This tests creating and loading checkpoints with the CUDA platform.
+ */
+#include "CudaPlatform.h"
+#include "openmm/internal/AssertionUtilities.h"
+#include "openmm/AndersenThermostat.h"
+#include "openmm/Context.h"
+#include "openmm/NonbondedForce.h"
+#include "openmm/System.h"
+#include "openmm/VerletIntegrator.h"
+#include "sfmt/SFMT.h"
+#include <iostream>
+#include <sstream>
+#include <vector>
+using namespace OpenMM;
+using namespace std;
+const double TOL = 1e-5;
+void compareStates(State& s1, State& s2) {
+    ASSERT_EQUAL_TOL(s1.getTime(), s2.getTime(), TOL);
+    int numParticles = s1.getPositions().size();
+    for (int i = 0; i < numParticles; i++) {
+        ASSERT_EQUAL_VEC(s1.getPositions()[i], s2.getPositions()[i], TOL);
+        ASSERT_EQUAL_VEC(s1.getVelocities()[i], s2.getVelocities()[i], TOL);
+        Vec3 a1, b1, c1, a2, b2, c2;
+        s1.getPeriodicBoxVectors(a1, b1, c1);
+        s2.getPeriodicBoxVectors(a2, b2, c2);
+        ASSERT_EQUAL_VEC(a1, a2, TOL);
+        ASSERT_EQUAL_VEC(b1, b2, TOL);
+        ASSERT_EQUAL_VEC(c1, c2, TOL);
+        for (map<string, double>::const_iterator iter = s1.getParameters().begin(); iter != s1.getParameters().end(); ++iter)
+            ASSERT_EQUAL(iter->second, (*s2.getParameters().find(iter->first)).second);
+    }
+}
+void testCheckpoint() {
+    const int numParticles = 100;
+    const double boxSize = 5.0;
+    const double temperature = 200.0;
+    CudaPlatform platform;
+    System system;
+    system.addForce(new AndersenThermostat(0.0, 100.0));
+    NonbondedForce* nonbonded = new NonbondedForce();
+    system.addForce(nonbonded);
+    nonbonded->setNonbondedMethod(NonbondedForce::CutoffPeriodic);
+    vector<Vec3> positions(numParticles);
+    OpenMM_SFMT::SFMT sfmt;
+    init_gen_rand(0, sfmt);
+    for (int i = 0; i < numParticles; i++) {
+        system.addParticle(1.0);
+        nonbonded->addParticle(i%2 == 0 ? 0.1 : -0.1, 0.2, 0.1);
+        bool clash;
+        do {
+            clash = false;
+            positions[i] = Vec3(boxSize*genrand_real2(sfmt), boxSize*genrand_real2(sfmt), boxSize*genrand_real2(sfmt));
+            for (int j = 0; j < i; j++) {
+                Vec3 delta = positions[i]-positions[j];
+                if (sqrt(delta.dot(delta)) < 0.1)
+                    clash = true;
+            }
+        } while (clash);
+    }
+    VerletIntegrator integrator(0.001);
+    Context context(system, integrator, platform);
+    context.setPositions(positions);
+    context.setPeriodicBoxVectors(Vec3(boxSize, 0, 0), Vec3(0, boxSize, 0), Vec3(0, 0, boxSize));
+    context.setParameter(AndersenThermostat::Temperature(), temperature);
+    // Run for a little while.
+    integrator.step(100);
+    // Record the current state and make a checkpoint.
+    State s1 = context.getState(State::Positions | State::Velocities | State::Parameters);
+    stringstream stream1(ios_base::out | ios_base::in | ios_base::binary);
+    context.createCheckpoint(stream1);
+    // Continue the simulation for a few more steps and record the state again.
+    integrator.step(10);
+    State s2 = context.getState(State::Positions | State::Velocities | State::Parameters);
+    // Restore from the checkpoint and see if everything gets restored correctly.
+    context.setPeriodicBoxVectors(Vec3(2*boxSize, 0, 0), Vec3(0, 2*boxSize, 0), Vec3(0, 0, 2*boxSize));
+    context.setParameter(AndersenThermostat::Temperature(), temperature+10);
+    context.loadCheckpoint(stream1);
+    State s3 = context.getState(State::Positions | State::Velocities | State::Parameters);
+    compareStates(s1, s3);
+    // Now simulate from there and see if the trajectory is identical.
+    integrator.step(10);
+    State s4 = context.getState(State::Positions | State::Velocities | State::Parameters);
+    compareStates(s2, s4);
+    // Create a new Context that uses multiple devices.
+    string deviceIndex = platform.getPropertyValue(context, CudaPlatform::CudaDeviceIndex());
+    map<string, string> props;
+    props[CudaPlatform::CudaDeviceIndex()] = deviceIndex+","+deviceIndex;
+    VerletIntegrator integrator2(0.001);
+    Context context2(system, integrator2, platform, props);
+    context2.setPositions(positions);
+    context2.setPeriodicBoxVectors(Vec3(boxSize, 0, 0), Vec3(0, boxSize, 0), Vec3(0, 0, boxSize));
+    context2.setParameter(AndersenThermostat::Temperature(), temperature);
+    // Now repeat all of the above tests with it.
+    integrator2.step(100);
+    State s5 = context2.getState(State::Positions | State::Velocities | State::Parameters);
+    stringstream stream2(ios_base::out | ios_base::in | ios_base::binary);
+    context2.createCheckpoint(stream2);
+    integrator2.step(10);
+    State s6 = context2.getState(State::Positions | State::Velocities | State::Parameters);
+    context2.setPeriodicBoxVectors(Vec3(2*boxSize, 0, 0), Vec3(0, 2*boxSize, 0), Vec3(0, 0, 2*boxSize));
+    context2.setParameter(AndersenThermostat::Temperature(), temperature+10);
+    context2.loadCheckpoint(stream2);
+    State s7 = context2.getState(State::Positions | State::Velocities | State::Parameters);
+    compareStates(s5, s7);
+    integrator2.step(10);
+    State s8 = context2.getState(State::Positions | State::Velocities | State::Parameters);
+    compareStates(s6, s8);
+}
+int main() {
+    try {
+        testCheckpoint();
+    }
+    catch(const exception& e) {
+        cout << "exception: " << e.what() << endl;
+        return 1;
+    }
+    cout << "Done" << endl;
+    return 0;
+}
--- a/platforms/cuda2/tests/TestCudaCustomAngleForce.cpp
+++ b/platforms/cuda2/tests/TestCudaCustomAngleForce.cpp
@@ -164,7 +164,7 @@ void testParallelComputation() {
 int main() {
    try {
        testAngles();
-//        testParallelComputation();
+        testParallelComputation();
    }
    catch(const exception& e) {
        cout << "exception: " << e.what() << endl;

--- a/platforms/cuda2/tests/TestCudaCustomBondForce.cpp
+++ b/platforms/cuda2/tests/TestCudaCustomBondForce.cpp
@@ -169,7 +169,7 @@ int main() {
    try {
        testBonds();
        testManyParameters();
-//        testParallelComputation();
+        testParallelComputation();
    }
    catch(const exception& e) {
        cout << "exception: " << e.what() << endl;

--- a/platforms/cuda2/tests/TestCudaCustomCompoundBondForce.cpp
+++ b/platforms/cuda2/tests/TestCudaCustomCompoundBondForce.cpp
@@ -205,7 +205,7 @@ int main() {
    try {
        testBond();
        testPositionDependence();
-//        testParallelComputation();
+        testParallelComputation();
    }
    catch(const exception& e) {
        cout << "exception: " << e.what() << endl;

--- a/platforms/cuda2/tests/TestCudaCustomExternalForce.cpp
+++ b/platforms/cuda2/tests/TestCudaCustomExternalForce.cpp
@@ -166,7 +166,7 @@ int main() {
    try {
        testForce();
        testManyParameters();
-//        testParallelComputation();
+        testParallelComputation();
    }
    catch(const exception& e) {
        cout << "exception: " << e.what() << endl;

--- a/platforms/cuda2/tests/TestCudaCustomNonbondedForce.cpp
+++ b/platforms/cuda2/tests/TestCudaCustomNonbondedForce.cpp
@@ -424,7 +424,7 @@ int main() {
        testPeriodic();
        testTabulatedFunction();
        testCoulombLennardJones();
-//        testParallelComputation();
+        testParallelComputation();
    }
    catch(const exception& e) {
        cout << "exception: " << e.what() << endl;

--- a/platforms/cuda2/tests/TestCudaCustomTorsionForce.cpp
+++ b/platforms/cuda2/tests/TestCudaCustomTorsionForce.cpp
@@ -205,7 +205,7 @@ int main() {
    try {
        testTorsions();
        testRange();
-//        testParallelComputation();
+        testParallelComputation();
    }
    catch(const exception& e) {
        cout << "exception: " << e.what() << endl;

--- a/platforms/cuda2/tests/TestCudaHarmonicAngleForce.cpp
+++ b/platforms/cuda2/tests/TestCudaHarmonicAngleForce.cpp
@@ -130,7 +130,7 @@ void testParallelComputation() {
 int main() {
    try {
        testAngles();
-//        testParallelComputation();
+        testParallelComputation();
    }
    catch(const exception& e) {
        cout << "exception: " << e.what() << endl;

--- a/platforms/cuda2/tests/TestCudaHarmonicBondForce.cpp
+++ b/platforms/cuda2/tests/TestCudaHarmonicBondForce.cpp
@@ -121,7 +121,7 @@ void testParallelComputation() {
 int main() {
    try {
        testBonds();
-//        testParallelComputation();
+        testParallelComputation();
    }
    catch(const exception& e) {
        cout << "exception: " << e.what() << endl;

--- a/platforms/cuda2/tests/TestCudaNonbondedForce.cpp
+++ b/platforms/cuda2/tests/TestCudaNonbondedForce.cpp
@@ -814,8 +814,8 @@ int main() {
        testBlockInteractions(true);
        testDispersionCorrection();
        testChangingParameters();
-//        testParallelComputation(false);
+        testParallelComputation(false);
-//        testParallelComputation(true);
+        testParallelComputation(true);
    }
    catch(const exception& e) {
        cout << "exception: " << e.what() << endl;

--- a/platforms/cuda2/tests/TestCudaPeriodicTorsionForce.cpp
+++ b/platforms/cuda2/tests/TestCudaPeriodicTorsionForce.cpp
@@ -124,7 +124,7 @@ void testParallelComputation() {
 int main() {
    try {
        testPeriodicTorsions();
-//        testParallelComputation();
+        testParallelComputation();
    }
    catch(const exception& e) {
        cout << "exception: " << e.what() << endl;

--- a/platforms/cuda2/tests/TestCudaRBTorsionForce.cpp
+++ b/platforms/cuda2/tests/TestCudaRBTorsionForce.cpp
@@ -143,7 +143,7 @@ void testParallelComputation() {
 int main() {
    try {
        testRBTorsions();
-//        testParallelComputation();
+        testParallelComputation();
    }
    catch(const exception& e) {
        cout << "exception: " << e.what() << endl;