Continuing to implement new CUDA platform: checkpointing, parallelization across multiple devices

387008ce · Peter Eastman · 17ae3aae · 387008ce · 387008ce · 387008ce
Commit 387008ce authored Jun 22, 2012 by Peter Eastman
20 changed files
--- a/platforms/cuda2/src/CudaContext.cpp
+++ b/platforms/cuda2/src/CudaContext.cpp
@@ -113,7 +113,6 @@ CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlocking
                deviceIndex = i;
                bestSpeed = speed;
                bestCompute = major;
-                gpuArchitecture = intToString(major)+intToString(minor);
            }
        }
    }
@@ -121,6 +120,9 @@ CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlocking
        throw OpenMMException("No compatible CUDA device is available");
    CHECK_RESULT(cuDeviceGet(&device, deviceIndex));
    this->deviceIndex = deviceIndex;
+    int major, minor;
+    CHECK_RESULT(cuDeviceComputeCapability(&major, &minor, device));
+    gpuArchitecture = intToString(major)+intToString(minor);
    defaultOptimizationOptions = "--use_fast_math";
    unsigned int flags = CU_CTX_MAP_HOST;
    if (useBlockingSync)

--- a/platforms/cuda2/src/CudaKernelFactory.cpp
+++ b/platforms/cuda2/src/CudaKernelFactory.cpp
@@ -26,7 +26,7 @@

 #include "CudaKernelFactory.h"
 #include "CudaKernels.h"
-//#include "CudaParallelKernels.h"
+#include "CudaParallelKernels.h"
 #include "CudaPlatform.h"
 #include "openmm/internal/ContextImpl.h"
 #include "openmm/OpenMMException.h"
@@ -35,38 +35,38 @@ using namespace OpenMM;

 KernelImpl* CudaKernelFactory::createKernelImpl(std::string name, const Platform& platform, ContextImpl& context) const {
    CudaPlatform::PlatformData& data = *static_cast<CudaPlatform::PlatformData*>(context.getPlatformData());
-//    if (data.contexts.size() > 1) {
-//        // We are running in parallel on multiple devices, so we may want to create a parallel kernel.
-//        
-//        if (name == CalcForcesAndEnergyKernel::Name())
-//            return new CudaParallelCalcForcesAndEnergyKernel(name, platform, data);
-//        if (name == CalcHarmonicBondForceKernel::Name())
-//            return new CudaParallelCalcHarmonicBondForceKernel(name, platform, data, context.getSystem());
-//        if (name == CalcCustomBondForceKernel::Name())
-//            return new CudaParallelCalcCustomBondForceKernel(name, platform, data, context.getSystem());
-//        if (name == CalcHarmonicAngleForceKernel::Name())
-//            return new CudaParallelCalcHarmonicAngleForceKernel(name, platform, data, context.getSystem());
-//        if (name == CalcCustomAngleForceKernel::Name())
-//            return new CudaParallelCalcCustomAngleForceKernel(name, platform, data, context.getSystem());
-//        if (name == CalcPeriodicTorsionForceKernel::Name())
-//            return new CudaParallelCalcPeriodicTorsionForceKernel(name, platform, data, context.getSystem());
-//        if (name == CalcRBTorsionForceKernel::Name())
-//            return new CudaParallelCalcRBTorsionForceKernel(name, platform, data, context.getSystem());
-//        if (name == CalcCMAPTorsionForceKernel::Name())
-//            return new CudaParallelCalcCMAPTorsionForceKernel(name, platform, data, context.getSystem());
-//        if (name == CalcCustomTorsionForceKernel::Name())
-//            return new CudaParallelCalcCustomTorsionForceKernel(name, platform, data, context.getSystem());
-//        if (name == CalcNonbondedForceKernel::Name())
-//            return new CudaParallelCalcNonbondedForceKernel(name, platform, data, context.getSystem());
-//        if (name == CalcCustomNonbondedForceKernel::Name())
-//            return new CudaParallelCalcCustomNonbondedForceKernel(name, platform, data, context.getSystem());
-//        if (name == CalcCustomExternalForceKernel::Name())
-//            return new CudaParallelCalcCustomExternalForceKernel(name, platform, data, context.getSystem());
-//        if (name == CalcCustomHbondForceKernel::Name())
-//            return new CudaParallelCalcCustomHbondForceKernel(name, platform, data, context.getSystem());
-//        if (name == CalcCustomCompoundBondForceKernel::Name())
-//            return new CudaParallelCalcCustomCompoundBondForceKernel(name, platform, data, context.getSystem());
-//    }
+    if (data.contexts.size() > 1) {
+        // We are running in parallel on multiple devices, so we may want to create a parallel kernel.
+        
+        if (name == CalcForcesAndEnergyKernel::Name())
+            return new CudaParallelCalcForcesAndEnergyKernel(name, platform, data);
+        if (name == CalcHarmonicBondForceKernel::Name())
+            return new CudaParallelCalcHarmonicBondForceKernel(name, platform, data, context.getSystem());
+        if (name == CalcCustomBondForceKernel::Name())
+            return new CudaParallelCalcCustomBondForceKernel(name, platform, data, context.getSystem());
+        if (name == CalcHarmonicAngleForceKernel::Name())
+            return new CudaParallelCalcHarmonicAngleForceKernel(name, platform, data, context.getSystem());
+        if (name == CalcCustomAngleForceKernel::Name())
+            return new CudaParallelCalcCustomAngleForceKernel(name, platform, data, context.getSystem());
+        if (name == CalcPeriodicTorsionForceKernel::Name())
+            return new CudaParallelCalcPeriodicTorsionForceKernel(name, platform, data, context.getSystem());
+        if (name == CalcRBTorsionForceKernel::Name())
+            return new CudaParallelCalcRBTorsionForceKernel(name, platform, data, context.getSystem());
+        if (name == CalcCMAPTorsionForceKernel::Name())
+            return new CudaParallelCalcCMAPTorsionForceKernel(name, platform, data, context.getSystem());
+        if (name == CalcCustomTorsionForceKernel::Name())
+            return new CudaParallelCalcCustomTorsionForceKernel(name, platform, data, context.getSystem());
+        if (name == CalcNonbondedForceKernel::Name())
+            return new CudaParallelCalcNonbondedForceKernel(name, platform, data, context.getSystem());
+        if (name == CalcCustomNonbondedForceKernel::Name())
+            return new CudaParallelCalcCustomNonbondedForceKernel(name, platform, data, context.getSystem());
+        if (name == CalcCustomExternalForceKernel::Name())
+            return new CudaParallelCalcCustomExternalForceKernel(name, platform, data, context.getSystem());
+        if (name == CalcCustomHbondForceKernel::Name())
+            return new CudaParallelCalcCustomHbondForceKernel(name, platform, data, context.getSystem());
+        if (name == CalcCustomCompoundBondForceKernel::Name())
+            return new CudaParallelCalcCustomCompoundBondForceKernel(name, platform, data, context.getSystem());
+    }
    CudaContext& cu = *data.contexts[0];
    if (name == CalcForcesAndEnergyKernel::Name())
        return new CudaCalcForcesAndEnergyKernel(name, platform, cu);

--- a/platforms/cuda2/src/CudaKernels.cpp
+++ b/platforms/cuda2/src/CudaKernels.cpp
@@ -37,7 +37,7 @@
 #include "CudaBondedUtilities.h"
 #include "CudaExpressionUtilities.h"
 #include "CudaIntegrationUtilities.h"
-//#include "CudaNonbondedUtilities.h"
+#include "CudaNonbondedUtilities.h"
 #include "CudaKernelSources.h"
 #include "lepton/ExpressionTreeNode.h"
 #include "lepton/Operation.h"
@@ -282,48 +282,62 @@ void CudaUpdateStateDataKernel::setPeriodicBoxVectors(ContextImpl& context, cons

 void CudaUpdateStateDataKernel::createCheckpoint(ContextImpl& context, ostream& stream) {
    cu.setAsCurrent();
-//    int version = 1;
-//    stream.write((char*) &version, sizeof(int));
-//    double time = cu.getTime();
-//    stream.write((char*) &time, sizeof(double));
-//    cu.getPosq().download();
-//    stream.write((char*) &cu.getPosq()[0], sizeof(mm_float4)*cu.getPosq().getSize());
-//    cu.getVelm().download();
-//    stream.write((char*) &cu.getVelm()[0], sizeof(mm_float4)*cu.getVelm().getSize());
-//    stream.write((char*) &cu.getAtomIndex()[0], sizeof(cl_int)*cu.getAtomIndex().getSize());
-//    stream.write((char*) &cu.getPosCellOffsets()[0], sizeof(mm_int4)*cu.getPosCellOffsets().size());
-//    mm_float4 box = cu.getPeriodicBoxSize();
-//    stream.write((char*) &box, sizeof(mm_float4));
-//    cu.getIntegrationUtilities().createCheckpoint(stream);
-//    SimTKOpenMMUtilities::createCheckpoint(stream);
+    int version = 1;
+    stream.write((char*) &version, sizeof(int));
+    double time = cu.getTime();
+    stream.write((char*) &time, sizeof(double));
+    int stepCount = cu.getStepCount();
+    stream.write((char*) &stepCount, sizeof(int));
+    int computeForceCount = cu.getComputeForceCount();
+    stream.write((char*) &computeForceCount, sizeof(int));
+    int bufferSize = cu.getPaddedNumAtoms()*(cu.getUseDoublePrecision() ? sizeof(double4) : sizeof(float4));
+    char* buffer = (char*) cu.getPinnedBuffer();
+    cu.getPosq().download(buffer);
+    stream.write(buffer, bufferSize);
+    cu.getVelm().download(buffer);
+    stream.write(buffer, bufferSize);
+    stream.write((char*) &cu.getAtomIndex()[0], sizeof(int)*cu.getAtomIndex().size());
+    stream.write((char*) &cu.getPosCellOffsets()[0], sizeof(int4)*cu.getPosCellOffsets().size());
+    double4 box = cu.getPeriodicBoxSize();
+    stream.write((char*) &box, sizeof(double4));
+    cu.getIntegrationUtilities().createCheckpoint(stream);
+    SimTKOpenMMUtilities::createCheckpoint(stream);
 }

 void CudaUpdateStateDataKernel::loadCheckpoint(ContextImpl& context, istream& stream) {
    cu.setAsCurrent();
-//    int version;
-//    stream.read((char*) &version, sizeof(int));
-//    if (version != 1)
-//        throw OpenMMException("Checkpoint was created with a different version of OpenMM");
-//    double time;
-//    stream.read((char*) &time, sizeof(double));
-//    vector<CudaContext*>& contexts = cu.getPlatformData().contexts;
-//    for (int i = 0; i < (int) contexts.size(); i++)
-//        contexts[i]->setTime(time);
-//    stream.read((char*) &cu.getPosq()[0], sizeof(mm_float4)*cu.getPosq().getSize());
-//    cu.getPosq().upload();
-//    stream.read((char*) &cu.getVelm()[0], sizeof(mm_float4)*cu.getVelm().getSize());
-//    cu.getVelm().upload();
-//    stream.read((char*) &cu.getAtomIndex()[0], sizeof(cl_int)*cu.getAtomIndex().getSize());
-//    cu.getAtomIndex().upload();
-//    stream.read((char*) &cu.getPosCellOffsets()[0], sizeof(mm_int4)*cu.getPosCellOffsets().size());
-//    mm_float4 box;
-//    stream.read((char*) &box, sizeof(mm_float4));
-//    for (int i = 0; i < (int) contexts.size(); i++)
-//        contexts[i]->setPeriodicBoxSize(box.x, box.y, box.z);
-//    cu.getIntegrationUtilities().loadCheckpoint(stream);
-//    SimTKOpenMMUtilities::loadCheckpoint(stream);
-//    for (int i = 0; i < cu.getReorderListeners().size(); i++)
-//        cu.getReorderListeners()[i]->execute();
+    int version;
+    stream.read((char*) &version, sizeof(int));
+    if (version != 1)
+        throw OpenMMException("Checkpoint was created with a different version of OpenMM");
+    double time;
+    stream.read((char*) &time, sizeof(double));
+    int stepCount, computeForceCount;
+    stream.read((char*) &stepCount, sizeof(int));
+    stream.read((char*) &computeForceCount, sizeof(int));
+    vector<CudaContext*>& contexts = cu.getPlatformData().contexts;
+    for (int i = 0; i < (int) contexts.size(); i++) {
+        contexts[i]->setTime(time);
+        contexts[i]->setStepCount(stepCount);
+        contexts[i]->setComputeForceCount(computeForceCount);
+    }
+    int bufferSize = cu.getPaddedNumAtoms()*(cu.getUseDoublePrecision() ? sizeof(double4) : sizeof(float4));
+    char* buffer = (char*) cu.getPinnedBuffer();
+    stream.read(buffer, bufferSize);
+    cu.getPosq().upload(buffer);
+    stream.read(buffer, bufferSize);
+    cu.getVelm().upload(buffer);
+    stream.read((char*) &cu.getAtomIndex()[0], sizeof(int)*cu.getAtomIndex().size());
+    cu.getAtomIndexArray().upload(cu.getAtomIndex());
+    stream.read((char*) &cu.getPosCellOffsets()[0], sizeof(int4)*cu.getPosCellOffsets().size());
+    double4 box;
+    stream.read((char*) &box, sizeof(double4));
+    for (int i = 0; i < (int) contexts.size(); i++)
+        contexts[i]->setPeriodicBoxSize(box.x, box.y, box.z);
+    cu.getIntegrationUtilities().loadCheckpoint(stream);
+    SimTKOpenMMUtilities::loadCheckpoint(stream);
+    for (int i = 0; i < cu.getReorderListeners().size(); i++)
+        cu.getReorderListeners()[i]->execute();
 }

 void CudaApplyConstraintsKernel::initialize(const System& system) {
@@ -840,6 +854,7 @@ private:
 };

 CudaCalcPeriodicTorsionForceKernel::~CudaCalcPeriodicTorsionForceKernel() {
+    cu.setAsCurrent();
    if (params != NULL)
        delete params;
 }
@@ -926,6 +941,7 @@ private:
 };

 CudaCalcRBTorsionForceKernel::~CudaCalcRBTorsionForceKernel() {
+    cu.setAsCurrent();
    if (params1 != NULL)
        delete params1;
    if (params2 != NULL)
@@ -3983,8 +3999,8 @@ CudaIntegrateVerletStepKernel::~CudaIntegrateVerletStepKernel() {
 }

 void CudaIntegrateVerletStepKernel::initialize(const System& system, const VerletIntegrator& integrator) {
-    cu.setAsCurrent();
    cu.getPlatformData().initializeContexts(system);
+    cu.setAsCurrent();
    map<string, string> defines;
    defines["NUM_ATOMS"] = cu.intToString(cu.getNumAtoms());
    defines["PADDED_NUM_ATOMS"] = cu.intToString(cu.getPaddedNumAtoms());
@@ -3995,6 +4011,7 @@ void CudaIntegrateVerletStepKernel::initialize(const System& system, const Verle
 }

 void CudaIntegrateVerletStepKernel::execute(ContextImpl& context, const VerletIntegrator& integrator) {
+    cu.setAsCurrent();
    CudaIntegrationUtilities& integration = cu.getIntegrationUtilities();
    int numAtoms = cu.getNumAtoms();
    double dt = integrator.getStepSize();
@@ -4042,8 +4059,8 @@ CudaIntegrateLangevinStepKernel::~CudaIntegrateLangevinStepKernel() {
 }

 void CudaIntegrateLangevinStepKernel::initialize(const System& system, const LangevinIntegrator& integrator) {
-    cu.setAsCurrent();
    cu.getPlatformData().initializeContexts(system);
+    cu.setAsCurrent();
    cu.getIntegrationUtilities().initRandomNumberGenerator(integrator.getRandomNumberSeed());
    map<string, string> defines;
    defines["NUM_ATOMS"] = cu.intToString(cu.getNumAtoms());
@@ -4056,6 +4073,7 @@ void CudaIntegrateLangevinStepKernel::initialize(const System& system, const Lan
 }

 void CudaIntegrateLangevinStepKernel::execute(ContextImpl& context, const LangevinIntegrator& integrator) {
+    cu.setAsCurrent();
    CudaIntegrationUtilities& integration = cu.getIntegrationUtilities();
    int numAtoms = cu.getNumAtoms();
    double temperature = integrator.getTemperature();
@@ -4120,8 +4138,8 @@ CudaIntegrateBrownianStepKernel::~CudaIntegrateBrownianStepKernel() {
 }

 void CudaIntegrateBrownianStepKernel::initialize(const System& system, const BrownianIntegrator& integrator) {
-    cu.setAsCurrent();
    cu.getPlatformData().initializeContexts(system);
+    cu.setAsCurrent();
    cu.getIntegrationUtilities().initRandomNumberGenerator(integrator.getRandomNumberSeed());
    map<string, string> defines;
    defines["NUM_ATOMS"] = cu.intToString(cu.getNumAtoms());
@@ -4133,6 +4151,7 @@ void CudaIntegrateBrownianStepKernel::initialize(const System& system, const Bro
 }

 void CudaIntegrateBrownianStepKernel::execute(ContextImpl& context, const BrownianIntegrator& integrator) {
+    cu.setAsCurrent();
    CudaIntegrationUtilities& integration = cu.getIntegrationUtilities();
    int numAtoms = cu.getNumAtoms();
    double temperature = integrator.getTemperature();
@@ -4175,8 +4194,8 @@ CudaIntegrateVariableVerletStepKernel::~CudaIntegrateVariableVerletStepKernel()
 }

 void CudaIntegrateVariableVerletStepKernel::initialize(const System& system, const VariableVerletIntegrator& integrator) {
-    cu.setAsCurrent();
    cu.getPlatformData().initializeContexts(system);
+    cu.setAsCurrent();
    map<string, string> defines;
    defines["NUM_ATOMS"] = cu.intToString(cu.getNumAtoms());
    defines["PADDED_NUM_ATOMS"] = cu.intToString(cu.getPaddedNumAtoms());
@@ -4188,6 +4207,7 @@ void CudaIntegrateVariableVerletStepKernel::initialize(const System& system, con
 }

 double CudaIntegrateVariableVerletStepKernel::execute(ContextImpl& context, const VariableVerletIntegrator& integrator, double maxTime) {
+    cu.setAsCurrent();
    CudaIntegrationUtilities& integration = cu.getIntegrationUtilities();
    int numAtoms = cu.getNumAtoms();

@@ -4252,8 +4272,8 @@ CudaIntegrateVariableLangevinStepKernel::~CudaIntegrateVariableLangevinStepKerne
 }

 void CudaIntegrateVariableLangevinStepKernel::initialize(const System& system, const VariableLangevinIntegrator& integrator) {
-    cu.setAsCurrent();
    cu.getPlatformData().initializeContexts(system);
+    cu.setAsCurrent();
    cu.getIntegrationUtilities().initRandomNumberGenerator(integrator.getRandomNumberSeed());
    map<string, string> defines;
    defines["NUM_ATOMS"] = cu.intToString(cu.getNumAtoms());
@@ -4268,6 +4288,7 @@ void CudaIntegrateVariableLangevinStepKernel::initialize(const System& system, c
 }

 double CudaIntegrateVariableLangevinStepKernel::execute(ContextImpl& context, const VariableLangevinIntegrator& integrator, double maxTime) {
+    cu.setAsCurrent();
    CudaIntegrationUtilities& integration = cu.getIntegrationUtilities();
    int numAtoms = cu.getNumAtoms();

@@ -4412,8 +4433,8 @@ CudaIntegrateCustomStepKernel::~CudaIntegrateCustomStepKernel() {
 }

 void CudaIntegrateCustomStepKernel::initialize(const System& system, const CustomIntegrator& integrator) {
-    cu.setAsCurrent();
    cu.getPlatformData().initializeContexts(system);
+    cu.setAsCurrent();
    cu.getIntegrationUtilities().initRandomNumberGenerator(integrator.getRandomNumberSeed());
    numGlobalVariables = integrator.getNumGlobalVariables();
    int elementSize = (cu.getUseDoublePrecision() ? sizeof(double) : sizeof(float));
@@ -4492,6 +4513,7 @@ string CudaIntegrateCustomStepKernel::createPerDofComputation(const string& vari
 }

 void CudaIntegrateCustomStepKernel::execute(ContextImpl& context, CustomIntegrator& integrator, bool& forcesAreValid) {
+    cu.setAsCurrent();
    CudaIntegrationUtilities& integration = cu.getIntegrationUtilities();
    int numAtoms = cu.getNumAtoms();
    int numSteps = integrator.getNumComputations();

--- a/platforms/cuda2/src/CudaParallelKernels.cpp
+++ b/platforms/cuda2/src/CudaParallelKernels.cpp
--- a/platforms/cuda2/src/CudaParallelKernels.h
+++ b/platforms/cuda2/src/CudaParallelKernels.h
--- a/platforms/cuda2/src/kernels/findInteractingBlocks.cu
+++ b/platforms/cuda2/src/kernels/findInteractingBlocks.cu
@@ -101,13 +101,14 @@ __device__ void storeInteractionData(ushort2* buffer, int* valid, short* sum, us
 extern "C" __global__ void findBlocksWithInteractions(real4 periodicBoxSize, real4 invPeriodicBoxSize, const real4* __restrict__ blockCenter,
        const real4* __restrict__ blockBoundingBox, unsigned int* __restrict__ interactionCount, ushort2* __restrict__ interactingTiles,
        unsigned int* __restrict__ interactionFlags, const real4* __restrict__ posq, unsigned int maxTiles, unsigned int startTileIndex,
-        unsigned int endTileIndex) {
+        unsigned int numTiles) {
    __shared__ ushort2 buffer[BUFFER_SIZE];
    __shared__ int valid[BUFFER_SIZE];
    __shared__ short sum[BUFFER_SIZE];
    __shared__ ushort2 temp[BUFFER_SIZE];
    __shared__ int bufferFull;
    __shared__ int globalIndex;
+    unsigned int endTileIndex = startTileIndex+numTiles;
    int valuesInBuffer = 0;
    if (threadIdx.x == 0)
        bufferFull = false;

--- a/platforms/cuda2/src/kernels/parallel.cu
+++ b/platforms/cuda2/src/kernels/parallel.cu
+/**
+ * Sum the forces computed by different contexts.
+ */
+
+extern "C" __global__ void sumForces(long long* __restrict__ force, long long* __restrict__ buffer, int bufferSize, int numBuffers) {
+    int totalSize = bufferSize*numBuffers;
+    for (int index = blockDim.x*blockIdx.x+threadIdx.x; index < bufferSize; index += blockDim.x*gridDim.x) {
+        long long sum = force[index];
+        for (int i = index; i < totalSize; i += bufferSize)
+            sum += buffer[i];
+        force[index] = sum;
+    }
+}
--- a/platforms/cuda2/src/kernels/utilities.cu
+++ b/platforms/cuda2/src/kernels/utilities.cu
@@ -73,34 +73,4 @@ __global__ void clearSixBuffers(int* __restrict__ buffer1, int size1, int* __res
    clearSingleBuffer(buffer6, size6);
 }

-/**
- * Sum a collection of buffers into the first one.
- */
-
-__global__ void reduceFloat4Buffer(float4* __restrict__ buffer, int bufferSize, int numBuffers) {
-    int index = blockDim.x*blockIdx.x+threadIdx.x;
-    int totalSize = bufferSize*numBuffers;
-    while (index < bufferSize) {
-        float4 sum = buffer[index];
-        for (int i = index+bufferSize; i < totalSize; i += bufferSize)
-            sum += buffer[i];
-        buffer[index] = sum;
-        index += blockDim.x*gridDim.x;
-    }
-}
-
-/**
- * Sum the various buffers containing forces.
- */
-__global__ void reduceForces(const long* __restrict__ longBuffer, float4* __restrict__ buffer, int bufferSize, int numBuffers) {
-    int totalSize = bufferSize*numBuffers;
-    float scale = 1.0f/(float) 0xFFFFFFFF;
-    for (int index = blockDim.x*blockIdx.x+threadIdx.x; index < bufferSize; index += blockDim.x*gridDim.x) {
-        float4 sum = make_float4(scale*longBuffer[index], scale*longBuffer[index+bufferSize], scale*longBuffer[index+2*bufferSize], 0.0f);
-        for (int i = index; i < totalSize; i += bufferSize)
-            sum += buffer[i];
-        buffer[index] = sum;
-    }
-}
-
 }
\ No newline at end of file
--- a/platforms/cuda2/tests/TestCudaCheckpoints.cpp
+++ b/platforms/cuda2/tests/TestCudaCheckpoints.cpp
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2012 Stanford University and the Authors.           *
+ * Authors: Peter Eastman                                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * Permission is hereby granted, free of charge, to any person obtaining a    *
+ * copy of this software and associated documentation files (the "Software"), *
+ * to deal in the Software without restriction, including without limitation  *
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
+ * and/or sell copies of the Software, and to permit persons to whom the      *
+ * Software is furnished to do so, subject to the following conditions:       *
+ *                                                                            *
+ * The above copyright notice and this permission notice shall be included in *
+ * all copies or substantial portions of the Software.                        *
+ *                                                                            *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
+ * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
+ * -------------------------------------------------------------------------- */
+
+/**
+ * This tests creating and loading checkpoints with the CUDA platform.
+ */
+
+#include "CudaPlatform.h"
+#include "openmm/internal/AssertionUtilities.h"
+#include "openmm/AndersenThermostat.h"
+#include "openmm/Context.h"
+#include "openmm/NonbondedForce.h"
+#include "openmm/System.h"
+#include "openmm/VerletIntegrator.h"
+#include "sfmt/SFMT.h"
+#include <iostream>
+#include <sstream>
+#include <vector>
+
+using namespace OpenMM;
+using namespace std;
+
+const double TOL = 1e-5;
+
+void compareStates(State& s1, State& s2) {
+    ASSERT_EQUAL_TOL(s1.getTime(), s2.getTime(), TOL);
+    int numParticles = s1.getPositions().size();
+    for (int i = 0; i < numParticles; i++) {
+        ASSERT_EQUAL_VEC(s1.getPositions()[i], s2.getPositions()[i], TOL);
+        ASSERT_EQUAL_VEC(s1.getVelocities()[i], s2.getVelocities()[i], TOL);
+        Vec3 a1, b1, c1, a2, b2, c2;
+        s1.getPeriodicBoxVectors(a1, b1, c1);
+        s2.getPeriodicBoxVectors(a2, b2, c2);
+        ASSERT_EQUAL_VEC(a1, a2, TOL);
+        ASSERT_EQUAL_VEC(b1, b2, TOL);
+        ASSERT_EQUAL_VEC(c1, c2, TOL);
+        for (map<string, double>::const_iterator iter = s1.getParameters().begin(); iter != s1.getParameters().end(); ++iter)
+            ASSERT_EQUAL(iter->second, (*s2.getParameters().find(iter->first)).second);
+    }
+}
+
+void testCheckpoint() {
+    const int numParticles = 100;
+    const double boxSize = 5.0;
+    const double temperature = 200.0;
+    CudaPlatform platform;
+    System system;
+    system.addForce(new AndersenThermostat(0.0, 100.0));
+    NonbondedForce* nonbonded = new NonbondedForce();
+    system.addForce(nonbonded);
+    nonbonded->setNonbondedMethod(NonbondedForce::CutoffPeriodic);
+    vector<Vec3> positions(numParticles);
+    OpenMM_SFMT::SFMT sfmt;
+    init_gen_rand(0, sfmt);
+    for (int i = 0; i < numParticles; i++) {
+        system.addParticle(1.0);
+        nonbonded->addParticle(i%2 == 0 ? 0.1 : -0.1, 0.2, 0.1);
+        bool clash;
+        do {
+            clash = false;
+            positions[i] = Vec3(boxSize*genrand_real2(sfmt), boxSize*genrand_real2(sfmt), boxSize*genrand_real2(sfmt));
+            for (int j = 0; j < i; j++) {
+                Vec3 delta = positions[i]-positions[j];
+                if (sqrt(delta.dot(delta)) < 0.1)
+                    clash = true;
+            }
+        } while (clash);
+    }
+    VerletIntegrator integrator(0.001);
+    Context context(system, integrator, platform);
+    context.setPositions(positions);
+    context.setPeriodicBoxVectors(Vec3(boxSize, 0, 0), Vec3(0, boxSize, 0), Vec3(0, 0, boxSize));
+    context.setParameter(AndersenThermostat::Temperature(), temperature);
+    
+    // Run for a little while.
+    
+    integrator.step(100);
+    
+    // Record the current state and make a checkpoint.
+    
+    State s1 = context.getState(State::Positions | State::Velocities | State::Parameters);
+    stringstream stream1(ios_base::out | ios_base::in | ios_base::binary);
+    context.createCheckpoint(stream1);
+    
+    // Continue the simulation for a few more steps and record the state again.
+    
+    integrator.step(10);
+    State s2 = context.getState(State::Positions | State::Velocities | State::Parameters);
+    
+    // Restore from the checkpoint and see if everything gets restored correctly.
+    
+    context.setPeriodicBoxVectors(Vec3(2*boxSize, 0, 0), Vec3(0, 2*boxSize, 0), Vec3(0, 0, 2*boxSize));
+    context.setParameter(AndersenThermostat::Temperature(), temperature+10);
+    context.loadCheckpoint(stream1);
+    State s3 = context.getState(State::Positions | State::Velocities | State::Parameters);
+    compareStates(s1, s3);
+    
+    // Now simulate from there and see if the trajectory is identical.
+    
+    integrator.step(10);
+    State s4 = context.getState(State::Positions | State::Velocities | State::Parameters);
+    compareStates(s2, s4);
+    
+    // Create a new Context that uses multiple devices.
+
+    string deviceIndex = platform.getPropertyValue(context, CudaPlatform::CudaDeviceIndex());
+    map<string, string> props;
+    props[CudaPlatform::CudaDeviceIndex()] = deviceIndex+","+deviceIndex;
+    VerletIntegrator integrator2(0.001);
+    Context context2(system, integrator2, platform, props);
+    context2.setPositions(positions);
+    context2.setPeriodicBoxVectors(Vec3(boxSize, 0, 0), Vec3(0, boxSize, 0), Vec3(0, 0, boxSize));
+    context2.setParameter(AndersenThermostat::Temperature(), temperature);
+    
+    // Now repeat all of the above tests with it.
+
+    integrator2.step(100);
+    State s5 = context2.getState(State::Positions | State::Velocities | State::Parameters);
+    stringstream stream2(ios_base::out | ios_base::in | ios_base::binary);
+    context2.createCheckpoint(stream2);
+    integrator2.step(10);
+    State s6 = context2.getState(State::Positions | State::Velocities | State::Parameters);
+    context2.setPeriodicBoxVectors(Vec3(2*boxSize, 0, 0), Vec3(0, 2*boxSize, 0), Vec3(0, 0, 2*boxSize));
+    context2.setParameter(AndersenThermostat::Temperature(), temperature+10);
+    context2.loadCheckpoint(stream2);
+    State s7 = context2.getState(State::Positions | State::Velocities | State::Parameters);
+    compareStates(s5, s7);
+    integrator2.step(10);
+    State s8 = context2.getState(State::Positions | State::Velocities | State::Parameters);
+    compareStates(s6, s8);
+}
+
+int main() {
+    try {
+        testCheckpoint();
+    }
+    catch(const exception& e) {
+        cout << "exception: " << e.what() << endl;
+        return 1;
+    }
+    cout << "Done" << endl;
+    return 0;
+}
--- a/platforms/cuda2/tests/TestCudaCustomAngleForce.cpp
+++ b/platforms/cuda2/tests/TestCudaCustomAngleForce.cpp
@@ -164,7 +164,7 @@ void testParallelComputation() {
 int main() {
    try {
        testAngles();
-//        testParallelComputation();
+        testParallelComputation();
    }
    catch(const exception& e) {
        cout << "exception: " << e.what() << endl;

--- a/platforms/cuda2/tests/TestCudaCustomBondForce.cpp
+++ b/platforms/cuda2/tests/TestCudaCustomBondForce.cpp
@@ -169,7 +169,7 @@ int main() {
    try {
        testBonds();
        testManyParameters();
-//        testParallelComputation();
+        testParallelComputation();
    }
    catch(const exception& e) {
        cout << "exception: " << e.what() << endl;

--- a/platforms/cuda2/tests/TestCudaCustomCompoundBondForce.cpp
+++ b/platforms/cuda2/tests/TestCudaCustomCompoundBondForce.cpp
@@ -205,7 +205,7 @@ int main() {
    try {
        testBond();
        testPositionDependence();
-//        testParallelComputation();
+        testParallelComputation();
    }
    catch(const exception& e) {
        cout << "exception: " << e.what() << endl;

--- a/platforms/cuda2/tests/TestCudaCustomExternalForce.cpp
+++ b/platforms/cuda2/tests/TestCudaCustomExternalForce.cpp
@@ -166,7 +166,7 @@ int main() {
    try {
        testForce();
        testManyParameters();
-//        testParallelComputation();
+        testParallelComputation();
    }
    catch(const exception& e) {
        cout << "exception: " << e.what() << endl;

--- a/platforms/cuda2/tests/TestCudaCustomNonbondedForce.cpp
+++ b/platforms/cuda2/tests/TestCudaCustomNonbondedForce.cpp
@@ -424,7 +424,7 @@ int main() {
        testPeriodic();
        testTabulatedFunction();
        testCoulombLennardJones();
-//        testParallelComputation();
+        testParallelComputation();
    }
    catch(const exception& e) {
        cout << "exception: " << e.what() << endl;

--- a/platforms/cuda2/tests/TestCudaCustomTorsionForce.cpp
+++ b/platforms/cuda2/tests/TestCudaCustomTorsionForce.cpp
@@ -205,7 +205,7 @@ int main() {
    try {
        testTorsions();
        testRange();
-//        testParallelComputation();
+        testParallelComputation();
    }
    catch(const exception& e) {
        cout << "exception: " << e.what() << endl;

--- a/platforms/cuda2/tests/TestCudaHarmonicAngleForce.cpp
+++ b/platforms/cuda2/tests/TestCudaHarmonicAngleForce.cpp
@@ -130,7 +130,7 @@ void testParallelComputation() {
 int main() {
    try {
        testAngles();
-//        testParallelComputation();
+        testParallelComputation();
    }
    catch(const exception& e) {
        cout << "exception: " << e.what() << endl;

--- a/platforms/cuda2/tests/TestCudaHarmonicBondForce.cpp
+++ b/platforms/cuda2/tests/TestCudaHarmonicBondForce.cpp
@@ -121,7 +121,7 @@ void testParallelComputation() {
 int main() {
    try {
        testBonds();
-//        testParallelComputation();
+        testParallelComputation();
    }
    catch(const exception& e) {
        cout << "exception: " << e.what() << endl;

--- a/platforms/cuda2/tests/TestCudaNonbondedForce.cpp
+++ b/platforms/cuda2/tests/TestCudaNonbondedForce.cpp
@@ -814,8 +814,8 @@ int main() {
        testBlockInteractions(true);
        testDispersionCorrection();
        testChangingParameters();
-//        testParallelComputation(false);
-//        testParallelComputation(true);
+        testParallelComputation(false);
+        testParallelComputation(true);
    }
    catch(const exception& e) {
        cout << "exception: " << e.what() << endl;

--- a/platforms/cuda2/tests/TestCudaPeriodicTorsionForce.cpp
+++ b/platforms/cuda2/tests/TestCudaPeriodicTorsionForce.cpp
@@ -124,7 +124,7 @@ void testParallelComputation() {
 int main() {
    try {
        testPeriodicTorsions();
-//        testParallelComputation();
+        testParallelComputation();
    }
    catch(const exception& e) {
        cout << "exception: " << e.what() << endl;

--- a/platforms/cuda2/tests/TestCudaRBTorsionForce.cpp
+++ b/platforms/cuda2/tests/TestCudaRBTorsionForce.cpp
@@ -143,7 +143,7 @@ void testParallelComputation() {
 int main() {
    try {
        testRBTorsions();
-//        testParallelComputation();
+        testParallelComputation();
    }
    catch(const exception& e) {
        cout << "exception: " << e.what() << endl;