Merge branch 'master' of github.com:pandegroup/openmm into genpt

2d2f05ce · Andy Simmonett · 94823d84 · 4d32047c · 2d2f05ce · 2d2f05ce
Commit 2d2f05ce authored Dec 08, 2015 by Andy Simmonett
20 changed files
--- a/platforms/cpu/src/CpuNonbondedForce.cpp
+++ b/platforms/cpu/src/CpuNonbondedForce.cpp
@@ -28,7 +28,7 @@
 #include "CpuNonbondedForce.h"
 #include "ReferenceForce.h"
 #include "ReferencePME.h"
-#include "gmx_atomic.h"
+#include "openmm/internal/gmx_atomic.h"
 #include <algorithm>

 // In case we're using some primitive version of Visual Studio this will
@@ -322,6 +322,14 @@ void CpuNonbondedForce::calculateDirectIxn(int numberOfAtoms, float* posq, const
    threads.execute(task);
    threads.waitForThreads();
    
+    // Signal the threads to subtract the exclusions.
+    
+    if (ewald || pme) {
+        gmx_atomic_set(&counter, 0);
+        threads.resumeThreads();
+        threads.waitForThreads();
+    }
+    
    // Combine the energies from all the threads.
    
    if (totalEnergy != NULL) {
@@ -354,28 +362,37 @@ void CpuNonbondedForce::threadComputeDirect(ThreadPool& threads, int threadIndex

        // Now subtract off the exclusions, since they were implicitly included in the reciprocal space sum.

-        for (int i = threadIndex; i < numberOfAtoms; i += numThreads) {
-            fvec4 posI((float) atomCoordinates[i][0], (float) atomCoordinates[i][1], (float) atomCoordinates[i][2], 0.0f);
-            for (set<int>::const_iterator iter = exclusions[i].begin(); iter != exclusions[i].end(); ++iter) {
-                if (*iter > i) {
-                    int j = *iter;
-                    fvec4 deltaR;
-                    fvec4 posJ((float) atomCoordinates[j][0], (float) atomCoordinates[j][1], (float) atomCoordinates[j][2], 0.0f);
-                    float r2;
-                    getDeltaR(posJ, posI, deltaR, r2, false, boxSize, invBoxSize);
-                    float r = sqrtf(r2);
-                    float inverseR = 1/r;
-                    float chargeProd = ONE_4PI_EPS0*posq[4*i+3]*posq[4*j+3];
-                    float alphaR = alphaEwald*r;
-                    float erfAlphaR = erf(alphaR);
-                    if (erfAlphaR > 1e-6f) {
-                        float dEdR = (float) (chargeProd * inverseR * inverseR * inverseR);
-                        dEdR = (float) (dEdR * (erfAlphaR-TWO_OVER_SQRT_PI*alphaR*exp(-alphaR*alphaR)));
-                        fvec4 result = deltaR*dEdR;
-                        (fvec4(forces+4*i)-result).store(forces+4*i);
-                        (fvec4(forces+4*j)+result).store(forces+4*j);
-                        if (includeEnergy)
-                            threadEnergy[threadIndex] -= chargeProd*inverseR*erfAlphaR;
+        threads.syncThreads();
+        const int groupSize = max(1, numberOfAtoms/(10*numThreads));
+        while (true) {
+            int start = gmx_atomic_fetch_add(reinterpret_cast<gmx_atomic_t*>(atomicCounter), groupSize);
+            if (start >= numberOfAtoms)
+                break;
+            int end = min(start+groupSize, numberOfAtoms);
+            for (int i = start; i < end; i++) {
+               fvec4 posI((float) atomCoordinates[i][0], (float) atomCoordinates[i][1], (float) atomCoordinates[i][2], 0.0f);
+                float scaledChargeI = (float) (ONE_4PI_EPS0*posq[4*i+3]);
+                for (set<int>::const_iterator iter = exclusions[i].begin(); iter != exclusions[i].end(); ++iter) {
+                    if (*iter > i) {
+                        int j = *iter;
+                        fvec4 deltaR;
+                        fvec4 posJ((float) atomCoordinates[j][0], (float) atomCoordinates[j][1], (float) atomCoordinates[j][2], 0.0f);
+                        float r2;
+                        getDeltaR(posJ, posI, deltaR, r2, false, boxSize, invBoxSize);
+                        float r = sqrtf(r2);
+                        float alphaR = alphaEwald*r;
+                        float erfAlphaR = erf(alphaR);
+                        if (erfAlphaR > 1e-6f) {
+                            float inverseR = 1/r;
+                            float chargeProdOverR = scaledChargeI*posq[4*j+3]*inverseR;
+                            float dEdR = chargeProdOverR*inverseR*inverseR;
+                            dEdR = dEdR * (erfAlphaR-(float)TWO_OVER_SQRT_PI*alphaR*(float)exp(-alphaR*alphaR));
+                            fvec4 result = deltaR*dEdR;
+                            (fvec4(forces+4*i)-result).store(forces+4*i);
+                            (fvec4(forces+4*j)+result).store(forces+4*j);
+                            if (includeEnergy)
+                                threadEnergy[threadIndex] -= chargeProdOverR*erfAlphaR;
+                        }
                    }
                }
            }

--- a/platforms/cpu/src/CpuSETTLE.cpp
+++ b/platforms/cpu/src/CpuSETTLE.cpp
@@ -6,7 +6,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2013 Stanford University and the Authors.           *
+ * Portions copyright (c) 2013-2015 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -30,6 +30,7 @@
 * -------------------------------------------------------------------------- */

 #include "CpuSETTLE.h"
+#include "openmm/internal/gmx_atomic.h"

 using namespace OpenMM;
 using namespace std;
@@ -39,10 +40,14 @@ public:
    ApplyToPositionsTask(vector<OpenMM::RealVec>& atomCoordinates, vector<OpenMM::RealVec>& atomCoordinatesP, vector<RealOpenMM>& inverseMasses,
            RealOpenMM tolerance, vector<ReferenceSETTLEAlgorithm*>& threadSettle) : atomCoordinates(atomCoordinates), atomCoordinatesP(atomCoordinatesP),
            inverseMasses(inverseMasses), tolerance(tolerance), threadSettle(threadSettle) {
+        gmx_atomic_set(&atomicCounter, 0);
    }
    void execute(ThreadPool& threads, int threadIndex) {
-        if (threadIndex < threadSettle.size()) {
-            threadSettle[threadIndex]->apply(atomCoordinates, atomCoordinatesP, inverseMasses, tolerance);
+        while (true) {
+            int index = gmx_atomic_fetch_add(&atomicCounter, 1);
+            if (index >= threadSettle.size())
+                break;
+            threadSettle[index]->apply(atomCoordinates, atomCoordinatesP, inverseMasses, tolerance);
        }
    }
    vector<OpenMM::RealVec>& atomCoordinates;
@@ -50,6 +55,7 @@ public:
    vector<RealOpenMM>& inverseMasses;
    RealOpenMM tolerance;
    vector<ReferenceSETTLEAlgorithm*>& threadSettle;
+    gmx_atomic_t atomicCounter;
 };

 class CpuSETTLE::ApplyToVelocitiesTask : public ThreadPool::Task {
@@ -57,10 +63,14 @@ public:
    ApplyToVelocitiesTask(vector<OpenMM::RealVec>& atomCoordinates, vector<OpenMM::RealVec>& velocities, vector<RealOpenMM>& inverseMasses,
            RealOpenMM tolerance, vector<ReferenceSETTLEAlgorithm*>& threadSettle) : atomCoordinates(atomCoordinates), velocities(velocities),
            inverseMasses(inverseMasses), tolerance(tolerance), threadSettle(threadSettle) {
+        gmx_atomic_set(&atomicCounter, 0);
    }
    void execute(ThreadPool& threads, int threadIndex) {
-        if (threadIndex < threadSettle.size()) {
-            threadSettle[threadIndex]->applyToVelocities(atomCoordinates, velocities, inverseMasses, tolerance);
+        while (true) {
+            int index = gmx_atomic_fetch_add(&atomicCounter, 1);
+            if (index >= threadSettle.size())
+                break;
+            threadSettle[index]->applyToVelocities(atomCoordinates, velocities, inverseMasses, tolerance);
        }
    }
    vector<OpenMM::RealVec>& atomCoordinates;
@@ -68,17 +78,18 @@ public:
    vector<RealOpenMM>& inverseMasses;
    RealOpenMM tolerance;
    vector<ReferenceSETTLEAlgorithm*>& threadSettle;
+    gmx_atomic_t atomicCounter;
 };

 CpuSETTLE::CpuSETTLE(const System& system, const ReferenceSETTLEAlgorithm& settle, ThreadPool& threads) : threads(threads) {
-    int numThreads = threads.getNumThreads();
+    int numBlocks = 10*threads.getNumThreads();
    int numClusters = settle.getNumClusters();
    vector<RealOpenMM> mass(system.getNumParticles());
    for (int i = 0; i < system.getNumParticles(); i++)
        mass[i] = system.getParticleMass(i);
-    for (int i = 0; i < numThreads; i++) {
-        int start = i*numClusters/numThreads;
-        int end = (i+1)*numClusters/numThreads;
+    for (int i = 0; i < numBlocks; i++) {
+        int start = i*numClusters/numBlocks;
+        int end = (i+1)*numClusters/numBlocks;
        if (start != end) {
            int numThreadClusters = end-start;
            vector<int> atom1(numThreadClusters), atom2(numThreadClusters), atom3(numThreadClusters);

--- a/serialization/include/openmm/serialization/GBVIForceProxy.h
+++ b/serialization/include/openmm/serialization/GBVIForceProxy.h
-#ifndef OPENMM_GBVIFORCE_PROXY_H_
-#define OPENMM_GBVIFORCE_PROXY_H_
-
 /* -------------------------------------------------------------------------- *
 *                                   OpenMM                                   *
 * -------------------------------------------------------------------------- *
@@ -9,7 +6,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2010 Stanford University and the Authors.           *
+ * Portions copyright (c) 2015 Stanford University and the Authors.           *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -32,22 +29,8 @@
 * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
 * -------------------------------------------------------------------------- */

-#include "openmm/internal/windowsExport.h"
-#include "openmm/serialization/SerializationProxy.h"
-
-namespace OpenMM {
-
-/**
- * This is a proxy for serializing GBVIForce objects.
- */
-
-class OPENMM_EXPORT GBVIForceProxy : public SerializationProxy {
-public:
-    GBVIForceProxy();
-    void serialize(const void* object, SerializationNode& node) const;
-    void* deserialize(const SerializationNode& node) const;
-};
-
-} // namespace OpenMM
+#include "CpuTests.h"
+#include "TestCompoundIntegrator.h"

-#endif /*OPENMM_GBVIFORCE_PROXY_H_*/
+void runPlatformTests() {
+}
--- a/platforms/cuda/CMakeLists.txt
+++ b/platforms/cuda/CMakeLists.txt
@@ -13,9 +13,9 @@
 #----------------------------------------------------

 set(OPENMM_BUILD_CUDA_TESTS TRUE CACHE BOOL "Whether to build CUDA test cases")
-if(OPENMM_BUILD_CUDA_TESTS)
+if(BUILD_TESTING AND OPENMM_BUILD_CUDA_TESTS)
    SUBDIRS (tests)
-endif(OPENMM_BUILD_CUDA_TESTS)
+endif(BUILD_TESTING AND OPENMM_BUILD_CUDA_TESTS)

 # The source is organized into subdirectories, but we handle them all from
 # this CMakeLists file rather than letting CMake visit them as SUBDIRS.

--- a/platforms/cuda/include/CudaContext.h
+++ b/platforms/cuda/include/CudaContext.h
@@ -30,6 +30,7 @@
 #include <map>
 #include <queue>
 #include <string>
+#include <utility>
 #define __CL_ENABLE_EXCEPTIONS
 #ifdef _MSC_VER
    // Prevent Windows from defining macros that interfere with other code.
@@ -538,6 +539,11 @@ public:
     */
    void invalidateMolecules();
 private:
+    /**
+     * Compute a sorted list of device indices in decreasing order of desirability
+     */
+    std::vector<int> getDevicePrecedence();
+
    struct Molecule;
    struct MoleculeGroup;
    class VirtualSiteInfo;

--- a/platforms/cuda/include/CudaIntegrationUtilities.h
+++ b/platforms/cuda/include/CudaIntegrationUtilities.h
@@ -62,6 +62,14 @@ public:
    CudaArray& getStepSize() {
        return *stepSize;
    }
+    /**
+     * Set the size to use for the next step.
+     */
+    void setNextStepSize(double size);
+    /**
+     * Get the size that was used for the last step.
+     */
+    double getLastStepSize();
    /**
     * Apply constraints to the atom positions.
     *
@@ -154,6 +162,7 @@ private:
    CudaArray* vsiteLocalCoordsParams;
    int randomPos;
    int lastSeed, numVsites;
+    double2 lastStepSize;
    struct ShakeCluster;
    struct ConstraintOrderer;
 };

--- a/platforms/cuda/include/CudaKernels.h
+++ b/platforms/cuda/include/CudaKernels.h
@@ -592,7 +592,7 @@ class CudaCalcNonbondedForceKernel : public CalcNonbondedForceKernel {
 public:
    CudaCalcNonbondedForceKernel(std::string name, const Platform& platform, CudaContext& cu, const System& system) : CalcNonbondedForceKernel(name, platform),
            cu(cu), hasInitializedFFT(false), sigmaEpsilon(NULL), exceptionParams(NULL), cosSinSums(NULL), directPmeGrid(NULL), reciprocalPmeGrid(NULL),
-            pmeBsplineModuliX(NULL), pmeBsplineModuliY(NULL), pmeBsplineModuliZ(NULL),  pmeAtomRange(NULL), pmeAtomGridIndex(NULL), sort(NULL), fft(NULL), pmeio(NULL) {
+            pmeBsplineModuliX(NULL), pmeBsplineModuliY(NULL), pmeBsplineModuliZ(NULL),  pmeAtomRange(NULL), pmeAtomGridIndex(NULL), pmeEnergyBuffer(NULL), sort(NULL), fft(NULL), pmeio(NULL) {
    }
    ~CudaCalcNonbondedForceKernel();
    /**
@@ -657,6 +657,7 @@ private:
    CudaArray* pmeBsplineModuliZ;
    CudaArray* pmeAtomRange;
    CudaArray* pmeAtomGridIndex;
+    CudaArray* pmeEnergyBuffer;
    CudaSort* sort;
    Kernel cpuPme;
    PmeIO* pmeio;
@@ -1123,7 +1124,6 @@ public:
    double computeKineticEnergy(ContextImpl& context, const VerletIntegrator& integrator);
 private:
    CudaContext& cu;
-    double prevStepSize;
    CUfunction kernel1, kernel2;
 };

@@ -1354,7 +1354,7 @@ private:
    void recordChangedParameters(ContextImpl& context);
    bool evaluateCondition(int step);
    CudaContext& cu;
-    double prevStepSize, energy;
+    double energy;
    float energyFloat;
    int numGlobalVariables;
    bool hasInitializedKernels, deviceValuesAreCurrent, deviceGlobalsAreCurrent, modifiesParameters, keNeedsForce, hasAnyConstraints;

--- a/platforms/cuda/include/CudaPlatform.h
+++ b/platforms/cuda/include/CudaPlatform.h
@@ -121,7 +121,7 @@ public:
    ContextImpl* context;
    std::vector<CudaContext*> contexts;
    std::vector<double> contextEnergy;
-    bool removeCM, peerAccessSupported, useCpuPme;
+    bool hasInitializedContexts, removeCM, peerAccessSupported, useCpuPme;
    int cmMotionFrequency;
    int stepCount, computeForceCount;
    double time;

--- a/platforms/cuda/src/CudaBondedUtilities.cpp
+++ b/platforms/cuda/src/CudaBondedUtilities.cpp
@@ -99,7 +99,7 @@ void CudaBondedUtilities::initialize(const System& system) {
    s<<CudaKernelSources::vectorOps;
    for (int i = 0; i < (int) prefixCode.size(); i++)
        s<<prefixCode[i];
-    s<<"extern \"C\" __global__ void computeBondedForces(unsigned long long* __restrict__ forceBuffer, real* __restrict__ energyBuffer, const real4* __restrict__ posq, int groups, real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ";
+    s<<"extern \"C\" __global__ void computeBondedForces(unsigned long long* __restrict__ forceBuffer, mixed* __restrict__ energyBuffer, const real4* __restrict__ posq, int groups, real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ";
    for (int force = 0; force < numForces; force++) {
        for (int i = 0; i < (int) atomIndices[force].size(); i++) {
            int indexWidth = atomIndices[force][i]->getElementSize()/4;
@@ -110,7 +110,7 @@ void CudaBondedUtilities::initialize(const System& system) {
    for (int i = 0; i < (int) arguments.size(); i++)
        s<<", "<<argTypes[i]<<"* customArg"<<(i+1);
    s<<") {\n";
-    s<<"real energy = 0;\n";
+    s<<"mixed energy = 0;\n";
    for (int force = 0; force < numForces; force++)
        s<<createForceSource(force, forceAtoms[force].size(), forceAtoms[force][0].size(), forceGroup[force], forceSource[force]);
    s<<"energyBuffer[blockIdx.x*blockDim.x+threadIdx.x] += energy;\n";

--- a/platforms/cuda/src/CudaContext.cpp
+++ b/platforms/cuda/src/CudaContext.cpp
@@ -122,49 +122,48 @@ CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlocking
    CHECK_RESULT(cuDeviceGetCount(&numDevices));
    if (deviceIndex < -1 || deviceIndex >= numDevices)
        throw OpenMMException("Illegal value for CudaDeviceIndex: "+intToString(deviceIndex));
+
+    vector<int> devicePrecedence;
    if (deviceIndex == -1) {
-        // Try to figure out which device is the fastest.
-
-        int bestSpeed = -1;
-        int bestCompute = -1;
-        for (int i = 0; i < numDevices; i++) {
-            CHECK_RESULT(cuDeviceGet(&device, i));
-            int major, minor, clock, multiprocessors;
-            CHECK_RESULT(cuDeviceComputeCapability(&major, &minor, device));
-            if (major == 1 && minor < 2)
-                continue; // 1.0 and 1.1 are not supported
-            CHECK_RESULT(cuDeviceGetAttribute(&clock, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, device));
-            CHECK_RESULT(cuDeviceGetAttribute(&multiprocessors, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, device));
-            int speed = clock*multiprocessors;
-            if (major > bestCompute || (major == bestCompute && speed > bestSpeed)) {
-                deviceIndex = i;
-                bestSpeed = speed;
-                bestCompute = major;
-            }
+        devicePrecedence = getDevicePrecedence();
+    } else {
+        devicePrecedence.push_back(deviceIndex);
+    }
+
+    this->deviceIndex = -1;
+    for (int i = 0; i < static_cast<int>(devicePrecedence.size()); i++) {
+        int trialDeviceIndex = devicePrecedence[i];
+        CHECK_RESULT(cuDeviceGet(&device, trialDeviceIndex));
+        defaultOptimizationOptions = "--use_fast_math";
+        unsigned int flags = CU_CTX_MAP_HOST;
+        if (useBlockingSync)
+            flags += CU_CTX_SCHED_BLOCKING_SYNC;
+        else
+            flags += CU_CTX_SCHED_SPIN;
+
+        if (cuCtxCreate(&context, flags, device) == CUDA_SUCCESS) {
+            this->deviceIndex = trialDeviceIndex;
+            break;
        }
    }
-    if (deviceIndex == -1)
-        throw OpenMMException("No compatible CUDA device is available");
-    CHECK_RESULT(cuDeviceGet(&device, deviceIndex));
-    this->deviceIndex = deviceIndex;
+    if (this->deviceIndex == -1)
+        if (deviceIndex != -1)
+            throw OpenMMException("The requested CUDA device could not be loaded");
+        else
+            throw OpenMMException("No compatible CUDA device is available");
+
    int major, minor;
    CHECK_RESULT(cuDeviceComputeCapability(&major, &minor, device));
-    // This is a workaround to support GTX 980 with CUDA 6.5.  It reports its compute capability
-    // as 5.2, but the compiler doesn't support anything beyond 5.0.  We can remove this once
-    // CUDA 7.0 is released.
-    if (major == 5)
-        minor = 0;
+#if __CUDA_API_VERSION < 7000
+        // This is a workaround to support GTX 980 with CUDA 6.5.  It reports
+        // its compute capability as 5.2, but the compiler doesn't support
+        // anything beyond 5.0.
+        if (major == 5)
+            minor = 0;
+#endif
    gpuArchitecture = intToString(major)+intToString(minor);
    computeCapability = major+0.1*minor;
-    if ((useDoublePrecision || useMixedPrecision) && computeCapability < 1.3)
-        throw OpenMMException("This device does not support double precision");
-    defaultOptimizationOptions = "--use_fast_math";
-    unsigned int flags = CU_CTX_MAP_HOST;
-    if (useBlockingSync)
-        flags += CU_CTX_SCHED_BLOCKING_SYNC;
-    else
-        flags += CU_CTX_SCHED_SPIN;
-    CHECK_RESULT(cuCtxCreate(&context, flags, device));
+
    contextIsValid = true;
    CHECK_RESULT(cuCtxSetCacheConfig(CU_FUNC_CACHE_PREFER_SHARED));
    if (contextIndex > 0) {
@@ -245,9 +244,9 @@ CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlocking
    compilationDefines["ATAN"] = useDoublePrecision ? "atan" : "atanf";
    compilationDefines["ERF"] = useDoublePrecision ? "erf" : "erff";
    compilationDefines["ERFC"] = useDoublePrecision ? "erfc" : "erfcf";
-    
+
    // Set defines for applying periodic boundary conditions.
-    
+
    Vec3 boxVectors[3];
    system.getDefaultPeriodicBoxVectors(boxVectors[0], boxVectors[1], boxVectors[2]);
    boxIsTriclinic = (boxVectors[0][1] != 0.0 || boxVectors[0][2] != 0.0 ||
@@ -307,11 +306,11 @@ CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlocking
    }

    // Create the work thread used for parallelization when running on multiple devices.
-    
+
    thread = new WorkThread();
-    
+
    // Create utilities objects.
-    
+
    bonded = new CudaBondedUtilities(*this);
    nonbonded = new CudaNonbondedUtilities(*this);
    integration = new CudaIntegrationUtilities(*this, system);
@@ -368,7 +367,7 @@ void CudaContext::initialize() {
        CHECK_RESULT(cuMemHostAlloc(&pinnedBuffer, pinnedBufferSize*sizeof(double), 0));
    }
    else if (useMixedPrecision) {
-        energyBuffer = CudaArray::create<float>(*this, numEnergyBuffers, "energyBuffer");
+        energyBuffer = CudaArray::create<double>(*this, numEnergyBuffers, "energyBuffer");
        int pinnedBufferSize = max(paddedNumAtoms*4, numEnergyBuffers);
        CHECK_RESULT(cuMemHostAlloc(&pinnedBuffer, pinnedBufferSize*sizeof(double), 0));
    }
@@ -427,7 +426,7 @@ string CudaContext::replaceStrings(const string& input, const std::map<std::stri
            if (index != result.npos) {
                if ((index == 0 || symbolChars.find(result[index-1]) == symbolChars.end()) && (index == result.size()-size || symbolChars.find(result[index+size]) == symbolChars.end())) {
                    // We have found a complete symbol, not part of a longer symbol.
-                    
+
                    result.replace(index, size, iter->second);
                    index += iter->second.size();
                }
@@ -462,11 +461,11 @@ static bool compileInWindows(const string &command) {
        return -1;
    }
    WaitForSingleObject(pi.hProcess, INFINITE);
-    DWORD exitCode = -1;  
+    DWORD exitCode = -1;
    if(!GetExitCodeProcess(pi.hProcess, &exitCode)) {
        throw(OpenMMException("Could not get nvcc.exe's exit code\n"));
    } else {
-        if(exitCode == 0) 
+        if(exitCode == 0)
            return 0;
        else
            return -1;
@@ -522,9 +521,9 @@ CUmodule CudaContext::createModule(const string source, const map<string, string
    if (!defines.empty())
        src << endl;
    src << source << endl;
-    
+
    // See whether we already have PTX for this kernel cached.
-    
+
    CSHA1 sha1;
    sha1.Update((const UINT_8*) src.str().c_str(), src.str().size());
    sha1.Final();
@@ -539,9 +538,9 @@ CUmodule CudaContext::createModule(const string source, const map<string, string
    CUmodule module;
    if (cuModuleLoad(&module, cacheFile.str().c_str()) == CUDA_SUCCESS)
        return module;
-    
+
    // Select names for the various temporary files.
-    
+
    stringstream tempFileName;
    tempFileName << "openmmTempKernel" << this; // Include a pointer to this context as part of the filename to avoid collisions.
 #ifdef WIN32
@@ -555,12 +554,12 @@ CUmodule CudaContext::createModule(const string source, const map<string, string
    int res = 0;

    // If the runtime compiler plugin is available, use it.
-    
+
    if (hasCompilerKernel) {
        string ptx = compilerKernel.getAs<CudaCompilerKernel>().createModule(src.str(), "-arch=compute_"+gpuArchitecture+" "+options, *this);
-        
+
        // If possible, write the PTX out to a temporary file so we can cache it for later use.
-        
+
        bool wroteCache = false;
        try {
            ofstream out(outputFile.c_str());
@@ -574,7 +573,7 @@ CUmodule CudaContext::createModule(const string source, const map<string, string
        }
        if (!wroteCache) {
            // An error occurred.  Possibly we don't have permission to write to the temp directory.  Just try to load the module directly.
-            
+
            CHECK_RESULT2(cuModuleLoadDataEx(&module, &ptx[0], 0, NULL, NULL), "Error loading CUDA module");
            return module;
        }
@@ -883,7 +882,7 @@ private:

 void CudaContext::findMoleculeGroups() {
    // The first time this is called, we need to identify all the molecules in the system.
-    
+
    if (moleculeGroups.size() == 0) {
        // Add a ForceInfo that makes sure reordering doesn't break virtual sites.

@@ -966,7 +965,7 @@ void CudaContext::findMoleculeGroups() {
                    if (!forces[k]->areParticlesIdentical(mol.atoms[i], mol2.atoms[i]))
                        identical = false;
            }
-            
+
            // See if the constraints are identical.

            for (int i = 0; i < (int) mol.constraints.size() && identical; i++) {
@@ -1047,11 +1046,11 @@ void CudaContext::invalidateMolecules() {
    }
    if (valid)
        return;
-    
+
    // The list of which molecules are identical is no longer valid.  We need to restore the
    // atoms to their original order, rebuild the list of identical molecules, and sort them
    // again.
-    
+
    vector<int4> newCellOffsets(numAtoms);
    if (useDoublePrecision) {
        vector<double4> oldPosq(paddedNumAtoms);
@@ -1196,6 +1195,8 @@ void CudaContext::reorderAtomsImpl() {
            molPos[i].x *= invNumAtoms;
            molPos[i].y *= invNumAtoms;
            molPos[i].z *= invNumAtoms;
+            if (molPos[i].x != molPos[i].x)
+                throw OpenMMException("Particle coordinate is nan");
        }
        if (nonbonded->getUsePeriodic()) {
            // Move each molecule position into the same box.
@@ -1391,3 +1392,41 @@ void CudaContext::WorkThread::flush() {
       pthread_cond_wait(&queueEmptyCondition, &queueLock);
    pthread_mutex_unlock(&queueLock);
 }
+
+
+vector<int> CudaContext::getDevicePrecedence() {
+    int numDevices;
+    CUdevice thisDevice;
+    string errorMessage = "Error initializing Context";
+    vector<pair<pair<int, int>, int> > devices;
+
+    CHECK_RESULT(cuDeviceGetCount(&numDevices));
+    for (int i = 0; i < numDevices; i++) {
+        CHECK_RESULT(cuDeviceGet(&thisDevice, i));
+        int major, minor, clock, multiprocessors, speed;
+        CHECK_RESULT(cuDeviceComputeCapability(&major, &minor, thisDevice));
+        if (major == 1 && minor < 2)
+            continue;
+
+        if ((useDoublePrecision || useMixedPrecision) && (major+0.1*minor < 1.3))
+            continue;
+
+        CHECK_RESULT(cuDeviceGetAttribute(&clock, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, thisDevice));
+        CHECK_RESULT(cuDeviceGetAttribute(&multiprocessors, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, thisDevice));
+        speed = clock*multiprocessors;
+        pair<int, int> deviceProperties = std::make_pair(major, speed);
+        devices.push_back(std::make_pair(deviceProperties, -i));
+    }
+
+    // sort first by compute capability (higher is better), then speed
+    // (higher is better), and finally device index (lower is better)
+    std::sort(devices.begin(), devices.end());
+    std::reverse(devices.begin(), devices.end());
+
+    vector<int> precedence;
+    for (int i = 0; i < static_cast<int>(devices.size()); i++) {
+        precedence.push_back(-devices[i].second);
+    }
+
+    return precedence;
+}
--- a/platforms/cuda/src/CudaExpressionUtilities.cpp
+++ b/platforms/cuda/src/CudaExpressionUtilities.cpp
@@ -109,7 +109,8 @@ void CudaExpressionUtilities::processExpression(stringstream& out, const Express
                }
                out << ");\n";
                out << "APPLY_PERIODIC_TO_DELTA(periodicDistance_delta)\n";
-                out << tempType << " periodicDistance_rinv = RSQRT(periodicDistance_delta.x*periodicDistance_delta.x + periodicDistance_delta.y*periodicDistance_delta.y + periodicDistance_delta.z*periodicDistance_delta.z);\n";
+                out << tempType << " periodicDistance_r2 = periodicDistance_delta.x*periodicDistance_delta.x + periodicDistance_delta.y*periodicDistance_delta.y + periodicDistance_delta.z*periodicDistance_delta.z;\n";
+                out << tempType << " periodicDistance_rinv = RSQRT(periodicDistance_r2);\n";
                for (int j = 0; j < nodes.size(); j++) {
                    const vector<int>& derivOrder = dynamic_cast<const Operation::Custom*>(&nodes[j]->getOperation())->getDerivOrder();
                    int argIndex = -1;
@@ -123,17 +124,17 @@ void CudaExpressionUtilities::processExpression(stringstream& out, const Express
                    if (argIndex == -1)
                        out << nodeNames[j] << " = RECIP(periodicDistance_rinv);\n";
                    else if (argIndex == 0)
-                        out << nodeNames[j] << " = periodicDistance_delta.x*periodicDistance_rinv;\n";
+                        out << nodeNames[j] << " = (periodicDistance_r2 > 0 ? periodicDistance_delta.x*periodicDistance_rinv : 0);\n";
                    else if (argIndex == 1)
-                        out << nodeNames[j] << " = periodicDistance_delta.y*periodicDistance_rinv;\n";
+                        out << nodeNames[j] << " = (periodicDistance_r2 > 0 ? periodicDistance_delta.y*periodicDistance_rinv : 0);\n";
                    else if (argIndex == 2)
-                        out << nodeNames[j] << " = periodicDistance_delta.z*periodicDistance_rinv;\n";
+                        out << nodeNames[j] << " = (periodicDistance_r2 > 0 ? periodicDistance_delta.z*periodicDistance_rinv : 0);\n";
                    else if (argIndex == 3)
-                        out << nodeNames[j] << " = -periodicDistance_delta.x*periodicDistance_rinv;\n";
+                        out << nodeNames[j] << " = (periodicDistance_r2 > 0 ? -periodicDistance_delta.x*periodicDistance_rinv : 0);\n";
                    else if (argIndex == 4)
-                        out << nodeNames[j] << " = -periodicDistance_delta.y*periodicDistance_rinv;\n";
+                        out << nodeNames[j] << " = (periodicDistance_r2 > 0 ? -periodicDistance_delta.y*periodicDistance_rinv : 0);\n";
                    else if (argIndex == 5)
-                        out << nodeNames[j] << " = -periodicDistance_delta.z*periodicDistance_rinv;\n";
+                        out << nodeNames[j] << " = (periodicDistance_r2 > 0 ? -periodicDistance_delta.z*periodicDistance_rinv : 0);\n";
                }
            }
            else {

--- a/platforms/cuda/src/CudaIntegrationUtilities.cpp
+++ b/platforms/cuda/src/CudaIntegrationUtilities.cpp
@@ -106,21 +106,21 @@ CudaIntegrationUtilities::CudaIntegrationUtilities(CudaContext& context, const S
        vsiteOutOfPlaneAtoms(NULL), vsiteOutOfPlaneWeights(NULL), vsiteLocalCoordsAtoms(NULL), vsiteLocalCoordsParams(NULL) {
    // Create workspace arrays.

+    lastStepSize = make_double2(0.0, 0.0);
    if (context.getUseDoublePrecision() || context.getUseMixedPrecision()) {
        posDelta = CudaArray::create<double4>(context, context.getPaddedNumAtoms(), "posDelta");
        vector<double4> deltas(posDelta->getSize(), make_double4(0.0, 0.0, 0.0, 0.0));
        posDelta->upload(deltas);
        stepSize = CudaArray::create<double2>(context, 1, "stepSize");
-        vector<double2> step(1, make_double2(0.0, 0.0));
-        stepSize->upload(step);
+        stepSize->upload(&lastStepSize);
    }
    else {
        posDelta = CudaArray::create<float4>(context, context.getPaddedNumAtoms(), "posDelta");
        vector<float4> deltas(posDelta->getSize(), make_float4(0.0f, 0.0f, 0.0f, 0.0f));
        posDelta->upload(deltas);
        stepSize = CudaArray::create<float2>(context, 1, "stepSize");
-        vector<float2> step(1, make_float2(0.0f, 0.0f));
-        stepSize->upload(step);
+        float2 lastStepSizeFloat = make_float2(0.0f, 0.0f);
+        stepSize->upload(&lastStepSizeFloat);
    }

    // Record the set of constraints and how many constraints each atom is involved in.
@@ -650,6 +650,29 @@ CudaIntegrationUtilities::~CudaIntegrationUtilities() {
        delete vsiteLocalCoordsParams;
 }

+void CudaIntegrationUtilities::setNextStepSize(double size) {
+    if (size != lastStepSize.x || size != lastStepSize.y) {
+        lastStepSize = make_double2(size, size);
+        if (context.getUseDoublePrecision() || context.getUseMixedPrecision())
+            stepSize->upload(&lastStepSize);
+        else {
+            float2 lastStepSizeFloat = make_float2((float) size, (float) size);
+            stepSize->upload(&lastStepSizeFloat);
+        }
+    }
+}
+
+double CudaIntegrationUtilities::getLastStepSize() {
+    if (context.getUseDoublePrecision() || context.getUseMixedPrecision())
+        stepSize->download(&lastStepSize);
+    else {
+        float2 lastStepSizeFloat;
+        stepSize->download(&lastStepSizeFloat);
+        lastStepSize = make_double2(lastStepSizeFloat.x, lastStepSizeFloat.y);
+    }
+    return lastStepSize.y;
+}
+
 void CudaIntegrationUtilities::applyConstraints(double tol) {
    applyConstraints(false, tol);
 }

--- a/platforms/cuda/src/CudaKernels.cpp
+++ b/platforms/cuda/src/CudaKernels.cpp
@@ -112,7 +112,7 @@ double CudaCalcForcesAndEnergyKernel::finishComputation(ContextImpl& context, bo
    cu.getIntegrationUtilities().distributeForcesFromVirtualSites();
    if (includeEnergy) {
        CudaArray& energyArray = cu.getEnergyBuffer();
-        if (cu.getUseDoublePrecision()) {
+        if (cu.getUseDoublePrecision() || cu.getUseMixedPrecision()) {
            double* energy = (double*) cu.getPinnedBuffer();
            energyArray.download(energy);
            for (int i = 0; i < energyArray.getSize(); i++)
@@ -1458,16 +1458,24 @@ private:

 class CudaCalcNonbondedForceKernel::SyncStreamPostComputation : public CudaContext::ForcePostComputation {
 public:
-    SyncStreamPostComputation(CudaContext& cu, CUevent event, int forceGroup) : cu(cu), event(event), forceGroup(forceGroup) {
+    SyncStreamPostComputation(CudaContext& cu, CUevent event, CUfunction addEnergyKernel, CudaArray& pmeEnergyBuffer, int forceGroup) : cu(cu), event(event),
+            addEnergyKernel(addEnergyKernel), pmeEnergyBuffer(pmeEnergyBuffer), forceGroup(forceGroup) {
    }
    double computeForceAndEnergy(bool includeForces, bool includeEnergy, int groups) {
        if ((groups&(1<<forceGroup)) != 0)
            cuStreamWaitEvent(cu.getCurrentStream(), event, 0);
+        if (includeEnergy) {
+            int bufferSize = pmeEnergyBuffer.getSize();
+            void* args[] = {&pmeEnergyBuffer.getDevicePointer(), &cu.getEnergyBuffer().getDevicePointer(), &bufferSize};
+            cu.executeKernel(addEnergyKernel, args, bufferSize);
+        }
        return 0.0;
    }
 private:
    CudaContext& cu;
    CUevent event;
+    CUfunction addEnergyKernel;
+    CudaArray& pmeEnergyBuffer;
    int forceGroup;
 };

@@ -1493,6 +1501,8 @@ CudaCalcNonbondedForceKernel::~CudaCalcNonbondedForceKernel() {
        delete pmeAtomRange;
    if (pmeAtomGridIndex != NULL)
        delete pmeAtomGridIndex;
+    if (pmeEnergyBuffer != NULL)
+        delete pmeEnergyBuffer;
    if (sort != NULL)
        delete sort;
    if (fft != NULL)
@@ -1681,6 +1691,9 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
                pmeBsplineModuliZ = new CudaArray(cu, gridSizeZ, elementSize, "pmeBsplineModuliZ");
                pmeAtomRange = CudaArray::create<int>(cu, gridSizeX*gridSizeY*gridSizeZ+1, "pmeAtomRange");
                pmeAtomGridIndex = CudaArray::create<int2>(cu, numParticles, "pmeAtomGridIndex");
+                int energyElementSize = (cu.getUseDoublePrecision() || cu.getUseMixedPrecision() ? sizeof(double) : sizeof(float));
+                pmeEnergyBuffer = new CudaArray(cu, cu.getNumThreadBlocks()*CudaContext::ThreadBlockSize, energyElementSize, "pmeEnergyBuffer");
+                cu.clearBuffer(*pmeEnergyBuffer);
                sort = new CudaSort(cu, new SortTrait(), cu.getNumAtoms());
                int cufftVersion;
                cufftGetVersion(&cufftVersion);
@@ -1714,7 +1727,7 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
                    if (recipForceGroup < 0)
                        recipForceGroup = force.getForceGroup();
                    cu.addPreComputation(new SyncStreamPreComputation(cu, pmeStream, pmeSyncEvent, recipForceGroup));
-                    cu.addPostComputation(new SyncStreamPostComputation(cu, pmeSyncEvent, recipForceGroup));
+                    cu.addPostComputation(new SyncStreamPostComputation(cu, pmeSyncEvent, cu.getKernel(module, "addEnergy"), *pmeEnergyBuffer, recipForceGroup));
                }
                hasInitializedFFT = true;

@@ -1889,7 +1902,7 @@ double CudaCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeF
        }

        if (includeEnergy) {
-            void* computeEnergyArgs[] = {&reciprocalPmeGrid->getDevicePointer(), &cu.getEnergyBuffer().getDevicePointer(),
+            void* computeEnergyArgs[] = {&reciprocalPmeGrid->getDevicePointer(), usePmeStream ? &pmeEnergyBuffer->getDevicePointer() : &cu.getEnergyBuffer().getDevicePointer(),
                    &pmeBsplineModuliX->getDevicePointer(), &pmeBsplineModuliY->getDevicePointer(), &pmeBsplineModuliZ->getDevicePointer(),
                    cu.getPeriodicBoxSizePointer(), recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]};
            cu.executeKernel(pmeEvalEnergyKernel, computeEnergyArgs, cu.getNumAtoms());
@@ -5674,7 +5687,6 @@ void CudaIntegrateVerletStepKernel::initialize(const System& system, const Verle
    CUmodule module = cu.createModule(CudaKernelSources::verlet, defines, "");
    kernel1 = cu.getKernel(module, "integrateVerletPart1");
    kernel2 = cu.getKernel(module, "integrateVerletPart2");
-    prevStepSize = -1.0;
 }

 void CudaIntegrateVerletStepKernel::execute(ContextImpl& context, const VerletIntegrator& integrator) {
@@ -5683,19 +5695,7 @@ void CudaIntegrateVerletStepKernel::execute(ContextImpl& context, const VerletIn
    int numAtoms = cu.getNumAtoms();
    int paddedNumAtoms = cu.getPaddedNumAtoms();
    double dt = integrator.getStepSize();
-    if (dt != prevStepSize) {
-        if (cu.getUseDoublePrecision() || cu.getUseMixedPrecision()) {
-            vector<double2> stepSizeVec(1);
-            stepSizeVec[0] = make_double2(dt, dt);
-            cu.getIntegrationUtilities().getStepSize().upload(stepSizeVec);
-        }
-        else {
-            vector<float2> stepSizeVec(1);
-            stepSizeVec[0] = make_float2((float) dt, (float) dt);
-            cu.getIntegrationUtilities().getStepSize().upload(stepSizeVec);
-        }
-        prevStepSize = dt;
-    }
+    cu.getIntegrationUtilities().setNextStepSize(dt);

    // Call the first integration kernel.

@@ -5752,6 +5752,7 @@ void CudaIntegrateLangevinStepKernel::execute(ContextImpl& context, const Langev
    double temperature = integrator.getTemperature();
    double friction = integrator.getFriction();
    double stepSize = integrator.getStepSize();
+    cu.getIntegrationUtilities().setNextStepSize(stepSize);
    if (temperature != prevTemp || friction != prevFriction || stepSize != prevStepSize) {
        // Calculate the integration parameters.

@@ -5766,8 +5767,6 @@ void CudaIntegrateLangevinStepKernel::execute(ContextImpl& context, const Langev
            p[1] = fscale;
            p[2] = noisescale;
            params->upload(p);
-            double2 ss = make_double2(0, stepSize);
-            integration.getStepSize().upload(&ss);
        }
        else {
            vector<float> p(params->getSize());
@@ -5775,8 +5774,6 @@ void CudaIntegrateLangevinStepKernel::execute(ContextImpl& context, const Langev
            p[1] = (float) fscale;
            p[2] = (float) noisescale;
            params->upload(p);
-            float2 ss = make_float2(0, (float) stepSize);
-            integration.getStepSize().upload(&ss);
        }
        prevTemp = temperature;
        prevFriction = friction;
@@ -5929,20 +5926,13 @@ double CudaIntegrateVariableVerletStepKernel::execute(ContextImpl& context, cons

    // Update the time and step count.

-    double dt, time;
+    double dt = cu.getIntegrationUtilities().getLastStepSize();
+    double time = cu.getTime()+dt;
    if (useDouble) {
-        double2 stepSize;
-        cu.getIntegrationUtilities().getStepSize().download(&stepSize);
-        dt = stepSize.y;
-        time = cu.getTime()+dt;
        if (dt == maxStepSize)
            time = maxTime; // Avoid round-off error
    }
    else {
-        float2 stepSize;
-        cu.getIntegrationUtilities().getStepSize().download(&stepSize);
-        dt = stepSize.y;
-        time = cu.getTime()+dt;
        if (dt == maxStepSizeFloat)
            time = maxTime; // Avoid round-off error
    }
@@ -6023,20 +6013,13 @@ double CudaIntegrateVariableLangevinStepKernel::execute(ContextImpl& context, co

    // Update the time and step count.

-    double dt, time;
+    double dt = cu.getIntegrationUtilities().getLastStepSize();
+    double time = cu.getTime()+dt;
    if (useDouble) {
-        double2 stepSize;
-        cu.getIntegrationUtilities().getStepSize().download(&stepSize);
-        dt = stepSize.y;
-        time = cu.getTime()+dt;
        if (dt == maxStepSize)
            time = maxTime; // Avoid round-off error
    }
    else {
-        float2 stepSize;
-        cu.getIntegrationUtilities().getStepSize().download(&stepSize);
-        dt = stepSize.y;
-        time = cu.getTime()+dt;
        if (dt == maxStepSizeFloat)
            time = maxTime; // Avoid round-off error
    }
@@ -6139,7 +6122,6 @@ void CudaIntegrateCustomStepKernel::initialize(const System& system, const Custo
    summedValue = new CudaArray(cu, 1, elementSize, "summedValue");
    perDofValues = new CudaParameterSet(cu, integrator.getNumPerDofVariables(), 3*system.getNumParticles(), "perDofVariables", false, cu.getUseDoublePrecision() || cu.getUseMixedPrecision());
    cu.addReorderListener(new ReorderListener(cu, *perDofValues, localPerDofValuesFloat, localPerDofValuesDouble, deviceValuesAreCurrent));
-    prevStepSize = -1.0;
    SimTKOpenMMUtilities::setRandomNumberSeed(integrator.getRandomNumberSeed());
 }

@@ -6553,9 +6535,7 @@ void CudaIntegrateCustomStepKernel::prepareForComputation(ContextImpl& context,
    }
    localValuesAreCurrent = false;
    double stepSize = integrator.getStepSize();
-    if (stepSize != prevStepSize) {
-        recordGlobalValue(stepSize, GlobalTarget(DT, dtVariableIndex));
-    }
+    recordGlobalValue(stepSize, GlobalTarget(DT, dtVariableIndex));
    for (int i = 0; i < (int) parameterNames.size(); i++) {
        double value = context.getParameter(parameterNames[i]);
        if (value != globalValuesDouble[parameterVariableIndex[i]]) {
@@ -6760,17 +6740,10 @@ double CudaIntegrateCustomStepKernel::computeKineticEnergy(ContextImpl& context,
 void CudaIntegrateCustomStepKernel::recordGlobalValue(double value, GlobalTarget target) {
    switch (target.type) {
        case DT:
+            if (value != globalValuesDouble[dtVariableIndex])
+                deviceGlobalsAreCurrent = false;
            globalValuesDouble[dtVariableIndex] = value;
-            deviceGlobalsAreCurrent = false;
-            if (cu.getUseDoublePrecision() || cu.getUseMixedPrecision()) {
-                double size[] = {0, value};
-                cu.getIntegrationUtilities().getStepSize().upload(size);
-            }
-            else {
-                float size[] = {0, (float) value};
-                cu.getIntegrationUtilities().getStepSize().upload(size);
-            }
-            prevStepSize = value;
+            cu.getIntegrationUtilities().setNextStepSize(value);
            break;
        case VARIABLE:
        case PARAMETER:

--- a/platforms/cuda/src/CudaPlatform.cpp
+++ b/platforms/cuda/src/CudaPlatform.cpp
@@ -179,7 +179,7 @@ void CudaPlatform::contextDestroyed(ContextImpl& context) const {
 }

 CudaPlatform::PlatformData::PlatformData(ContextImpl* context, const System& system, const string& deviceIndexProperty, const string& blockingProperty, const string& precisionProperty,
-            const string& cpuPmeProperty, const string& compilerProperty, const string& tempProperty, const string& hostCompilerProperty) : context(context), removeCM(false), stepCount(0), computeForceCount(0), time(0.0) {
+            const string& cpuPmeProperty, const string& compilerProperty, const string& tempProperty, const string& hostCompilerProperty) : context(context), removeCM(false), stepCount(0), computeForceCount(0), time(0.0), hasInitializedContexts(false) {
    bool blocking = (blockingProperty == "true");
    vector<string> devices;
    size_t searchPos = 0, nextPos;
@@ -247,8 +247,11 @@ CudaPlatform::PlatformData::~PlatformData() {
 }

 void CudaPlatform::PlatformData::initializeContexts(const System& system) {
+    if (hasInitializedContexts)
+        return;
    for (int i = 0; i < (int) contexts.size(); i++)
        contexts[i]->initialize();
+    hasInitializedContexts = true;
 }

 void CudaPlatform::PlatformData::syncContexts() {

--- a/platforms/cuda/src/kernels/customCentroidBond.cu
+++ b/platforms/cuda/src/kernels/customCentroidBond.cu
@@ -104,10 +104,10 @@ inline __device__ real4 computeCross(real4 vec1, real4 vec2) {
 /**
 * Compute the forces on groups based on the bonds.
 */
-extern "C" __global__ void computeGroupForces(unsigned long long* __restrict__ groupForce, real* __restrict__ energyBuffer, const real4* __restrict__ centerPositions,
+extern "C" __global__ void computeGroupForces(unsigned long long* __restrict__ groupForce, mixed* __restrict__ energyBuffer, const real4* __restrict__ centerPositions,
        const int* __restrict__ bondGroups
        EXTRA_ARGS) {
-    real energy = 0;
+    mixed energy = 0;
    for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < NUM_BONDS; index += blockDim.x*gridDim.x) {
        COMPUTE_FORCE
    }

--- a/platforms/cuda/src/kernels/customGBEnergyN2.cu
+++ b/platforms/cuda/src/kernels/customGBEnergyN2.cu
@@ -13,7 +13,7 @@ typedef struct {
 /**
 * Compute a force based on pair interactions.
 */
-extern "C" __global__ void computeN2Energy(unsigned long long* __restrict__ forceBuffers, real* __restrict__ energyBuffer,
+extern "C" __global__ void computeN2Energy(unsigned long long* __restrict__ forceBuffers, mixed* __restrict__ energyBuffer,
        const real4* __restrict__ posq, const unsigned int* __restrict__ exclusions, const ushort2* __restrict__ exclusionTiles,
 #ifdef USE_CUTOFF
        const int* __restrict__ tiles, const unsigned int* __restrict__ interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize,
@@ -27,7 +27,7 @@ extern "C" __global__ void computeN2Energy(unsigned long long* __restrict__ forc
    const unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/TILE_SIZE;
    const unsigned int tgx = threadIdx.x & (TILE_SIZE-1);
    const unsigned int tbx = threadIdx.x - tgx;
-    real energy = 0;
+    mixed energy = 0;
    __shared__ AtomData localData[THREAD_BLOCK_SIZE];

    // First loop: process tiles that contain exclusions.

--- a/platforms/cuda/src/kernels/customGBEnergyPerParticle.cu
+++ b/platforms/cuda/src/kernels/customGBEnergyPerParticle.cu
@@ -2,9 +2,9 @@
 * Reduce the derivatives computed in the N^2 energy kernel, and compute all per-particle energy terms.
 */

-extern "C" __global__ void computePerParticleEnergy(long long* __restrict__ forceBuffers, real* __restrict__ energyBuffer, const real4* __restrict__ posq
+extern "C" __global__ void computePerParticleEnergy(long long* __restrict__ forceBuffers, mixed* __restrict__ energyBuffer, const real4* __restrict__ posq
        PARAMETER_ARGUMENTS) {
-    real energy = 0;
+    mixed energy = 0;
    for (unsigned int index = blockIdx.x*blockDim.x+threadIdx.x; index < NUM_ATOMS; index += blockDim.x*gridDim.x) {
        // Load the derivatives


--- a/platforms/cuda/src/kernels/customHbondForce.cu
+++ b/platforms/cuda/src/kernels/customHbondForce.cu
@@ -66,12 +66,12 @@ inline __device__ real4 computeCross(real4 vec1, real4 vec2) {
 /**
 * Compute forces on donors.
 */
-extern "C" __global__ void computeDonorForces(unsigned long long* __restrict__ force, real* __restrict__ energyBuffer, const real4* __restrict__ posq,
+extern "C" __global__ void computeDonorForces(unsigned long long* __restrict__ force, mixed* __restrict__ energyBuffer, const real4* __restrict__ posq,
        const int4* __restrict__ exclusions, const int4* __restrict__ donorAtoms, const int4* __restrict__ acceptorAtoms, real4 periodicBoxSize, real4 invPeriodicBoxSize,
        real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ
        PARAMETER_ARGUMENTS) {
    extern __shared__ real4 posBuffer[];
-    real energy = 0;
+    mixed energy = 0;
    real3 f1 = make_real3(0);
    real3 f2 = make_real3(0);
    real3 f3 = make_real3(0);
@@ -155,7 +155,7 @@ extern "C" __global__ void computeDonorForces(unsigned long long* __restrict__ f
 /**
 * Compute forces on acceptors.
 */
-extern "C" __global__ void computeAcceptorForces(unsigned long long* __restrict__ force, real* __restrict__ energyBuffer, const real4* __restrict__ posq,
+extern "C" __global__ void computeAcceptorForces(unsigned long long* __restrict__ force, mixed* __restrict__ energyBuffer, const real4* __restrict__ posq,
        const int4* __restrict__ exclusions, const int4* __restrict__ donorAtoms, const int4* __restrict__ acceptorAtoms, real4 periodicBoxSize, real4 invPeriodicBoxSize,
        real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ
        PARAMETER_ARGUMENTS) {

--- a/platforms/cuda/src/kernels/customManyParticle.cu
+++ b/platforms/cuda/src/kernels/customManyParticle.cu
@@ -78,7 +78,7 @@ __constant__ float globals[NUM_GLOBALS];
 * Compute the interaction.
 */
 extern "C" __global__ void computeInteraction(
-        unsigned long long* __restrict__ forceBuffers, real* __restrict__ energyBuffer, const real4* __restrict__ posq,
+        unsigned long long* __restrict__ forceBuffers, mixed* __restrict__ energyBuffer, const real4* __restrict__ posq,
        real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ
 #ifdef USE_CUTOFF
        , const int* __restrict__ neighbors, const int* __restrict__ neighborStartIndex
@@ -90,7 +90,7 @@ extern "C" __global__ void computeInteraction(
        , int* __restrict__ exclusions, int* __restrict__ exclusionStartIndex
 #endif
        PARAMETER_ARGUMENTS) {
-    real energy = 0.0f;
+    mixed energy = 0;
    
    // Loop over particles to be the first one in the set.
    

--- a/platforms/cuda/src/kernels/customNonbondedGroups.cu
+++ b/platforms/cuda/src/kernels/customNonbondedGroups.cu
@@ -9,14 +9,14 @@ typedef struct {
 } AtomData;

 extern "C" __global__ void computeInteractionGroups(
-        unsigned long long* __restrict__ forceBuffers, real* __restrict__ energyBuffer, const real4* __restrict__ posq, const int4* __restrict__ groupData,
+        unsigned long long* __restrict__ forceBuffers, mixed* __restrict__ energyBuffer, const real4* __restrict__ posq, const int4* __restrict__ groupData,
        real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ
        PARAMETER_ARGUMENTS) {
    const unsigned int totalWarps = (blockDim.x*gridDim.x)/TILE_SIZE;
    const unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/TILE_SIZE; // global warpIndex
    const unsigned int tgx = threadIdx.x & (TILE_SIZE-1); // index within the warp
    const unsigned int tbx = threadIdx.x - tgx;           // block warpIndex
-    real energy = 0.0f;
+    mixed energy = 0;
    __shared__ AtomData localData[LOCAL_MEMORY_SIZE];

    const unsigned int startTile = FIRST_TILE+warp*(LAST_TILE-FIRST_TILE)/totalWarps;