Began implementing new mixed precision model that does integration in double...

Began implementing new mixed precision model that does integration in double precision and force evaluation in single precision

Began implementing new mixed precision model that does integration in double...
Began implementing new mixed precision model that does integration in double precision and force evaluation in single precision
9ad85ebd · Peter Eastman · 7492dc48 · 9ad85ebd · 9ad85ebd · 9ad85ebd
Commit 9ad85ebd authored Oct 02, 2012 by Peter Eastman
12 changed files
--- a/platforms/cuda/src/CudaContext.cpp
+++ b/platforms/cuda/src/CudaContext.cpp
@@ -67,22 +67,22 @@ bool CudaContext::hasInitializedCuda = false;
 CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlockingSync, const string& precision, const string& compiler,
        const string& tempDir, CudaPlatform::PlatformData& platformData) : system(system), compiler(compiler),
        time(0.0), platformData(platformData), stepCount(0), computeForceCount(0), contextIsValid(false), atomsWereReordered(false), pinnedBuffer(NULL), posq(NULL),
-        velm(NULL), force(NULL), energyBuffer(NULL), integration(NULL), expression(NULL), bonded(NULL), nonbonded(NULL), thread(NULL) {
+        posqCorrection(NULL), velm(NULL), force(NULL), energyBuffer(NULL), integration(NULL), expression(NULL), bonded(NULL), nonbonded(NULL), thread(NULL) {
    if (!hasInitializedCuda) {
        CHECK_RESULT2(cuInit(0), "Error initializing CUDA");
        hasInitializedCuda = true;
    }
    if (precision == "single") {
        useDoublePrecision = false;
-        accumulateInDouble = false;
+        useMixedPrecision = false;
    }
    else if (precision == "mixed") {
        useDoublePrecision = false;
-        accumulateInDouble = true;
+        useMixedPrecision = true;
    }
    else if (precision == "double") {
        useDoublePrecision = true;
-        accumulateInDouble = true;
+        useMixedPrecision = false;
    }
    else
        throw OpenMMException("Illegal value for CudaPrecision: "+precision);
@@ -150,16 +150,37 @@ CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlocking
        compilationDefines["make_real2"] = "make_double2";
        compilationDefines["make_real3"] = "make_double3";
        compilationDefines["make_real4"] = "make_double4";
+        compilationDefines["make_mixed2"] = "make_double2";
+        compilationDefines["make_mixed3"] = "make_double3";
+        compilationDefines["make_mixed4"] = "make_double4";
        energyBuffer = CudaArray::create<double>(*this, numEnergyBuffers, "energyBuffer");
        int pinnedBufferSize = max(paddedNumAtoms*4, numEnergyBuffers);
        CHECK_RESULT(cuMemHostAlloc(&pinnedBuffer, pinnedBufferSize*sizeof(double), 0));
    }
+    else if (useMixedPrecision) {
+        posq = CudaArray::create<float4>(*this, paddedNumAtoms, "posq");
+        posqCorrection = CudaArray::create<float4>(*this, paddedNumAtoms, "posqCorrection");
+        velm = CudaArray::create<double4>(*this, paddedNumAtoms, "velm");
+        compilationDefines["USE_MIXED_PRECISION"] = "1";
+        compilationDefines["make_real2"] = "make_float2";
+        compilationDefines["make_real3"] = "make_float3";
+        compilationDefines["make_real4"] = "make_float4";
+        compilationDefines["make_mixed2"] = "make_double2";
+        compilationDefines["make_mixed3"] = "make_double3";
+        compilationDefines["make_mixed4"] = "make_double4";
+        energyBuffer = CudaArray::create<float>(*this, numEnergyBuffers, "energyBuffer");
+        int pinnedBufferSize = max(paddedNumAtoms*4, numEnergyBuffers);
+        CHECK_RESULT(cuMemHostAlloc(&pinnedBuffer, pinnedBufferSize*sizeof(double), 0));
+    }
    else {
        posq = CudaArray::create<float4>(*this, paddedNumAtoms, "posq");
        velm = CudaArray::create<float4>(*this, paddedNumAtoms, "velm");
        compilationDefines["make_real2"] = "make_float2";
        compilationDefines["make_real3"] = "make_float3";
        compilationDefines["make_real4"] = "make_float4";
+        compilationDefines["make_mixed2"] = "make_float2";
+        compilationDefines["make_mixed3"] = "make_float3";
+        compilationDefines["make_mixed4"] = "make_float4";
        energyBuffer = CudaArray::create<float>(*this, numEnergyBuffers, "energyBuffer");
        int pinnedBufferSize = max(paddedNumAtoms*6, numEnergyBuffers);
        CHECK_RESULT(cuMemHostAlloc(&pinnedBuffer, pinnedBufferSize*sizeof(float), 0));
@@ -211,6 +232,8 @@ CudaContext::~CudaContext() {
        cuMemFreeHost(pinnedBuffer);
    if (posq != NULL)
        delete posq;
+    if (posqCorrection != NULL)
+        delete posqCorrection;
    if (velm != NULL)
        delete velm;
    if (force != NULL)
@@ -237,7 +260,7 @@ void CudaContext::initialize() {
    string errorMessage = "Error initializing Context";
    for (int i = 0; i < numAtoms; i++) {
        double mass = system.getParticleMass(i);
-        if (useDoublePrecision)
+        if (useDoublePrecision || useMixedPrecision)
            ((double4*) pinnedBuffer)[i] = make_double4(0.0, 0.0, 0.0, mass == 0.0 ? 0.0 : 1.0/mass);
        else
            ((float4*) pinnedBuffer)[i] = make_float4(0.0f, 0.0f, 0.0f, mass == 0.0 ? 0.0f : (float) (1.0/mass));
@@ -308,6 +331,18 @@ CUmodule CudaContext::createModule(const string source, const map<string, string
        src << "typedef float3 real3;\n";
        src << "typedef float4 real4;\n";
    }
+    if (useDoublePrecision || useMixedPrecision) {
+        src << "typedef double mixed;\n";
+        src << "typedef double2 mixed2;\n";
+        src << "typedef double3 mixed3;\n";
+        src << "typedef double4 mixed4;\n";
+    }
+    else {
+        src << "typedef float mixed;\n";
+        src << "typedef float2 mixed2;\n";
+        src << "typedef float3 mixed3;\n";
+        src << "typedef float4 mixed4;\n";
+    }
    for (map<string, string>::const_iterator iter = defines.begin(); iter != defines.end(); ++iter) {
        src << "#define " << iter->first;
        if (!iter->second.empty())
@@ -789,6 +824,22 @@ void CudaContext::validateMolecules() {
        posq->upload(newPosq);
        velm->upload(newVelm);
    }
+    else if (useMixedPrecision) {
+        vector<float4> oldPosq(paddedNumAtoms);
+        vector<float4> newPosq(paddedNumAtoms);
+        vector<double4> oldVelm(paddedNumAtoms);
+        vector<double4> newVelm(paddedNumAtoms);
+        posq->download(oldPosq);
+        velm->download(oldVelm);
+        for (int i = 0; i < numAtoms; i++) {
+            int index = atomIndex[i];
+            newPosq[index] = oldPosq[i];
+            newVelm[index] = oldVelm[i];
+            newCellOffsets[index] = posCellOffsets[i];
+        }
+        posq->upload(newPosq);
+        velm->upload(newVelm);
+    }
    else {
        vector<float4> oldPosq(paddedNumAtoms);
        vector<float4> newPosq(paddedNumAtoms);
@@ -822,19 +873,24 @@ void CudaContext::reorderAtoms(bool enforcePeriodic) {
        validateMolecules();
    atomsWereReordered = true;
    if (useDoublePrecision)
-        reorderAtomsImpl<double, double4>(enforcePeriodic);
+        reorderAtomsImpl<double, double4, double, double4>(enforcePeriodic);
+    else if (useMixedPrecision)
+        reorderAtomsImpl<float, float4, double, double4>(enforcePeriodic);
    else
-        reorderAtomsImpl<float, float4>(enforcePeriodic);
+        reorderAtomsImpl<float, float4, float, float4>(enforcePeriodic);
 }

-template <class Real, class Real4>
+template <class Real, class Real4, class Mixed, class Mixed4>
 void CudaContext::reorderAtomsImpl(bool enforcePeriodic) {
    // Find the range of positions and the number of bins along each axis.

    vector<Real4> oldPosq(paddedNumAtoms);
-    vector<Real4> oldVelm(paddedNumAtoms);
+    vector<Real4> oldPosqCorrection(paddedNumAtoms);
+    vector<Mixed4> oldVelm(paddedNumAtoms);
    posq->download(oldPosq);
    velm->download(oldVelm);
+    if (useMixedPrecision)
+        posqCorrection->download(oldPosqCorrection);
    Real minx = oldPosq[0].x, maxx = oldPosq[0].x;
    Real miny = oldPosq[0].y, maxy = oldPosq[0].y;
    Real minz = oldPosq[0].z, maxz = oldPosq[0].z;
@@ -860,7 +916,8 @@ void CudaContext::reorderAtomsImpl(bool enforcePeriodic) {

    vector<int> originalIndex(numAtoms);
    vector<Real4> newPosq(paddedNumAtoms);
-    vector<Real4> newVelm(paddedNumAtoms);
+    vector<Real4> newPosqCorrection(paddedNumAtoms);
+    vector<Mixed4> newVelm(paddedNumAtoms);
    vector<int4> newCellOffsets(numAtoms);
    for (int group = 0; group < (int) moleculeGroups.size(); group++) {
        // Find the center of each molecule.
@@ -959,6 +1016,8 @@ void CudaContext::reorderAtomsImpl(bool enforcePeriodic) {
                int newIndex = mol.offsets[i]+atoms[j];
                originalIndex[newIndex] = atomIndex[oldIndex];
                newPosq[newIndex] = oldPosq[oldIndex];
+                if (useMixedPrecision)
+                    newPosqCorrection[newIndex] = oldPosqCorrection[oldIndex];
                newVelm[newIndex] = oldVelm[oldIndex];
                newCellOffsets[newIndex] = posCellOffsets[oldIndex];
            }
@@ -972,6 +1031,8 @@ void CudaContext::reorderAtomsImpl(bool enforcePeriodic) {
        posCellOffsets[i] = newCellOffsets[i];
    }
    posq->upload(newPosq);
+    if (useMixedPrecision)
+        posqCorrection->upload(newPosqCorrection);
    velm->upload(newVelm);
    atomIndexDevice->upload(atomIndex);
    for (int i = 0; i < (int) reorderListeners.size(); i++)

--- a/platforms/cuda/src/CudaContext.h
+++ b/platforms/cuda/src/CudaContext.h
@@ -135,6 +135,12 @@ public:
    CudaArray& getPosq() {
        return *posq;
    }
+    /**
+     * Get the array which contains a correction to the position of each atom.  This only exists if getUseMixedPrecision() returns true.
+     */
+    CudaArray& getPosqCorrection() {
+        return *posqCorrection;
+    }
    /**
     * Get the array which contains the velocity (the xyz components) and inverse mass (the w component) of each atom.
     */
@@ -314,10 +320,10 @@ public:
        return useDoublePrecision;
    }
    /**
-     * Get whether accumulation is being done in double precision.
+     * Get whether mixed precision is being used.
     */
-    bool getAccumulateInDouble() {
-        return accumulateInDouble;
+    bool getUseMixedPrecision() {
+        return useMixedPrecision;
    }
    /**
     * Convert a number to a string in a format suitable for including in a kernel.
@@ -455,7 +461,7 @@ private:
    /**
     * This is the internal implementation of reorderAtoms(), templatized by the numerical precision in use.
     */
-    template <class Real, class Real4>
+    template <class Real, class Real4, class Mixed, class Mixed4>
    void reorderAtomsImpl(bool enforcePeriodic);
    static bool hasInitializedCuda;
    const System& system;
@@ -469,7 +475,7 @@ private:
    int paddedNumAtoms;
    int numAtomBlocks;
    int numThreadBlocks;
-    bool useBlockingSync, useDoublePrecision, accumulateInDouble, contextIsValid, atomsWereReordered, moleculesInvalid;
+    bool useBlockingSync, useDoublePrecision, useMixedPrecision, contextIsValid, atomsWereReordered, moleculesInvalid;
    std::string compiler, tempDir, gpuArchitecture;
    float4 periodicBoxSizeFloat, invPeriodicBoxSizeFloat;
    double4 periodicBoxSize, invPeriodicBoxSize;
@@ -489,6 +495,7 @@ private:
    std::vector<int4> posCellOffsets;
    void* pinnedBuffer;
    CudaArray* posq;
+    CudaArray* posqCorrection;
    CudaArray* velm;
    CudaArray* force;
    CudaArray* energyBuffer;

--- a/platforms/cuda/src/CudaIntegrationUtilities.cpp
+++ b/platforms/cuda/src/CudaIntegrationUtilities.cpp
@@ -104,7 +104,7 @@ CudaIntegrationUtilities::CudaIntegrationUtilities(CudaContext& context, const S
        vsiteOutOfPlaneAtoms(NULL), vsiteOutOfPlaneWeights(NULL) {
    // Create workspace arrays.

-    if (context.getUseDoublePrecision()) {
+    if (context.getUseDoublePrecision() || context.getUseMixedPrecision()) {
        posDelta = CudaArray::create<double4>(context, context.getPaddedNumAtoms(), "posDelta");
        vector<double4> deltas(posDelta->getSize(), make_double4(0.0, 0.0, 0.0, 0.0));
        posDelta->upload(deltas);
@@ -473,7 +473,7 @@ CudaIntegrationUtilities::CudaIntegrationUtilities(CudaContext& context, const S
        vector<int> atomConstraintsVec(ccmaAtomConstraints->getSize());
        vector<int> numAtomConstraintsVec(ccmaNumAtomConstraints->getSize());
        vector<int> constraintMatrixColumnVec(ccmaConstraintMatrixColumn->getSize());
-        if (context.getUseDoublePrecision()) {
+        if (context.getUseDoublePrecision() || context.getUseMixedPrecision()) {
            ccmaDistance = CudaArray::create<double4>(context, numCCMA, "CcmaDistance");
            ccmaDelta1 = CudaArray::create<double>(context, numCCMA, "CcmaDelta1");
            ccmaDelta2 = CudaArray::create<double>(context, numCCMA, "CcmaDelta2");
@@ -717,23 +717,24 @@ void CudaIntegrationUtilities::applyConstraints(bool constrainVelocities, double
        ccmaForceKernel = ccmaPosForceKernel;
    }
    float floatTol = (float) tol;
-    void* tolPointer = (context.getUseDoublePrecision() ? (void*) &tol : (void*) &floatTol);
+    void* tolPointer = (context.getUseDoublePrecision() || context.getUseMixedPrecision() ? (void*) &tol : (void*) &floatTol);
+    CUdeviceptr posCorrection = (context.getUseMixedPrecision() ? context.getPosqCorrection().getDevicePointer() : 0);
    if (settleAtoms != NULL) {
        int numClusters = settleAtoms->getSize();
-        void* args[] = {&numClusters, tolPointer, &context.getPosq().getDevicePointer(),
+        void* args[] = {&numClusters, tolPointer, &context.getPosq().getDevicePointer(), &posCorrection,
                &posDelta->getDevicePointer(), &context.getVelm().getDevicePointer(),
                &settleAtoms->getDevicePointer(), &settleParams->getDevicePointer()};
        context.executeKernel(settleKernel, args, settleAtoms->getSize());
    }
    if (shakeAtoms != NULL) {
        int numClusters = shakeAtoms->getSize();
-        void* args[] = {&numClusters, tolPointer, &context.getPosq().getDevicePointer(),
+        void* args[] = {&numClusters, tolPointer, &context.getPosq().getDevicePointer(), &posCorrection,
                constrainVelocities ? &context.getVelm().getDevicePointer() : &posDelta->getDevicePointer(),
                &shakeAtoms->getDevicePointer(), &shakeParams->getDevicePointer()};
        context.executeKernel(shakeKernel, args, shakeAtoms->getSize());
    }
    if (ccmaAtoms != NULL) {
-        void* directionsArgs[] = {&ccmaAtoms->getDevicePointer(), &ccmaDistance->getDevicePointer(), &context.getPosq().getDevicePointer()};
+        void* directionsArgs[] = {&ccmaAtoms->getDevicePointer(), &ccmaDistance->getDevicePointer(), &context.getPosq().getDevicePointer(), &posCorrection};
        context.executeKernel(ccmaDirectionsKernel, directionsArgs, ccmaAtoms->getSize());
        int i;
        void* forceArgs[] = {&ccmaAtoms->getDevicePointer(), &ccmaDistance->getDevicePointer(),
@@ -768,7 +769,8 @@ void CudaIntegrationUtilities::applyConstraints(bool constrainVelocities, double

 void CudaIntegrationUtilities::computeVirtualSites() {
    if (numVsites > 0) {
-        void* args[] = {&context.getPosq().getDevicePointer(), &vsite2AvgAtoms->getDevicePointer(), &vsite2AvgWeights->getDevicePointer(),
+        CUdeviceptr posCorrection = (context.getUseMixedPrecision() ? context.getPosqCorrection().getDevicePointer() : 0);
+        void* args[] = {&context.getPosq().getDevicePointer(), &posCorrection, &vsite2AvgAtoms->getDevicePointer(), &vsite2AvgWeights->getDevicePointer(),
                &vsite3AvgAtoms->getDevicePointer(), &vsite3AvgWeights->getDevicePointer(),
                &vsiteOutOfPlaneAtoms->getDevicePointer(), &vsiteOutOfPlaneWeights->getDevicePointer()};
        context.executeKernel(vsitePositionKernel, args, numVsites);
@@ -777,7 +779,8 @@ void CudaIntegrationUtilities::computeVirtualSites() {

 void CudaIntegrationUtilities::distributeForcesFromVirtualSites() {
    if (numVsites > 0) {
-        void* args[] = {&context.getPosq().getDevicePointer(), &context.getForce().getDevicePointer(),
+        CUdeviceptr posCorrection = (context.getUseMixedPrecision() ? context.getPosqCorrection().getDevicePointer() : 0);
+        void* args[] = {&context.getPosq().getDevicePointer(), &posCorrection, &context.getForce().getDevicePointer(),
                &vsite2AvgAtoms->getDevicePointer(), &vsite2AvgWeights->getDevicePointer(),
                &vsite3AvgAtoms->getDevicePointer(), &vsite3AvgWeights->getDevicePointer(),
                &vsiteOutOfPlaneAtoms->getDevicePointer(), &vsiteOutOfPlaneWeights->getDevicePointer()};

--- a/platforms/cuda/src/CudaKernels.cpp
+++ b/platforms/cuda/src/CudaKernels.cpp
@@ -148,6 +148,18 @@ void CudaUpdateStateDataKernel::getPositions(ContextImpl& context, vector<Vec3>&
            positions[order[i]] = Vec3(pos.x-offset.x*periodicBoxSize.x, pos.y-offset.y*periodicBoxSize.y, pos.z-offset.z*periodicBoxSize.z);
        }
    }
+    else if (cu.getUseMixedPrecision()) {
+        float4* posq = (float4*) cu.getPinnedBuffer();
+        vector<float4> posCorrection;
+        cu.getPosq().download(posq);
+        cu.getPosqCorrection().download(posCorrection);
+        for (int i = 0; i < numParticles; ++i) {
+            float4 pos1 = posq[i];
+            float4 pos2 = posCorrection[i];
+            int4 offset = cu.getPosCellOffsets()[i];
+            positions[order[i]] = Vec3((double)pos1.x+(double)pos2.x-offset.x*periodicBoxSize.x, (double)pos1.y+(double)pos2.y-offset.y*periodicBoxSize.y, (double)pos1.z+(double)pos2.z-offset.z*periodicBoxSize.z);
+        }
+    }
    else {
        float4* posq = (float4*) cu.getPinnedBuffer();
        cu.getPosq().download(posq);
@@ -183,14 +195,28 @@ void CudaUpdateStateDataKernel::setPositions(ContextImpl& context, const vector<
        for (int i = 0; i < numParticles; ++i) {
            float4& pos = posq[i];
            const Vec3& p = positions[order[i]];
-            pos.x = p[0];
-            pos.y = p[1];
-            pos.z = p[2];
+            pos.x = (float) p[0];
+            pos.y = (float) p[1];
+            pos.z = (float) p[2];
        }
        for (int i = numParticles; i < cu.getPaddedNumAtoms(); i++)
            posq[i] = make_float4(0.0, 0.0, 0.0, 0.0);
        cu.getPosq().upload(posq);
    }
+    if (cu.getUseMixedPrecision()) {
+        float4* posCorrection = (float4*) cu.getPinnedBuffer();
+        for (int i = 0; i < numParticles; ++i) {
+            float4& c = posCorrection[i];
+            const Vec3& p = positions[order[i]];
+            c.x = (float) (p[0]-(float)p[0]);
+            c.y = (float) (p[1]-(float)p[1]);
+            c.z = (float) (p[2]-(float)p[2]);
+            c.w = 0;
+        }
+        for (int i = numParticles; i < cu.getPaddedNumAtoms(); i++)
+            posCorrection[i] = make_float4(0.0, 0.0, 0.0, 0.0);
+        cu.getPosqCorrection().upload(posCorrection);
+    }
    for (int i = 0; i < (int) cu.getPosCellOffsets().size(); i++)
        cu.getPosCellOffsets()[i] = make_int4(0, 0, 0, 0);
 }
@@ -200,7 +226,7 @@ void CudaUpdateStateDataKernel::getVelocities(ContextImpl& context, vector<Vec3>
    const vector<int>& order = cu.getAtomIndex();
    int numParticles = context.getSystem().getNumParticles();
    velocities.resize(numParticles);
-    if (cu.getUseDoublePrecision()) {
+    if (cu.getUseDoublePrecision() || cu.getUseMixedPrecision()) {
        double4* velm = (double4*) cu.getPinnedBuffer();
        cu.getVelm().download(velm);
        for (int i = 0; i < numParticles; ++i) {
@@ -224,7 +250,7 @@ void CudaUpdateStateDataKernel::setVelocities(ContextImpl& context, const vector
    cu.setAsCurrent();
    const vector<int>& order = cu.getAtomIndex();
    int numParticles = context.getSystem().getNumParticles();
-    if (cu.getUseDoublePrecision()) {
+    if (cu.getUseDoublePrecision() || cu.getUseMixedPrecision()) {
        double4* velm = (double4*) cu.getPinnedBuffer();
        cu.getVelm().download(velm);
        for (int i = 0; i < numParticles; ++i) {
@@ -290,12 +316,11 @@ void CudaUpdateStateDataKernel::createCheckpoint(ContextImpl& context, ostream&
    stream.write((char*) &stepCount, sizeof(int));
    int computeForceCount = cu.getComputeForceCount();
    stream.write((char*) &computeForceCount, sizeof(int));
-    int bufferSize = cu.getPaddedNumAtoms()*(cu.getUseDoublePrecision() ? sizeof(double4) : sizeof(float4));
    char* buffer = (char*) cu.getPinnedBuffer();
    cu.getPosq().download(buffer);
-    stream.write(buffer, bufferSize);
+    stream.write(buffer, cu.getPosq().getSize()*cu.getPosq().getElementSize());
    cu.getVelm().download(buffer);
-    stream.write(buffer, bufferSize);
+    stream.write(buffer, cu.getVelm().getSize()*cu.getVelm().getElementSize());
    stream.write((char*) &cu.getAtomIndex()[0], sizeof(int)*cu.getAtomIndex().size());
    stream.write((char*) &cu.getPosCellOffsets()[0], sizeof(int4)*cu.getPosCellOffsets().size());
    double4 box = cu.getPeriodicBoxSize();
@@ -321,11 +346,10 @@ void CudaUpdateStateDataKernel::loadCheckpoint(ContextImpl& context, istream& st
        contexts[i]->setStepCount(stepCount);
        contexts[i]->setComputeForceCount(computeForceCount);
    }
-    int bufferSize = cu.getPaddedNumAtoms()*(cu.getUseDoublePrecision() ? sizeof(double4) : sizeof(float4));
    char* buffer = (char*) cu.getPinnedBuffer();
-    stream.read(buffer, bufferSize);
+    stream.read(buffer, cu.getPosq().getSize()*cu.getPosq().getElementSize());
    cu.getPosq().upload(buffer);
-    stream.read(buffer, bufferSize);
+    stream.read(buffer, cu.getVelm().getSize()*cu.getVelm().getElementSize());
    cu.getVelm().upload(buffer);
    stream.read((char*) &cu.getAtomIndex()[0], sizeof(int)*cu.getAtomIndex().size());
    cu.getAtomIndexArray().upload(cu.getAtomIndex());
@@ -2016,7 +2040,7 @@ double CudaCalcGBSAOBCForceKernel::execute(ContextImpl& context, bool includeFor
        defines["FORCE_WORK_GROUP_SIZE"] = cu.intToString(nb.getForceThreadBlockSize());
        map<string, string> replacements;
        stringstream defineAccum;
-        if (cu.getAccumulateInDouble()) {
+        if (cu.getUseMixedPrecision()) {
            defineAccum << "typedef double accum;\n";
            defineAccum << "typedef double4 accum4;\n";
            defines["make_accum4"] = "make_double4";
@@ -2531,7 +2555,7 @@ void CudaCalcCustomGBForceKernel::initialize(const System& system, const CustomG
        replacements["STORE_DERIVATIVES_2"] = storeDerivs2.str();
        map<string, string> defines;
        stringstream defineAccum;
-        if (cu.getAccumulateInDouble()) {
+        if (cu.getUseMixedPrecision()) {
            defineAccum << "typedef double accum;\n";
            defineAccum << "typedef double3 accum3;\n";
            defines["make_accum3"] = "make_double3";
@@ -3971,7 +3995,7 @@ void CudaIntegrateVerletStepKernel::execute(ContextImpl& context, const VerletIn
    int numAtoms = cu.getNumAtoms();
    double dt = integrator.getStepSize();
    if (dt != prevStepSize) {
-        if (cu.getUseDoublePrecision()) {
+        if (cu.getUseDoublePrecision() || cu.getUseMixedPrecision()) {
            vector<double2> stepSizeVec(1);
            stepSizeVec[0] = make_double2(dt, dt);
            cu.getIntegrationUtilities().getStepSize().upload(stepSizeVec);
@@ -3986,7 +4010,8 @@ void CudaIntegrateVerletStepKernel::execute(ContextImpl& context, const VerletIn

    // Call the first integration kernel.

-    void* args1[] = {&cu.getIntegrationUtilities().getStepSize().getDevicePointer(), &cu.getPosq().getDevicePointer(),
+    CUdeviceptr posCorrection = (cu.getUseMixedPrecision() ? cu.getPosqCorrection().getDevicePointer() : 0);
+    void* args1[] = {&cu.getIntegrationUtilities().getStepSize().getDevicePointer(), &cu.getPosq().getDevicePointer(), &posCorrection,
            &cu.getVelm().getDevicePointer(), &cu.getForce().getDevicePointer(), &integration.getPosDelta().getDevicePointer()};
    cu.executeKernel(kernel1, args1, numAtoms);

@@ -3996,7 +4021,7 @@ void CudaIntegrateVerletStepKernel::execute(ContextImpl& context, const VerletIn

    // Call the second integration kernel.

-    void* args2[] = {&cu.getIntegrationUtilities().getStepSize().getDevicePointer(), &cu.getPosq().getDevicePointer(),
+    void* args2[] = {&cu.getIntegrationUtilities().getStepSize().getDevicePointer(), &cu.getPosq().getDevicePointer(), &posCorrection,
            &cu.getVelm().getDevicePointer(), &integration.getPosDelta().getDevicePointer()};
    cu.executeKernel(kernel2, args2, numAtoms);
    integration.computeVirtualSites();
@@ -4023,7 +4048,7 @@ void CudaIntegrateLangevinStepKernel::initialize(const System& system, const Lan
    CUmodule module = cu.createModule(CudaKernelSources::langevin, defines, "");
    kernel1 = cu.getKernel(module, "integrateLangevinPart1");
    kernel2 = cu.getKernel(module, "integrateLangevinPart2");
-    params = new CudaArray(cu, 3, cu.getUseDoublePrecision() ? sizeof(double) : sizeof(float), "langevinParams");
+    params = new CudaArray(cu, 3, cu.getUseDoublePrecision() || cu.getUseMixedPrecision() ? sizeof(double) : sizeof(float), "langevinParams");
    prevStepSize = -1.0;
 }

@@ -4042,7 +4067,7 @@ void CudaIntegrateLangevinStepKernel::execute(ContextImpl& context, const Langev
        double vscale = exp(-stepSize/tau);
        double fscale = (1-vscale)*tau;
        double noisescale = sqrt(2*kT/tau)*sqrt(0.5*(1-vscale*vscale)*tau);
-        if (cu.getUseDoublePrecision()) {
+        if (cu.getUseDoublePrecision() || cu.getUseMixedPrecision()) {
            vector<double> p(params->getSize());
            p[0] = vscale;
            p[1] = fscale;
@@ -4078,7 +4103,8 @@ void CudaIntegrateLangevinStepKernel::execute(ContextImpl& context, const Langev

    // Call the second integration kernel.

-    void* args2[] = {&cu.getPosq().getDevicePointer(), &integration.getPosDelta().getDevicePointer(),
+    CUdeviceptr posCorrection = (cu.getUseMixedPrecision() ? cu.getPosqCorrection().getDevicePointer() : 0);
+    void* args2[] = {&cu.getPosq().getDevicePointer(), &posCorrection, &integration.getPosDelta().getDevicePointer(),
            &cu.getVelm().getDevicePointer(), &integration.getStepSize().getDevicePointer()};
    cu.executeKernel(kernel2, args2, numAtoms);
    integration.computeVirtualSites();
@@ -4172,16 +4198,18 @@ double CudaIntegrateVariableVerletStepKernel::execute(ContextImpl& context, cons
    float maxStepSizeFloat = (float) maxStepSize;
    double tol = integrator.getErrorTolerance();
    float tolFloat = (float) tol;
-    void* argsSelect[] = {cu.getUseDoublePrecision() ? (void*) &maxStepSize : (void*) &maxStepSizeFloat,
-            cu.getUseDoublePrecision() ? (void*) &tol : (void*) &tolFloat,
+    bool useDouble = cu.getUseDoublePrecision() || cu.getUseMixedPrecision();
+    void* argsSelect[] = {useDouble ? (void*) &maxStepSize : (void*) &maxStepSizeFloat,
+            useDouble ? (void*) &tol : (void*) &tolFloat,
            &cu.getIntegrationUtilities().getStepSize().getDevicePointer(),
            &cu.getVelm().getDevicePointer(), &cu.getForce().getDevicePointer()};
-    int sharedSize = blockSize*(cu.getUseDoublePrecision() ? sizeof(double) : sizeof(float));
+    int sharedSize = blockSize*(useDouble ? sizeof(double) : sizeof(float));
    cu.executeKernel(selectSizeKernel, argsSelect, blockSize, blockSize, sharedSize);

    // Call the first integration kernel.

-    void* args1[] = {&cu.getIntegrationUtilities().getStepSize().getDevicePointer(), &cu.getPosq().getDevicePointer(),
+    CUdeviceptr posCorrection = (cu.getUseMixedPrecision() ? cu.getPosqCorrection().getDevicePointer() : 0);
+    void* args1[] = {&cu.getIntegrationUtilities().getStepSize().getDevicePointer(), &cu.getPosq().getDevicePointer(), &posCorrection,
            &cu.getVelm().getDevicePointer(), &cu.getForce().getDevicePointer(), &integration.getPosDelta().getDevicePointer()};
    cu.executeKernel(kernel1, args1, numAtoms);

@@ -4191,7 +4219,7 @@ double CudaIntegrateVariableVerletStepKernel::execute(ContextImpl& context, cons

    // Call the second integration kernel.

-    void* args2[] = {&cu.getIntegrationUtilities().getStepSize().getDevicePointer(), &cu.getPosq().getDevicePointer(),
+    void* args2[] = {&cu.getIntegrationUtilities().getStepSize().getDevicePointer(), &cu.getPosq().getDevicePointer(), &posCorrection,
            &cu.getVelm().getDevicePointer(), &integration.getPosDelta().getDevicePointer()};
    cu.executeKernel(kernel2, args2, numAtoms);
    integration.computeVirtualSites();
@@ -4199,7 +4227,7 @@ double CudaIntegrateVariableVerletStepKernel::execute(ContextImpl& context, cons
    // Update the time and step count.

    double dt, time;
-    if (cu.getUseDoublePrecision()) {
+    if (useDouble) {
        double2 stepSize;
        cu.getIntegrationUtilities().getStepSize().download(&stepSize);
        dt = stepSize.y;
@@ -4237,7 +4265,7 @@ void CudaIntegrateVariableLangevinStepKernel::initialize(const System& system, c
    kernel1 = cu.getKernel(module, "integrateLangevinPart1");
    kernel2 = cu.getKernel(module, "integrateLangevinPart2");
    selectSizeKernel = cu.getKernel(module, "selectLangevinStepSize");
-    params = CudaArray::create<float>(cu, 3, "langevinParams");
+    params = new CudaArray(cu, 3, cu.getUseDoublePrecision() || cu.getUseMixedPrecision() ? sizeof(double) : sizeof(float), "langevinParams");
    blockSize = min(256, system.getNumParticles());
    blockSize = max(blockSize, params->getSize());
 }
@@ -4257,13 +4285,14 @@ double CudaIntegrateVariableLangevinStepKernel::execute(ContextImpl& context, co
    float tauFloat = (float) tau;
    double kT = BOLTZ*integrator.getTemperature();
    float kTFloat = (float) kT;
-    void* argsSelect[] = {cu.getUseDoublePrecision() ? (void*) &maxStepSize : (void*) &maxStepSizeFloat,
-            cu.getUseDoublePrecision() ? (void*) &tol : (void*) &tolFloat,
-            cu.getUseDoublePrecision() ? (void*) &tau : (void*) &tauFloat,
-            cu.getUseDoublePrecision() ? (void*) &kT : (void*) &kTFloat,
+    bool useDouble = cu.getUseDoublePrecision() || cu.getUseMixedPrecision();
+    void* argsSelect[] = {useDouble ? (void*) &maxStepSize : (void*) &maxStepSizeFloat,
+            useDouble ? (void*) &tol : (void*) &tolFloat,
+            useDouble ? (void*) &tau : (void*) &tauFloat,
+            useDouble ? (void*) &kT : (void*) &kTFloat,
            &cu.getIntegrationUtilities().getStepSize().getDevicePointer(),
            &cu.getVelm().getDevicePointer(), &cu.getForce().getDevicePointer(), &params->getDevicePointer()};
-    int sharedSize = blockSize*(cu.getUseDoublePrecision() ? sizeof(double) : sizeof(float));
+    int sharedSize = blockSize*(useDouble ? sizeof(double) : sizeof(float));
    cu.executeKernel(selectSizeKernel, argsSelect, blockSize, blockSize, sharedSize);

    // Call the first integration kernel.
@@ -4279,7 +4308,8 @@ double CudaIntegrateVariableLangevinStepKernel::execute(ContextImpl& context, co

    // Call the second integration kernel.

-    void* args2[] = {&cu.getPosq().getDevicePointer(), &integration.getPosDelta().getDevicePointer(),
+    CUdeviceptr posCorrection = (cu.getUseMixedPrecision() ? cu.getPosqCorrection().getDevicePointer() : 0);
+    void* args2[] = {&cu.getPosq().getDevicePointer(), &posCorrection, &integration.getPosDelta().getDevicePointer(),
            &cu.getVelm().getDevicePointer(), &integration.getStepSize().getDevicePointer()};
    cu.executeKernel(kernel2, args2, numAtoms);
    integration.computeVirtualSites();
@@ -4287,7 +4317,7 @@ double CudaIntegrateVariableLangevinStepKernel::execute(ContextImpl& context, co
    // Update the time and step count.

    double dt, time;
-    if (cu.getUseDoublePrecision()) {
+    if (useDouble) {
        double2 stepSize;
        cu.getIntegrationUtilities().getStepSize().download(&stepSize);
        dt = stepSize.y;
@@ -5099,7 +5129,7 @@ double CudaCalcKineticEnergyKernel::execute(ContextImpl& context) {

    const vector<int>& order = cu.getAtomIndex();
    double energy = 0.0;
-    if (cu.getUseDoublePrecision()) {
+    if (cu.getUseDoublePrecision() || cu.getUseMixedPrecision()) {
        double4* velm = (double4*) cu.getPinnedBuffer();
        cu.getVelm().download(velm);
        for (size_t i = 0; i < masses.size(); ++i) {

--- a/platforms/cuda/src/CudaNonbondedUtilities.cpp
+++ b/platforms/cuda/src/CudaNonbondedUtilities.cpp
@@ -446,12 +446,12 @@ CUfunction CudaNonbondedUtilities::createInteractionKernel(const string& source,
    defines["NUM_ATOMS"] = context.intToString(context.getNumAtoms());
    defines["PADDED_NUM_ATOMS"] = context.intToString(context.getPaddedNumAtoms());
    defines["NUM_BLOCKS"] = context.intToString(context.getNumAtomBlocks());
-    if ((localDataSize/4)%2 == 0 && !context.getUseDoublePrecision() && !context.getAccumulateInDouble())
+    if ((localDataSize/4)%2 == 0 && !context.getUseDoublePrecision() && !context.getUseMixedPrecision())
        defines["PARAMETER_SIZE_IS_EVEN"] = "1";
    if (context.getComputeCapability() >= 3.0 && !context.getUseDoublePrecision())
        defines["ENABLE_SHUFFLE"] = "1";
    stringstream defineAccum;
-    if (context.getAccumulateInDouble()) {
+    if (context.getUseMixedPrecision()) {
        defineAccum << "typedef double accum;\n";
        defineAccum << "typedef double3 accum3;\n";
        defines["make_accum3"] = "make_double3";

--- a/platforms/cuda/src/kernels/andersenThermostat.cu
+++ b/platforms/cuda/src/kernels/andersenThermostat.cu
@@ -2,12 +2,12 @@
 * Apply the Andersen thermostat to adjust particle velocities.
 */

-extern "C" __global__ void applyAndersenThermostat(float collisionFrequency, float kT, real4* velm, const real2* __restrict__ stepSize, const float4* __restrict__ random,
+extern "C" __global__ void applyAndersenThermostat(float collisionFrequency, float kT, mixed4* velm, const mixed4* __restrict__ stepSize, const float4* __restrict__ random,
        unsigned int randomIndex, const int* __restrict__ atomGroups) {
-    float collisionProbability = 1.0f-expf(-collisionFrequency*stepSize[0].y);
+    float collisionProbability = 1.0f-expf(-(float) (collisionFrequency*stepSize[0].y));
    float randomRange = erff(collisionProbability/sqrtf(2.0f));
    for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < NUM_ATOMS; index += blockDim.x*gridDim.x) {
-        real4 velocity = velm[index];
+        mixed4 velocity = velm[index];
        float4 selectRand = random[randomIndex+atomGroups[index]];
        float4 velRand = random[randomIndex+index];
        real scale = (selectRand.w > -randomRange && selectRand.w < randomRange ? 0 : 1);

--- a/platforms/cuda/src/kernels/integrationUtilities.cu
+++ b/platforms/cuda/src/kernels/integrationUtilities.cu
--- a/platforms/cuda/src/kernels/langevin.cu
+++ b/platforms/cuda/src/kernels/langevin.cu
@@ -4,23 +4,23 @@ enum {VelScale, ForceScale, NoiseScale, MaxParams};
 * Perform the first step of Langevin integration.
 */

-extern "C" __global__ void integrateLangevinPart1(real4* __restrict__ velm, const long long* __restrict__ force, real4* __restrict__ posDelta,
-        const real* __restrict__ paramBuffer, const real2* __restrict__ dt, const float4* __restrict__ random, unsigned int randomIndex) {
-    real vscale = paramBuffer[VelScale];
-    real fscale = paramBuffer[ForceScale]/(real) 0xFFFFFFFF;
-    real noisescale = paramBuffer[NoiseScale];
-    real stepSize = dt[0].y;
+extern "C" __global__ void integrateLangevinPart1(mixed4* __restrict__ velm, const long long* __restrict__ force, mixed4* __restrict__ posDelta,
+        const mixed* __restrict__ paramBuffer, const mixed2* __restrict__ dt, const float4* __restrict__ random, unsigned int randomIndex) {
+    mixed vscale = paramBuffer[VelScale];
+    mixed fscale = paramBuffer[ForceScale]/(mixed) 0xFFFFFFFF;
+    mixed noisescale = paramBuffer[NoiseScale];
+    mixed stepSize = dt[0].y;
    int index = blockIdx.x*blockDim.x+threadIdx.x;
    randomIndex += index;
    while (index < NUM_ATOMS) {
-        real4 velocity = velm[index];
+        mixed4 velocity = velm[index];
        if (velocity.w != 0) {
-            real sqrtInvMass = SQRT(velocity.w);
+            mixed sqrtInvMass = SQRT(velocity.w);
            velocity.x = vscale*velocity.x + fscale*velocity.w*force[index] + noisescale*sqrtInvMass*random[randomIndex].x;
            velocity.y = vscale*velocity.y + fscale*velocity.w*force[index+PADDED_NUM_ATOMS] + noisescale*sqrtInvMass*random[randomIndex].y;
            velocity.z = vscale*velocity.z + fscale*velocity.w*force[index+PADDED_NUM_ATOMS*2] + noisescale*sqrtInvMass*random[randomIndex].z;
            velm[index] = velocity;
-            posDelta[index] = make_real4(stepSize*velocity.x, stepSize*velocity.y, stepSize*velocity.z, 0);
+            posDelta[index] = make_mixed4(stepSize*velocity.x, stepSize*velocity.y, stepSize*velocity.z, 0);
        }
        randomIndex += blockDim.x*gridDim.x;
        index += blockDim.x*gridDim.x;
@@ -31,21 +31,32 @@ extern "C" __global__ void integrateLangevinPart1(real4* __restrict__ velm, cons
 * Perform the second step of Langevin integration.
 */

-extern "C" __global__ void integrateLangevinPart2(real4* __restrict__ posq, const real4* __restrict__ posDelta, real4* __restrict__ velm, const real2* __restrict__ dt) {
+extern "C" __global__ void integrateLangevinPart2(real4* __restrict__ posq, real4* __restrict__ posqCorrection, const mixed4* __restrict__ posDelta, mixed4* __restrict__ velm, const mixed2* __restrict__ dt) {
    double invStepSize = 1.0/dt[0].y;
    int index = blockIdx.x*blockDim.x+threadIdx.x;
    while (index < NUM_ATOMS) {
-        real4 vel = velm[index];
+        mixed4 vel = velm[index];
        if (vel.w != 0) {
+#ifdef USE_MIXED_PRECISION
+            real4 pos1 = posq[index];
+            real4 pos2 = posqCorrection[index];
+            mixed4 pos = make_mixed4(pos1.x+(mixed)pos2.x, pos1.y+(mixed)pos2.y, pos1.z+(mixed)pos2.z, pos1.w);
+#else
            real4 pos = posq[index];
-            real4 delta = posDelta[index];
+#endif
+            mixed4 delta = posDelta[index];
            pos.x += delta.x;
            pos.y += delta.y;
            pos.z += delta.z;
-            vel.x = (real) invStepSize*delta.x;
-            vel.y = (real) invStepSize*delta.y;
-            vel.z = (real) invStepSize*delta.z;
+            vel.x = (mixed) (invStepSize*delta.x);
+            vel.y = (mixed) (invStepSize*delta.y);
+            vel.z = (mixed) (invStepSize*delta.z);
+#ifdef USE_MIXED_PRECISION
+            posq[index] = make_real4((real) pos.x, (real) pos.y, (real) pos.z, (real) pos.w);
+            posqCorrection[index] = make_real4(pos.x-(real) pos.x, pos.y-(real) pos.y, pos.z-(real) pos.z, 0);
+#else
            posq[index] = pos;
+#endif
            velm[index] = vel;
        }
        index += blockDim.x*gridDim.x;
@@ -56,18 +67,18 @@ extern "C" __global__ void integrateLangevinPart2(real4* __restrict__ posq, cons
 * Select the step size to use for the next step.
 */

-extern "C" __global__ void selectLangevinStepSize(real maxStepSize, real errorTol, real tau, real kT, real2* __restrict__ dt,
-        const real4* __restrict__ velm, const long long* __restrict__ force, real* __restrict__ paramBuffer) {
+extern "C" __global__ void selectLangevinStepSize(mixed maxStepSize, mixed errorTol, mixed tau, mixed kT, mixed2* __restrict__ dt,
+        const mixed4* __restrict__ velm, const long long* __restrict__ force, mixed* __restrict__ paramBuffer) {
    // Calculate the error.

-    extern __shared__ real params[];
-    real* error = &params[MaxParams];
-    real err = 0;
+    extern __shared__ mixed params[];
+    mixed* error = &params[MaxParams];
+    mixed err = 0;
    unsigned int index = threadIdx.x;
-    const real scale = RECIP((real) 0xFFFFFFFF);
+    const mixed scale = RECIP((mixed) 0xFFFFFFFF);
    while (index < NUM_ATOMS) {
-        real3 f = make_real3(scale*force[index], scale*force[index+PADDED_NUM_ATOMS], scale*force[index+PADDED_NUM_ATOMS*2]);
-        real invMass = velm[index].w;
+        mixed3 f = make_mixed3(scale*force[index], scale*force[index+PADDED_NUM_ATOMS], scale*force[index+PADDED_NUM_ATOMS*2]);
+        mixed invMass = velm[index].w;
        err += (f.x*f.x + f.y*f.y + f.z*f.z)*invMass;
        index += blockDim.x*gridDim.x;
    }
@@ -84,9 +95,9 @@ extern "C" __global__ void selectLangevinStepSize(real maxStepSize, real errorTo
    if (blockIdx.x*blockDim.x+threadIdx.x == 0) {
        // Select the new step size.

-        real totalError = sqrt(error[0]/(NUM_ATOMS*3));
-        real newStepSize = sqrt(errorTol/totalError);
-        real oldStepSize = dt[0].y;
+        mixed totalError = sqrt(error[0]/(NUM_ATOMS*3));
+        mixed newStepSize = sqrt(errorTol/totalError);
+        mixed oldStepSize = dt[0].y;
        if (oldStepSize > 0.0f)
            newStepSize = min(newStepSize, oldStepSize*2.0f); // For safety, limit how quickly dt can increase.
        if (newStepSize > oldStepSize && newStepSize < 1.1f*oldStepSize)
@@ -97,9 +108,9 @@ extern "C" __global__ void selectLangevinStepSize(real maxStepSize, real errorTo

        // Recalculate the integration parameters.

-        real vscale = exp(-newStepSize/tau);
-        real fscale = (1-vscale)*tau;
-        real noisescale = sqrt(2*kT/tau)*sqrt(0.5f*(1-vscale*vscale)*tau);
+        mixed vscale = exp(-newStepSize/tau);
+        mixed fscale = (1-vscale)*tau;
+        mixed noisescale = sqrt(2*kT/tau)*sqrt(0.5f*(1-vscale*vscale)*tau);
        params[VelScale] = vscale;
        params[ForceScale] = fscale;
        params[NoiseScale] = noisescale;

--- a/platforms/cuda/src/kernels/removeCM.cu
+++ b/platforms/cuda/src/kernels/removeCM.cu
@@ -2,13 +2,13 @@
 * Calculate the center of mass momentum.
 */

-extern "C" __global__ void calcCenterOfMassMomentum(int numAtoms, const real4* __restrict__ velm, float4* __restrict__ cmMomentum) {
+extern "C" __global__ void calcCenterOfMassMomentum(int numAtoms, const mixed4* __restrict__ velm, float4* __restrict__ cmMomentum) {
    extern __shared__ volatile float3 temp[];
    float3 cm = make_float3(0, 0, 0);
    for (unsigned int index = blockIdx.x*blockDim.x+threadIdx.x; index < numAtoms; index += blockDim.x*gridDim.x) {
-        real4 velocity = velm[index];
-        if (velocity.w != 0.0) {
-            real mass = RECIP(velocity.w);
+        mixed4 velocity = velm[index];
+        if (velocity.w != 0) {
+            mixed mass = RECIP(velocity.w);
            cm.x += (float) velocity.x*mass;
            cm.y += (float) velocity.y*mass;
            cm.z += (float) velocity.z*mass;
@@ -57,7 +57,7 @@ extern "C" __global__ void calcCenterOfMassMomentum(int numAtoms, const real4* _
 * Remove center of mass motion.
 */

-extern "C" __global__ void removeCenterOfMassMomentum(unsigned int numAtoms, real4* __restrict__ velm, const float4* __restrict__ cmMomentum) {
+extern "C" __global__ void removeCenterOfMassMomentum(unsigned int numAtoms, mixed4* __restrict__ velm, const float4* __restrict__ cmMomentum) {
    // First sum all of the momenta that were calculated by individual groups.

    extern volatile float3 temp[];
@@ -104,7 +104,7 @@ extern "C" __global__ void removeCenterOfMassMomentum(unsigned int numAtoms, rea
    // Now remove the center of mass velocity from each atom.

    for (unsigned int index = blockIdx.x*blockDim.x+threadIdx.x; index < numAtoms; index += blockDim.x*gridDim.x) {
-        real4 velocity = velm[index];
+        mixed4 velocity = velm[index];
        velocity.x -= cm.x;
        velocity.y -= cm.y;
        velocity.z -= cm.z;

--- a/platforms/cuda/src/kernels/verlet.cu
+++ b/platforms/cuda/src/kernels/verlet.cu
@@ -2,15 +2,22 @@
 * Perform the first step of Verlet integration.
 */

-extern "C" __global__ void integrateVerletPart1(const real2* __restrict__ dt, const real4* __restrict__ posq, real4* __restrict__ velm, const long long* __restrict__ force, real4* __restrict__ posDelta) {
-    const real2 stepSize = dt[0];
-    const real dtPos = stepSize.y;
-    const real dtVel = 0.5f*(stepSize.x+stepSize.y);
-    const real scale = dtVel/(real) 0xFFFFFFFF;
+extern "C" __global__ void integrateVerletPart1(const mixed2* __restrict__ dt, const real4* __restrict__ posq,
+        const real4* __restrict__ posqCorrection, mixed4* __restrict__ velm, const long long* __restrict__ force, mixed4* __restrict__ posDelta) {
+    const mixed2 stepSize = dt[0];
+    const mixed dtPos = stepSize.y;
+    const mixed dtVel = 0.5f*(stepSize.x+stepSize.y);
+    const mixed scale = dtVel/(mixed) 0xFFFFFFFF;
    for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < NUM_ATOMS; index += blockDim.x*gridDim.x) {
-        real4 velocity = velm[index];
+        mixed4 velocity = velm[index];
        if (velocity.w != 0.0) {
+#ifdef USE_MIXED_PRECISION
+            real4 pos1 = posq[index];
+            real4 pos2 = posqCorrection[index];
+            mixed4 pos = make_mixed4(pos1.x+(mixed)pos2.x, pos1.y+(mixed)pos2.y, pos1.z+(mixed)pos2.z, pos1.w);
+#else
            real4 pos = posq[index];
+#endif
            velocity.x += scale*force[index]*velocity.w;
            velocity.y += scale*force[index+PADDED_NUM_ATOMS]*velocity.w;
            velocity.z += scale*force[index+PADDED_NUM_ATOMS*2]*velocity.w;
@@ -27,22 +34,34 @@ extern "C" __global__ void integrateVerletPart1(const real2* __restrict__ dt, co
 * Perform the second step of Verlet integration.
 */

-extern "C" __global__ void integrateVerletPart2(real2* __restrict__ dt, real4* __restrict__ posq, real4* __restrict__ velm, const real4* __restrict__ posDelta) {
-    real2 stepSize = dt[0];
+extern "C" __global__ void integrateVerletPart2(mixed2* __restrict__ dt, real4* __restrict__ posq,
+        real4* __restrict__ posqCorrection, mixed4* __restrict__ velm, const mixed4* __restrict__ posDelta) {
+    mixed2 stepSize = dt[0];
    double oneOverDt = 1.0/stepSize.y;
    int index = blockIdx.x*blockDim.x+threadIdx.x;
    if (index == 0)
        dt[0].x = stepSize.y;
    for (; index < NUM_ATOMS; index += blockDim.x*gridDim.x) {
-        real4 velocity = velm[index];
+        mixed4 velocity = velm[index];
        if (velocity.w != 0.0) {
+#ifdef USE_MIXED_PRECISION
+            real4 pos1 = posq[index];
+            real4 pos2 = posqCorrection[index];
+            mixed4 pos = make_mixed4(pos1.x+(mixed)pos2.x, pos1.y+(mixed)pos2.y, pos1.z+(mixed)pos2.z, pos1.w);
+#else
            real4 pos = posq[index];
-            real4 delta = posDelta[index];
+#endif
+            mixed4 delta = posDelta[index];
            pos.x += delta.x;
            pos.y += delta.y;
            pos.z += delta.z;
-            velocity = make_real4((real) (delta.x*oneOverDt), (real) (delta.y*oneOverDt), (real) (delta.z*oneOverDt), velocity.w);
+            velocity = make_mixed4((mixed) (delta.x*oneOverDt), (mixed) (delta.y*oneOverDt), (mixed) (delta.z*oneOverDt), velocity.w);
+#ifdef USE_MIXED_PRECISION
+            posq[index] = make_real4((real) pos.x, (real) pos.y, (real) pos.z, (real) pos.w);
+            posqCorrection[index] = make_real4(pos.x-(real) pos.x, pos.y-(real) pos.y, pos.z-(real) pos.z, 0);
+#else
            posq[index] = pos;
+#endif
            velm[index] = velocity;
        }
    }
@@ -52,15 +71,15 @@ extern "C" __global__ void integrateVerletPart2(real2* __restrict__ dt, real4* _
 * Select the step size to use for the next step.
 */

-extern "C" __global__ void selectVerletStepSize(real maxStepSize, real errorTol, real2* __restrict__ dt, const real4* __restrict__ velm, const long long* __restrict__ force) {
+extern "C" __global__ void selectVerletStepSize(mixed maxStepSize, mixed errorTol, mixed2* __restrict__ dt, const mixed4* __restrict__ velm, const long long* __restrict__ force) {
    // Calculate the error.

-    extern __shared__ real error[];
-    real err = 0.0f;
-    const real scale = RECIP((real) 0xFFFFFFFF);
+    extern __shared__ mixed error[];
+    mixed err = 0.0f;
+    const mixed scale = RECIP((mixed) 0xFFFFFFFF);
    for (int index = threadIdx.x; index < NUM_ATOMS; index += blockDim.x*gridDim.x) {
-        real3 f = make_real3(scale*force[index], scale*force[index+PADDED_NUM_ATOMS], scale*force[index+PADDED_NUM_ATOMS*2]);
-        real invMass = velm[index].w;
+        mixed3 f = make_mixed3(scale*force[index], scale*force[index+PADDED_NUM_ATOMS], scale*force[index+PADDED_NUM_ATOMS*2]);
+        mixed invMass = velm[index].w;
        err += (f.x*f.x + f.y*f.y + f.z*f.z)*invMass;
    }
    error[threadIdx.x] = err;
@@ -74,9 +93,9 @@ extern "C" __global__ void selectVerletStepSize(real maxStepSize, real errorTol,
        __syncthreads();
    }
    if (threadIdx.x == 0) {
-        real totalError = SQRT(error[0]/(NUM_ATOMS*3));
-        real newStepSize = SQRT(errorTol/totalError);
-        real oldStepSize = dt[0].y;
+        mixed totalError = sqrt(error[0]/(NUM_ATOMS*3));
+        mixed newStepSize = sqrt(errorTol/totalError);
+        mixed oldStepSize = dt[0].y;
        if (oldStepSize > 0.0f)
            newStepSize = min(newStepSize, oldStepSize*2.0f); // For safety, limit how quickly dt can increase.
        if (newStepSize > oldStepSize && newStepSize < 1.1f*oldStepSize)

--- a/plugins/amoeba/platforms/cuda/src/AmoebaCudaKernels.cpp
+++ b/plugins/amoeba/platforms/cuda/src/AmoebaCudaKernels.cpp
@@ -1410,11 +1410,12 @@ void CudaCalcAmoebaMultipoleForceKernel::getElectrostaticPotential(ContextImpl&
    }
 }

-template <class T, class T4>
+template <class T, class T4, class M4>
 void CudaCalcAmoebaMultipoleForceKernel::computeSystemMultipoleMoments(ContextImpl& context, vector<double>& outputMultipoleMoments) {
    // Compute the local coordinates relative to the center of mass.
    int numAtoms = cu.getNumAtoms();
-    vector<T4> posq, velm;
+    vector<T4> posq;
+    vector<M4> velm;
    cu.getPosq().download(posq);
    cu.getVelm().download(velm);
    double totalMass = 0.0;
@@ -1524,9 +1525,11 @@ void CudaCalcAmoebaMultipoleForceKernel::computeSystemMultipoleMoments(ContextIm
 void CudaCalcAmoebaMultipoleForceKernel::getSystemMultipoleMoments(ContextImpl& context, vector<double>& outputMultipoleMoments) {
    context.calcForcesAndEnergy(false, false, -1);
    if (cu.getUseDoublePrecision())
-        computeSystemMultipoleMoments<double, double4>(context, outputMultipoleMoments);
+        computeSystemMultipoleMoments<double, double4, double4>(context, outputMultipoleMoments);
+    else if (cu.getUseMixedPrecision())
+        computeSystemMultipoleMoments<float, float4, double4>(context, outputMultipoleMoments);
    else
-        computeSystemMultipoleMoments<float, float4>(context, outputMultipoleMoments);
+        computeSystemMultipoleMoments<float, float4, float4>(context, outputMultipoleMoments);
 }

 /* -------------------------------------------------------------------------- *

--- a/plugins/amoeba/platforms/cuda/src/AmoebaCudaKernels.h
+++ b/plugins/amoeba/platforms/cuda/src/AmoebaCudaKernels.h
@@ -321,7 +321,7 @@ private:
        const char* getSortKey() const {return "value.y";}
    };
    void initializeScaleFactors();
-    template <class T, class T4> void computeSystemMultipoleMoments(ContextImpl& context, std::vector<double>& outputMultipoleMoments);
+    template <class T, class T4, class M4> void computeSystemMultipoleMoments(ContextImpl& context, std::vector<double>& outputMultipoleMoments);
    int numMultipoles, maxInducedIterations;
    double inducedEpsilon;
    bool hasInitializedScaleFactors, hasInitializedFFT;