Finished implementing mixed precision model

8cda1b79 · Peter Eastman · 9ad85ebd · 8cda1b79 · 8cda1b79 · 8cda1b79
Commit 8cda1b79 authored Oct 04, 2012 by Peter Eastman
7 changed files
--- a/platforms/cuda/src/CudaKernels.cpp
+++ b/platforms/cuda/src/CudaKernels.cpp
@@ -378,7 +378,8 @@ void CudaApplyConstraintsKernel::apply(ContextImpl& context, double tol) {
    CudaIntegrationUtilities& integration = cu.getIntegrationUtilities();
    cu.clearBuffer(integration.getPosDelta());
    integration.applyConstraints(tol);
-    void* args[] = {&cu.getPosq().getDevicePointer(), &cu.getIntegrationUtilities().getPosDelta().getDevicePointer()};
+    CUdeviceptr posCorrection = (cu.getUseMixedPrecision() ? cu.getPosqCorrection().getDevicePointer() : 0);
+    void* args[] = {&cu.getPosq().getDevicePointer(), &posCorrection, &cu.getIntegrationUtilities().getPosDelta().getDevicePointer()};
    cu.executeKernel(applyDeltasKernel, args, cu.getNumAtoms());
    integration.computeVirtualSites();
 }
@@ -4144,12 +4145,13 @@ void CudaIntegrateBrownianStepKernel::execute(ContextImpl& context, const Browni
    float stepSizeFloat = (float) stepSize;
    float tauDtFloat = (float) tauDt;
    float noiseFloat = (float) noise;
+    bool useDouble = cu.getUseDoublePrecision() || cu.getUseMixedPrecision();
    // Call the first integration kernel.
    int randomIndex = integration.prepareRandomNumbers(cu.getPaddedNumAtoms());
-    void* args1[] = {cu.getUseDoublePrecision() ? (void*) &tauDt : (void*) &tauDtFloat,
+    void* args1[] = {useDouble ? (void*) &tauDt : (void*) &tauDtFloat,
-            cu.getUseDoublePrecision() ? (void*) &noise : (void*) &noiseFloat,
+            useDouble ? (void*) &noise : (void*) &noiseFloat,
            &cu.getForce().getDevicePointer(), &integration.getPosDelta().getDevicePointer(),
            &cu.getVelm().getDevicePointer(), &integration.getRandom().getDevicePointer(), &randomIndex};
    cu.executeKernel(kernel1, args1, numAtoms);
@@ -4160,8 +4162,9 @@ void CudaIntegrateBrownianStepKernel::execute(ContextImpl& context, const Browni
    // Call the second integration kernel.
-    void* args2[] = {cu.getUseDoublePrecision() ? (void*) &stepSize : (void*) &stepSizeFloat,
+    CUdeviceptr posCorrection = (cu.getUseMixedPrecision() ? cu.getPosqCorrection().getDevicePointer() : 0);
-            &cu.getPosq().getDevicePointer(), &cu.getVelm().getDevicePointer(), &integration.getPosDelta().getDevicePointer()};
+    void* args2[] = {useDouble ? (void*) &stepSize : (void*) &stepSizeFloat,
+            &cu.getPosq().getDevicePointer(), &posCorrection, &cu.getVelm().getDevicePointer(), &integration.getPosDelta().getDevicePointer()};
    cu.executeKernel(kernel2, args2, numAtoms);
    integration.computeVirtualSites();
@@ -4354,7 +4357,7 @@ public:
            return;
        int numAtoms = cu.getNumAtoms();
        const vector<int>& order = cu.getAtomIndex();
-        if (cu.getUseDoublePrecision()) {
+        if (cu.getUseDoublePrecision() || cu.getUseMixedPrecision()) {
            if (deviceValuesAreCurrent)
                perDofValues.getParameterValues(localPerDofValuesDouble);
            vector<vector<double> > swap(3*numAtoms);
@@ -4422,11 +4425,11 @@ void CudaIntegrateCustomStepKernel::initialize(const System& system, const Custo
    cu.setAsCurrent();
    cu.getIntegrationUtilities().initRandomNumberGenerator(integrator.getRandomNumberSeed());
    numGlobalVariables = integrator.getNumGlobalVariables();
-    int elementSize = (cu.getUseDoublePrecision() ? sizeof(double) : sizeof(float));
+    int elementSize = (cu.getUseDoublePrecision() || cu.getUseMixedPrecision() ? sizeof(double) : sizeof(float));
    globalValues = new CudaArray(cu, max(1, numGlobalVariables), elementSize, "globalVariables");
    sumBuffer = new CudaArray(cu, 3*system.getNumParticles(), elementSize, "sumBuffer");
-    energy = new CudaArray(cu, 1, elementSize, "energy");
+    energy = new CudaArray(cu, 1, cu.getEnergyBuffer().getElementSize(), "energy");
-    perDofValues = new CudaParameterSet(cu, integrator.getNumPerDofVariables(), 3*system.getNumParticles(), "perDofVariables", false, cu.getUseDoublePrecision());
+    perDofValues = new CudaParameterSet(cu, integrator.getNumPerDofVariables(), 3*system.getNumParticles(), "perDofVariables", false, cu.getUseDoublePrecision() || cu.getUseMixedPrecision());
    cu.addReorderListener(new ReorderListener(cu, *perDofValues, localPerDofValuesFloat, localPerDofValuesDouble, deviceValuesAreCurrent));
    prevStepSize = -1.0;
    SimTKOpenMMUtilities::setRandomNumberSeed(integrator.getRandomNumberSeed());
@@ -4502,13 +4505,14 @@ void CudaIntegrateCustomStepKernel::execute(ContextImpl& context, CustomIntegrat
    CudaIntegrationUtilities& integration = cu.getIntegrationUtilities();
    int numAtoms = cu.getNumAtoms();
    int numSteps = integrator.getNumComputations();
+    bool useDouble = cu.getUseDoublePrecision() || cu.getUseMixedPrecision();
    if (!hasInitializedKernels) {
        hasInitializedKernels = true;
        // Initialize various data structures.
        const map<string, double>& params = context.getParameters();
-        if (cu.getUseDoublePrecision()) {
+        if (useDouble) {
            contextParameterValues = CudaArray::create<double>(cu, max(1, (int) params.size()), "contextParameters");
            contextValuesDouble.resize(contextParameterValues->getSize());
            for (map<string, double>::const_iterator iter = params.begin(); iter != params.end(); ++iter) {
@@ -4674,9 +4678,9 @@ void CudaIntegrateCustomStepKernel::execute(ContextImpl& context, CustomIntegrat
                        compute << createPerDofComputation(stepType[j] == CustomIntegrator::ComputePerDof ? variable[j] : "", expression[j], i, integrator, forceName[j], energyName[j]);
                    if (variable[j] == "x") {
                        if (storePosAsDelta[j])
-                            compute << "posDelta[index] = convertFromDouble4(position-convertToDouble4(posq[index]));\n";
+                            compute << "posDelta[index] = convertFromDouble4(position-convertToDouble4(loadPos(posq, posqCorrection, index)));\n";
                        else
-                            compute << "posq[index] = convertFromDouble4(position);\n";
+                            compute << "storePos(posq, posqCorrection, index, convertFromDouble4(position));\n";
                    }
                    else if (variable[j] == "v")
                        compute << "velm[index] = convertFromDouble4(velocity);\n";
@@ -4712,6 +4716,7 @@ void CudaIntegrateCustomStepKernel::execute(ContextImpl& context, CustomIntegrat
                requiredUniform[step] = numUniform;
                vector<void*> args1;
                args1.push_back(&cu.getPosq().getDevicePointer());
+                args1.push_back(NULL);
                args1.push_back(&integration.getPosDelta().getDevicePointer());
                args1.push_back(&cu.getVelm().getDevicePointer());
                args1.push_back(&cu.getForce().getDevicePointer());
@@ -4749,7 +4754,7 @@ void CudaIntegrateCustomStepKernel::execute(ContextImpl& context, CustomIntegrat
                        throw OpenMMException("Unknown global variable: "+variable[step]);
                    defines["SUM_BUFFER_SIZE"] = cu.intToString(3*numAtoms);
                    module = cu.createModule(CudaKernelSources::customIntegrator, defines);
-                    kernel = cu.getKernel(module, "computeSum");
+                    kernel = cu.getKernel(module, useDouble ? "computeDoubleSum" : "computeFloatSum");
                    kernels[step].push_back(kernel);
                    kernelArgs[step].push_back(args2);
                }
@@ -4782,6 +4787,7 @@ void CudaIntegrateCustomStepKernel::execute(ContextImpl& context, CustomIntegrat
                kernels[step].push_back(kernel);
                vector<void*> args;
                args.push_back(&cu.getPosq().getDevicePointer());
+                args.push_back(NULL);
                args.push_back(&integration.getPosDelta().getDevicePointer());
                kernelArgs[step].push_back(args);
            }
@@ -4792,13 +4798,13 @@ void CudaIntegrateCustomStepKernel::execute(ContextImpl& context, CustomIntegrat
        defines["SUM_OUTPUT_INDEX"] = "0";
        defines["SUM_BUFFER_SIZE"] = cu.intToString(cu.getEnergyBuffer().getSize());
        CUmodule module = cu.createModule(CudaKernelSources::customIntegrator, defines);
-        sumEnergyKernel = cu.getKernel(module, "computeSum");
+        sumEnergyKernel = cu.getKernel(module, cu.getUseDoublePrecision() ? "computeDoubleSum" : "computeFloatSum");
    }
    // Make sure all values (variables, parameters, etc.) stored on the device are up to date.
    if (!deviceValuesAreCurrent) {
-        if (cu.getUseDoublePrecision())
+        if (useDouble)
            perDofValues->setParameterValues(localPerDofValuesDouble);
        else
            perDofValues->setParameterValues(localPerDofValuesFloat);
@@ -4807,7 +4813,7 @@ void CudaIntegrateCustomStepKernel::execute(ContextImpl& context, CustomIntegrat
    localValuesAreCurrent = false;
    double stepSize = integrator.getStepSize();
    if (stepSize != prevStepSize) {
-        if (cu.getUseDoublePrecision()) {
+        if (useDouble) {
            double size[] = {0, stepSize};
            integration.getStepSize().upload(size);
        }
@@ -4818,7 +4824,7 @@ void CudaIntegrateCustomStepKernel::execute(ContextImpl& context, CustomIntegrat
        prevStepSize = stepSize;
    }
    bool paramsChanged = false;
-    if (cu.getUseDoublePrecision()) {
+    if (useDouble) {
        for (int i = 0; i < (int) parameterNames.size(); i++) {
            double value = context.getParameter(parameterNames[i]);
            if (value != contextValuesDouble[i]) {
@@ -4844,6 +4850,7 @@ void CudaIntegrateCustomStepKernel::execute(ContextImpl& context, CustomIntegrat
    // Loop over computation steps in the integrator and execute them.
    void* randomArgs[] = {&uniformRandoms->getDevicePointer(), &randomSeed->getDevicePointer()};
+    CUdeviceptr posCorrection = (cu.getUseMixedPrecision() ? cu.getPosqCorrection().getDevicePointer() : 0);
    for (int i = 0; i < numSteps; i++) {
        if ((needsForces[i] || needsEnergy[i]) && (!forcesAreValid || context.getLastForceGroups() != forceGroup[i])) {
            // Recompute forces and/or energy.  Figure out what is actually needed
@@ -4872,7 +4879,8 @@ void CudaIntegrateCustomStepKernel::execute(ContextImpl& context, CustomIntegrat
        }
        if (stepType[i] == CustomIntegrator::ComputePerDof && !merged[i]) {
            int randomIndex = integration.prepareRandomNumbers(requiredGaussian[i]);
-            kernelArgs[i][0][9] = &randomIndex;
+            kernelArgs[i][0][1] = &posCorrection;
+            kernelArgs[i][0][10] = &randomIndex;
            if (requiredUniform[i] > 0)
                cu.executeKernel(randomKernel, &randomArgs[0], numAtoms);
            cu.executeKernel(kernels[i][0], &kernelArgs[i][0][0], numAtoms);
@@ -4886,7 +4894,8 @@ void CudaIntegrateCustomStepKernel::execute(ContextImpl& context, CustomIntegrat
        }
        else if (stepType[i] == CustomIntegrator::ComputeSum) {
            int randomIndex = integration.prepareRandomNumbers(requiredGaussian[i]);
-            kernelArgs[i][0][9] = &randomIndex;
+            kernelArgs[i][0][1] = &posCorrection;
+            kernelArgs[i][0][10] = &randomIndex;
            if (requiredUniform[i] > 0)
                cu.executeKernel(randomKernel, &randomArgs[0], numAtoms);
            cu.executeKernel(kernels[i][0], &kernelArgs[i][0][0], numAtoms);
@@ -4898,6 +4907,7 @@ void CudaIntegrateCustomStepKernel::execute(ContextImpl& context, CustomIntegrat
        }
        else if (stepType[i] == CustomIntegrator::ConstrainPositions) {
            cu.getIntegrationUtilities().applyConstraints(integrator.getConstraintTolerance());
+            kernelArgs[i][0][1] = &posCorrection;
            cu.executeKernel(kernels[i][0], &kernelArgs[i][0][0], numAtoms);
            cu.getIntegrationUtilities().computeVirtualSites();
        }
@@ -4918,7 +4928,7 @@ void CudaIntegrateCustomStepKernel::execute(ContextImpl& context, CustomIntegrat
 void CudaIntegrateCustomStepKernel::recordChangedParameters(ContextImpl& context) {
    if (!modifiesParameters)
        return;
-    if (cu.getUseDoublePrecision()) {
+    if (cu.getUseDoublePrecision() || cu.getUseMixedPrecision()) {
        contextParameterValues->download(contextValuesDouble);
        for (int i = 0; i < (int) parameterNames.size(); i++) {
            double value = context.getParameter(parameterNames[i]);
@@ -4940,7 +4950,7 @@ void CudaIntegrateCustomStepKernel::getGlobalVariables(ContextImpl& context, vec
    values.resize(numGlobalVariables);
    if (numGlobalVariables == 0)
        return;
-    if (cu.getUseDoublePrecision())
+    if (cu.getUseDoublePrecision() || cu.getUseMixedPrecision())
        globalValues->download(values);
    else {
        vector<float> buffer;
@@ -4953,7 +4963,7 @@ void CudaIntegrateCustomStepKernel::getGlobalVariables(ContextImpl& context, vec
 void CudaIntegrateCustomStepKernel::setGlobalVariables(ContextImpl& context, const vector<double>& values) {
    if (numGlobalVariables == 0)
        return;
-    if (cu.getUseDoublePrecision())
+    if (cu.getUseDoublePrecision() || cu.getUseMixedPrecision())
        globalValues->upload(values);
    else {
        vector<float> buffer(numGlobalVariables);
@@ -4966,7 +4976,7 @@ void CudaIntegrateCustomStepKernel::setGlobalVariables(ContextImpl& context, con
 void CudaIntegrateCustomStepKernel::getPerDofVariable(ContextImpl& context, int variable, vector<Vec3>& values) const {
    values.resize(perDofValues->getNumObjects()/3);
    const vector<int>& order = cu.getAtomIndex();
-    if (cu.getUseDoublePrecision()) {
+    if (cu.getUseDoublePrecision() || cu.getUseMixedPrecision()) {
        if (!localValuesAreCurrent) {
            perDofValues->getParameterValues(localPerDofValuesDouble);
            localValuesAreCurrent = true;
@@ -4988,7 +4998,7 @@ void CudaIntegrateCustomStepKernel::getPerDofVariable(ContextImpl& context, int
 void CudaIntegrateCustomStepKernel::setPerDofVariable(ContextImpl& context, int variable, const vector<Vec3>& values) {
    const vector<int>& order = cu.getAtomIndex();
-    if (cu.getUseDoublePrecision()) {
+    if (cu.getUseDoublePrecision() || cu.getUseMixedPrecision()) {
        if (!localValuesAreCurrent) {
            perDofValues->getParameterValues(localPerDofValuesDouble);
            localValuesAreCurrent = true;

--- a/platforms/cuda/src/kernels/brownian.cu
+++ b/platforms/cuda/src/kernels/brownian.cu
@@ -2,12 +2,12 @@
 * Perform the first step of Brownian integration.
 */
-extern "C" __global__ void integrateBrownianPart1(real tauDeltaT, real noiseAmplitude, const long long* __restrict__ force,
+extern "C" __global__ void integrateBrownianPart1(mixed tauDeltaT, mixed noiseAmplitude, const long long* __restrict__ force,
-        real4* __restrict__ posDelta, const real4* __restrict__ velm, const float4* __restrict__ random, unsigned int randomIndex) {
+        mixed4* __restrict__ posDelta, const mixed4* __restrict__ velm, const float4* __restrict__ random, unsigned int randomIndex) {
    randomIndex += blockIdx.x*blockDim.x+threadIdx.x;
-    const real fscale = tauDeltaT/(real) 0xFFFFFFFF;
+    const mixed fscale = tauDeltaT/(mixed) 0xFFFFFFFF;
    for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < NUM_ATOMS; index += blockDim.x*gridDim.x) {
-        real invMass = velm[index].w;
+        mixed invMass = velm[index].w;
        if (invMass != 0) {
            posDelta[index].x = fscale*invMass*force[index] + noiseAmplitude*SQRT(invMass)*random[randomIndex].x;
            posDelta[index].y = fscale*invMass*force[index+PADDED_NUM_ATOMS] + noiseAmplitude*SQRT(invMass)*random[randomIndex].y;
@@ -21,17 +21,30 @@ extern "C" __global__ void integrateBrownianPart1(real tauDeltaT, real noiseAmpl
 * Perform the second step of Brownian integration.
 */
-extern "C" __global__ void integrateBrownianPart2(real deltaT, real4* posq, real4* velm, const real4* __restrict__ posDelta) {
+extern "C" __global__ void integrateBrownianPart2(mixed deltaT, real4* posq, real4* __restrict__ posqCorrection, mixed4* velm, const mixed4* __restrict__ posDelta) {
-    const real oneOverDeltaT = RECIP(deltaT);
+    const mixed oneOverDeltaT = RECIP(deltaT);
    for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < NUM_ATOMS; index += blockDim.x*gridDim.x) {
        if (velm[index].w != 0) {
-            real4 delta = posDelta[index];
+            mixed4 delta = posDelta[index];
            velm[index].x = oneOverDeltaT*delta.x;
            velm[index].y = oneOverDeltaT*delta.y;
            velm[index].z = oneOverDeltaT*delta.z;
-            posq[index].x = posq[index].x + delta.x;
+#ifdef USE_MIXED_PRECISION
-            posq[index].y = posq[index].y + delta.y;
+            real4 pos1 = posq[index];
-            posq[index].z = posq[index].z + delta.z;
+            real4 pos2 = posqCorrection[index];
+            mixed4 pos = make_mixed4(pos1.x+(mixed)pos2.x, pos1.y+(mixed)pos2.y, pos1.z+(mixed)pos2.z, pos1.w);
+#else
+            real4 pos = posq[index];
+#endif
+            pos.x += delta.x;
+            pos.y += delta.y;
+            pos.z += delta.z;
+#ifdef USE_MIXED_PRECISION
+            posq[index] = make_real4((real) pos.x, (real) pos.y, (real) pos.z, (real) pos.w);
+            posqCorrection[index] = make_real4(pos.x-(real) pos.x, pos.y-(real) pos.y, pos.z-(real) pos.z, 0);
+#else
+            posq[index] = pos;
+#endif
        }
    }
 }
--- a/platforms/cuda/src/kernels/constraints.cu
+++ b/platforms/cuda/src/kernels/constraints.cu
-extern "C" __global__ void applyPositionDeltas(real4* __restrict__ posq, real4* __restrict__ posDelta) {
+extern "C" __global__ void applyPositionDeltas(real4* __restrict__ posq, real4* __restrict__ posqCorrection, mixed4* __restrict__ posDelta) {
    for (unsigned int index = blockIdx.x*blockDim.x+threadIdx.x; index < NUM_ATOMS; index += blockDim.x*gridDim.x) {
-        real4 position = posq[index];
+#ifdef USE_MIXED_PRECISION
-        position.x += posDelta[index].x;
+        real4 pos1 = posq[index];
-        position.y += posDelta[index].y;
+        real4 pos2 = posqCorrection[index];
-        position.z += posDelta[index].z;
+        mixed4 pos = make_mixed4(pos1.x+(mixed)pos2.x, pos1.y+(mixed)pos2.y, pos1.z+(mixed)pos2.z, pos1.w);
-        posq[index] = position;
+#else
+        mixed4 pos = posq[index];
+#endif
+        pos.x += posDelta[index].x;
+        pos.y += posDelta[index].y;
+        pos.z += posDelta[index].z;
+#ifdef USE_MIXED_PRECISION
+        posq[index] = make_real4((real) pos.x, (real) pos.y, (real) pos.z, (real) pos.w);
+        posqCorrection[index] = make_real4(pos.x-(real) pos.x, pos.y-(real) pos.y, pos.z-(real) pos.z, 0);
+#else
+        posq[index] = pos;
+#endif
    }
 }
--- a/platforms/cuda/src/kernels/customIntegrator.cu
+++ b/platforms/cuda/src/kernels/customIntegrator.cu
-extern "C" __global__ void computeSum(const real* __restrict__ sumBuffer, real* result) {
+extern "C" __global__ void computeFloatSum(const float* __restrict__ sumBuffer, float* result) {
-    __shared__ real tempBuffer[WORK_GROUP_SIZE];
+    __shared__ float tempBuffer[WORK_GROUP_SIZE];
    const unsigned int thread = threadIdx.x;
-    real sum = 0;
+    float sum = 0;
    for (unsigned int index = thread; index < SUM_BUFFER_SIZE; index += blockDim.x)
        sum += sumBuffer[index];
    tempBuffer[thread] = sum;
@@ -14,14 +14,41 @@ extern "C" __global__ void computeSum(const real* __restrict__ sumBuffer, real*
        result[SUM_OUTPUT_INDEX] = tempBuffer[0];
 }
-extern "C" __global__ void applyPositionDeltas(real4* __restrict__ posq, real4* __restrict__ posDelta) {
+extern "C" __global__ void computeDoubleSum(const double* __restrict__ sumBuffer, double* result) {
+    __shared__ double tempBuffer[WORK_GROUP_SIZE];
+    const unsigned int thread = threadIdx.x;
+    double sum = 0;
+    for (unsigned int index = thread; index < SUM_BUFFER_SIZE; index += blockDim.x)
+        sum += sumBuffer[index];
+    tempBuffer[thread] = sum;
+    for (int i = 1; i < WORK_GROUP_SIZE; i *= 2) {
+        __syncthreads();
+        if (thread%(i*2) == 0 && thread+i < WORK_GROUP_SIZE)
+            tempBuffer[thread] += tempBuffer[thread+i];
+    }
+    if (thread == 0)
+        result[SUM_OUTPUT_INDEX] = tempBuffer[0];
+}
+extern "C" __global__ void applyPositionDeltas(real4* __restrict__ posq, real4* __restrict__ posqCorrection, mixed4* __restrict__ posDelta) {
    for (unsigned int index = blockIdx.x*blockDim.x+threadIdx.x; index < NUM_ATOMS; index += blockDim.x*gridDim.x) {
-        real4 position = posq[index];
+#ifdef USE_MIXED_PRECISION
-        position.x += posDelta[index].x;
+        real4 pos1 = posq[index];
-        position.y += posDelta[index].y;
+        real4 pos2 = posqCorrection[index];
-        position.z += posDelta[index].z;
+        mixed4 pos = make_mixed4(pos1.x+(mixed)pos2.x, pos1.y+(mixed)pos2.y, pos1.z+(mixed)pos2.z, pos1.w);
-        posq[index] = position;
+#else
-        posDelta[index] = make_real4(0, 0, 0, 0);
+        real4 pos = posq[index];
+#endif
+        pos.x += posDelta[index].x;
+        pos.y += posDelta[index].y;
+        pos.z += posDelta[index].z;
+#ifdef USE_MIXED_PRECISION
+        posq[index] = make_real4((real) pos.x, (real) pos.y, (real) pos.z, (real) pos.w);
+        posqCorrection[index] = make_real4(pos.x-(real) pos.x, pos.y-(real) pos.y, pos.z-(real) pos.z, 0);
+#else
+        posq[index] = pos;
+#endif
+        posDelta[index] = make_mixed4(0, 0, 0, 0);
    }
 }

--- a/platforms/cuda/src/kernels/customIntegratorGlobal.cu
+++ b/platforms/cuda/src/kernels/customIntegratorGlobal.cu
-extern "C" __global__ void computeGlobal(real2* __restrict__ dt, real* __restrict__ globals, real* __restrict__ params,
+extern "C" __global__ void computeGlobal(mixed2* __restrict__ dt, mixed* __restrict__ globals, mixed* __restrict__ params,
        float uniform, float gaussian, const real* __restrict__ energy) {
    COMPUTE_STEP
 }
--- a/platforms/cuda/src/kernels/customIntegratorPerDof.cu
+++ b/platforms/cuda/src/kernels/customIntegratorPerDof.cu
-inline __device__ double4 convertToDouble4(real4 a) {
+/**
+ * Load the position of a particle.
+ */
+inline __device__ mixed4 loadPos(const real4* __restrict__ posq, const real4* __restrict__ posqCorrection, int index) {
+#ifdef USE_MIXED_PRECISION
+    real4 pos1 = posq[index];
+    real4 pos2 = posqCorrection[index];
+    return make_mixed4(pos1.x+(mixed)pos2.x, pos1.y+(mixed)pos2.y, pos1.z+(mixed)pos2.z, pos1.w);
+#else
+    return posq[index];
+#endif
+}
+/**
+ * Store the position of a particle.
+ */
+inline __device__ void storePos(real4* __restrict__ posq, real4* __restrict__ posqCorrection, int index, mixed4 pos) {
+#ifdef USE_MIXED_PRECISION
+    posq[index] = make_real4((real) pos.x, (real) pos.y, (real) pos.z, (real) pos.w);
+    posqCorrection[index] = make_real4(pos.x-(real) pos.x, pos.y-(real) pos.y, pos.z-(real) pos.z, 0);
+#else
+    posq[index] = pos;
+#endif
+}
+inline __device__ double4 convertToDouble4(mixed4 a) {
    return make_double4(a.x, a.y, a.z, a.w);
 }
-inline __device__ real4 convertFromDouble4(double4 a) {
+inline __device__ mixed4 convertFromDouble4(double4 a) {
-    return make_real4(a.x, a.y, a.z, a.w);
+    return make_mixed4(a.x, a.y, a.z, a.w);
 }
-extern "C" __global__ void computePerDof(real4* __restrict__ posq, real4* __restrict__ posDelta, real4* __restrict__ velm,
+extern "C" __global__ void computePerDof(real4* __restrict__ posq, real4* __restrict__ posqCorrection, mixed4* __restrict__ posDelta,
-        const long long* __restrict__ force, const real2* __restrict__ dt, const real* __restrict__ globals,
+        mixed4* __restrict__ velm, const long long* __restrict__ force, const mixed2* __restrict__ dt, const mixed* __restrict__ globals,
-        const real* __restrict__ params, real* __restrict__ sum, const float4* __restrict__ gaussianValues,
+        const mixed* __restrict__ params, mixed* __restrict__ sum, const float4* __restrict__ gaussianValues,
        unsigned int randomIndex, const float4* __restrict__ uniformValues, const real* __restrict__ energy
        PARAMETER_ARGUMENTS) {
-    real stepSize = dt[0].y;
+    mixed stepSize = dt[0].y;
    int index = blockIdx.x*blockDim.x+threadIdx.x;
    randomIndex += index;
    const double forceScale = 1.0/0xFFFFFFFF;
    while (index < NUM_ATOMS) {
 #ifdef LOAD_POS_AS_DELTA
-        double4 position = convertToDouble4(posq[index]+posDelta[index]);
+        double4 position = convertToDouble4(loadPos(posq, posqCorrection, index)+posDelta[index]);
 #else
-        double4 position = convertToDouble4(posq[index]);
+        double4 position = convertToDouble4(loadPos(posq, posqCorrection, index));
 #endif
        double4 velocity = convertToDouble4(velm[index]);
        double4 f = make_double4(forceScale*force[index], forceScale*force[index+PADDED_NUM_ATOMS], forceScale*force[index+PADDED_NUM_ATOMS*2], 0.0);

--- a/platforms/cuda/src/kernels/integrationUtilities.cu
+++ b/platforms/cuda/src/kernels/integrationUtilities.cu
@@ -779,10 +779,11 @@ inline __device__ real3 loadForce(int index, long long* __restrict__ force) {
    return make_real3(scale*force[index], scale*force[index+PADDED_NUM_ATOMS], scale*force[index+PADDED_NUM_ATOMS*2]);
 }
-inline __device__ void storeForce(int index, long long* __restrict__ force, real3 value) {
+inline __device__ void addForce(int index, long long* __restrict__ force, real3 value) {
-    force[index] = (long long) (value.x*0xFFFFFFFF);
+    unsigned long long* f = (unsigned long long*) force;
-    force[index+PADDED_NUM_ATOMS] = (long long) (value.y*0xFFFFFFFF);
+    atomicAdd(&f[index], static_cast<unsigned long long>((long long) (value.x*0xFFFFFFFF)));
-    force[index+2*PADDED_NUM_ATOMS] = (long long) (value.z*0xFFFFFFFF);
+    atomicAdd(&f[index+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (value.y*0xFFFFFFFF)));
+    atomicAdd(&f[index+PADDED_NUM_ATOMS*2], static_cast<unsigned long long>((long long) (value.z*0xFFFFFFFF)));
 }
 /**
@@ -799,12 +800,8 @@ extern "C" __global__ void distributeVirtualSiteForces(const real4* __restrict__
        int4 atoms = avg2Atoms[index];
        real2 weights = avg2Weights[index];
        real3 f = loadForce(atoms.x, force);
-        real3 f1 = loadForce(atoms.y, force);
+        addForce(atoms.y, force, f*weights.x);
-        real3 f2 = loadForce(atoms.z, force);
+        addForce(atoms.z, force, f*weights.y);
-        f1 += f*weights.x;
-        f2 += f*weights.y;
-        storeForce(atoms.y, force, f1);
-        storeForce(atoms.z, force, f2);
    }
    // Three particle average sites.
@@ -813,15 +810,9 @@ extern "C" __global__ void distributeVirtualSiteForces(const real4* __restrict__
        int4 atoms = avg3Atoms[index];
        real4 weights = avg3Weights[index];
        real3 f = loadForce(atoms.x, force);
-        real3 f1 = loadForce(atoms.y, force);
+        addForce(atoms.y, force, f*weights.x);
-        real3 f2 = loadForce(atoms.z, force);
+        addForce(atoms.z, force, f*weights.y);
-        real3 f3 = loadForce(atoms.w, force);
+        addForce(atoms.w, force, f*weights.z);
-        f1 += f*weights.x;
-        f2 += f*weights.y;
-        f3 += f*weights.z;
-        storeForce(atoms.y, force, f1);
-        storeForce(atoms.z, force, f2);
-        storeForce(atoms.w, force, f3);
    }
    // Out of plane sites.
@@ -835,20 +826,14 @@ extern "C" __global__ void distributeVirtualSiteForces(const real4* __restrict__
        mixed4 v12 = pos2-pos1;
        mixed4 v13 = pos3-pos1;
        real3 f = loadForce(atoms.x, force);
-        real3 f1 = loadForce(atoms.y, force);
-        real3 f2 = loadForce(atoms.z, force);
-        real3 f3 = loadForce(atoms.w, force);
        real3 fp2 = make_real3((real) (weights.x*f.x - weights.z*v13.z*f.y + weights.z*v13.y*f.z),
                   (real) (weights.z*v13.z*f.x + weights.x*f.y - weights.z*v13.x*f.z),
                   (real) (-weights.z*v13.y*f.x + weights.z*v13.x*f.y + weights.x*f.z));
        real3 fp3 = make_real3((real) (weights.y*f.x + weights.z*v12.z*f.y - weights.z*v12.y*f.z),
                   (real) (-weights.z*v12.z*f.x + weights.y*f.y + weights.z*v12.x*f.z),
                   (real) (weights.z*v12.y*f.x - weights.z*v12.x*f.y + weights.y*f.z));
-        f1 += f-fp2-fp3;
+        addForce(atoms.y, force, f-fp2-fp3);
-        f2 += fp2;
+        addForce(atoms.z, force, fp2);
-        f3 += fp3;
+        addForce(atoms.w, force, fp3);
-        storeForce(atoms.y, force, f1);
-        storeForce(atoms.z, force, f2);
-        storeForce(atoms.w, force, f3);
    }
 }