RPMD supports mixed and double precision

cd6af26e · Peter Eastman · b8b2e1ef · cd6af26e · cd6af26e · cd6af26e
Commit cd6af26e authored Nov 16, 2012 by Peter Eastman
10 changed files
--- a/plugins/rpmd/platforms/cuda/src/CudaRpmdKernels.cpp
+++ b/plugins/rpmd/platforms/cuda/src/CudaRpmdKernels.cpp
--- a/plugins/rpmd/platforms/cuda/src/CudaRpmdKernels.h
+++ b/plugins/rpmd/platforms/cuda/src/CudaRpmdKernels.h
@@ -91,7 +91,7 @@ private:
    CudaArray* forces;
    CudaArray* positions;
    CudaArray* velocities;
-    CUfunction pileKernel, stepKernel, velocitiesKernel, copyToContextKernel, copyFromContextKernel, translateKernel;
+    CUfunction pileKernel, stepKernel, velocitiesKernel, copyPositionsToContextKernel, copyVelocitiesToContextKernel, copyForcesFromContextKernel, translateKernel;
 };
 } // namespace OpenMM

--- a/plugins/rpmd/platforms/cuda/src/kernels/rpmd.cu
+++ b/plugins/rpmd/platforms/cuda/src/kernels/rpmd.cu
-__device__ float3 multiplyComplexRealPart(float2 c1, float3 c2r, float3 c2i) {
+__device__ mixed3 multiplyComplexRealPart(mixed2 c1, mixed3 c2r, mixed3 c2i) {
    return c1.x*c2r-c1.y*c2i;
 }
-__device__ float3 multiplyComplexImagPart(float2 c1, float3 c2r, float3 c2i) {
+__device__ mixed3 multiplyComplexImagPart(mixed2 c1, mixed3 c2r, mixed3 c2i) {
    return c1.x*c2i+c1.y*c2r;
 }
-__device__ float3 multiplyComplexRealPartConj(float2 c1, float3 c2r, float3 c2i) {
+__device__ mixed3 multiplyComplexRealPartConj(mixed2 c1, mixed3 c2r, mixed3 c2i) {
    return c1.x*c2r+c1.y*c2i;
 }
-__device__ float3 multiplyComplexImagPartConj(float2 c1, float3 c2r, float3 c2i) {
+__device__ mixed3 multiplyComplexImagPartConj(mixed2 c1, mixed3 c2r, mixed3 c2i) {
    return c1.x*c2i-c1.y*c2r;
 }
 /**
 * Apply the PILE-L thermostat.
 */
-extern "C" __global__ void applyPileThermostat(float4* velm, float4* random, unsigned int randomIndex,
+extern "C" __global__ void applyPileThermostat(mixed4* velm, float4* random, unsigned int randomIndex,
-        float dt, float kT, float friction) {
+        mixed dt, mixed kT, mixed friction) {
    const int numBlocks = blockDim.x*gridDim.x/NUM_COPIES;
    const int blockStart = NUM_COPIES*(threadIdx.x/NUM_COPIES);
    const int indexInBlock = threadIdx.x-blockStart;
-    const float nkT = NUM_COPIES*kT;
+    const mixed nkT = NUM_COPIES*kT;
-    const float twown = 2.0f*nkT/HBAR;
+    const mixed twown = 2.0f*nkT/HBAR;
-    const float c1_0 = EXP(-0.5f*dt*friction);
+    const mixed c1_0 = EXP(-0.5f*dt*friction);
-    const float c2_0 = SQRT(1.0f-c1_0*c1_0);
+    const mixed c2_0 = SQRT(1.0f-c1_0*c1_0);
-    __shared__ float3 v[2*THREAD_BLOCK_SIZE];
+    __shared__ mixed3 v[2*THREAD_BLOCK_SIZE];
-    __shared__ float3 temp[2*THREAD_BLOCK_SIZE];
+    __shared__ mixed3 temp[2*THREAD_BLOCK_SIZE];
-    __shared__ float2 w[NUM_COPIES];
+    __shared__ mixed2 w[NUM_COPIES];
-    float3* vreal = &v[blockStart];
+    mixed3* vreal = &v[blockStart];
-    float3* vimag = &v[blockStart+blockDim.x];
+    mixed3* vimag = &v[blockStart+blockDim.x];
    if (threadIdx.x < NUM_COPIES)
-        w[indexInBlock] = make_float2(cos(-indexInBlock*2*M_PI/NUM_COPIES), sin(-indexInBlock*2*M_PI/NUM_COPIES));
+        w[indexInBlock] = make_mixed2(cos(-indexInBlock*2*M_PI/NUM_COPIES), sin(-indexInBlock*2*M_PI/NUM_COPIES));
    __syncthreads();
    randomIndex += NUM_COPIES*((blockIdx.x*blockDim.x+threadIdx.x)/NUM_COPIES);
    for (int particle = (blockIdx.x*blockDim.x+threadIdx.x)/NUM_COPIES; particle < NUM_ATOMS; particle += numBlocks) {
-        float4 particleVelm = velm[particle+indexInBlock*PADDED_NUM_ATOMS];
+        mixed4 particleVelm = velm[particle+indexInBlock*PADDED_NUM_ATOMS];
-        float invMass = particleVelm.w;
+        mixed invMass = particleVelm.w;
-        float c3_0 = c2_0*SQRT(nkT*invMass);
+        mixed c3_0 = c2_0*SQRT(nkT*invMass);
        // Forward FFT.
-        vreal[indexInBlock] = SCALE*make_float3(particleVelm.x, particleVelm.y, particleVelm.z);
+        vreal[indexInBlock] = SCALE*make_mixed3(particleVelm.x, particleVelm.y, particleVelm.z);
-        vimag[indexInBlock] = make_float3(0);
+        vimag[indexInBlock] = make_mixed3(0);
        __syncthreads();
        FFT_V_FORWARD
@@ -53,28 +53,28 @@ extern "C" __global__ void applyPileThermostat(float4* velm, float4* random, uns
            // Apply a local Langevin thermostat to the centroid mode.
            float4 rand = random[randomIndex];
-            vreal[0] = vreal[0]*c1_0 + c3_0*make_float3(rand.x, rand.y, rand.z);
+            vreal[0] = vreal[0]*c1_0 + c3_0*make_mixed3(rand.x, rand.y, rand.z);
        }
        else {
            // Use critical damping white noise for the remaining modes.
            int k = (indexInBlock <= NUM_COPIES/2 ? indexInBlock : NUM_COPIES-indexInBlock);
            const bool isCenter = (NUM_COPIES%2 == 0 && k == NUM_COPIES/2);
-            const float wk = twown*sin(k*M_PI/NUM_COPIES);
+            const mixed wk = twown*sin(k*M_PI/NUM_COPIES);
-            const float c1 = EXP(-wk*dt);
+            const mixed c1 = EXP(-wk*dt);
-            const float c2 = SQRT((1.0f-c1*c1)/2.0f) * (isCenter ? sqrt(2.0f) : 1.0f);
+            const mixed c2 = SQRT((1.0f-c1*c1)/2.0f) * (isCenter ? sqrt(2.0f) : 1.0f);
-            const float c3 = c2*SQRT(nkT*invMass);
+            const mixed c3 = c2*SQRT(nkT*invMass);
-            float4 rand1 = c3*random[randomIndex+k];
+            float4 rand1 = random[randomIndex+k];
-            float4 rand2 = (isCenter ? make_float4(0) : c3*random[randomIndex+NUM_COPIES-k]);
+            float4 rand2 = (isCenter ? make_float4(0) : random[randomIndex+NUM_COPIES-k]);
-            vreal[indexInBlock] = c1*vreal[indexInBlock] + make_float3(rand1.x, rand1.y, rand1.z);
+            vreal[indexInBlock] = c1*vreal[indexInBlock] + c3*make_mixed3(rand1.x, rand1.y, rand1.z);
-            vimag[indexInBlock] = c1*vimag[indexInBlock] + (indexInBlock < NUM_COPIES/2 ? make_float3(rand2.x, rand2.y, rand2.z) : make_float3(-rand2.x, -rand2.y, -rand2.z));
+            vimag[indexInBlock] = c1*vimag[indexInBlock] + c3*(indexInBlock < NUM_COPIES/2 ? make_mixed3(rand2.x, rand2.y, rand2.z) : make_mixed3(-rand2.x, -rand2.y, -rand2.z));
        }
        __syncthreads();
        // Inverse FFT.
        FFT_V_BACKWARD
-        velm[particle+indexInBlock*PADDED_NUM_ATOMS] = make_float4(SCALE*vreal[indexInBlock].x, SCALE*vreal[indexInBlock].y, SCALE*vreal[indexInBlock].z, particleVelm.w);
+        velm[particle+indexInBlock*PADDED_NUM_ATOMS] = make_mixed4(SCALE*vreal[indexInBlock].x, SCALE*vreal[indexInBlock].y, SCALE*vreal[indexInBlock].z, particleVelm.w);
        randomIndex += blockDim.x*gridDim.x;
    }
 }
@@ -82,24 +82,24 @@ extern "C" __global__ void applyPileThermostat(float4* velm, float4* random, uns
 /**
 * Advance the positions and velocities.
 */
-extern "C" __global__ void integrateStep(float4* posq, float4* velm, long long* force, float dt, float kT) {
+extern "C" __global__ void integrateStep(mixed4* posq, mixed4* velm, long long* force, mixed dt, mixed kT) {
    const int numBlocks = (blockDim.x*gridDim.x)/NUM_COPIES;
    const int blockStart = NUM_COPIES*(threadIdx.x/NUM_COPIES);
    const int indexInBlock = threadIdx.x-blockStart;
-    const float nkT = NUM_COPIES*kT;
+    const mixed nkT = NUM_COPIES*kT;
-    const float twown = 2.0f*nkT/HBAR;
+    const mixed twown = 2.0f*nkT/HBAR;
-    const float forceScale = 1/(float) 0xFFFFFFFF;
+    const mixed forceScale = 1/(mixed) 0xFFFFFFFF;
-    __shared__ float3 q[2*THREAD_BLOCK_SIZE];
+    __shared__ mixed3 q[2*THREAD_BLOCK_SIZE];
-    __shared__ float3 v[2*THREAD_BLOCK_SIZE];
+    __shared__ mixed3 v[2*THREAD_BLOCK_SIZE];
-    __shared__ float3 temp[2*THREAD_BLOCK_SIZE];
+    __shared__ mixed3 temp[2*THREAD_BLOCK_SIZE];
-    __shared__ float2 w[NUM_COPIES];
+    __shared__ mixed2 w[NUM_COPIES];
    // Update velocities.
    for (int particle = (blockIdx.x*blockDim.x+threadIdx.x)/NUM_COPIES; particle < NUM_ATOMS; particle += numBlocks) {
        int index = particle+indexInBlock*PADDED_NUM_ATOMS;
        int forceIndex = particle+indexInBlock*PADDED_NUM_ATOMS*3;
-        float4 particleVelm = velm[index];
+        mixed4 particleVelm = velm[index];
        particleVelm.x += forceScale*force[forceIndex]*(0.5f*dt*particleVelm.w);
        particleVelm.y += forceScale*force[forceIndex+PADDED_NUM_ATOMS]*(0.5f*dt*particleVelm.w);
        particleVelm.z += forceScale*force[forceIndex+PADDED_NUM_ATOMS*2]*(0.5f*dt*particleVelm.w);
@@ -108,23 +108,23 @@ extern "C" __global__ void integrateStep(float4* posq, float4* velm, long long*
    // Evolve the free ring polymer by transforming to the frequency domain.
-    float3* qreal = &q[blockStart];
+    mixed3* qreal = &q[blockStart];
-    float3* qimag = &q[blockStart+blockDim.x];
+    mixed3* qimag = &q[blockStart+blockDim.x];
-    float3* vreal = &v[blockStart];
+    mixed3* vreal = &v[blockStart];
-    float3* vimag = &v[blockStart+blockDim.x];
+    mixed3* vimag = &v[blockStart+blockDim.x];
    if (threadIdx.x < NUM_COPIES)
-        w[indexInBlock] = make_float2(cos(-indexInBlock*2*M_PI/NUM_COPIES), sin(-indexInBlock*2*M_PI/NUM_COPIES));
+        w[indexInBlock] = make_mixed2(cos(-indexInBlock*2*M_PI/NUM_COPIES), sin(-indexInBlock*2*M_PI/NUM_COPIES));
    __syncthreads();
    for (int particle = (blockIdx.x*blockDim.x+threadIdx.x)/NUM_COPIES; particle < NUM_ATOMS; particle += numBlocks) {
-        float4 particlePosq = posq[particle+indexInBlock*PADDED_NUM_ATOMS];
+        mixed4 particlePosq = posq[particle+indexInBlock*PADDED_NUM_ATOMS];
-        float4 particleVelm = velm[particle+indexInBlock*PADDED_NUM_ATOMS];
+        mixed4 particleVelm = velm[particle+indexInBlock*PADDED_NUM_ATOMS];
        // Forward FFT.
-        qreal[indexInBlock] = SCALE*make_float3(particlePosq.x, particlePosq.y, particlePosq.z);
+        qreal[indexInBlock] = SCALE*make_mixed3(particlePosq.x, particlePosq.y, particlePosq.z);
-        qimag[indexInBlock] = make_float3(0);
+        qimag[indexInBlock] = make_mixed3(0);
-        vreal[indexInBlock] = SCALE*make_float3(particleVelm.x, particleVelm.y, particleVelm.z);
+        vreal[indexInBlock] = SCALE*make_mixed3(particleVelm.x, particleVelm.y, particleVelm.z);
-        vimag[indexInBlock] = make_float3(0);
+        vimag[indexInBlock] = make_mixed3(0);
        __syncthreads();
        FFT_Q_FORWARD
        FFT_V_FORWARD
@@ -136,12 +136,12 @@ extern "C" __global__ void integrateStep(float4* posq, float4* velm, long long*
            qimag[0] += vimag[0]*dt;
        }
        else {
-            const float wk = twown*sin(indexInBlock*M_PI/NUM_COPIES);
+            const mixed wk = twown*sin(indexInBlock*M_PI/NUM_COPIES);
-            const float wt = wk*dt;
+            const mixed wt = wk*dt;
-            const float coswt = cos(wt);
+            const mixed coswt = cos(wt);
-            const float sinwt = sin(wt);
+            const mixed sinwt = sin(wt);
-            const float3 vprimereal = vreal[indexInBlock]*coswt - qreal[indexInBlock]*(wk*sinwt); // Advance velocity from t to t+dt
+            const mixed3 vprimereal = vreal[indexInBlock]*coswt - qreal[indexInBlock]*(wk*sinwt); // Advance velocity from t to t+dt
-            const float3 vprimeimag = vimag[indexInBlock]*coswt - qimag[indexInBlock]*(wk*sinwt);
+            const mixed3 vprimeimag = vimag[indexInBlock]*coswt - qimag[indexInBlock]*(wk*sinwt);
            qreal[indexInBlock] = vreal[indexInBlock]*(sinwt/wk) + qreal[indexInBlock]*coswt; // Advance position from t to t+dt
            qimag[indexInBlock] = vimag[indexInBlock]*(sinwt/wk) + qimag[indexInBlock]*coswt;
            vreal[indexInBlock] = vprimereal;
@@ -153,26 +153,26 @@ extern "C" __global__ void integrateStep(float4* posq, float4* velm, long long*
        FFT_Q_BACKWARD
        FFT_V_BACKWARD
-        posq[particle+indexInBlock*PADDED_NUM_ATOMS] = make_float4(SCALE*qreal[indexInBlock].x, SCALE*qreal[indexInBlock].y, SCALE*qreal[indexInBlock].z, particlePosq.w);
+        posq[particle+indexInBlock*PADDED_NUM_ATOMS] = make_mixed4(SCALE*qreal[indexInBlock].x, SCALE*qreal[indexInBlock].y, SCALE*qreal[indexInBlock].z, particlePosq.w);
-        velm[particle+indexInBlock*PADDED_NUM_ATOMS] = make_float4(SCALE*vreal[indexInBlock].x, SCALE*vreal[indexInBlock].y, SCALE*vreal[indexInBlock].z, particleVelm.w);
+        velm[particle+indexInBlock*PADDED_NUM_ATOMS] = make_mixed4(SCALE*vreal[indexInBlock].x, SCALE*vreal[indexInBlock].y, SCALE*vreal[indexInBlock].z, particleVelm.w);
    }
 }
 /**
 * Advance the velocities by a half step.
 */
-extern "C" __global__ void advanceVelocities(float4* velm, long long* force, float dt) {
+extern "C" __global__ void advanceVelocities(mixed4* velm, long long* force, mixed dt) {
    const int numBlocks = (blockDim.x*gridDim.x)/NUM_COPIES;
    const int blockStart = NUM_COPIES*(threadIdx.x/NUM_COPIES);
    const int indexInBlock = threadIdx.x-blockStart;
-    const float forceScale = 1/(float) 0xFFFFFFFF;
+    const mixed forceScale = 1/(mixed) 0xFFFFFFFF;
    // Update velocities.
    for (int particle = (blockIdx.x*blockDim.x+threadIdx.x)/NUM_COPIES; particle < NUM_ATOMS; particle += numBlocks) {
        int index = particle+indexInBlock*PADDED_NUM_ATOMS;
        int forceIndex = particle+indexInBlock*PADDED_NUM_ATOMS*3;
-        float4 particleVelm = velm[index];
+        mixed4 particleVelm = velm[index];
        particleVelm.x += forceScale*force[forceIndex]*(0.5f*dt*particleVelm.w);
        particleVelm.y += forceScale*force[forceIndex+PADDED_NUM_ATOMS]*(0.5f*dt*particleVelm.w);
        particleVelm.z += forceScale*force[forceIndex+PADDED_NUM_ATOMS*2]*(0.5f*dt*particleVelm.w);
@@ -181,9 +181,20 @@ extern "C" __global__ void advanceVelocities(float4* velm, long long* force, flo
 }
 /**
- * Copy a set of per-atom values from the integrator's arrays to the context.
+ * Copy a set of positions from the integrator's arrays to the context.
 */
-extern "C" __global__ void copyToContext(float4* src, float4* dst, int* order, int copy) {
+extern "C" __global__ void copyPositionsToContext(mixed4* src, real4* dst, int* order, int copy) {
+    const int base = copy*PADDED_NUM_ATOMS;
+    for (int particle = blockIdx.x*blockDim.x+threadIdx.x; particle < NUM_ATOMS; particle += blockDim.x*gridDim.x) {
+        mixed4 posq = src[base+order[particle]];
+        dst[particle] = make_real4(posq.x, posq.y, posq.z, posq.w);
+    }
+}
+/**
+ * Copy a set of velocities from the integrator's arrays to the context.
+ */
+extern "C" __global__ void copyVelocitiesToContext(mixed4* src, mixed4* dst, int* order, int copy) {
    const int base = copy*PADDED_NUM_ATOMS;
    for (int particle = blockIdx.x*blockDim.x+threadIdx.x; particle < NUM_ATOMS; particle += blockDim.x*gridDim.x) {
        dst[particle] = src[base+order[particle]];
@@ -191,9 +202,9 @@ extern "C" __global__ void copyToContext(float4* src, float4* dst, int* order, i
 }
 /**
- * Copy a set of per-atom force values from the context to the integrator's arrays.
+ * Copy a set of forces from the context to the integrator's arrays.
 */
-extern "C" __global__ void copyFromContext(long long* src, long long* dst, int* order, int copy) {
+extern "C" __global__ void copyForcesFromContext(long long* src, long long* dst, int* order, int copy) {
    const int base = copy*PADDED_NUM_ATOMS*3;
    for (int particle = blockIdx.x*blockDim.x+threadIdx.x; particle < NUM_ATOMS; particle += blockDim.x*gridDim.x) {
        dst[base+order[particle]] = src[particle];
@@ -205,10 +216,11 @@ extern "C" __global__ void copyFromContext(long long* src, long long* dst, int*
 /**
 * Update atom positions so all copies are offset by the same number of periodic box widths.
 */
-extern "C" __global__ void applyCellTranslations(float4* posq, float4* movedPos, int* order, int movedCopy) {
+extern "C" __global__ void applyCellTranslations(mixed4* posq, real4* movedPos, int* order, int movedCopy) {
    for (int particle = blockIdx.x*blockDim.x+threadIdx.x; particle < NUM_ATOMS; particle += blockDim.x*gridDim.x) {
        int index = order[particle];
-        float4 delta = movedPos[particle]-posq[movedCopy*PADDED_NUM_ATOMS+index];
+        real4 p = movedPos[particle];
+        mixed4 delta = make_mixed4(p.x, p.y, p.z, p.w)-posq[movedCopy*PADDED_NUM_ATOMS+index];
        for (int copy = 0; copy < NUM_COPIES; copy++)
            posq[copy*PADDED_NUM_ATOMS+index] += delta;
    }

--- a/plugins/rpmd/platforms/cuda/tests/CMakeLists.txt
+++ b/plugins/rpmd/platforms/cuda/tests/CMakeLists.txt
@@ -14,6 +14,10 @@ FOREACH(TEST_PROG ${TEST_PROGS})
    # Link with shared library
    ADD_EXECUTABLE(${TEST_ROOT} ${TEST_PROG})
    TARGET_LINK_LIBRARIES(${TEST_ROOT} ${SHARED_RPMD_TARGET})
-    ADD_TEST(${TEST_ROOT} ${EXECUTABLE_OUTPUT_PATH}/${TEST_ROOT})
+    ADD_TEST(${TEST_ROOT}Single ${EXECUTABLE_OUTPUT_PATH}/${TEST_ROOT} single)
+    IF (OPENMM_BUILD_CUDA_DOUBLE_PRECISION_TESTS)
+        ADD_TEST(${TEST_ROOT}Mixed ${EXECUTABLE_OUTPUT_PATH}/${TEST_ROOT} mixed)
+        ADD_TEST(${TEST_ROOT}Double ${EXECUTABLE_OUTPUT_PATH}/${TEST_ROOT} double)
+    ENDIF(OPENMM_BUILD_CUDA_DOUBLE_PRECISION_TESTS)
 ENDFOREACH(TEST_PROG ${TEST_PROGS})
--- a/plugins/rpmd/platforms/cuda/tests/TestCudaRpmd.cpp
+++ b/plugins/rpmd/platforms/cuda/tests/TestCudaRpmd.cpp
@@ -165,7 +165,6 @@ void testParaHydrogen() {
    vector<int> counts(numBins, 0);
    const double invBoxSize = 1.0/boxSize;
    double meanKE = 0.0;
-    const RealOpenMM hbar = 1.054571628e-34*AVOGADRO/(1000*1e-12);
    for (int step = 0; step < numSteps; step++) {
        integ.step(20);
        vector<State> states(numCopies);
@@ -221,9 +220,11 @@ void testParaHydrogen() {
    ASSERT_USUALLY_EQUAL_TOL(60.0, 1.5*temperature+meanKE, 0.02);
 }
-int main() {
+int main(int argc, char* argv[]) {
    try {
        Platform::loadPluginsFromDirectory(Platform::getDefaultPluginsDirectory());
+        if (argc > 1)
+            Platform::getPlatformByName("CUDA").setPropertyDefaultValue("CudaPrecision", string(argv[1]));
        testFreeParticles();
        testParaHydrogen();
    }

--- a/plugins/rpmd/platforms/opencl/src/OpenCLRpmdKernels.cpp
+++ b/plugins/rpmd/platforms/opencl/src/OpenCLRpmdKernels.cpp
--- a/plugins/rpmd/platforms/opencl/src/OpenCLRpmdKernels.h
+++ b/plugins/rpmd/platforms/opencl/src/OpenCLRpmdKernels.h
@@ -9,7 +9,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2011 Stanford University and the Authors.           *
+ * Portions copyright (c) 2011-2012 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -92,7 +92,7 @@ private:
    OpenCLArray* forces;
    OpenCLArray* positions;
    OpenCLArray* velocities;
-    cl::Kernel pileKernel, stepKernel, velocitiesKernel, copyToContextKernel, copyFromContextKernel, translateKernel;
+    cl::Kernel pileKernel, stepKernel, velocitiesKernel, copyPositionsToContextKernel, copyVelocitiesToContextKernel, copyForcesFromContextKernel, translateKernel;
 };
 } // namespace OpenMM

--- a/plugins/rpmd/platforms/opencl/src/kernels/rpmd.cl
+++ b/plugins/rpmd/platforms/opencl/src/kernels/rpmd.cl
-float4 multiplyComplexRealPart(float2 c1, float4 c2r, float4 c2i) {
+mixed4 multiplyComplexRealPart(mixed2 c1, mixed4 c2r, mixed4 c2i) {
    return c1.x*c2r-c1.y*c2i;
 }
-float4 multiplyComplexImagPart(float2 c1, float4 c2r, float4 c2i) {
+mixed4 multiplyComplexImagPart(mixed2 c1, mixed4 c2r, mixed4 c2i) {
    return c1.x*c2i+c1.y*c2r;
 }
-float4 multiplyComplexRealPartConj(float2 c1, float4 c2r, float4 c2i) {
+mixed4 multiplyComplexRealPartConj(mixed2 c1, mixed4 c2r, mixed4 c2i) {
    return c1.x*c2r+c1.y*c2i;
 }
-float4 multiplyComplexImagPartConj(float2 c1, float4 c2r, float4 c2i) {
+mixed4 multiplyComplexImagPartConj(mixed2 c1, mixed4 c2r, mixed4 c2i) {
    return c1.x*c2i-c1.y*c2r;
 }
 /**
 * Apply the PILE-L thermostat.
 */
-__kernel void applyPileThermostat(__global float4* velm, __local float4* v, __local float4* temp, __local float2* w, __global float4* random, unsigned int randomIndex,
+__kernel void applyPileThermostat(__global mixed4* velm, __global float4* random, unsigned int randomIndex,
-        float dt, float kT, float friction) {
+        mixed dt, mixed kT, mixed friction) {
    const int numBlocks = get_global_size(0)/NUM_COPIES;
    const int blockStart = NUM_COPIES*(get_local_id(0)/NUM_COPIES);
    const int indexInBlock = get_local_id(0)-blockStart;
-    const float nkT = NUM_COPIES*kT;
+    const mixed nkT = NUM_COPIES*kT;
-    const float twown = 2.0f*nkT/HBAR;
+    const mixed twown = 2.0f*nkT/HBAR;
-    const float c1_0 = EXP(-0.5f*dt*friction);
+    const mixed c1_0 = exp(-0.5f*dt*friction);
-    const float c2_0 = SQRT(1.0f-c1_0*c1_0);
+    const mixed c2_0 = sqrt(1.0f-c1_0*c1_0);
-    __local float4* vreal = &v[blockStart];
+    __local mixed4 v[2*THREAD_BLOCK_SIZE];
-    __local float4* vimag = &v[blockStart+get_local_size(0)];
+    __local mixed4 temp[2*THREAD_BLOCK_SIZE];
+    __local mixed2 w[NUM_COPIES];
+    __local mixed4* vreal = &v[blockStart];
+    __local mixed4* vimag = &v[blockStart+get_local_size(0)];
    if (get_local_id(0) < NUM_COPIES)
-        w[indexInBlock] = (float2) (cos(-indexInBlock*2*M_PI/NUM_COPIES), sin(-indexInBlock*2*M_PI/NUM_COPIES));
+        w[indexInBlock] = (mixed2) (cos(-indexInBlock*2*M_PI/NUM_COPIES), sin(-indexInBlock*2*M_PI/NUM_COPIES));
    barrier(CLK_LOCAL_MEM_FENCE);
    randomIndex += NUM_COPIES*(get_global_id(0)/NUM_COPIES);
    for (int particle = get_global_id(0)/NUM_COPIES; particle < NUM_ATOMS; particle += numBlocks) {
-        float4 particleVelm = velm[particle+indexInBlock*PADDED_NUM_ATOMS];
+        mixed4 particleVelm = velm[particle+indexInBlock*PADDED_NUM_ATOMS];
-        float invMass = particleVelm.w;
+        mixed invMass = particleVelm.w;
-        float c3_0 = c2_0*SQRT(nkT*invMass);
+        mixed c3_0 = c2_0*sqrt(nkT*invMass);
        // Forward FFT.
        vreal[indexInBlock] = SCALE*particleVelm;
-        vimag[indexInBlock] = (float4) (0.0f, 0.0f, 0.0f, 0.0f);
+        vimag[indexInBlock] = (mixed4) (0.0f, 0.0f, 0.0f, 0.0f);
        barrier(CLK_GLOBAL_MEM_FENCE);
        FFT_V_FORWARD
@@ -49,19 +52,19 @@ __kernel void applyPileThermostat(__global float4* velm, __local float4* v, __lo
        if (indexInBlock == 0) {
            // Apply a local Langevin thermostat to the centroid mode.
-            vreal[0].xyz = vreal[0].xyz*c1_0 + c3_0*random[randomIndex].xyz;
+            vreal[0].xyz = vreal[0].xyz*c1_0 + c3_0*convert_mixed4(random[randomIndex]).xyz;
        }
        else {
            // Use critical damping white noise for the remaining modes.
            int k = (indexInBlock <= NUM_COPIES/2 ? indexInBlock : NUM_COPIES-indexInBlock);
            const bool isCenter = (NUM_COPIES%2 == 0 && k == NUM_COPIES/2);
-            const float wk = twown*sin(k*M_PI/NUM_COPIES);
+            const mixed wk = twown*sin(k*M_PI/NUM_COPIES);
-            const float c1 = EXP(-wk*dt);
+            const mixed c1 = exp(-wk*dt);
-            const float c2 = SQRT((1.0f-c1*c1)/2.0f) * (isCenter ? sqrt(2.0f) : 1.0f);
+            const mixed c2 = sqrt((1.0f-c1*c1)/2.0f) * (isCenter ? sqrt(2.0f) : 1.0f);
-            const float c3 = c2*SQRT(nkT*invMass);
+            const mixed c3 = c2*sqrt(nkT*invMass);
-            float4 rand1 = c3*random[randomIndex+k];
+            mixed4 rand1 = c3*convert_mixed4(random[randomIndex+k]);
-            float4 rand2 = (isCenter ? 0.0f : c3*random[randomIndex+NUM_COPIES-k]);
+            mixed4 rand2 = (isCenter ? 0.0f : c3*convert_mixed4(random[randomIndex+NUM_COPIES-k]));
            vreal[indexInBlock].xyz = c1*vreal[indexInBlock].xyz + rand1.xyz;
            vimag[indexInBlock].xyz = c1*vimag[indexInBlock].xyz + (indexInBlock < NUM_COPIES/2 ? rand2.xyz : -rand2.xyz);
        }
@@ -78,42 +81,45 @@ __kernel void applyPileThermostat(__global float4* velm, __local float4* v, __lo
 /**
 * Advance the positions and velocities.
 */
-__kernel void integrateStep(__global float4* posq, __global float4* velm, __global float4* force,
+__kernel void integrateStep(__global mixed4* posq, __global mixed4* velm, __global real4* force, mixed dt, mixed kT) {
-        __local float4* q, __local float4* v, __local float4* temp, __local float2* w, float dt, float kT) {
    const int numBlocks = get_global_size(0)/NUM_COPIES;
    const int blockStart = NUM_COPIES*(get_local_id(0)/NUM_COPIES);
    const int indexInBlock = get_local_id(0)-blockStart;
-    const float nkT = NUM_COPIES*kT;
+    const mixed nkT = NUM_COPIES*kT;
-    const float twown = 2.0f*nkT/HBAR;
+    const mixed twown = 2.0f*nkT/HBAR;
+    __local mixed4 q[2*THREAD_BLOCK_SIZE];
+    __local mixed4 v[2*THREAD_BLOCK_SIZE];
+    __local mixed4 temp[2*THREAD_BLOCK_SIZE];
+    __local mixed2 w[NUM_COPIES];
    // Update velocities.
    for (int particle = get_global_id(0)/NUM_COPIES; particle < NUM_ATOMS; particle += numBlocks) {
        int index = particle+indexInBlock*PADDED_NUM_ATOMS;
-        float4 particleVelm = velm[index];
+        mixed4 particleVelm = velm[index];
-        particleVelm.xyz += force[index].xyz*(0.5f*dt*particleVelm.w);
+        particleVelm.xyz += convert_mixed4(force[index]).xyz*(0.5f*dt*particleVelm.w);
        velm[index] = particleVelm;
    }
    // Evolve the free ring polymer by transforming to the frequency domain.
-    __local float4* qreal = &q[blockStart];
+    __local mixed4* qreal = &q[blockStart];
-    __local float4* qimag = &q[blockStart+get_local_size(0)];
+    __local mixed4* qimag = &q[blockStart+get_local_size(0)];
-    __local float4* vreal = &v[blockStart];
+    __local mixed4* vreal = &v[blockStart];
-    __local float4* vimag = &v[blockStart+get_local_size(0)];
+    __local mixed4* vimag = &v[blockStart+get_local_size(0)];
    if (get_local_id(0) < NUM_COPIES)
-        w[indexInBlock] = (float2) (cos(-indexInBlock*2*M_PI/NUM_COPIES), sin(-indexInBlock*2*M_PI/NUM_COPIES));
+        w[indexInBlock] = (mixed2) (cos(-indexInBlock*2*M_PI/NUM_COPIES), sin(-indexInBlock*2*M_PI/NUM_COPIES));
    barrier(CLK_LOCAL_MEM_FENCE);
    for (int particle = get_global_id(0)/NUM_COPIES; particle < NUM_ATOMS; particle += numBlocks) {
-        float4 particlePosq = posq[particle+indexInBlock*PADDED_NUM_ATOMS];
+        mixed4 particlePosq = posq[particle+indexInBlock*PADDED_NUM_ATOMS];
-        float4 particleVelm = velm[particle+indexInBlock*PADDED_NUM_ATOMS];
+        mixed4 particleVelm = velm[particle+indexInBlock*PADDED_NUM_ATOMS];
        // Forward FFT.
        qreal[indexInBlock] = SCALE*particlePosq;
-        qimag[indexInBlock] = (float4) (0.0f, 0.0f, 0.0f, 0.0f);
+        qimag[indexInBlock] = (mixed4) (0.0f, 0.0f, 0.0f, 0.0f);
        vreal[indexInBlock] = SCALE*particleVelm;
-        vimag[indexInBlock] = (float4) (0.0f, 0.0f, 0.0f, 0.0f);
+        vimag[indexInBlock] = (mixed4) (0.0f, 0.0f, 0.0f, 0.0f);
        barrier(CLK_GLOBAL_MEM_FENCE);
        FFT_Q_FORWARD
        FFT_V_FORWARD
@@ -125,12 +131,12 @@ __kernel void integrateStep(__global float4* posq, __global float4* velm, __glob
            qimag[0].xyz += vimag[0].xyz*dt;
        }
        else {
-            const float wk = twown*sin(indexInBlock*M_PI/NUM_COPIES);
+            const mixed wk = twown*sin(indexInBlock*M_PI/NUM_COPIES);
-            const float wt = wk*dt;
+            const mixed wt = wk*dt;
-            const float coswt = cos(wt);
+            const mixed coswt = cos(wt);
-            const float sinwt = sin(wt);
+            const mixed sinwt = sin(wt);
-            const float4 vprimereal = vreal[indexInBlock]*coswt - qreal[indexInBlock]*(wk*sinwt); // Advance velocity from t to t+dt
+            const mixed4 vprimereal = vreal[indexInBlock]*coswt - qreal[indexInBlock]*(wk*sinwt); // Advance velocity from t to t+dt
-            const float4 vprimeimag = vimag[indexInBlock]*coswt - qimag[indexInBlock]*(wk*sinwt);
+            const mixed4 vprimeimag = vimag[indexInBlock]*coswt - qimag[indexInBlock]*(wk*sinwt);
            qreal[indexInBlock] = vreal[indexInBlock]*(sinwt/wk) + qreal[indexInBlock]*coswt; // Advance position from t to t+dt
            qimag[indexInBlock] = vimag[indexInBlock]*(sinwt/wk) + qimag[indexInBlock]*coswt;
            vreal[indexInBlock] = vprimereal;
@@ -150,7 +156,7 @@ __kernel void integrateStep(__global float4* posq, __global float4* velm, __glob
 /**
 * Advance the velocities by a half step.
 */
-__kernel void advanceVelocities(__global float4* velm, __global float4* force, float dt) {
+__kernel void advanceVelocities(__global mixed4* velm, __global real4* force, mixed dt) {
    const int numBlocks = get_global_size(0)/NUM_COPIES;
    const int blockStart = NUM_COPIES*(get_local_id(0)/NUM_COPIES);
    const int indexInBlock = get_local_id(0)-blockStart;
@@ -159,16 +165,26 @@ __kernel void advanceVelocities(__global float4* velm, __global float4* force, f
    for (int particle = get_global_id(0)/NUM_COPIES; particle < NUM_ATOMS; particle += numBlocks) {
        int index = particle+indexInBlock*PADDED_NUM_ATOMS;
-        float4 particleVelm = velm[index];
+        mixed4 particleVelm = velm[index];
-        particleVelm.xyz += force[index].xyz*(0.5f*dt*particleVelm.w);
+        particleVelm.xyz += convert_mixed4(force[index]).xyz*(0.5f*dt*particleVelm.w);
        velm[index] = particleVelm;
    }
 }
 /**
- * Copy a set of per-atom values from the integrator's arrays to the context.
+ * Copy a set of positions from the integrator's arrays to the context.
 */
-__kernel void copyToContext(__global float4* src, __global float4* dst, __global int* order, int copy) {
+__kernel void copyPositionsToContext(__global mixed4* src, __global real4* dst, __global int* order, int copy) {
+    const int base = copy*PADDED_NUM_ATOMS;
+    for (int particle = get_global_id(0); particle < NUM_ATOMS; particle += get_global_size(0)) {
+        dst[particle] = convert_real4(src[base+order[particle]]);
+    }
+}
+/**
+ * Copy a set of velocities from the integrator's arrays to the context.
+ */
+__kernel void copyVelocitiesToContext(__global mixed4* src, __global mixed4* dst, __global int* order, int copy) {
    const int base = copy*PADDED_NUM_ATOMS;
    for (int particle = get_global_id(0); particle < NUM_ATOMS; particle += get_global_size(0)) {
        dst[particle] = src[base+order[particle]];
@@ -176,9 +192,9 @@ __kernel void copyToContext(__global float4* src, __global float4* dst, __global
 }
 /**
- * Copy a set of per-atom values from the context to the integrator's arrays.
+ * Copy a set forces from the context to the integrator's arrays.
 */
-__kernel void copyFromContext(__global float4* src, __global float4* dst, __global int* order, int copy) {
+__kernel void copyForcesFromContext(__global real4* src, __global real4* dst, __global int* order, int copy) {
    const int base = copy*PADDED_NUM_ATOMS;
    for (int particle = get_global_id(0); particle < NUM_ATOMS; particle += get_global_size(0)) {
        dst[base+order[particle]] = src[particle];
@@ -188,10 +204,10 @@ __kernel void copyFromContext(__global float4* src, __global float4* dst, __glob
 /**
 * Update atom positions so all copies are offset by the same number of periodic box widths.
 */
-__kernel void applyCellTranslations(__global float4* posq, __global float4* movedPos, __global int* order, int movedCopy) {
+__kernel void applyCellTranslations(__global mixed4* posq, __global real4* movedPos, __global int* order, int movedCopy) {
    for (int particle = get_global_id(0); particle < NUM_ATOMS; particle += get_global_size(0)) {
        int index = order[particle];
-        float4 delta = movedPos[particle]-posq[movedCopy*PADDED_NUM_ATOMS+index];
+        mixed4 delta = convert_mixed4(movedPos[particle])-posq[movedCopy*PADDED_NUM_ATOMS+index];
        for (int copy = 0; copy < NUM_COPIES; copy++)
            posq[copy*PADDED_NUM_ATOMS+index] += delta;
    }

--- a/plugins/rpmd/platforms/opencl/tests/CMakeLists.txt
+++ b/plugins/rpmd/platforms/opencl/tests/CMakeLists.txt
@@ -14,6 +14,10 @@ FOREACH(TEST_PROG ${TEST_PROGS})
    # Link with shared library
    ADD_EXECUTABLE(${TEST_ROOT} ${TEST_PROG})
    TARGET_LINK_LIBRARIES(${TEST_ROOT} ${SHARED_RPMD_TARGET})
-    ADD_TEST(${TEST_ROOT} ${EXECUTABLE_OUTPUT_PATH}/${TEST_ROOT})
+    ADD_TEST(${TEST_ROOT}Single ${EXECUTABLE_OUTPUT_PATH}/${TEST_ROOT} single)
+    IF (OPENMM_BUILD_CUDA_DOUBLE_PRECISION_TESTS)
+        ADD_TEST(${TEST_ROOT}Mixed ${EXECUTABLE_OUTPUT_PATH}/${TEST_ROOT} mixed)
+        ADD_TEST(${TEST_ROOT}Double ${EXECUTABLE_OUTPUT_PATH}/${TEST_ROOT} double)
+    ENDIF(OPENMM_BUILD_CUDA_DOUBLE_PRECISION_TESTS)
 ENDFOREACH(TEST_PROG ${TEST_PROGS})
--- a/plugins/rpmd/platforms/opencl/tests/TestOpenCLRpmd.cpp
+++ b/plugins/rpmd/platforms/opencl/tests/TestOpenCLRpmd.cpp
@@ -6,7 +6,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2011 Stanford University and the Authors.           *
+ * Portions copyright (c) 2011-2012 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -221,9 +221,11 @@ void testParaHydrogen() {
    ASSERT_USUALLY_EQUAL_TOL(60.0, 1.5*temperature+meanKE, 0.02);
 }
-int main() {
+int main(int argc, char* argv[]) {
    try {
        Platform::loadPluginsFromDirectory(Platform::getDefaultPluginsDirectory());
+        if (argc > 1)
+            Platform::getPlatformByName("OpenCL").setPropertyDefaultValue("OpenCLPrecision", string(argv[1]));
        testFreeParticles();
        testParaHydrogen();
    }