RPMD supports mixed and double precision

cd6af26e · Peter Eastman · b8b2e1ef · cd6af26e · cd6af26e · cd6af26e
Commit cd6af26e authored Nov 16, 2012 by Peter Eastman
10 changed files
--- a/plugins/rpmd/platforms/cuda/src/CudaRpmdKernels.cpp
+++ b/plugins/rpmd/platforms/cuda/src/CudaRpmdKernels.cpp
@@ -70,6 +70,7 @@ CudaIntegrateRPMDStepKernel::~CudaIntegrateRPMDStepKernel() {
    if (velocities != NULL)
        delete velocities;
 }
+
 void CudaIntegrateRPMDStepKernel::initialize(const System& system, const RPMDIntegrator& integrator) {
    cu.getPlatformData().initializeContexts(system);
    numCopies = integrator.getNumCopies();
@@ -80,20 +81,33 @@ void CudaIntegrateRPMDStepKernel::initialize(const System& system, const RPMDInt
    if (numCopies != findFFTDimension(numCopies))
        throw OpenMMException("RPMDIntegrator: the number of copies must be a multiple of powers of 2, 3, and 5.");
    int paddedParticles = cu.getPaddedNumAtoms();
+    bool useDoublePrecision = (cu.getUseDoublePrecision() || cu.getUseMixedPrecision());
+    int elementSize = (useDoublePrecision ? sizeof(double4) : sizeof(float4));
    forces = CudaArray::create<long long>(cu, numCopies*paddedParticles*3, "rpmdForces");
-    positions = CudaArray::create<float4>(cu, numCopies*paddedParticles, "rpmdPositions");
-    velocities = CudaArray::create<float4>(cu, numCopies*paddedParticles, "rpmdVelocities");
+    positions = new CudaArray(cu, numCopies*paddedParticles, elementSize, "rpmdPositions");
+    velocities = new CudaArray(cu, numCopies*paddedParticles, elementSize, "rpmdVelocities");
    cu.getIntegrationUtilities().initRandomNumberGenerator((unsigned int) integrator.getRandomNumberSeed());
    
    // Fill in the posq and velm arrays with safe values to avoid a risk of nans.
    
-    vector<float4> temp(positions->getSize());
-    for (int i = 0; i < positions->getSize(); i++)
-        temp[i] = make_float4(0, 0, 0, 0);
-    positions->upload(temp);
-    for (int i = 0; i < velocities->getSize(); i++)
-        temp[i] = make_float4(0, 0, 0, 1);
-    velocities->upload(temp);
+    if (useDoublePrecision) {
+        vector<double4> temp(positions->getSize());
+        for (int i = 0; i < positions->getSize(); i++)
+            temp[i] = make_double4(0, 0, 0, 0);
+        positions->upload(temp);
+        for (int i = 0; i < velocities->getSize(); i++)
+            temp[i] = make_double4(0, 0, 0, 1);
+        velocities->upload(temp);
+    }
+    else {
+        vector<float4> temp(positions->getSize());
+        for (int i = 0; i < positions->getSize(); i++)
+            temp[i] = make_float4(0, 0, 0, 0);
+        positions->upload(temp);
+        for (int i = 0; i < velocities->getSize(); i++)
+            temp[i] = make_float4(0, 0, 0, 1);
+        velocities->upload(temp);
+    }

    // Create kernels.
    
@@ -114,8 +128,9 @@ void CudaIntegrateRPMDStepKernel::initialize(const System& system, const RPMDInt
    pileKernel = cu.getKernel(module, "applyPileThermostat");
    stepKernel = cu.getKernel(module, "integrateStep");
    velocitiesKernel = cu.getKernel(module, "advanceVelocities");
-    copyToContextKernel = cu.getKernel(module, "copyToContext");
-    copyFromContextKernel = cu.getKernel(module, "copyFromContext");
+    copyPositionsToContextKernel = cu.getKernel(module, "copyPositionsToContext");
+    copyVelocitiesToContextKernel = cu.getKernel(module, "copyVelocitiesToContext");
+    copyForcesFromContextKernel = cu.getKernel(module, "copyForcesFromContext");
    translateKernel = cu.getKernel(module, "applyCellTranslations");
 }

@@ -129,19 +144,23 @@ void CudaIntegrateRPMDStepKernel::execute(ContextImpl& context, const RPMDIntegr
    
    // Apply the PILE-L thermostat.
    
+    bool useDoublePrecision = (cu.getUseDoublePrecision() || cu.getUseMixedPrecision());
    double dt = integrator.getStepSize();
    float dtFloat = (float) dt;
+    void* dtPtr = (useDoublePrecision ? (void*) &dt : (void*) &dtFloat);
    double kT = integrator.getTemperature()*BOLTZ;
    float kTFloat = (float) kT;
+    void* kTPtr = (useDoublePrecision ? (void*) &kT : (void*) &kTFloat);
    double friction = integrator.getFriction();
    float frictionFloat = (float) friction;
+    void* frictionPtr = (useDoublePrecision ? (void*) &friction : (void*) &frictionFloat);
    int randomIndex = integration.prepareRandomNumbers(numParticles*numCopies);
-    void* pileArgs[] = {&velocities->getDevicePointer(), &integration.getRandom().getDevicePointer(), &randomIndex, &dtFloat, &kTFloat, &frictionFloat};
+    void* pileArgs[] = {&velocities->getDevicePointer(), &integration.getRandom().getDevicePointer(), &randomIndex, dtPtr, kTPtr, frictionPtr};
    cu.executeKernel(pileKernel, pileArgs, numParticles*numCopies, workgroupSize);

    // Update positions and velocities.
    
-    void* stepArgs[] = {&positions->getDevicePointer(), &velocities->getDevicePointer(), &forces->getDevicePointer(), &dtFloat, &kTFloat};
+    void* stepArgs[] = {&positions->getDevicePointer(), &velocities->getDevicePointer(), &forces->getDevicePointer(), dtPtr, kTPtr};
    cu.executeKernel(stepKernel, stepArgs, numParticles*numCopies, workgroupSize);

    // Calculate forces based on the updated positions.
@@ -150,7 +169,7 @@ void CudaIntegrateRPMDStepKernel::execute(ContextImpl& context, const RPMDIntegr
    
    // Update velocities.

-    void* velocitiesArgs[] = {&velocities->getDevicePointer(), &forces->getDevicePointer(), &dtFloat};
+    void* velocitiesArgs[] = {&velocities->getDevicePointer(), &forces->getDevicePointer(), dtPtr};
    cu.executeKernel(velocitiesKernel, velocitiesArgs, numParticles*numCopies, workgroupSize);

    // Apply the PILE-L thermostat again.
@@ -167,10 +186,10 @@ void CudaIntegrateRPMDStepKernel::execute(ContextImpl& context, const RPMDIntegr
 void CudaIntegrateRPMDStepKernel::computeForces(ContextImpl& context) {
    for (int i = 0; i < numCopies; i++) {
        void* copyToContextArgs[] = {&positions->getDevicePointer(), &cu.getPosq().getDevicePointer(), &cu.getAtomIndexArray().getDevicePointer(), &i};
-        cu.executeKernel(copyToContextKernel, copyToContextArgs, cu.getNumAtoms());
+        cu.executeKernel(copyPositionsToContextKernel, copyToContextArgs, cu.getNumAtoms());
        context.calcForcesAndEnergy(true, false);
        void* copyFromContextArgs[] = {&cu.getForce().getDevicePointer(), &forces->getDevicePointer(), &cu.getAtomIndexArray().getDevicePointer(), &i};
-        cu.executeKernel(copyFromContextKernel, copyFromContextArgs, cu.getNumAtoms());
+        cu.executeKernel(copyForcesFromContextKernel, copyFromContextArgs, cu.getNumAtoms());
        if (cu.getAtomsWereReordered() && cu.getNonbondedUtilities().getUsePeriodic()) {
            // Atoms may have been translated into a different periodic box, so apply
            // the same translation to all the beads.
@@ -190,11 +209,29 @@ void CudaIntegrateRPMDStepKernel::setPositions(int copy, const vector<Vec3>& pos
        throw OpenMMException("RPMDIntegrator: Cannot set positions before the integrator is added to a Context");
    if (pos.size() != numParticles)
        throw OpenMMException("RPMDIntegrator: wrong number of values passed to setPositions()");
-    vector<float4> posq(cu.getPaddedNumAtoms());
-    cu.getPosq().download(posq);
-    for (int i = 0; i < numParticles; i++)
-        posq[i] = make_float4((float) pos[i][0], (float) pos[i][1], (float) pos[i][2], posq[i].w);
-    CUresult result = cuMemcpyHtoD(positions->getDevicePointer()+copy*cu.getPaddedNumAtoms()*sizeof(float4), &posq[0], numParticles*sizeof(float4));
+    CUresult result;
+    if (cu.getUseDoublePrecision()) {
+        vector<double4> posq(cu.getPaddedNumAtoms());
+        cu.getPosq().download(posq);
+        for (int i = 0; i < numParticles; i++)
+            posq[i] = make_double4(pos[i][0], pos[i][1], pos[i][2], posq[i].w);
+        result = cuMemcpyHtoD(positions->getDevicePointer()+copy*cu.getPaddedNumAtoms()*sizeof(double4), &posq[0], numParticles*sizeof(double4));
+    }
+    else if (cu.getUseMixedPrecision()) {
+        vector<float4> posqf(cu.getPaddedNumAtoms());
+        cu.getPosq().download(posqf);
+        vector<double4> posq(cu.getPaddedNumAtoms());
+        for (int i = 0; i < numParticles; i++)
+            posq[i] = make_double4(pos[i][0], pos[i][1], pos[i][2], posqf[i].w);
+        result = cuMemcpyHtoD(positions->getDevicePointer()+copy*cu.getPaddedNumAtoms()*sizeof(double4), &posq[0], numParticles*sizeof(double4));
+    }
+    else {
+        vector<float4> posq(cu.getPaddedNumAtoms());
+        cu.getPosq().download(posq);
+        for (int i = 0; i < numParticles; i++)
+            posq[i] = make_float4((float) pos[i][0], (float) pos[i][1], (float) pos[i][2], posq[i].w);
+        result = cuMemcpyHtoD(positions->getDevicePointer()+copy*cu.getPaddedNumAtoms()*sizeof(float4), &posq[0], numParticles*sizeof(float4));
+    }
    if (result != CUDA_SUCCESS) {
        std::stringstream str;
        str<<"Error uploading array "<<positions->getName()<<": "<<CudaContext::getErrorString(result)<<" ("<<result<<")";
@@ -207,11 +244,21 @@ void CudaIntegrateRPMDStepKernel::setVelocities(int copy, const vector<Vec3>& ve
        throw OpenMMException("RPMDIntegrator: Cannot set velocities before the integrator is added to a Context");
    if (vel.size() != numParticles)
        throw OpenMMException("RPMDIntegrator: wrong number of values passed to setVelocities()");
-    vector<float4> velm(cu.getPaddedNumAtoms());
-    cu.getVelm().download(velm);
-    for (int i = 0; i < numParticles; i++)
-        velm[i] = make_float4((float) vel[i][0], (float) vel[i][1], (float) vel[i][2], velm[i].w);
-    CUresult result = cuMemcpyHtoD(velocities->getDevicePointer()+copy*cu.getPaddedNumAtoms()*sizeof(float4), &velm[0], numParticles*sizeof(float4));
+    CUresult result;
+    if (cu.getUseDoublePrecision() || cu.getUseMixedPrecision()) {
+        vector<double4> velm(cu.getPaddedNumAtoms());
+        cu.getVelm().download(velm);
+        for (int i = 0; i < numParticles; i++)
+            velm[i] = make_double4(vel[i][0], vel[i][1], vel[i][2], velm[i].w);
+        result = cuMemcpyHtoD(velocities->getDevicePointer()+copy*cu.getPaddedNumAtoms()*sizeof(double4), &velm[0], numParticles*sizeof(double4));
+    }
+    else {
+        vector<float4> velm(cu.getPaddedNumAtoms());
+        cu.getVelm().download(velm);
+        for (int i = 0; i < numParticles; i++)
+            velm[i] = make_float4((float) vel[i][0], (float) vel[i][1], (float) vel[i][2], velm[i].w);
+        result = cuMemcpyHtoD(velocities->getDevicePointer()+copy*cu.getPaddedNumAtoms()*sizeof(float4), &velm[0], numParticles*sizeof(float4));
+    }
    if (result != CUDA_SUCCESS) {
        std::stringstream str;
        str<<"Error uploading array "<<velocities->getName()<<": "<<CudaContext::getErrorString(result)<<" ("<<result<<")";
@@ -221,9 +268,9 @@ void CudaIntegrateRPMDStepKernel::setVelocities(int copy, const vector<Vec3>& ve

 void CudaIntegrateRPMDStepKernel::copyToContext(int copy, ContextImpl& context) {
    void* copyPositionsArgs[] = {&positions->getDevicePointer(), &cu.getPosq().getDevicePointer(), &cu.getAtomIndexArray().getDevicePointer(), &copy};
-    cu.executeKernel(copyToContextKernel, copyPositionsArgs, cu.getNumAtoms());
+    cu.executeKernel(copyPositionsToContextKernel, copyPositionsArgs, cu.getNumAtoms());
    void* copyVelocitiesArgs[] = {&velocities->getDevicePointer(), &cu.getVelm().getDevicePointer(), &cu.getAtomIndexArray().getDevicePointer(), &copy};
-    cu.executeKernel(copyToContextKernel, copyVelocitiesArgs, cu.getNumAtoms());
+    cu.executeKernel(copyVelocitiesToContextKernel, copyVelocitiesArgs, cu.getNumAtoms());
 }

 string CudaIntegrateRPMDStepKernel::createFFT(int size, const string& variable, bool forward) {
@@ -237,10 +284,10 @@ string CudaIntegrateRPMDStepKernel::createFFT(int size, const string& variable,
    string multImag = (forward ? "multiplyComplexImagPart" : "multiplyComplexImagPartConj");

    source<<"{\n";
-    source<<"float3* real0 = "<<variable<<"real;\n";
-    source<<"float3* imag0 = "<<variable<<"imag;\n";
-    source<<"float3* real1 = &temp[blockStart];\n";
-    source<<"float3* imag1 = &temp[blockStart+blockDim.x];\n";
+    source<<"mixed3* real0 = "<<variable<<"real;\n";
+    source<<"mixed3* imag0 = "<<variable<<"imag;\n";
+    source<<"mixed3* real1 = &temp[blockStart];\n";
+    source<<"mixed3* imag1 = &temp[blockStart+blockDim.x];\n";

    // Factor size, generating an appropriate block of code for each factor.

@@ -254,39 +301,39 @@ string CudaIntegrateRPMDStepKernel::createFFT(int size, const string& variable,
            source<<"if (indexInBlock < "<<(L*m)<<") {\n";
            source<<"int i = indexInBlock;\n";
            source<<"int j = i/"<<m<<";\n";
-            source<<"float3 c0r = real"<<input<<"[i];\n";
-            source<<"float3 c0i = imag"<<input<<"[i];\n";
-            source<<"float3 c1r = real"<<input<<"[i+"<<(L*m)<<"];\n";
-            source<<"float3 c1i = imag"<<input<<"[i+"<<(L*m)<<"];\n";
-            source<<"float3 c2r = real"<<input<<"[i+"<<(2*L*m)<<"];\n";
-            source<<"float3 c2i = imag"<<input<<"[i+"<<(2*L*m)<<"];\n";
-            source<<"float3 c3r = real"<<input<<"[i+"<<(3*L*m)<<"];\n";
-            source<<"float3 c3i = imag"<<input<<"[i+"<<(3*L*m)<<"];\n";
-            source<<"float3 c4r = real"<<input<<"[i+"<<(4*L*m)<<"];\n";
-            source<<"float3 c4i = imag"<<input<<"[i+"<<(4*L*m)<<"];\n";
-            source<<"float3 d0r = c1r+c4r;\n";
-            source<<"float3 d0i = c1i+c4i;\n";
-            source<<"float3 d1r = c2r+c3r;\n";
-            source<<"float3 d1i = c2i+c3i;\n";
-            source<<"float3 d2r = "<<cu.doubleToString(sin(0.4*M_PI))<<"*(c1r-c4r);\n";
-            source<<"float3 d2i = "<<cu.doubleToString(sin(0.4*M_PI))<<"*(c1i-c4i);\n";
-            source<<"float3 d3r = "<<cu.doubleToString(sin(0.4*M_PI))<<"*(c2r-c3r);\n";
-            source<<"float3 d3i = "<<cu.doubleToString(sin(0.4*M_PI))<<"*(c2i-c3i);\n";
-            source<<"float3 d4r = d0r+d1r;\n";
-            source<<"float3 d4i = d0i+d1i;\n";
-            source<<"float3 d5r = "<<cu.doubleToString(0.25*sqrt(5.0))<<"*(d0r-d1r);\n";
-            source<<"float3 d5i = "<<cu.doubleToString(0.25*sqrt(5.0))<<"*(d0i-d1i);\n";
-            source<<"float3 d6r = c0r-0.25f*d4r;\n";
-            source<<"float3 d6i = c0i-0.25f*d4i;\n";
-            source<<"float3 d7r = d6r+d5r;\n";
-            source<<"float3 d7i = d6i+d5i;\n";
-            source<<"float3 d8r = d6r-d5r;\n";
-            source<<"float3 d8i = d6i-d5i;\n";
+            source<<"mixed3 c0r = real"<<input<<"[i];\n";
+            source<<"mixed3 c0i = imag"<<input<<"[i];\n";
+            source<<"mixed3 c1r = real"<<input<<"[i+"<<(L*m)<<"];\n";
+            source<<"mixed3 c1i = imag"<<input<<"[i+"<<(L*m)<<"];\n";
+            source<<"mixed3 c2r = real"<<input<<"[i+"<<(2*L*m)<<"];\n";
+            source<<"mixed3 c2i = imag"<<input<<"[i+"<<(2*L*m)<<"];\n";
+            source<<"mixed3 c3r = real"<<input<<"[i+"<<(3*L*m)<<"];\n";
+            source<<"mixed3 c3i = imag"<<input<<"[i+"<<(3*L*m)<<"];\n";
+            source<<"mixed3 c4r = real"<<input<<"[i+"<<(4*L*m)<<"];\n";
+            source<<"mixed3 c4i = imag"<<input<<"[i+"<<(4*L*m)<<"];\n";
+            source<<"mixed3 d0r = c1r+c4r;\n";
+            source<<"mixed3 d0i = c1i+c4i;\n";
+            source<<"mixed3 d1r = c2r+c3r;\n";
+            source<<"mixed3 d1i = c2i+c3i;\n";
+            source<<"mixed3 d2r = "<<cu.doubleToString(sin(0.4*M_PI))<<"*(c1r-c4r);\n";
+            source<<"mixed3 d2i = "<<cu.doubleToString(sin(0.4*M_PI))<<"*(c1i-c4i);\n";
+            source<<"mixed3 d3r = "<<cu.doubleToString(sin(0.4*M_PI))<<"*(c2r-c3r);\n";
+            source<<"mixed3 d3i = "<<cu.doubleToString(sin(0.4*M_PI))<<"*(c2i-c3i);\n";
+            source<<"mixed3 d4r = d0r+d1r;\n";
+            source<<"mixed3 d4i = d0i+d1i;\n";
+            source<<"mixed3 d5r = "<<cu.doubleToString(0.25*sqrt(5.0))<<"*(d0r-d1r);\n";
+            source<<"mixed3 d5i = "<<cu.doubleToString(0.25*sqrt(5.0))<<"*(d0i-d1i);\n";
+            source<<"mixed3 d6r = c0r-0.25f*d4r;\n";
+            source<<"mixed3 d6i = c0i-0.25f*d4i;\n";
+            source<<"mixed3 d7r = d6r+d5r;\n";
+            source<<"mixed3 d7i = d6i+d5i;\n";
+            source<<"mixed3 d8r = d6r-d5r;\n";
+            source<<"mixed3 d8i = d6i-d5i;\n";
            string coeff = cu.doubleToString(sin(0.2*M_PI)/sin(0.4*M_PI));
-            source<<"float3 d9r = "<<sign<<"*(d2i+"<<coeff<<"*d3i);\n";
-            source<<"float3 d9i = "<<sign<<"*(-d2r-"<<coeff<<"*d3r);\n";
-            source<<"float3 d10r = "<<sign<<"*("<<coeff<<"*d2i-d3i);\n";
-            source<<"float3 d10i = "<<sign<<"*(d3r-"<<coeff<<"*d2r);\n";
+            source<<"mixed3 d9r = "<<sign<<"*(d2i+"<<coeff<<"*d3i);\n";
+            source<<"mixed3 d9i = "<<sign<<"*(-d2r-"<<coeff<<"*d3r);\n";
+            source<<"mixed3 d10r = "<<sign<<"*("<<coeff<<"*d2i-d3i);\n";
+            source<<"mixed3 d10i = "<<sign<<"*(d3r-"<<coeff<<"*d2r);\n";
            source<<"real"<<output<<"[i+4*j*"<<m<<"] = c0r+d4r;\n";
            source<<"imag"<<output<<"[i+4*j*"<<m<<"] = c0i+d4i;\n";
            source<<"real"<<output<<"[i+(4*j+1)*"<<m<<"] = "<<multReal<<"(w[j*"<<size<<"/"<<(5*L)<<"], d7r+d9r, d7i+d9i);\n";
@@ -307,22 +354,22 @@ string CudaIntegrateRPMDStepKernel::createFFT(int size, const string& variable,
            source<<"if (indexInBlock < "<<(L*m)<<") {\n";
            source<<"int i = indexInBlock;\n";
            source<<"int j = i/"<<m<<";\n";
-            source<<"float3 c0r = real"<<input<<"[i];\n";
-            source<<"float3 c0i = imag"<<input<<"[i];\n";
-            source<<"float3 c1r = real"<<input<<"[i+"<<(L*m)<<"];\n";
-            source<<"float3 c1i = imag"<<input<<"[i+"<<(L*m)<<"];\n";
-            source<<"float3 c2r = real"<<input<<"[i+"<<(2*L*m)<<"];\n";
-            source<<"float3 c2i = imag"<<input<<"[i+"<<(2*L*m)<<"];\n";
-            source<<"float3 c3r = real"<<input<<"[i+"<<(3*L*m)<<"];\n";
-            source<<"float3 c3i = imag"<<input<<"[i+"<<(3*L*m)<<"];\n";
-            source<<"float3 d0r = c0r+c2r;\n";
-            source<<"float3 d0i = c0i+c2i;\n";
-            source<<"float3 d1r = c0r-c2r;\n";
-            source<<"float3 d1i = c0i-c2i;\n";
-            source<<"float3 d2r = c1r+c3r;\n";
-            source<<"float3 d2i = c1i+c3i;\n";
-            source<<"float3 d3r = "<<sign<<"*(c1i-c3i);\n";
-            source<<"float3 d3i = "<<sign<<"*(c3r-c1r);\n";
+            source<<"mixed3 c0r = real"<<input<<"[i];\n";
+            source<<"mixed3 c0i = imag"<<input<<"[i];\n";
+            source<<"mixed3 c1r = real"<<input<<"[i+"<<(L*m)<<"];\n";
+            source<<"mixed3 c1i = imag"<<input<<"[i+"<<(L*m)<<"];\n";
+            source<<"mixed3 c2r = real"<<input<<"[i+"<<(2*L*m)<<"];\n";
+            source<<"mixed3 c2i = imag"<<input<<"[i+"<<(2*L*m)<<"];\n";
+            source<<"mixed3 c3r = real"<<input<<"[i+"<<(3*L*m)<<"];\n";
+            source<<"mixed3 c3i = imag"<<input<<"[i+"<<(3*L*m)<<"];\n";
+            source<<"mixed3 d0r = c0r+c2r;\n";
+            source<<"mixed3 d0i = c0i+c2i;\n";
+            source<<"mixed3 d1r = c0r-c2r;\n";
+            source<<"mixed3 d1i = c0i-c2i;\n";
+            source<<"mixed3 d2r = c1r+c3r;\n";
+            source<<"mixed3 d2i = c1i+c3i;\n";
+            source<<"mixed3 d3r = "<<sign<<"*(c1i-c3i);\n";
+            source<<"mixed3 d3i = "<<sign<<"*(c3r-c1r);\n";
            source<<"real"<<output<<"[i+3*j*"<<m<<"] = d0r+d2r;\n";
            source<<"imag"<<output<<"[i+3*j*"<<m<<"] = d0i+d2i;\n";
            source<<"real"<<output<<"[i+(3*j+1)*"<<m<<"] = "<<multReal<<"(w[j*"<<size<<"/"<<(4*L)<<"], d1r+d3r, d1i+d3i);\n";
@@ -341,18 +388,18 @@ string CudaIntegrateRPMDStepKernel::createFFT(int size, const string& variable,
            source<<"if (indexInBlock < "<<(L*m)<<") {\n";
            source<<"int i = indexInBlock;\n";
            source<<"int j = i/"<<m<<";\n";
-            source<<"float3 c0r = real"<<input<<"[i];\n";
-            source<<"float3 c0i = imag"<<input<<"[i];\n";
-            source<<"float3 c1r = real"<<input<<"[i+"<<(L*m)<<"];\n";
-            source<<"float3 c1i = imag"<<input<<"[i+"<<(L*m)<<"];\n";
-            source<<"float3 c2r = real"<<input<<"[i+"<<(2*L*m)<<"];\n";
-            source<<"float3 c2i = imag"<<input<<"[i+"<<(2*L*m)<<"];\n";
-            source<<"float3 d0r = c1r+c2r;\n";
-            source<<"float3 d0i = c1i+c2i;\n";
-            source<<"float3 d1r = c0r-0.5f*d0r;\n";
-            source<<"float3 d1i = c0i-0.5f*d0i;\n";
-            source<<"float3 d2r = "<<sign<<"*"<<cu.doubleToString(sin(M_PI/3.0))<<"*(c1i-c2i);\n";
-            source<<"float3 d2i = "<<sign<<"*"<<cu.doubleToString(sin(M_PI/3.0))<<"*(c2r-c1r);\n";
+            source<<"mixed3 c0r = real"<<input<<"[i];\n";
+            source<<"mixed3 c0i = imag"<<input<<"[i];\n";
+            source<<"mixed3 c1r = real"<<input<<"[i+"<<(L*m)<<"];\n";
+            source<<"mixed3 c1i = imag"<<input<<"[i+"<<(L*m)<<"];\n";
+            source<<"mixed3 c2r = real"<<input<<"[i+"<<(2*L*m)<<"];\n";
+            source<<"mixed3 c2i = imag"<<input<<"[i+"<<(2*L*m)<<"];\n";
+            source<<"mixed3 d0r = c1r+c2r;\n";
+            source<<"mixed3 d0i = c1i+c2i;\n";
+            source<<"mixed3 d1r = c0r-0.5f*d0r;\n";
+            source<<"mixed3 d1i = c0i-0.5f*d0i;\n";
+            source<<"mixed3 d2r = "<<sign<<"*"<<cu.doubleToString(sin(M_PI/3.0))<<"*(c1i-c2i);\n";
+            source<<"mixed3 d2i = "<<sign<<"*"<<cu.doubleToString(sin(M_PI/3.0))<<"*(c2r-c1r);\n";
            source<<"real"<<output<<"[i+2*j*"<<m<<"] = c0r+d0r;\n";
            source<<"imag"<<output<<"[i+2*j*"<<m<<"] = c0i+d0i;\n";
            source<<"real"<<output<<"[i+(2*j+1)*"<<m<<"] = "<<multReal<<"(w[j*"<<size<<"/"<<(3*L)<<"], d1r+d2r, d1i+d2i);\n";
@@ -369,10 +416,10 @@ string CudaIntegrateRPMDStepKernel::createFFT(int size, const string& variable,
            source<<"if (indexInBlock < "<<(L*m)<<") {\n";
            source<<"int i = indexInBlock;\n";
            source<<"int j = i/"<<m<<";\n";
-            source<<"float3 c0r = real"<<input<<"[i];\n";
-            source<<"float3 c0i = imag"<<input<<"[i];\n";
-            source<<"float3 c1r = real"<<input<<"[i+"<<(L*m)<<"];\n";
-            source<<"float3 c1i = imag"<<input<<"[i+"<<(L*m)<<"];\n";
+            source<<"mixed3 c0r = real"<<input<<"[i];\n";
+            source<<"mixed3 c0i = imag"<<input<<"[i];\n";
+            source<<"mixed3 c1r = real"<<input<<"[i+"<<(L*m)<<"];\n";
+            source<<"mixed3 c1i = imag"<<input<<"[i+"<<(L*m)<<"];\n";
            source<<"real"<<output<<"[i+j*"<<m<<"] = c0r+c1r;\n";
            source<<"imag"<<output<<"[i+j*"<<m<<"] = c0i+c1i;\n";
            source<<"real"<<output<<"[i+(j+1)*"<<m<<"] = "<<multReal<<"(w[j*"<<size<<"/"<<(2*L)<<"], c0r-c1r, c0i-c1i);\n";

--- a/plugins/rpmd/platforms/cuda/src/CudaRpmdKernels.h
+++ b/plugins/rpmd/platforms/cuda/src/CudaRpmdKernels.h
@@ -91,7 +91,7 @@ private:
    CudaArray* forces;
    CudaArray* positions;
    CudaArray* velocities;
-    CUfunction pileKernel, stepKernel, velocitiesKernel, copyToContextKernel, copyFromContextKernel, translateKernel;
+    CUfunction pileKernel, stepKernel, velocitiesKernel, copyPositionsToContextKernel, copyVelocitiesToContextKernel, copyForcesFromContextKernel, translateKernel;
 };

 } // namespace OpenMM

--- a/plugins/rpmd/platforms/cuda/src/kernels/rpmd.cu
+++ b/plugins/rpmd/platforms/cuda/src/kernels/rpmd.cu
-__device__ float3 multiplyComplexRealPart(float2 c1, float3 c2r, float3 c2i) {
+__device__ mixed3 multiplyComplexRealPart(mixed2 c1, mixed3 c2r, mixed3 c2i) {
    return c1.x*c2r-c1.y*c2i;
 }

-__device__ float3 multiplyComplexImagPart(float2 c1, float3 c2r, float3 c2i) {
+__device__ mixed3 multiplyComplexImagPart(mixed2 c1, mixed3 c2r, mixed3 c2i) {
    return c1.x*c2i+c1.y*c2r;
 }

-__device__ float3 multiplyComplexRealPartConj(float2 c1, float3 c2r, float3 c2i) {
+__device__ mixed3 multiplyComplexRealPartConj(mixed2 c1, mixed3 c2r, mixed3 c2i) {
    return c1.x*c2r+c1.y*c2i;
 }

-__device__ float3 multiplyComplexImagPartConj(float2 c1, float3 c2r, float3 c2i) {
+__device__ mixed3 multiplyComplexImagPartConj(mixed2 c1, mixed3 c2r, mixed3 c2i) {
    return c1.x*c2i-c1.y*c2r;
 }

 /**
 * Apply the PILE-L thermostat.
 */
-extern "C" __global__ void applyPileThermostat(float4* velm, float4* random, unsigned int randomIndex,
-        float dt, float kT, float friction) {
+extern "C" __global__ void applyPileThermostat(mixed4* velm, float4* random, unsigned int randomIndex,
+        mixed dt, mixed kT, mixed friction) {
    const int numBlocks = blockDim.x*gridDim.x/NUM_COPIES;
    const int blockStart = NUM_COPIES*(threadIdx.x/NUM_COPIES);
    const int indexInBlock = threadIdx.x-blockStart;
-    const float nkT = NUM_COPIES*kT;
-    const float twown = 2.0f*nkT/HBAR;
-    const float c1_0 = EXP(-0.5f*dt*friction);
-    const float c2_0 = SQRT(1.0f-c1_0*c1_0);
-    __shared__ float3 v[2*THREAD_BLOCK_SIZE];
-    __shared__ float3 temp[2*THREAD_BLOCK_SIZE];
-    __shared__ float2 w[NUM_COPIES];
-    float3* vreal = &v[blockStart];
-    float3* vimag = &v[blockStart+blockDim.x];
+    const mixed nkT = NUM_COPIES*kT;
+    const mixed twown = 2.0f*nkT/HBAR;
+    const mixed c1_0 = EXP(-0.5f*dt*friction);
+    const mixed c2_0 = SQRT(1.0f-c1_0*c1_0);
+    __shared__ mixed3 v[2*THREAD_BLOCK_SIZE];
+    __shared__ mixed3 temp[2*THREAD_BLOCK_SIZE];
+    __shared__ mixed2 w[NUM_COPIES];
+    mixed3* vreal = &v[blockStart];
+    mixed3* vimag = &v[blockStart+blockDim.x];
    if (threadIdx.x < NUM_COPIES)
-        w[indexInBlock] = make_float2(cos(-indexInBlock*2*M_PI/NUM_COPIES), sin(-indexInBlock*2*M_PI/NUM_COPIES));
+        w[indexInBlock] = make_mixed2(cos(-indexInBlock*2*M_PI/NUM_COPIES), sin(-indexInBlock*2*M_PI/NUM_COPIES));
    __syncthreads();
    randomIndex += NUM_COPIES*((blockIdx.x*blockDim.x+threadIdx.x)/NUM_COPIES);
    for (int particle = (blockIdx.x*blockDim.x+threadIdx.x)/NUM_COPIES; particle < NUM_ATOMS; particle += numBlocks) {
-        float4 particleVelm = velm[particle+indexInBlock*PADDED_NUM_ATOMS];
-        float invMass = particleVelm.w;
-        float c3_0 = c2_0*SQRT(nkT*invMass);
+        mixed4 particleVelm = velm[particle+indexInBlock*PADDED_NUM_ATOMS];
+        mixed invMass = particleVelm.w;
+        mixed c3_0 = c2_0*SQRT(nkT*invMass);
        
        // Forward FFT.
        
-        vreal[indexInBlock] = SCALE*make_float3(particleVelm.x, particleVelm.y, particleVelm.z);
-        vimag[indexInBlock] = make_float3(0);
+        vreal[indexInBlock] = SCALE*make_mixed3(particleVelm.x, particleVelm.y, particleVelm.z);
+        vimag[indexInBlock] = make_mixed3(0);
        __syncthreads();
        FFT_V_FORWARD

@@ -53,28 +53,28 @@ extern "C" __global__ void applyPileThermostat(float4* velm, float4* random, uns
            // Apply a local Langevin thermostat to the centroid mode.

            float4 rand = random[randomIndex];
-            vreal[0] = vreal[0]*c1_0 + c3_0*make_float3(rand.x, rand.y, rand.z);
+            vreal[0] = vreal[0]*c1_0 + c3_0*make_mixed3(rand.x, rand.y, rand.z);
        }
        else {
            // Use critical damping white noise for the remaining modes.

            int k = (indexInBlock <= NUM_COPIES/2 ? indexInBlock : NUM_COPIES-indexInBlock);
            const bool isCenter = (NUM_COPIES%2 == 0 && k == NUM_COPIES/2);
-            const float wk = twown*sin(k*M_PI/NUM_COPIES);
-            const float c1 = EXP(-wk*dt);
-            const float c2 = SQRT((1.0f-c1*c1)/2.0f) * (isCenter ? sqrt(2.0f) : 1.0f);
-            const float c3 = c2*SQRT(nkT*invMass);
-            float4 rand1 = c3*random[randomIndex+k];
-            float4 rand2 = (isCenter ? make_float4(0) : c3*random[randomIndex+NUM_COPIES-k]);
-            vreal[indexInBlock] = c1*vreal[indexInBlock] + make_float3(rand1.x, rand1.y, rand1.z);
-            vimag[indexInBlock] = c1*vimag[indexInBlock] + (indexInBlock < NUM_COPIES/2 ? make_float3(rand2.x, rand2.y, rand2.z) : make_float3(-rand2.x, -rand2.y, -rand2.z));
+            const mixed wk = twown*sin(k*M_PI/NUM_COPIES);
+            const mixed c1 = EXP(-wk*dt);
+            const mixed c2 = SQRT((1.0f-c1*c1)/2.0f) * (isCenter ? sqrt(2.0f) : 1.0f);
+            const mixed c3 = c2*SQRT(nkT*invMass);
+            float4 rand1 = random[randomIndex+k];
+            float4 rand2 = (isCenter ? make_float4(0) : random[randomIndex+NUM_COPIES-k]);
+            vreal[indexInBlock] = c1*vreal[indexInBlock] + c3*make_mixed3(rand1.x, rand1.y, rand1.z);
+            vimag[indexInBlock] = c1*vimag[indexInBlock] + c3*(indexInBlock < NUM_COPIES/2 ? make_mixed3(rand2.x, rand2.y, rand2.z) : make_mixed3(-rand2.x, -rand2.y, -rand2.z));
        }
        __syncthreads();
        
        // Inverse FFT.
        
        FFT_V_BACKWARD
-        velm[particle+indexInBlock*PADDED_NUM_ATOMS] = make_float4(SCALE*vreal[indexInBlock].x, SCALE*vreal[indexInBlock].y, SCALE*vreal[indexInBlock].z, particleVelm.w);
+        velm[particle+indexInBlock*PADDED_NUM_ATOMS] = make_mixed4(SCALE*vreal[indexInBlock].x, SCALE*vreal[indexInBlock].y, SCALE*vreal[indexInBlock].z, particleVelm.w);
        randomIndex += blockDim.x*gridDim.x;
    }
 }
@@ -82,24 +82,24 @@ extern "C" __global__ void applyPileThermostat(float4* velm, float4* random, uns
 /**
 * Advance the positions and velocities.
 */
-extern "C" __global__ void integrateStep(float4* posq, float4* velm, long long* force, float dt, float kT) {
+extern "C" __global__ void integrateStep(mixed4* posq, mixed4* velm, long long* force, mixed dt, mixed kT) {
    const int numBlocks = (blockDim.x*gridDim.x)/NUM_COPIES;
    const int blockStart = NUM_COPIES*(threadIdx.x/NUM_COPIES);
    const int indexInBlock = threadIdx.x-blockStart;
-    const float nkT = NUM_COPIES*kT;
-    const float twown = 2.0f*nkT/HBAR;
-    const float forceScale = 1/(float) 0xFFFFFFFF;
-    __shared__ float3 q[2*THREAD_BLOCK_SIZE];
-    __shared__ float3 v[2*THREAD_BLOCK_SIZE];
-    __shared__ float3 temp[2*THREAD_BLOCK_SIZE];
-    __shared__ float2 w[NUM_COPIES];
+    const mixed nkT = NUM_COPIES*kT;
+    const mixed twown = 2.0f*nkT/HBAR;
+    const mixed forceScale = 1/(mixed) 0xFFFFFFFF;
+    __shared__ mixed3 q[2*THREAD_BLOCK_SIZE];
+    __shared__ mixed3 v[2*THREAD_BLOCK_SIZE];
+    __shared__ mixed3 temp[2*THREAD_BLOCK_SIZE];
+    __shared__ mixed2 w[NUM_COPIES];

    // Update velocities.
    
    for (int particle = (blockIdx.x*blockDim.x+threadIdx.x)/NUM_COPIES; particle < NUM_ATOMS; particle += numBlocks) {
        int index = particle+indexInBlock*PADDED_NUM_ATOMS;
        int forceIndex = particle+indexInBlock*PADDED_NUM_ATOMS*3;
-        float4 particleVelm = velm[index];
+        mixed4 particleVelm = velm[index];
        particleVelm.x += forceScale*force[forceIndex]*(0.5f*dt*particleVelm.w);
        particleVelm.y += forceScale*force[forceIndex+PADDED_NUM_ATOMS]*(0.5f*dt*particleVelm.w);
        particleVelm.z += forceScale*force[forceIndex+PADDED_NUM_ATOMS*2]*(0.5f*dt*particleVelm.w);
@@ -108,23 +108,23 @@ extern "C" __global__ void integrateStep(float4* posq, float4* velm, long long*
    
    // Evolve the free ring polymer by transforming to the frequency domain.

-    float3* qreal = &q[blockStart];
-    float3* qimag = &q[blockStart+blockDim.x];
-    float3* vreal = &v[blockStart];
-    float3* vimag = &v[blockStart+blockDim.x];
+    mixed3* qreal = &q[blockStart];
+    mixed3* qimag = &q[blockStart+blockDim.x];
+    mixed3* vreal = &v[blockStart];
+    mixed3* vimag = &v[blockStart+blockDim.x];
    if (threadIdx.x < NUM_COPIES)
-        w[indexInBlock] = make_float2(cos(-indexInBlock*2*M_PI/NUM_COPIES), sin(-indexInBlock*2*M_PI/NUM_COPIES));
+        w[indexInBlock] = make_mixed2(cos(-indexInBlock*2*M_PI/NUM_COPIES), sin(-indexInBlock*2*M_PI/NUM_COPIES));
    __syncthreads();
    for (int particle = (blockIdx.x*blockDim.x+threadIdx.x)/NUM_COPIES; particle < NUM_ATOMS; particle += numBlocks) {
-        float4 particlePosq = posq[particle+indexInBlock*PADDED_NUM_ATOMS];
-        float4 particleVelm = velm[particle+indexInBlock*PADDED_NUM_ATOMS];
+        mixed4 particlePosq = posq[particle+indexInBlock*PADDED_NUM_ATOMS];
+        mixed4 particleVelm = velm[particle+indexInBlock*PADDED_NUM_ATOMS];
        
        // Forward FFT.
        
-        qreal[indexInBlock] = SCALE*make_float3(particlePosq.x, particlePosq.y, particlePosq.z);
-        qimag[indexInBlock] = make_float3(0);
-        vreal[indexInBlock] = SCALE*make_float3(particleVelm.x, particleVelm.y, particleVelm.z);
-        vimag[indexInBlock] = make_float3(0);
+        qreal[indexInBlock] = SCALE*make_mixed3(particlePosq.x, particlePosq.y, particlePosq.z);
+        qimag[indexInBlock] = make_mixed3(0);
+        vreal[indexInBlock] = SCALE*make_mixed3(particleVelm.x, particleVelm.y, particleVelm.z);
+        vimag[indexInBlock] = make_mixed3(0);
        __syncthreads();
        FFT_Q_FORWARD
        FFT_V_FORWARD
@@ -136,12 +136,12 @@ extern "C" __global__ void integrateStep(float4* posq, float4* velm, long long*
            qimag[0] += vimag[0]*dt;
        }
        else {
-            const float wk = twown*sin(indexInBlock*M_PI/NUM_COPIES);
-            const float wt = wk*dt;
-            const float coswt = cos(wt);
-            const float sinwt = sin(wt);
-            const float3 vprimereal = vreal[indexInBlock]*coswt - qreal[indexInBlock]*(wk*sinwt); // Advance velocity from t to t+dt
-            const float3 vprimeimag = vimag[indexInBlock]*coswt - qimag[indexInBlock]*(wk*sinwt);
+            const mixed wk = twown*sin(indexInBlock*M_PI/NUM_COPIES);
+            const mixed wt = wk*dt;
+            const mixed coswt = cos(wt);
+            const mixed sinwt = sin(wt);
+            const mixed3 vprimereal = vreal[indexInBlock]*coswt - qreal[indexInBlock]*(wk*sinwt); // Advance velocity from t to t+dt
+            const mixed3 vprimeimag = vimag[indexInBlock]*coswt - qimag[indexInBlock]*(wk*sinwt);
            qreal[indexInBlock] = vreal[indexInBlock]*(sinwt/wk) + qreal[indexInBlock]*coswt; // Advance position from t to t+dt
            qimag[indexInBlock] = vimag[indexInBlock]*(sinwt/wk) + qimag[indexInBlock]*coswt;
            vreal[indexInBlock] = vprimereal;
@@ -153,26 +153,26 @@ extern "C" __global__ void integrateStep(float4* posq, float4* velm, long long*
        
        FFT_Q_BACKWARD
        FFT_V_BACKWARD
-        posq[particle+indexInBlock*PADDED_NUM_ATOMS] = make_float4(SCALE*qreal[indexInBlock].x, SCALE*qreal[indexInBlock].y, SCALE*qreal[indexInBlock].z, particlePosq.w);
-        velm[particle+indexInBlock*PADDED_NUM_ATOMS] = make_float4(SCALE*vreal[indexInBlock].x, SCALE*vreal[indexInBlock].y, SCALE*vreal[indexInBlock].z, particleVelm.w);
+        posq[particle+indexInBlock*PADDED_NUM_ATOMS] = make_mixed4(SCALE*qreal[indexInBlock].x, SCALE*qreal[indexInBlock].y, SCALE*qreal[indexInBlock].z, particlePosq.w);
+        velm[particle+indexInBlock*PADDED_NUM_ATOMS] = make_mixed4(SCALE*vreal[indexInBlock].x, SCALE*vreal[indexInBlock].y, SCALE*vreal[indexInBlock].z, particleVelm.w);
    }
 }

 /**
 * Advance the velocities by a half step.
 */
-extern "C" __global__ void advanceVelocities(float4* velm, long long* force, float dt) {
+extern "C" __global__ void advanceVelocities(mixed4* velm, long long* force, mixed dt) {
    const int numBlocks = (blockDim.x*gridDim.x)/NUM_COPIES;
    const int blockStart = NUM_COPIES*(threadIdx.x/NUM_COPIES);
    const int indexInBlock = threadIdx.x-blockStart;
-    const float forceScale = 1/(float) 0xFFFFFFFF;
+    const mixed forceScale = 1/(mixed) 0xFFFFFFFF;

    // Update velocities.
    
    for (int particle = (blockIdx.x*blockDim.x+threadIdx.x)/NUM_COPIES; particle < NUM_ATOMS; particle += numBlocks) {
        int index = particle+indexInBlock*PADDED_NUM_ATOMS;
        int forceIndex = particle+indexInBlock*PADDED_NUM_ATOMS*3;
-        float4 particleVelm = velm[index];
+        mixed4 particleVelm = velm[index];
        particleVelm.x += forceScale*force[forceIndex]*(0.5f*dt*particleVelm.w);
        particleVelm.y += forceScale*force[forceIndex+PADDED_NUM_ATOMS]*(0.5f*dt*particleVelm.w);
        particleVelm.z += forceScale*force[forceIndex+PADDED_NUM_ATOMS*2]*(0.5f*dt*particleVelm.w);
@@ -181,9 +181,20 @@ extern "C" __global__ void advanceVelocities(float4* velm, long long* force, flo
 }

 /**
- * Copy a set of per-atom values from the integrator's arrays to the context.
+ * Copy a set of positions from the integrator's arrays to the context.
 */
-extern "C" __global__ void copyToContext(float4* src, float4* dst, int* order, int copy) {
+extern "C" __global__ void copyPositionsToContext(mixed4* src, real4* dst, int* order, int copy) {
+    const int base = copy*PADDED_NUM_ATOMS;
+    for (int particle = blockIdx.x*blockDim.x+threadIdx.x; particle < NUM_ATOMS; particle += blockDim.x*gridDim.x) {
+        mixed4 posq = src[base+order[particle]];
+        dst[particle] = make_real4(posq.x, posq.y, posq.z, posq.w);
+    }
+}
+
+/**
+ * Copy a set of velocities from the integrator's arrays to the context.
+ */
+extern "C" __global__ void copyVelocitiesToContext(mixed4* src, mixed4* dst, int* order, int copy) {
    const int base = copy*PADDED_NUM_ATOMS;
    for (int particle = blockIdx.x*blockDim.x+threadIdx.x; particle < NUM_ATOMS; particle += blockDim.x*gridDim.x) {
        dst[particle] = src[base+order[particle]];
@@ -191,9 +202,9 @@ extern "C" __global__ void copyToContext(float4* src, float4* dst, int* order, i
 }

 /**
- * Copy a set of per-atom force values from the context to the integrator's arrays.
+ * Copy a set of forces from the context to the integrator's arrays.
 */
-extern "C" __global__ void copyFromContext(long long* src, long long* dst, int* order, int copy) {
+extern "C" __global__ void copyForcesFromContext(long long* src, long long* dst, int* order, int copy) {
    const int base = copy*PADDED_NUM_ATOMS*3;
    for (int particle = blockIdx.x*blockDim.x+threadIdx.x; particle < NUM_ATOMS; particle += blockDim.x*gridDim.x) {
        dst[base+order[particle]] = src[particle];
@@ -205,10 +216,11 @@ extern "C" __global__ void copyFromContext(long long* src, long long* dst, int*
 /**
 * Update atom positions so all copies are offset by the same number of periodic box widths.
 */
-extern "C" __global__ void applyCellTranslations(float4* posq, float4* movedPos, int* order, int movedCopy) {
+extern "C" __global__ void applyCellTranslations(mixed4* posq, real4* movedPos, int* order, int movedCopy) {
    for (int particle = blockIdx.x*blockDim.x+threadIdx.x; particle < NUM_ATOMS; particle += blockDim.x*gridDim.x) {
        int index = order[particle];
-        float4 delta = movedPos[particle]-posq[movedCopy*PADDED_NUM_ATOMS+index];
+        real4 p = movedPos[particle];
+        mixed4 delta = make_mixed4(p.x, p.y, p.z, p.w)-posq[movedCopy*PADDED_NUM_ATOMS+index];
        for (int copy = 0; copy < NUM_COPIES; copy++)
            posq[copy*PADDED_NUM_ATOMS+index] += delta;
    }

--- a/plugins/rpmd/platforms/cuda/tests/CMakeLists.txt
+++ b/plugins/rpmd/platforms/cuda/tests/CMakeLists.txt
@@ -14,6 +14,10 @@ FOREACH(TEST_PROG ${TEST_PROGS})
    # Link with shared library
    ADD_EXECUTABLE(${TEST_ROOT} ${TEST_PROG})
    TARGET_LINK_LIBRARIES(${TEST_ROOT} ${SHARED_RPMD_TARGET})
-    ADD_TEST(${TEST_ROOT} ${EXECUTABLE_OUTPUT_PATH}/${TEST_ROOT})
+    ADD_TEST(${TEST_ROOT}Single ${EXECUTABLE_OUTPUT_PATH}/${TEST_ROOT} single)
+    IF (OPENMM_BUILD_CUDA_DOUBLE_PRECISION_TESTS)
+        ADD_TEST(${TEST_ROOT}Mixed ${EXECUTABLE_OUTPUT_PATH}/${TEST_ROOT} mixed)
+        ADD_TEST(${TEST_ROOT}Double ${EXECUTABLE_OUTPUT_PATH}/${TEST_ROOT} double)
+    ENDIF(OPENMM_BUILD_CUDA_DOUBLE_PRECISION_TESTS)

 ENDFOREACH(TEST_PROG ${TEST_PROGS})
--- a/plugins/rpmd/platforms/cuda/tests/TestCudaRpmd.cpp
+++ b/plugins/rpmd/platforms/cuda/tests/TestCudaRpmd.cpp
@@ -165,7 +165,6 @@ void testParaHydrogen() {
    vector<int> counts(numBins, 0);
    const double invBoxSize = 1.0/boxSize;
    double meanKE = 0.0;
-    const RealOpenMM hbar = 1.054571628e-34*AVOGADRO/(1000*1e-12);
    for (int step = 0; step < numSteps; step++) {
        integ.step(20);
        vector<State> states(numCopies);
@@ -221,9 +220,11 @@ void testParaHydrogen() {
    ASSERT_USUALLY_EQUAL_TOL(60.0, 1.5*temperature+meanKE, 0.02);
 }

-int main() {
+int main(int argc, char* argv[]) {
    try {
        Platform::loadPluginsFromDirectory(Platform::getDefaultPluginsDirectory());
+        if (argc > 1)
+            Platform::getPlatformByName("CUDA").setPropertyDefaultValue("CudaPrecision", string(argv[1]));
        testFreeParticles();
        testParaHydrogen();
    }

--- a/plugins/rpmd/platforms/opencl/src/OpenCLRpmdKernels.cpp
+++ b/plugins/rpmd/platforms/opencl/src/OpenCLRpmdKernels.cpp
@@ -6,7 +6,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2011 Stanford University and the Authors.           *
+ * Portions copyright (c) 2011-2012 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -49,6 +49,7 @@ OpenCLIntegrateRPMDStepKernel::~OpenCLIntegrateRPMDStepKernel() {
    if (velocities != NULL)
        delete velocities;
 }
+
 void OpenCLIntegrateRPMDStepKernel::initialize(const System& system, const RPMDIntegrator& integrator) {
    cl.getPlatformData().initializeContexts(system);
    numCopies = integrator.getNumCopies();
@@ -59,20 +60,34 @@ void OpenCLIntegrateRPMDStepKernel::initialize(const System& system, const RPMDI
    if (numCopies != OpenCLFFT3D::findLegalDimension(numCopies))
        throw OpenMMException("RPMDIntegrator: the number of copies must be a multiple of powers of 2, 3, and 5.");
    int paddedParticles = cl.getPaddedNumAtoms();
-    forces = OpenCLArray::create<mm_float4>(cl, numCopies*paddedParticles, "rpmdForces");
-    positions = OpenCLArray::create<mm_float4>(cl, numCopies*paddedParticles, "rpmdPositions");
-    velocities = OpenCLArray::create<mm_float4>(cl, numCopies*paddedParticles, "rpmdVelocities");
+    int forceElementSize = (cl.getUseDoublePrecision() ? sizeof(mm_double4) : sizeof(mm_float4));
+    forces = new OpenCLArray(cl, numCopies*paddedParticles, forceElementSize, "rpmdForces");
+    bool useDoublePrecision = (cl.getUseDoublePrecision() || cl.getUseMixedPrecision());
+    int elementSize = (useDoublePrecision ? sizeof(mm_double4) : sizeof(mm_float4));
+    positions = new OpenCLArray(cl, numCopies*paddedParticles, elementSize, "rpmdPositions");
+    velocities = new OpenCLArray(cl, numCopies*paddedParticles, elementSize, "rpmdVelocities");
    cl.getIntegrationUtilities().initRandomNumberGenerator((unsigned int) integrator.getRandomNumberSeed());
    
    // Fill in the posq and velm arrays with safe values to avoid a risk of nans.
    
-    vector<mm_float4> temp(positions->getSize());
-    for (int i = 0; i < positions->getSize(); i++)
-        temp[i] = mm_float4(0, 0, 0, 0);
-    positions->upload(temp);
-    for (int i = 0; i < velocities->getSize(); i++)
-        temp[i] = mm_float4(0, 0, 0, 1);
-    velocities->upload(temp);
+    if (useDoublePrecision) {
+        vector<mm_double4> temp(positions->getSize());
+        for (int i = 0; i < positions->getSize(); i++)
+            temp[i] = mm_double4(0, 0, 0, 0);
+        positions->upload(temp);
+        for (int i = 0; i < velocities->getSize(); i++)
+            temp[i] = mm_double4(0, 0, 0, 1);
+        velocities->upload(temp);
+    }
+    else {
+        vector<mm_float4> temp(positions->getSize());
+        for (int i = 0; i < positions->getSize(); i++)
+            temp[i] = mm_float4(0, 0, 0, 0);
+        positions->upload(temp);
+        for (int i = 0; i < velocities->getSize(); i++)
+            temp[i] = mm_float4(0, 0, 0, 1);
+        velocities->upload(temp);
+    }

    // Create kernels.
    
@@ -80,6 +95,7 @@ void OpenCLIntegrateRPMDStepKernel::initialize(const System& system, const RPMDI
    defines["NUM_ATOMS"] = cl.intToString(cl.getNumAtoms());
    defines["PADDED_NUM_ATOMS"] = cl.intToString(cl.getPaddedNumAtoms());
    defines["NUM_COPIES"] = cl.intToString(numCopies);
+    defines["THREAD_BLOCK_SIZE"] = cl.intToString(workgroupSize);
    defines["HBAR"] = cl.doubleToString(1.054571628e-34*AVOGADRO/(1000*1e-12));
    defines["SCALE"] = cl.doubleToString(1.0/sqrt((double) numCopies));
    defines["M_PI"] = cl.doubleToString(M_PI);
@@ -92,8 +108,9 @@ void OpenCLIntegrateRPMDStepKernel::initialize(const System& system, const RPMDI
    pileKernel = cl::Kernel(program, "applyPileThermostat");
    stepKernel = cl::Kernel(program, "integrateStep");
    velocitiesKernel = cl::Kernel(program, "advanceVelocities");
-    copyToContextKernel = cl::Kernel(program, "copyToContext");
-    copyFromContextKernel = cl::Kernel(program, "copyFromContext");
+    copyPositionsToContextKernel = cl::Kernel(program, "copyPositionsToContext");
+    copyVelocitiesToContextKernel = cl::Kernel(program, "copyVelocitiesToContext");
+    copyForcesFromContextKernel = cl::Kernel(program, "copyForcesFromContext");
    translateKernel = cl::Kernel(program, "applyCellTranslations");
 }

@@ -102,48 +119,56 @@ void OpenCLIntegrateRPMDStepKernel::execute(ContextImpl& context, const RPMDInte
    if (!hasInitializedKernel) {
        hasInitializedKernel = true;
        pileKernel.setArg<cl::Buffer>(0, velocities->getDeviceBuffer());
-        pileKernel.setArg(1, 2*workgroupSize*sizeof(mm_float4), NULL);
-        pileKernel.setArg(2, 2*workgroupSize*sizeof(mm_float4), NULL);
-        pileKernel.setArg(3, numCopies*sizeof(mm_float2), NULL);
        stepKernel.setArg<cl::Buffer>(0, positions->getDeviceBuffer());
        stepKernel.setArg<cl::Buffer>(1, velocities->getDeviceBuffer());
        stepKernel.setArg<cl::Buffer>(2, forces->getDeviceBuffer());
-        stepKernel.setArg(3, 2*workgroupSize*sizeof(mm_float4), NULL);
-        stepKernel.setArg(4, 2*workgroupSize*sizeof(mm_float4), NULL);
-        stepKernel.setArg(5, 2*workgroupSize*sizeof(mm_float4), NULL);
-        stepKernel.setArg(6, numCopies*sizeof(mm_float2), NULL);
        velocitiesKernel.setArg<cl::Buffer>(0, velocities->getDeviceBuffer());
        velocitiesKernel.setArg<cl::Buffer>(1, forces->getDeviceBuffer());
        translateKernel.setArg<cl::Buffer>(0, positions->getDeviceBuffer());
        translateKernel.setArg<cl::Buffer>(1, cl.getPosq().getDeviceBuffer());
        translateKernel.setArg<cl::Buffer>(2, cl.getAtomIndexArray().getDeviceBuffer());
+        copyPositionsToContextKernel.setArg<cl::Buffer>(0, positions->getDeviceBuffer());
+        copyPositionsToContextKernel.setArg<cl::Buffer>(1, cl.getPosq().getDeviceBuffer());
+        copyPositionsToContextKernel.setArg<cl::Buffer>(2, cl.getAtomIndexArray().getDeviceBuffer());
+        copyVelocitiesToContextKernel.setArg<cl::Buffer>(0, velocities->getDeviceBuffer());
+        copyVelocitiesToContextKernel.setArg<cl::Buffer>(1, cl.getVelm().getDeviceBuffer());
+        copyVelocitiesToContextKernel.setArg<cl::Buffer>(2, cl.getAtomIndexArray().getDeviceBuffer());
+        copyForcesFromContextKernel.setArg<cl::Buffer>(0, cl.getForce().getDeviceBuffer());
+        copyForcesFromContextKernel.setArg<cl::Buffer>(1, forces->getDeviceBuffer());
+        copyForcesFromContextKernel.setArg<cl::Buffer>(2, cl.getAtomIndexArray().getDeviceBuffer());
    }
    
    // Loop over copies and compute the force on each one.
    
-    copyToContextKernel.setArg<cl::Buffer>(0, positions->getDeviceBuffer());
-    copyToContextKernel.setArg<cl::Buffer>(1, cl.getPosq().getDeviceBuffer());
-    copyToContextKernel.setArg<cl::Buffer>(2, cl.getAtomIndexArray().getDeviceBuffer());
-    copyFromContextKernel.setArg<cl::Buffer>(0, cl.getForce().getDeviceBuffer());
-    copyFromContextKernel.setArg<cl::Buffer>(1, forces->getDeviceBuffer());
-    copyFromContextKernel.setArg<cl::Buffer>(2, cl.getAtomIndexArray().getDeviceBuffer());
    if (!forcesAreValid)
        computeForces(context);
    
    // Apply the PILE-L thermostat.
    
+    bool useDoublePrecision = (cl.getUseDoublePrecision() || cl.getUseMixedPrecision());
    const double dt = integrator.getStepSize();
-    pileKernel.setArg<cl_uint>(5, integration.prepareRandomNumbers(numParticles*numCopies));
-    pileKernel.setArg<cl::Buffer>(4, integration.getRandom().getDeviceBuffer()); // Do this *after* prepareRandomNumbers(), which might rebuild the array.
-    pileKernel.setArg<cl_float>(6, (cl_float) dt);
-    pileKernel.setArg<cl_float>(7, (cl_float) (integrator.getTemperature()*BOLTZ));
-    pileKernel.setArg<cl_float>(8, (cl_float) integrator.getFriction());
+    pileKernel.setArg<cl_uint>(2, integration.prepareRandomNumbers(numParticles*numCopies));
+    pileKernel.setArg<cl::Buffer>(1, integration.getRandom().getDeviceBuffer()); // Do this *after* prepareRandomNumbers(), which might rebuild the array.
+    if (useDoublePrecision) {
+        pileKernel.setArg<cl_double>(3, dt);
+        pileKernel.setArg<cl_double>(4, integrator.getTemperature()*BOLTZ);
+        pileKernel.setArg<cl_double>(5, integrator.getFriction());
+        stepKernel.setArg<cl_double>(3, dt);
+        stepKernel.setArg<cl_double>(4, integrator.getTemperature()*BOLTZ);
+        velocitiesKernel.setArg<cl_double>(2, dt);
+    }
+    else {
+        pileKernel.setArg<cl_float>(3, (cl_float) dt);
+        pileKernel.setArg<cl_float>(4, (cl_float) (integrator.getTemperature()*BOLTZ));
+        pileKernel.setArg<cl_float>(5, (cl_float) integrator.getFriction());
+        stepKernel.setArg<cl_float>(3, (cl_float) dt);
+        stepKernel.setArg<cl_float>(4, (cl_float) (integrator.getTemperature()*BOLTZ));
+        velocitiesKernel.setArg<cl_float>(2, (cl_float) dt);
+    }
    cl.executeKernel(pileKernel, numParticles*numCopies, workgroupSize);

    // Update positions and velocities.
    
-    stepKernel.setArg<cl_float>(7, (cl_float) dt);
-    stepKernel.setArg<cl_float>(8, (cl_float) (integrator.getTemperature()*BOLTZ));
    cl.executeKernel(stepKernel, numParticles*numCopies, workgroupSize);

    // Calculate forces based on the updated positions.
@@ -151,12 +176,11 @@ void OpenCLIntegrateRPMDStepKernel::execute(ContextImpl& context, const RPMDInte
    computeForces(context);
    
    // Update velocities.
-    velocitiesKernel.setArg<cl_float>(2, (cl_float) dt);
    cl.executeKernel(velocitiesKernel, numParticles*numCopies, workgroupSize);

    // Apply the PILE-L thermostat again.

-    pileKernel.setArg<cl_uint>(5, integration.prepareRandomNumbers(numParticles*numCopies));
+    pileKernel.setArg<cl_uint>(2, integration.prepareRandomNumbers(numParticles*numCopies));
    cl.executeKernel(pileKernel, numParticles*numCopies, workgroupSize);

    // Update the time and step count.
@@ -167,11 +191,11 @@ void OpenCLIntegrateRPMDStepKernel::execute(ContextImpl& context, const RPMDInte

 void OpenCLIntegrateRPMDStepKernel::computeForces(ContextImpl& context) {
    for (int i = 0; i < numCopies; i++) {
-        copyToContextKernel.setArg<cl_int>(3, i);
-        cl.executeKernel(copyToContextKernel, cl.getNumAtoms());
+        copyPositionsToContextKernel.setArg<cl_int>(3, i);
+        cl.executeKernel(copyPositionsToContextKernel, cl.getNumAtoms());
        context.calcForcesAndEnergy(true, false);
-        copyFromContextKernel.setArg<cl_int>(3, i);
-        cl.executeKernel(copyFromContextKernel, cl.getNumAtoms());
+        copyForcesFromContextKernel.setArg<cl_int>(3, i);
+        cl.executeKernel(copyForcesFromContextKernel, cl.getNumAtoms());
        if (cl.getAtomsWereReordered() && cl.getNonbondedUtilities().getUsePeriodic()) {
            // Atoms may have been translated into a different periodic box, so apply
            // the same translation to all the beads.
@@ -191,11 +215,28 @@ void OpenCLIntegrateRPMDStepKernel::setPositions(int copy, const vector<Vec3>& p
        throw OpenMMException("RPMDIntegrator: Cannot set positions before the integrator is added to a Context");
    if (pos.size() != numParticles)
        throw OpenMMException("RPMDIntegrator: wrong number of values passed to setPositions()");
-    vector<mm_float4> posq(cl.getPaddedNumAtoms());
-    cl.getPosq().download(posq);
-    for (int i = 0; i < numParticles; i++)
-        posq[i] = mm_float4((cl_float) pos[i][0], (cl_float) pos[i][1], (cl_float) pos[i][2], posq[i].w);
-    cl.getQueue().enqueueWriteBuffer(positions->getDeviceBuffer(), CL_TRUE, copy*cl.getPaddedNumAtoms()*sizeof(mm_float4), numParticles*sizeof(mm_float4), &posq[0]);
+    if (cl.getUseDoublePrecision()) {
+        vector<mm_double4> posq(cl.getPaddedNumAtoms());
+        cl.getPosq().download(posq);
+        for (int i = 0; i < numParticles; i++)
+            posq[i] = mm_double4(pos[i][0], pos[i][1], pos[i][2], posq[i].w);
+        cl.getQueue().enqueueWriteBuffer(positions->getDeviceBuffer(), CL_TRUE, copy*cl.getPaddedNumAtoms()*sizeof(mm_double4), numParticles*sizeof(mm_double4), &posq[0]);
+    }
+    else if (cl.getUseMixedPrecision()) {
+        vector<mm_float4> posqf(cl.getPaddedNumAtoms());
+        cl.getPosq().download(posqf);
+        vector<mm_double4> posq(cl.getPaddedNumAtoms());
+        for (int i = 0; i < numParticles; i++)
+            posq[i] = mm_double4(pos[i][0], pos[i][1], pos[i][2], posqf[i].w);
+        cl.getQueue().enqueueWriteBuffer(positions->getDeviceBuffer(), CL_TRUE, copy*cl.getPaddedNumAtoms()*sizeof(mm_double4), numParticles*sizeof(mm_double4), &posq[0]);
+    }
+    else {
+        vector<mm_float4> posq(cl.getPaddedNumAtoms());
+        cl.getPosq().download(posq);
+        for (int i = 0; i < numParticles; i++)
+            posq[i] = mm_float4((cl_float) pos[i][0], (cl_float) pos[i][1], (cl_float) pos[i][2], posq[i].w);
+        cl.getQueue().enqueueWriteBuffer(positions->getDeviceBuffer(), CL_TRUE, copy*cl.getPaddedNumAtoms()*sizeof(mm_float4), numParticles*sizeof(mm_float4), &posq[0]);
+    }
 }

 void OpenCLIntegrateRPMDStepKernel::setVelocities(int copy, const vector<Vec3>& vel) {
@@ -203,22 +244,27 @@ void OpenCLIntegrateRPMDStepKernel::setVelocities(int copy, const vector<Vec3>&
        throw OpenMMException("RPMDIntegrator: Cannot set velocities before the integrator is added to a Context");
    if (vel.size() != numParticles)
        throw OpenMMException("RPMDIntegrator: wrong number of values passed to setVelocities()");
-    vector<mm_float4> velm(cl.getPaddedNumAtoms());
-    cl.getVelm().download(velm);
-    for (int i = 0; i < numParticles; i++)
-        velm[i] = mm_float4((cl_float) vel[i][0], (cl_float) vel[i][1], (cl_float) vel[i][2], velm[i].w);
-    cl.getQueue().enqueueWriteBuffer(velocities->getDeviceBuffer(), CL_TRUE, copy*cl.getPaddedNumAtoms()*sizeof(mm_float4), numParticles*sizeof(mm_float4), &velm[0]);
+    if (cl.getUseDoublePrecision() || cl.getUseMixedPrecision()) {
+        vector<mm_double4> velm(cl.getPaddedNumAtoms());
+        cl.getVelm().download(velm);
+        for (int i = 0; i < numParticles; i++)
+            velm[i] = mm_double4(vel[i][0], vel[i][1], vel[i][2], velm[i].w);
+        cl.getQueue().enqueueWriteBuffer(velocities->getDeviceBuffer(), CL_TRUE, copy*cl.getPaddedNumAtoms()*sizeof(mm_double4), numParticles*sizeof(mm_double4), &velm[0]);
+    }
+    else {
+        vector<mm_float4> velm(cl.getPaddedNumAtoms());
+        cl.getVelm().download(velm);
+        for (int i = 0; i < numParticles; i++)
+            velm[i] = mm_float4((cl_float) vel[i][0], (cl_float) vel[i][1], (cl_float) vel[i][2], velm[i].w);
+        cl.getQueue().enqueueWriteBuffer(velocities->getDeviceBuffer(), CL_TRUE, copy*cl.getPaddedNumAtoms()*sizeof(mm_float4), numParticles*sizeof(mm_float4), &velm[0]);
+    }
 }

 void OpenCLIntegrateRPMDStepKernel::copyToContext(int copy, ContextImpl& context) {
-    copyToContextKernel.setArg<cl::Buffer>(0, positions->getDeviceBuffer());
-    copyToContextKernel.setArg<cl::Buffer>(1, cl.getPosq().getDeviceBuffer());
-    copyToContextKernel.setArg<cl::Buffer>(2, cl.getAtomIndexArray().getDeviceBuffer());
-    copyToContextKernel.setArg<cl_int>(3, copy);
-    cl.executeKernel(copyToContextKernel, cl.getNumAtoms());
-    copyToContextKernel.setArg<cl::Buffer>(0, velocities->getDeviceBuffer());
-    copyToContextKernel.setArg<cl::Buffer>(1, cl.getVelm().getDeviceBuffer());
-    cl.executeKernel(copyToContextKernel, cl.getNumAtoms());
+    copyPositionsToContextKernel.setArg<cl_int>(3, copy);
+    cl.executeKernel(copyPositionsToContextKernel, cl.getNumAtoms());
+    copyVelocitiesToContextKernel.setArg<cl_int>(3, copy);
+    cl.executeKernel(copyVelocitiesToContextKernel, cl.getNumAtoms());
 }

 string OpenCLIntegrateRPMDStepKernel::createFFT(int size, const string& variable, bool forward) {
@@ -232,10 +278,10 @@ string OpenCLIntegrateRPMDStepKernel::createFFT(int size, const string& variable
    string multImag = (forward ? "multiplyComplexImagPart" : "multiplyComplexImagPartConj");

    source<<"{\n";
-    source<<"__local float4* real0 = "<<variable<<"real;\n";
-    source<<"__local float4* imag0 = "<<variable<<"imag;\n";
-    source<<"__local float4* real1 = &temp[blockStart];\n";
-    source<<"__local float4* imag1 = &temp[blockStart+get_local_size(0)];\n";
+    source<<"__local mixed4* real0 = "<<variable<<"real;\n";
+    source<<"__local mixed4* imag0 = "<<variable<<"imag;\n";
+    source<<"__local mixed4* real1 = &temp[blockStart];\n";
+    source<<"__local mixed4* imag1 = &temp[blockStart+get_local_size(0)];\n";

    // Factor size, generating an appropriate block of code for each factor.

@@ -249,39 +295,39 @@ string OpenCLIntegrateRPMDStepKernel::createFFT(int size, const string& variable
            source<<"if (indexInBlock < "<<(L*m)<<") {\n";
            source<<"int i = indexInBlock;\n";
            source<<"int j = i/"<<m<<";\n";
-            source<<"float4 c0r = real"<<input<<"[i];\n";
-            source<<"float4 c0i = imag"<<input<<"[i];\n";
-            source<<"float4 c1r = real"<<input<<"[i+"<<(L*m)<<"];\n";
-            source<<"float4 c1i = imag"<<input<<"[i+"<<(L*m)<<"];\n";
-            source<<"float4 c2r = real"<<input<<"[i+"<<(2*L*m)<<"];\n";
-            source<<"float4 c2i = imag"<<input<<"[i+"<<(2*L*m)<<"];\n";
-            source<<"float4 c3r = real"<<input<<"[i+"<<(3*L*m)<<"];\n";
-            source<<"float4 c3i = imag"<<input<<"[i+"<<(3*L*m)<<"];\n";
-            source<<"float4 c4r = real"<<input<<"[i+"<<(4*L*m)<<"];\n";
-            source<<"float4 c4i = imag"<<input<<"[i+"<<(4*L*m)<<"];\n";
-            source<<"float4 d0r = c1r+c4r;\n";
-            source<<"float4 d0i = c1i+c4i;\n";
-            source<<"float4 d1r = c2r+c3r;\n";
-            source<<"float4 d1i = c2i+c3i;\n";
-            source<<"float4 d2r = "<<cl.doubleToString(sin(0.4*M_PI))<<"*(c1r-c4r);\n";
-            source<<"float4 d2i = "<<cl.doubleToString(sin(0.4*M_PI))<<"*(c1i-c4i);\n";
-            source<<"float4 d3r = "<<cl.doubleToString(sin(0.4*M_PI))<<"*(c2r-c3r);\n";
-            source<<"float4 d3i = "<<cl.doubleToString(sin(0.4*M_PI))<<"*(c2i-c3i);\n";
-            source<<"float4 d4r = d0r+d1r;\n";
-            source<<"float4 d4i = d0i+d1i;\n";
-            source<<"float4 d5r = "<<cl.doubleToString(0.25*sqrt(5.0))<<"*(d0r-d1r);\n";
-            source<<"float4 d5i = "<<cl.doubleToString(0.25*sqrt(5.0))<<"*(d0i-d1i);\n";
-            source<<"float4 d6r = c0r-0.25f*d4r;\n";
-            source<<"float4 d6i = c0i-0.25f*d4i;\n";
-            source<<"float4 d7r = d6r+d5r;\n";
-            source<<"float4 d7i = d6i+d5i;\n";
-            source<<"float4 d8r = d6r-d5r;\n";
-            source<<"float4 d8i = d6i-d5i;\n";
+            source<<"mixed4 c0r = real"<<input<<"[i];\n";
+            source<<"mixed4 c0i = imag"<<input<<"[i];\n";
+            source<<"mixed4 c1r = real"<<input<<"[i+"<<(L*m)<<"];\n";
+            source<<"mixed4 c1i = imag"<<input<<"[i+"<<(L*m)<<"];\n";
+            source<<"mixed4 c2r = real"<<input<<"[i+"<<(2*L*m)<<"];\n";
+            source<<"mixed4 c2i = imag"<<input<<"[i+"<<(2*L*m)<<"];\n";
+            source<<"mixed4 c3r = real"<<input<<"[i+"<<(3*L*m)<<"];\n";
+            source<<"mixed4 c3i = imag"<<input<<"[i+"<<(3*L*m)<<"];\n";
+            source<<"mixed4 c4r = real"<<input<<"[i+"<<(4*L*m)<<"];\n";
+            source<<"mixed4 c4i = imag"<<input<<"[i+"<<(4*L*m)<<"];\n";
+            source<<"mixed4 d0r = c1r+c4r;\n";
+            source<<"mixed4 d0i = c1i+c4i;\n";
+            source<<"mixed4 d1r = c2r+c3r;\n";
+            source<<"mixed4 d1i = c2i+c3i;\n";
+            source<<"mixed4 d2r = "<<cl.doubleToString(sin(0.4*M_PI))<<"*(c1r-c4r);\n";
+            source<<"mixed4 d2i = "<<cl.doubleToString(sin(0.4*M_PI))<<"*(c1i-c4i);\n";
+            source<<"mixed4 d3r = "<<cl.doubleToString(sin(0.4*M_PI))<<"*(c2r-c3r);\n";
+            source<<"mixed4 d3i = "<<cl.doubleToString(sin(0.4*M_PI))<<"*(c2i-c3i);\n";
+            source<<"mixed4 d4r = d0r+d1r;\n";
+            source<<"mixed4 d4i = d0i+d1i;\n";
+            source<<"mixed4 d5r = "<<cl.doubleToString(0.25*sqrt(5.0))<<"*(d0r-d1r);\n";
+            source<<"mixed4 d5i = "<<cl.doubleToString(0.25*sqrt(5.0))<<"*(d0i-d1i);\n";
+            source<<"mixed4 d6r = c0r-0.25f*d4r;\n";
+            source<<"mixed4 d6i = c0i-0.25f*d4i;\n";
+            source<<"mixed4 d7r = d6r+d5r;\n";
+            source<<"mixed4 d7i = d6i+d5i;\n";
+            source<<"mixed4 d8r = d6r-d5r;\n";
+            source<<"mixed4 d8i = d6i-d5i;\n";
            string coeff = cl.doubleToString(sin(0.2*M_PI)/sin(0.4*M_PI));
-            source<<"float4 d9r = "<<sign<<"*(d2i+"<<coeff<<"*d3i);\n";
-            source<<"float4 d9i = "<<sign<<"*(-d2r-"<<coeff<<"*d3r);\n";
-            source<<"float4 d10r = "<<sign<<"*("<<coeff<<"*d2i-d3i);\n";
-            source<<"float4 d10i = "<<sign<<"*(d3r-"<<coeff<<"*d2r);\n";
+            source<<"mixed4 d9r = "<<sign<<"*(d2i+"<<coeff<<"*d3i);\n";
+            source<<"mixed4 d9i = "<<sign<<"*(-d2r-"<<coeff<<"*d3r);\n";
+            source<<"mixed4 d10r = "<<sign<<"*("<<coeff<<"*d2i-d3i);\n";
+            source<<"mixed4 d10i = "<<sign<<"*(d3r-"<<coeff<<"*d2r);\n";
            source<<"real"<<output<<"[i+4*j*"<<m<<"] = c0r+d4r;\n";
            source<<"imag"<<output<<"[i+4*j*"<<m<<"] = c0i+d4i;\n";
            source<<"real"<<output<<"[i+(4*j+1)*"<<m<<"] = "<<multReal<<"(w[j*"<<size<<"/"<<(5*L)<<"], d7r+d9r, d7i+d9i);\n";
@@ -302,22 +348,22 @@ string OpenCLIntegrateRPMDStepKernel::createFFT(int size, const string& variable
            source<<"if (indexInBlock < "<<(L*m)<<") {\n";
            source<<"int i = indexInBlock;\n";
            source<<"int j = i/"<<m<<";\n";
-            source<<"float4 c0r = real"<<input<<"[i];\n";
-            source<<"float4 c0i = imag"<<input<<"[i];\n";
-            source<<"float4 c1r = real"<<input<<"[i+"<<(L*m)<<"];\n";
-            source<<"float4 c1i = imag"<<input<<"[i+"<<(L*m)<<"];\n";
-            source<<"float4 c2r = real"<<input<<"[i+"<<(2*L*m)<<"];\n";
-            source<<"float4 c2i = imag"<<input<<"[i+"<<(2*L*m)<<"];\n";
-            source<<"float4 c3r = real"<<input<<"[i+"<<(3*L*m)<<"];\n";
-            source<<"float4 c3i = imag"<<input<<"[i+"<<(3*L*m)<<"];\n";
-            source<<"float4 d0r = c0r+c2r;\n";
-            source<<"float4 d0i = c0i+c2i;\n";
-            source<<"float4 d1r = c0r-c2r;\n";
-            source<<"float4 d1i = c0i-c2i;\n";
-            source<<"float4 d2r = c1r+c3r;\n";
-            source<<"float4 d2i = c1i+c3i;\n";
-            source<<"float4 d3r = "<<sign<<"*(c1i-c3i);\n";
-            source<<"float4 d3i = "<<sign<<"*(c3r-c1r);\n";
+            source<<"mixed4 c0r = real"<<input<<"[i];\n";
+            source<<"mixed4 c0i = imag"<<input<<"[i];\n";
+            source<<"mixed4 c1r = real"<<input<<"[i+"<<(L*m)<<"];\n";
+            source<<"mixed4 c1i = imag"<<input<<"[i+"<<(L*m)<<"];\n";
+            source<<"mixed4 c2r = real"<<input<<"[i+"<<(2*L*m)<<"];\n";
+            source<<"mixed4 c2i = imag"<<input<<"[i+"<<(2*L*m)<<"];\n";
+            source<<"mixed4 c3r = real"<<input<<"[i+"<<(3*L*m)<<"];\n";
+            source<<"mixed4 c3i = imag"<<input<<"[i+"<<(3*L*m)<<"];\n";
+            source<<"mixed4 d0r = c0r+c2r;\n";
+            source<<"mixed4 d0i = c0i+c2i;\n";
+            source<<"mixed4 d1r = c0r-c2r;\n";
+            source<<"mixed4 d1i = c0i-c2i;\n";
+            source<<"mixed4 d2r = c1r+c3r;\n";
+            source<<"mixed4 d2i = c1i+c3i;\n";
+            source<<"mixed4 d3r = "<<sign<<"*(c1i-c3i);\n";
+            source<<"mixed4 d3i = "<<sign<<"*(c3r-c1r);\n";
            source<<"real"<<output<<"[i+3*j*"<<m<<"] = d0r+d2r;\n";
            source<<"imag"<<output<<"[i+3*j*"<<m<<"] = d0i+d2i;\n";
            source<<"real"<<output<<"[i+(3*j+1)*"<<m<<"] = "<<multReal<<"(w[j*"<<size<<"/"<<(4*L)<<"], d1r+d3r, d1i+d3i);\n";
@@ -336,18 +382,18 @@ string OpenCLIntegrateRPMDStepKernel::createFFT(int size, const string& variable
            source<<"if (indexInBlock < "<<(L*m)<<") {\n";
            source<<"int i = indexInBlock;\n";
            source<<"int j = i/"<<m<<";\n";
-            source<<"float4 c0r = real"<<input<<"[i];\n";
-            source<<"float4 c0i = imag"<<input<<"[i];\n";
-            source<<"float4 c1r = real"<<input<<"[i+"<<(L*m)<<"];\n";
-            source<<"float4 c1i = imag"<<input<<"[i+"<<(L*m)<<"];\n";
-            source<<"float4 c2r = real"<<input<<"[i+"<<(2*L*m)<<"];\n";
-            source<<"float4 c2i = imag"<<input<<"[i+"<<(2*L*m)<<"];\n";
-            source<<"float4 d0r = c1r+c2r;\n";
-            source<<"float4 d0i = c1i+c2i;\n";
-            source<<"float4 d1r = c0r-0.5f*d0r;\n";
-            source<<"float4 d1i = c0i-0.5f*d0i;\n";
-            source<<"float4 d2r = "<<sign<<"*"<<cl.doubleToString(sin(M_PI/3.0))<<"*(c1i-c2i);\n";
-            source<<"float4 d2i = "<<sign<<"*"<<cl.doubleToString(sin(M_PI/3.0))<<"*(c2r-c1r);\n";
+            source<<"mixed4 c0r = real"<<input<<"[i];\n";
+            source<<"mixed4 c0i = imag"<<input<<"[i];\n";
+            source<<"mixed4 c1r = real"<<input<<"[i+"<<(L*m)<<"];\n";
+            source<<"mixed4 c1i = imag"<<input<<"[i+"<<(L*m)<<"];\n";
+            source<<"mixed4 c2r = real"<<input<<"[i+"<<(2*L*m)<<"];\n";
+            source<<"mixed4 c2i = imag"<<input<<"[i+"<<(2*L*m)<<"];\n";
+            source<<"mixed4 d0r = c1r+c2r;\n";
+            source<<"mixed4 d0i = c1i+c2i;\n";
+            source<<"mixed4 d1r = c0r-0.5f*d0r;\n";
+            source<<"mixed4 d1i = c0i-0.5f*d0i;\n";
+            source<<"mixed4 d2r = "<<sign<<"*"<<cl.doubleToString(sin(M_PI/3.0))<<"*(c1i-c2i);\n";
+            source<<"mixed4 d2i = "<<sign<<"*"<<cl.doubleToString(sin(M_PI/3.0))<<"*(c2r-c1r);\n";
            source<<"real"<<output<<"[i+2*j*"<<m<<"] = c0r+d0r;\n";
            source<<"imag"<<output<<"[i+2*j*"<<m<<"] = c0i+d0i;\n";
            source<<"real"<<output<<"[i+(2*j+1)*"<<m<<"] = "<<multReal<<"(w[j*"<<size<<"/"<<(3*L)<<"], d1r+d2r, d1i+d2i);\n";
@@ -364,10 +410,10 @@ string OpenCLIntegrateRPMDStepKernel::createFFT(int size, const string& variable
            source<<"if (indexInBlock < "<<(L*m)<<") {\n";
            source<<"int i = indexInBlock;\n";
            source<<"int j = i/"<<m<<";\n";
-            source<<"float4 c0r = real"<<input<<"[i];\n";
-            source<<"float4 c0i = imag"<<input<<"[i];\n";
-            source<<"float4 c1r = real"<<input<<"[i+"<<(L*m)<<"];\n";
-            source<<"float4 c1i = imag"<<input<<"[i+"<<(L*m)<<"];\n";
+            source<<"mixed4 c0r = real"<<input<<"[i];\n";
+            source<<"mixed4 c0i = imag"<<input<<"[i];\n";
+            source<<"mixed4 c1r = real"<<input<<"[i+"<<(L*m)<<"];\n";
+            source<<"mixed4 c1i = imag"<<input<<"[i+"<<(L*m)<<"];\n";
            source<<"real"<<output<<"[i+j*"<<m<<"] = c0r+c1r;\n";
            source<<"imag"<<output<<"[i+j*"<<m<<"] = c0i+c1i;\n";
            source<<"real"<<output<<"[i+(j+1)*"<<m<<"] = "<<multReal<<"(w[j*"<<size<<"/"<<(2*L)<<"], c0r-c1r, c0i-c1i);\n";

--- a/plugins/rpmd/platforms/opencl/src/OpenCLRpmdKernels.h
+++ b/plugins/rpmd/platforms/opencl/src/OpenCLRpmdKernels.h
@@ -9,7 +9,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2011 Stanford University and the Authors.           *
+ * Portions copyright (c) 2011-2012 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -92,7 +92,7 @@ private:
    OpenCLArray* forces;
    OpenCLArray* positions;
    OpenCLArray* velocities;
-    cl::Kernel pileKernel, stepKernel, velocitiesKernel, copyToContextKernel, copyFromContextKernel, translateKernel;
+    cl::Kernel pileKernel, stepKernel, velocitiesKernel, copyPositionsToContextKernel, copyVelocitiesToContextKernel, copyForcesFromContextKernel, translateKernel;
 };

 } // namespace OpenMM

--- a/plugins/rpmd/platforms/opencl/src/kernels/rpmd.cl
+++ b/plugins/rpmd/platforms/opencl/src/kernels/rpmd.cl
-float4 multiplyComplexRealPart(float2 c1, float4 c2r, float4 c2i) {
+mixed4 multiplyComplexRealPart(mixed2 c1, mixed4 c2r, mixed4 c2i) {
    return c1.x*c2r-c1.y*c2i;
 }

-float4 multiplyComplexImagPart(float2 c1, float4 c2r, float4 c2i) {
+mixed4 multiplyComplexImagPart(mixed2 c1, mixed4 c2r, mixed4 c2i) {
    return c1.x*c2i+c1.y*c2r;
 }

-float4 multiplyComplexRealPartConj(float2 c1, float4 c2r, float4 c2i) {
+mixed4 multiplyComplexRealPartConj(mixed2 c1, mixed4 c2r, mixed4 c2i) {
    return c1.x*c2r+c1.y*c2i;
 }

-float4 multiplyComplexImagPartConj(float2 c1, float4 c2r, float4 c2i) {
+mixed4 multiplyComplexImagPartConj(mixed2 c1, mixed4 c2r, mixed4 c2i) {
    return c1.x*c2i-c1.y*c2r;
 }

 /**
 * Apply the PILE-L thermostat.
 */
-__kernel void applyPileThermostat(__global float4* velm, __local float4* v, __local float4* temp, __local float2* w, __global float4* random, unsigned int randomIndex,
-        float dt, float kT, float friction) {
+__kernel void applyPileThermostat(__global mixed4* velm, __global float4* random, unsigned int randomIndex,
+        mixed dt, mixed kT, mixed friction) {
    const int numBlocks = get_global_size(0)/NUM_COPIES;
    const int blockStart = NUM_COPIES*(get_local_id(0)/NUM_COPIES);
    const int indexInBlock = get_local_id(0)-blockStart;
-    const float nkT = NUM_COPIES*kT;
-    const float twown = 2.0f*nkT/HBAR;
-    const float c1_0 = EXP(-0.5f*dt*friction);
-    const float c2_0 = SQRT(1.0f-c1_0*c1_0);
-    __local float4* vreal = &v[blockStart];
-    __local float4* vimag = &v[blockStart+get_local_size(0)];
+    const mixed nkT = NUM_COPIES*kT;
+    const mixed twown = 2.0f*nkT/HBAR;
+    const mixed c1_0 = exp(-0.5f*dt*friction);
+    const mixed c2_0 = sqrt(1.0f-c1_0*c1_0);
+    __local mixed4 v[2*THREAD_BLOCK_SIZE];
+    __local mixed4 temp[2*THREAD_BLOCK_SIZE];
+    __local mixed2 w[NUM_COPIES];
+    __local mixed4* vreal = &v[blockStart];
+    __local mixed4* vimag = &v[blockStart+get_local_size(0)];
    if (get_local_id(0) < NUM_COPIES)
-        w[indexInBlock] = (float2) (cos(-indexInBlock*2*M_PI/NUM_COPIES), sin(-indexInBlock*2*M_PI/NUM_COPIES));
+        w[indexInBlock] = (mixed2) (cos(-indexInBlock*2*M_PI/NUM_COPIES), sin(-indexInBlock*2*M_PI/NUM_COPIES));
    barrier(CLK_LOCAL_MEM_FENCE);
    randomIndex += NUM_COPIES*(get_global_id(0)/NUM_COPIES);
    for (int particle = get_global_id(0)/NUM_COPIES; particle < NUM_ATOMS; particle += numBlocks) {
-        float4 particleVelm = velm[particle+indexInBlock*PADDED_NUM_ATOMS];
-        float invMass = particleVelm.w;
-        float c3_0 = c2_0*SQRT(nkT*invMass);
+        mixed4 particleVelm = velm[particle+indexInBlock*PADDED_NUM_ATOMS];
+        mixed invMass = particleVelm.w;
+        mixed c3_0 = c2_0*sqrt(nkT*invMass);
        
        // Forward FFT.
        
        vreal[indexInBlock] = SCALE*particleVelm;
-        vimag[indexInBlock] = (float4) (0.0f, 0.0f, 0.0f, 0.0f);
+        vimag[indexInBlock] = (mixed4) (0.0f, 0.0f, 0.0f, 0.0f);
        barrier(CLK_GLOBAL_MEM_FENCE);
        FFT_V_FORWARD

@@ -49,19 +52,19 @@ __kernel void applyPileThermostat(__global float4* velm, __local float4* v, __lo
        if (indexInBlock == 0) {
            // Apply a local Langevin thermostat to the centroid mode.

-            vreal[0].xyz = vreal[0].xyz*c1_0 + c3_0*random[randomIndex].xyz;
+            vreal[0].xyz = vreal[0].xyz*c1_0 + c3_0*convert_mixed4(random[randomIndex]).xyz;
        }
        else {
            // Use critical damping white noise for the remaining modes.

            int k = (indexInBlock <= NUM_COPIES/2 ? indexInBlock : NUM_COPIES-indexInBlock);
            const bool isCenter = (NUM_COPIES%2 == 0 && k == NUM_COPIES/2);
-            const float wk = twown*sin(k*M_PI/NUM_COPIES);
-            const float c1 = EXP(-wk*dt);
-            const float c2 = SQRT((1.0f-c1*c1)/2.0f) * (isCenter ? sqrt(2.0f) : 1.0f);
-            const float c3 = c2*SQRT(nkT*invMass);
-            float4 rand1 = c3*random[randomIndex+k];
-            float4 rand2 = (isCenter ? 0.0f : c3*random[randomIndex+NUM_COPIES-k]);
+            const mixed wk = twown*sin(k*M_PI/NUM_COPIES);
+            const mixed c1 = exp(-wk*dt);
+            const mixed c2 = sqrt((1.0f-c1*c1)/2.0f) * (isCenter ? sqrt(2.0f) : 1.0f);
+            const mixed c3 = c2*sqrt(nkT*invMass);
+            mixed4 rand1 = c3*convert_mixed4(random[randomIndex+k]);
+            mixed4 rand2 = (isCenter ? 0.0f : c3*convert_mixed4(random[randomIndex+NUM_COPIES-k]));
            vreal[indexInBlock].xyz = c1*vreal[indexInBlock].xyz + rand1.xyz;
            vimag[indexInBlock].xyz = c1*vimag[indexInBlock].xyz + (indexInBlock < NUM_COPIES/2 ? rand2.xyz : -rand2.xyz);
        }
@@ -78,42 +81,45 @@ __kernel void applyPileThermostat(__global float4* velm, __local float4* v, __lo
 /**
 * Advance the positions and velocities.
 */
-__kernel void integrateStep(__global float4* posq, __global float4* velm, __global float4* force,
-        __local float4* q, __local float4* v, __local float4* temp, __local float2* w, float dt, float kT) {
+__kernel void integrateStep(__global mixed4* posq, __global mixed4* velm, __global real4* force, mixed dt, mixed kT) {
    const int numBlocks = get_global_size(0)/NUM_COPIES;
    const int blockStart = NUM_COPIES*(get_local_id(0)/NUM_COPIES);
    const int indexInBlock = get_local_id(0)-blockStart;
-    const float nkT = NUM_COPIES*kT;
-    const float twown = 2.0f*nkT/HBAR;
+    const mixed nkT = NUM_COPIES*kT;
+    const mixed twown = 2.0f*nkT/HBAR;
+    __local mixed4 q[2*THREAD_BLOCK_SIZE];
+    __local mixed4 v[2*THREAD_BLOCK_SIZE];
+    __local mixed4 temp[2*THREAD_BLOCK_SIZE];
+    __local mixed2 w[NUM_COPIES];

    // Update velocities.
    
    for (int particle = get_global_id(0)/NUM_COPIES; particle < NUM_ATOMS; particle += numBlocks) {
        int index = particle+indexInBlock*PADDED_NUM_ATOMS;
-        float4 particleVelm = velm[index];
-        particleVelm.xyz += force[index].xyz*(0.5f*dt*particleVelm.w);
+        mixed4 particleVelm = velm[index];
+        particleVelm.xyz += convert_mixed4(force[index]).xyz*(0.5f*dt*particleVelm.w);
        velm[index] = particleVelm;
    }
    
    // Evolve the free ring polymer by transforming to the frequency domain.

-    __local float4* qreal = &q[blockStart];
-    __local float4* qimag = &q[blockStart+get_local_size(0)];
-    __local float4* vreal = &v[blockStart];
-    __local float4* vimag = &v[blockStart+get_local_size(0)];
+    __local mixed4* qreal = &q[blockStart];
+    __local mixed4* qimag = &q[blockStart+get_local_size(0)];
+    __local mixed4* vreal = &v[blockStart];
+    __local mixed4* vimag = &v[blockStart+get_local_size(0)];
    if (get_local_id(0) < NUM_COPIES)
-        w[indexInBlock] = (float2) (cos(-indexInBlock*2*M_PI/NUM_COPIES), sin(-indexInBlock*2*M_PI/NUM_COPIES));
+        w[indexInBlock] = (mixed2) (cos(-indexInBlock*2*M_PI/NUM_COPIES), sin(-indexInBlock*2*M_PI/NUM_COPIES));
    barrier(CLK_LOCAL_MEM_FENCE);
    for (int particle = get_global_id(0)/NUM_COPIES; particle < NUM_ATOMS; particle += numBlocks) {
-        float4 particlePosq = posq[particle+indexInBlock*PADDED_NUM_ATOMS];
-        float4 particleVelm = velm[particle+indexInBlock*PADDED_NUM_ATOMS];
+        mixed4 particlePosq = posq[particle+indexInBlock*PADDED_NUM_ATOMS];
+        mixed4 particleVelm = velm[particle+indexInBlock*PADDED_NUM_ATOMS];
        
        // Forward FFT.
        
        qreal[indexInBlock] = SCALE*particlePosq;
-        qimag[indexInBlock] = (float4) (0.0f, 0.0f, 0.0f, 0.0f);
+        qimag[indexInBlock] = (mixed4) (0.0f, 0.0f, 0.0f, 0.0f);
        vreal[indexInBlock] = SCALE*particleVelm;
-        vimag[indexInBlock] = (float4) (0.0f, 0.0f, 0.0f, 0.0f);
+        vimag[indexInBlock] = (mixed4) (0.0f, 0.0f, 0.0f, 0.0f);
        barrier(CLK_GLOBAL_MEM_FENCE);
        FFT_Q_FORWARD
        FFT_V_FORWARD
@@ -125,12 +131,12 @@ __kernel void integrateStep(__global float4* posq, __global float4* velm, __glob
            qimag[0].xyz += vimag[0].xyz*dt;
        }
        else {
-            const float wk = twown*sin(indexInBlock*M_PI/NUM_COPIES);
-            const float wt = wk*dt;
-            const float coswt = cos(wt);
-            const float sinwt = sin(wt);
-            const float4 vprimereal = vreal[indexInBlock]*coswt - qreal[indexInBlock]*(wk*sinwt); // Advance velocity from t to t+dt
-            const float4 vprimeimag = vimag[indexInBlock]*coswt - qimag[indexInBlock]*(wk*sinwt);
+            const mixed wk = twown*sin(indexInBlock*M_PI/NUM_COPIES);
+            const mixed wt = wk*dt;
+            const mixed coswt = cos(wt);
+            const mixed sinwt = sin(wt);
+            const mixed4 vprimereal = vreal[indexInBlock]*coswt - qreal[indexInBlock]*(wk*sinwt); // Advance velocity from t to t+dt
+            const mixed4 vprimeimag = vimag[indexInBlock]*coswt - qimag[indexInBlock]*(wk*sinwt);
            qreal[indexInBlock] = vreal[indexInBlock]*(sinwt/wk) + qreal[indexInBlock]*coswt; // Advance position from t to t+dt
            qimag[indexInBlock] = vimag[indexInBlock]*(sinwt/wk) + qimag[indexInBlock]*coswt;
            vreal[indexInBlock] = vprimereal;
@@ -150,7 +156,7 @@ __kernel void integrateStep(__global float4* posq, __global float4* velm, __glob
 /**
 * Advance the velocities by a half step.
 */
-__kernel void advanceVelocities(__global float4* velm, __global float4* force, float dt) {
+__kernel void advanceVelocities(__global mixed4* velm, __global real4* force, mixed dt) {
    const int numBlocks = get_global_size(0)/NUM_COPIES;
    const int blockStart = NUM_COPIES*(get_local_id(0)/NUM_COPIES);
    const int indexInBlock = get_local_id(0)-blockStart;
@@ -159,16 +165,26 @@ __kernel void advanceVelocities(__global float4* velm, __global float4* force, f
    
    for (int particle = get_global_id(0)/NUM_COPIES; particle < NUM_ATOMS; particle += numBlocks) {
        int index = particle+indexInBlock*PADDED_NUM_ATOMS;
-        float4 particleVelm = velm[index];
-        particleVelm.xyz += force[index].xyz*(0.5f*dt*particleVelm.w);
+        mixed4 particleVelm = velm[index];
+        particleVelm.xyz += convert_mixed4(force[index]).xyz*(0.5f*dt*particleVelm.w);
        velm[index] = particleVelm;
    }
 }

 /**
- * Copy a set of per-atom values from the integrator's arrays to the context.
+ * Copy a set of positions from the integrator's arrays to the context.
 */
-__kernel void copyToContext(__global float4* src, __global float4* dst, __global int* order, int copy) {
+__kernel void copyPositionsToContext(__global mixed4* src, __global real4* dst, __global int* order, int copy) {
+    const int base = copy*PADDED_NUM_ATOMS;
+    for (int particle = get_global_id(0); particle < NUM_ATOMS; particle += get_global_size(0)) {
+        dst[particle] = convert_real4(src[base+order[particle]]);
+    }
+}
+
+/**
+ * Copy a set of velocities from the integrator's arrays to the context.
+ */
+__kernel void copyVelocitiesToContext(__global mixed4* src, __global mixed4* dst, __global int* order, int copy) {
    const int base = copy*PADDED_NUM_ATOMS;
    for (int particle = get_global_id(0); particle < NUM_ATOMS; particle += get_global_size(0)) {
        dst[particle] = src[base+order[particle]];
@@ -176,9 +192,9 @@ __kernel void copyToContext(__global float4* src, __global float4* dst, __global
 }

 /**
- * Copy a set of per-atom values from the context to the integrator's arrays.
+ * Copy a set forces from the context to the integrator's arrays.
 */
-__kernel void copyFromContext(__global float4* src, __global float4* dst, __global int* order, int copy) {
+__kernel void copyForcesFromContext(__global real4* src, __global real4* dst, __global int* order, int copy) {
    const int base = copy*PADDED_NUM_ATOMS;
    for (int particle = get_global_id(0); particle < NUM_ATOMS; particle += get_global_size(0)) {
        dst[base+order[particle]] = src[particle];
@@ -188,10 +204,10 @@ __kernel void copyFromContext(__global float4* src, __global float4* dst, __glob
 /**
 * Update atom positions so all copies are offset by the same number of periodic box widths.
 */
-__kernel void applyCellTranslations(__global float4* posq, __global float4* movedPos, __global int* order, int movedCopy) {
+__kernel void applyCellTranslations(__global mixed4* posq, __global real4* movedPos, __global int* order, int movedCopy) {
    for (int particle = get_global_id(0); particle < NUM_ATOMS; particle += get_global_size(0)) {
        int index = order[particle];
-        float4 delta = movedPos[particle]-posq[movedCopy*PADDED_NUM_ATOMS+index];
+        mixed4 delta = convert_mixed4(movedPos[particle])-posq[movedCopy*PADDED_NUM_ATOMS+index];
        for (int copy = 0; copy < NUM_COPIES; copy++)
            posq[copy*PADDED_NUM_ATOMS+index] += delta;
    }

--- a/plugins/rpmd/platforms/opencl/tests/CMakeLists.txt
+++ b/plugins/rpmd/platforms/opencl/tests/CMakeLists.txt
@@ -14,6 +14,10 @@ FOREACH(TEST_PROG ${TEST_PROGS})
    # Link with shared library
    ADD_EXECUTABLE(${TEST_ROOT} ${TEST_PROG})
    TARGET_LINK_LIBRARIES(${TEST_ROOT} ${SHARED_RPMD_TARGET})
-    ADD_TEST(${TEST_ROOT} ${EXECUTABLE_OUTPUT_PATH}/${TEST_ROOT})
+    ADD_TEST(${TEST_ROOT}Single ${EXECUTABLE_OUTPUT_PATH}/${TEST_ROOT} single)
+    IF (OPENMM_BUILD_CUDA_DOUBLE_PRECISION_TESTS)
+        ADD_TEST(${TEST_ROOT}Mixed ${EXECUTABLE_OUTPUT_PATH}/${TEST_ROOT} mixed)
+        ADD_TEST(${TEST_ROOT}Double ${EXECUTABLE_OUTPUT_PATH}/${TEST_ROOT} double)
+    ENDIF(OPENMM_BUILD_CUDA_DOUBLE_PRECISION_TESTS)

 ENDFOREACH(TEST_PROG ${TEST_PROGS})
--- a/plugins/rpmd/platforms/opencl/tests/TestOpenCLRpmd.cpp
+++ b/plugins/rpmd/platforms/opencl/tests/TestOpenCLRpmd.cpp
@@ -6,7 +6,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2011 Stanford University and the Authors.           *
+ * Portions copyright (c) 2011-2012 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -221,9 +221,11 @@ void testParaHydrogen() {
    ASSERT_USUALLY_EQUAL_TOL(60.0, 1.5*temperature+meanKE, 0.02);
 }

-int main() {
+int main(int argc, char* argv[]) {
    try {
        Platform::loadPluginsFromDirectory(Platform::getDefaultPluginsDirectory());
+        if (argc > 1)
+            Platform::getPlatformByName("OpenCL").setPropertyDefaultValue("OpenCLPrecision", string(argv[1]));
        testFreeParticles();
        testParaHydrogen();
    }