Continuing to implement double precision in OpenCL

c8dac206 · Peter Eastman · 34938e2c · c8dac206 · c8dac206 · c8dac206
Commit c8dac206 authored Oct 17, 2012 by Peter Eastman
16 changed files
--- a/platforms/opencl/src/kernels/customHbondForce.cl
+++ b/platforms/opencl/src/kernels/customHbondForce.cl
 /**
 * Compute the difference between two vectors, setting the fourth component to the squared magnitude.
 */
-float4 delta(float4 vec1, float4 vec2) {
-    float4 result = (float4) (vec1.x-vec2.x, vec1.y-vec2.y, vec1.z-vec2.z, 0.0f);
+real4 delta(real4 vec1, real4 vec2) {
+    real4 result = (real4) (vec1.x-vec2.x, vec1.y-vec2.y, vec1.z-vec2.z, 0);
    result.w = result.x*result.x + result.y*result.y + result.z*result.z;
    return result;
 }
@@ -11,8 +11,8 @@ float4 delta(float4 vec1, float4 vec2) {
 * Compute the difference between two vectors, taking periodic boundary conditions into account
 * and setting the fourth component to the squared magnitude.
 */
-float4 deltaPeriodic(float4 vec1, float4 vec2, float4 periodicBoxSize, float4 invPeriodicBoxSize) {
-    float4 result = (float4) (vec1.x-vec2.x, vec1.y-vec2.y, vec1.z-vec2.z, 0.0f);
+real4 deltaPeriodic(real4 vec1, real4 vec2, real4 periodicBoxSize, real4 invPeriodicBoxSize) {
+    real4 result = (real4) (vec1.x-vec2.x, vec1.y-vec2.y, vec1.z-vec2.z, 0);
 #ifdef USE_PERIODIC
    result.x -= floor(result.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
    result.y -= floor(result.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
@@ -25,15 +25,15 @@ float4 deltaPeriodic(float4 vec1, float4 vec2, float4 periodicBoxSize, float4 in
 /**
 * Compute the angle between two vectors.  The w component of each vector should contain the squared magnitude.
 */
-float computeAngle(float4 vec1, float4 vec2) {
-    float dotProduct = vec1.x*vec2.x + vec1.y*vec2.y + vec1.z*vec2.z;
-    float cosine = dotProduct*RSQRT(vec1.w*vec2.w);
-    float angle;
+real computeAngle(real4 vec1, real4 vec2) {
+    real dotProduct = vec1.x*vec2.x + vec1.y*vec2.y + vec1.z*vec2.z;
+    real cosine = dotProduct*RSQRT(vec1.w*vec2.w);
+    real angle;
    if (cosine > 0.99f || cosine < -0.99f) {
        // We're close to the singularity in acos(), so take the cross product and use asin() instead.

-        float4 crossProduct = cross(vec1, vec2);
-        float scale = vec1.w*vec2.w;
+        real4 crossProduct = cross(vec1, vec2);
+        real scale = vec1.w*vec2.w;
        angle = asin(SQRT(dot(crossProduct, crossProduct)/scale));
        if (cosine < 0.0f)
            angle = PI-angle;
@@ -46,8 +46,8 @@ float computeAngle(float4 vec1, float4 vec2) {
 /**
 * Compute the cross product of two vectors, setting the fourth component to the squared magnitude.
 */
-float4 computeCross(float4 vec1, float4 vec2) {
-    float4 result = cross(vec1, vec2);
+real4 computeCross(real4 vec1, real4 vec2) {
+    real4 result = cross(vec1, vec2);
    result.w = result.x*result.x + result.y*result.y + result.z*result.z;
    return result;
 }
@@ -55,24 +55,24 @@ float4 computeCross(float4 vec1, float4 vec2) {
 /**
 * Compute forces on donors.
 */
-__kernel void computeDonorForces(__global float4* restrict forceBuffers, __global float* restrict energyBuffer, __global const float4* restrict posq, __global const int4* restrict exclusions,
-        __global const int4* restrict donorAtoms, __global const int4* restrict acceptorAtoms, __global const int4* restrict donorBufferIndices, __local float4* posBuffer, float4 periodicBoxSize, float4 invPeriodicBoxSize
+__kernel void computeDonorForces(__global real4* restrict forceBuffers, __global real* restrict energyBuffer, __global const real4* restrict posq, __global const int4* restrict exclusions,
+        __global const int4* restrict donorAtoms, __global const int4* restrict acceptorAtoms, __global const int4* restrict donorBufferIndices, __local real4* posBuffer, real4 periodicBoxSize, real4 invPeriodicBoxSize
        PARAMETER_ARGUMENTS) {
-    float energy = 0.0f;
-    float4 f1 = (float4) 0;
-    float4 f2 = (float4) 0;
-    float4 f3 = (float4) 0;
+    real energy = 0;
+    real4 f1 = (real4) 0;
+    real4 f2 = (real4) 0;
+    real4 f3 = (real4) 0;
    for (int donorStart = 0; donorStart < NUM_DONORS; donorStart += get_global_size(0)) {
        // Load information about the donor this thread will compute forces on.

        int donorIndex = donorStart+get_global_id(0);
        int4 atoms, exclusionIndices;
-        float4 d1, d2, d3;
+        real4 d1, d2, d3;
        if (donorIndex < NUM_DONORS) {
            atoms = donorAtoms[donorIndex];
-            d1 = (atoms.x > -1 ? posq[atoms.x] : (float4) 0);
-            d2 = (atoms.y > -1 ? posq[atoms.y] : (float4) 0);
-            d3 = (atoms.z > -1 ? posq[atoms.z] : (float4) 0);
+            d1 = (atoms.x > -1 ? posq[atoms.x] : (real4) 0);
+            d2 = (atoms.y > -1 ? posq[atoms.y] : (real4) 0);
+            d3 = (atoms.z > -1 ? posq[atoms.z] : (real4) 0);
 #ifdef USE_EXCLUSIONS
            exclusionIndices = exclusions[donorIndex];
 #endif
@@ -85,9 +85,9 @@ __kernel void computeDonorForces(__global float4* restrict forceBuffers, __globa
            int blockSize = min((int) get_local_size(0), NUM_ACCEPTORS-acceptorStart);
            if (get_local_id(0) < blockSize) {
                int4 atoms2 = acceptorAtoms[acceptorStart+get_local_id(0)];
-                posBuffer[3*get_local_id(0)] = (atoms2.x > -1 ? posq[atoms2.x] : (float4) 0);
-                posBuffer[3*get_local_id(0)+1] = (atoms2.y > -1 ? posq[atoms2.y] : (float4) 0);
-                posBuffer[3*get_local_id(0)+2] = (atoms2.z > -1 ? posq[atoms2.z] : (float4) 0);
+                posBuffer[3*get_local_id(0)] = (atoms2.x > -1 ? posq[atoms2.x] : (real4) 0);
+                posBuffer[3*get_local_id(0)+1] = (atoms2.y > -1 ? posq[atoms2.y] : (real4) 0);
+                posBuffer[3*get_local_id(0)+2] = (atoms2.z > -1 ? posq[atoms2.z] : (real4) 0);
            }
            barrier(CLK_LOCAL_MEM_FENCE);
            if (donorIndex < NUM_DONORS) {
@@ -99,10 +99,10 @@ __kernel void computeDonorForces(__global float4* restrict forceBuffers, __globa
 #endif
                    // Compute the interaction between a donor and an acceptor.

-                    float4 a1 = posBuffer[3*index];
-                    float4 a2 = posBuffer[3*index+1];
-                    float4 a3 = posBuffer[3*index+2];
-                    float4 deltaD1A1 = deltaPeriodic(d1, a1, periodicBoxSize, invPeriodicBoxSize);
+                    real4 a1 = posBuffer[3*index];
+                    real4 a2 = posBuffer[3*index+1];
+                    real4 a3 = posBuffer[3*index+2];
+                    real4 deltaD1A1 = deltaPeriodic(d1, a1, periodicBoxSize, invPeriodicBoxSize);
 #ifdef USE_CUTOFF
                    if (deltaD1A1.w < CUTOFF_SQUARED) {
 #endif
@@ -120,19 +120,19 @@ __kernel void computeDonorForces(__global float4* restrict forceBuffers, __globa
            int4 bufferIndices = donorBufferIndices[donorIndex];
            if (atoms.x > -1) {
                unsigned int offset = atoms.x+bufferIndices.x*PADDED_NUM_ATOMS;
-                float4 force = forceBuffers[offset];
+                real4 force = forceBuffers[offset];
                force.xyz += f1.xyz;
                forceBuffers[offset] = force;
            }
            if (atoms.y > -1) {
                unsigned int offset = atoms.y+bufferIndices.y*PADDED_NUM_ATOMS;
-                float4 force = forceBuffers[offset];
+                real4 force = forceBuffers[offset];
                force.xyz += f2.xyz;
                forceBuffers[offset] = force;
            }
            if (atoms.z > -1) {
                unsigned int offset = atoms.z+bufferIndices.z*PADDED_NUM_ATOMS;
-                float4 force = forceBuffers[offset];
+                real4 force = forceBuffers[offset];
                force.xyz += f3.xyz;
                forceBuffers[offset] = force;
            }
@@ -143,23 +143,23 @@ __kernel void computeDonorForces(__global float4* restrict forceBuffers, __globa
 /**
 * Compute forces on acceptors.
 */
-__kernel void computeAcceptorForces(__global float4* restrict forceBuffers, __global float* restrict energyBuffer, __global const float4* restrict posq, __global const int4* restrict exclusions,
-        __global const int4* restrict donorAtoms, __global const int4* restrict acceptorAtoms, __global const int4* restrict acceptorBufferIndices, __local float4* restrict posBuffer, float4 periodicBoxSize, float4 invPeriodicBoxSize
+__kernel void computeAcceptorForces(__global real4* restrict forceBuffers, __global real* restrict energyBuffer, __global const real4* restrict posq, __global const int4* restrict exclusions,
+        __global const int4* restrict donorAtoms, __global const int4* restrict acceptorAtoms, __global const int4* restrict acceptorBufferIndices, __local real4* restrict posBuffer, real4 periodicBoxSize, real4 invPeriodicBoxSize
        PARAMETER_ARGUMENTS) {
-    float4 f1 = (float4) 0;
-    float4 f2 = (float4) 0;
-    float4 f3 = (float4) 0;
+    real4 f1 = (real4) 0;
+    real4 f2 = (real4) 0;
+    real4 f3 = (real4) 0;
    for (int acceptorStart = 0; acceptorStart < NUM_ACCEPTORS; acceptorStart += get_global_size(0)) {
        // Load information about the acceptor this thread will compute forces on.

        int acceptorIndex = acceptorStart+get_global_id(0);
        int4 atoms, exclusionIndices;
-        float4 a1, a2, a3;
+        real4 a1, a2, a3;
        if (acceptorIndex < NUM_ACCEPTORS) {
            atoms = acceptorAtoms[acceptorIndex];
-            a1 = (atoms.x > -1 ? posq[atoms.x] : (float4) 0);
-            a2 = (atoms.y > -1 ? posq[atoms.y] : (float4) 0);
-            a3 = (atoms.z > -1 ? posq[atoms.z] : (float4) 0);
+            a1 = (atoms.x > -1 ? posq[atoms.x] : (real4) 0);
+            a2 = (atoms.y > -1 ? posq[atoms.y] : (real4) 0);
+            a3 = (atoms.z > -1 ? posq[atoms.z] : (real4) 0);
 #ifdef USE_EXCLUSIONS
            exclusionIndices = exclusions[acceptorIndex];
 #endif
@@ -172,9 +172,9 @@ __kernel void computeAcceptorForces(__global float4* restrict forceBuffers, __gl
            int blockSize = min((int) get_local_size(0), NUM_DONORS-donorStart);
            if (get_local_id(0) < blockSize) {
                int4 atoms2 = donorAtoms[donorStart+get_local_id(0)];
-                posBuffer[3*get_local_id(0)] = (atoms2.x > -1 ? posq[atoms2.x] : (float4) 0);
-                posBuffer[3*get_local_id(0)+1] = (atoms2.y > -1 ? posq[atoms2.y] : (float4) 0);
-                posBuffer[3*get_local_id(0)+2] = (atoms2.z > -1 ? posq[atoms2.z] : (float4) 0);
+                posBuffer[3*get_local_id(0)] = (atoms2.x > -1 ? posq[atoms2.x] : (real4) 0);
+                posBuffer[3*get_local_id(0)+1] = (atoms2.y > -1 ? posq[atoms2.y] : (real4) 0);
+                posBuffer[3*get_local_id(0)+2] = (atoms2.z > -1 ? posq[atoms2.z] : (real4) 0);
            }
            barrier(CLK_LOCAL_MEM_FENCE);
            if (acceptorIndex < NUM_ACCEPTORS) {
@@ -186,10 +186,10 @@ __kernel void computeAcceptorForces(__global float4* restrict forceBuffers, __gl
 #endif
                    // Compute the interaction between a donor and an acceptor.

-                    float4 d1 = posBuffer[3*index];
-                    float4 d2 = posBuffer[3*index+1];
-                    float4 d3 = posBuffer[3*index+2];
-                    float4 deltaD1A1 = deltaPeriodic(d1, a1, periodicBoxSize, invPeriodicBoxSize);
+                    real4 d1 = posBuffer[3*index];
+                    real4 d2 = posBuffer[3*index+1];
+                    real4 d3 = posBuffer[3*index+2];
+                    real4 deltaD1A1 = deltaPeriodic(d1, a1, periodicBoxSize, invPeriodicBoxSize);
 #ifdef USE_CUTOFF
                    if (deltaD1A1.w < CUTOFF_SQUARED) {
 #endif
@@ -207,19 +207,19 @@ __kernel void computeAcceptorForces(__global float4* restrict forceBuffers, __gl
            int4 bufferIndices = acceptorBufferIndices[acceptorIndex];
            if (atoms.x > -1) {
                unsigned int offset = atoms.x+bufferIndices.x*PADDED_NUM_ATOMS;
-                float4 force = forceBuffers[offset];
+                real4 force = forceBuffers[offset];
                force.xyz += f1.xyz;
                forceBuffers[offset] = force;
            }
            if (atoms.y > -1) {
                unsigned int offset = atoms.y+bufferIndices.y*PADDED_NUM_ATOMS;
-                float4 force = forceBuffers[offset];
+                real4 force = forceBuffers[offset];
                force.xyz += f2.xyz;
                forceBuffers[offset] = force;
            }
            if (atoms.z > -1) {
                unsigned int offset = atoms.z+bufferIndices.z*PADDED_NUM_ATOMS;
-                float4 force = forceBuffers[offset];
+                real4 force = forceBuffers[offset];
                force.xyz += f3.xyz;
                forceBuffers[offset] = force;
            }

--- a/platforms/opencl/src/kernels/ewald.cl
+++ b/platforms/opencl/src/kernels/ewald.cl
-float2 multofFloat2(float2 a, float2 b) {
-    return (float2) (a.x*b.x - a.y*b.y, a.x*b.y + a.y*b.x);
+real2 multofReal2(real2 a, real2 b) {
+    return (real2) (a.x*b.x - a.y*b.y, a.x*b.y + a.y*b.x);
 }

 /**
 * Precompute the cosine and sine sums which appear in each force term.
 */

-__kernel void calculateEwaldCosSinSums(__global float* restrict energyBuffer, __global const float4* restrict posq, __global float2* restrict cosSinSum, float4 reciprocalPeriodicBoxSize, float reciprocalCoefficient) {
+__kernel void calculateEwaldCosSinSums(__global real* restrict energyBuffer, __global const real4* restrict posq, __global real2* restrict cosSinSum, real4 reciprocalPeriodicBoxSize, real reciprocalCoefficient) {
    const unsigned int ksizex = 2*KMAX_X-1;
    const unsigned int ksizey = 2*KMAX_Y-1;
    const unsigned int ksizez = 2*KMAX_Z-1;
    const unsigned int totalK = ksizex*ksizey*ksizez;
    unsigned int index = get_global_id(0);
-    float energy = 0.0f;
+    real energy = 0.0f;
    while (index < (KMAX_Y-1)*ksizez+KMAX_Z)
        index += get_global_size(0);
    while (index < totalK) {
@@ -23,29 +23,29 @@ __kernel void calculateEwaldCosSinSums(__global float* restrict energyBuffer, __
        int ry = remainder/ksizez;
        int rz = remainder - ry*ksizez - KMAX_Z + 1;
        ry += -KMAX_Y + 1;
-        float kx = rx*reciprocalPeriodicBoxSize.x;
-        float ky = ry*reciprocalPeriodicBoxSize.y;
-        float kz = rz*reciprocalPeriodicBoxSize.z;
+        real kx = rx*reciprocalPeriodicBoxSize.x;
+        real ky = ry*reciprocalPeriodicBoxSize.y;
+        real kz = rz*reciprocalPeriodicBoxSize.z;

        // Compute the sum for this wave vector.

-        float2 sum = 0.0f;
+        real2 sum = 0.0f;
        for (int atom = 0; atom < NUM_ATOMS; atom++) {
-            float4 apos = posq[atom];
-            float phase = apos.x*kx;
-            float2 structureFactor = (float2) (cos(phase), sin(phase));
+            real4 apos = posq[atom];
+            real phase = apos.x*kx;
+            real2 structureFactor = (real2) (cos(phase), sin(phase));
            phase = apos.y*ky;
-            structureFactor = multofFloat2(structureFactor, (float2) (cos(phase), sin(phase)));
+            structureFactor = multofReal2(structureFactor, (real2) (cos(phase), sin(phase)));
            phase = apos.z*kz;
-            structureFactor = multofFloat2(structureFactor, (float2) (cos(phase), sin(phase)));
+            structureFactor = multofReal2(structureFactor, (real2) (cos(phase), sin(phase)));
            sum += apos.w*structureFactor;
        }
        cosSinSum[index] = sum;

        // Compute the contribution to the energy.

-        float k2 = kx*kx + ky*ky + kz*kz;
-        float ak = EXP(k2*EXP_COEFFICIENT) / k2;
+        real k2 = kx*kx + ky*ky + kz*kz;
+        real ak = EXP(k2*EXP_COEFFICIENT) / k2;
        energy += reciprocalCoefficient*ak*(sum.x*sum.x + sum.y*sum.y);
        index += get_global_size(0);
    }
@@ -57,36 +57,36 @@ __kernel void calculateEwaldCosSinSums(__global float* restrict energyBuffer, __
 * previous routine.
 */

-__kernel void calculateEwaldForces(__global float4* restrict forceBuffers, __global const float4* restrict posq, __global const float2* restrict cosSinSum, float4 reciprocalPeriodicBoxSize, float reciprocalCoefficient) {
+__kernel void calculateEwaldForces(__global real4* restrict forceBuffers, __global const real4* restrict posq, __global const real2* restrict cosSinSum, real4 reciprocalPeriodicBoxSize, real reciprocalCoefficient) {
    unsigned int atom = get_global_id(0);
    while (atom < NUM_ATOMS) {
-        float4 force = forceBuffers[atom];
-        float4 apos = posq[atom];
+        real4 force = forceBuffers[atom];
+        real4 apos = posq[atom];

        // Loop over all wave vectors.

        int lowry = 0;
        int lowrz = 1;
        for (int rx = 0; rx < KMAX_X; rx++) {
-            float kx = rx*reciprocalPeriodicBoxSize.x;
+            real kx = rx*reciprocalPeriodicBoxSize.x;
            for (int ry = lowry; ry < KMAX_Y; ry++) {
-                float ky = ry*reciprocalPeriodicBoxSize.y;
-                float phase = apos.x*kx;
-                float2 tab_xy = (float2) (cos(phase), sin(phase));
+                real ky = ry*reciprocalPeriodicBoxSize.y;
+                real phase = apos.x*kx;
+                real2 tab_xy = (real2) (cos(phase), sin(phase));
                phase = apos.y*ky;
-                tab_xy = multofFloat2(tab_xy, (float2) (cos(phase), sin(phase)));
+                tab_xy = multofReal2(tab_xy, (real2) (cos(phase), sin(phase)));
                for (int rz = lowrz; rz < KMAX_Z; rz++) {
-                    float kz = rz*reciprocalPeriodicBoxSize.z;
+                    real kz = rz*reciprocalPeriodicBoxSize.z;

                    // Compute the force contribution of this wave vector.

                    int index = rx*(KMAX_Y*2-1)*(KMAX_Z*2-1) + (ry+KMAX_Y-1)*(KMAX_Z*2-1) + (rz+KMAX_Z-1);
-                    float k2 = kx*kx + ky*ky + kz*kz;
-                    float ak = EXP(k2*EXP_COEFFICIENT)/k2;
+                    real k2 = kx*kx + ky*ky + kz*kz;
+                    real ak = EXP(k2*EXP_COEFFICIENT)/k2;
                    phase = apos.z*kz;
-                    float2 structureFactor = multofFloat2(tab_xy, (float2) (cos(phase), sin(phase)));
-                    float2 sum = cosSinSum[index];
-                    float dEdR = 2*reciprocalCoefficient*ak*apos.w*(sum.x*structureFactor.y - sum.y*structureFactor.x);
+                    real2 structureFactor = multofReal2(tab_xy, (real2) (cos(phase), sin(phase)));
+                    real2 sum = cosSinSum[index];
+                    real dEdR = 2*reciprocalCoefficient*ak*apos.w*(sum.x*structureFactor.y - sum.y*structureFactor.x);
                    force.x += dEdR*kx;
                    force.y += dEdR*ky;
                    force.z += dEdR*kz;

--- a/platforms/opencl/src/kernels/fft.cl
+++ b/platforms/opencl/src/kernels/fft.cl
-float2 multiplyComplex(float2 c1, float2 c2) {
-    return (float2) (c1.x*c2.x-c1.y*c2.y, c1.x*c2.y+c1.y*c2.x);
+real2 multiplyComplex(real2 c1, real2 c2) {
+    return (real2) (c1.x*c2.x-c1.y*c2.y, c1.x*c2.y+c1.y*c2.x);
 }

 /**
 * Perform a 1D FFT on each row along one axis.
 */

-__kernel void execFFT(__global const float2* restrict in, __global float2* restrict out, float sign, __local float2* restrict w,
-        __local float2* restrict data0, __local float2* restrict data1) {
+__kernel void execFFT(__global const real2* restrict in, __global real2* restrict out, int sign, __local real2* restrict w,
+        __local real2* restrict data0, __local real2* restrict data1) {
    for (int i = get_local_id(0); i < ZSIZE; i += get_local_size(0))
-        w[i] = (float2) (cos(-sign*i*2*M_PI/ZSIZE), sin(-sign*i*2*M_PI/ZSIZE));
+        w[i] = (real2) (cos(-sign*i*2*M_PI/ZSIZE), sin(-sign*i*2*M_PI/ZSIZE));
    barrier(CLK_LOCAL_MEM_FENCE);
    for (int index = get_group_id(0); index < XSIZE*YSIZE; index += get_num_groups(0)) {
        int x = index/YSIZE;

--- a/platforms/opencl/src/kernels/findInteractingBlocks.cl
+++ b/platforms/opencl/src/kernels/findInteractingBlocks.cl
@@ -8,19 +8,19 @@
 /**
 * Find a bounding box for the atoms in each block.
 */
-__kernel void findBlockBounds(int numAtoms, float4 periodicBoxSize, float4 invPeriodicBoxSize, __global const float4* restrict posq, __global float4* restrict blockCenter, __global float4* restrict blockBoundingBox, __global unsigned int* restrict interactionCount) {
+__kernel void findBlockBounds(int numAtoms, real4 periodicBoxSize, real4 invPeriodicBoxSize, __global const real4* restrict posq, __global real4* restrict blockCenter, __global real4* restrict blockBoundingBox, __global unsigned int* restrict interactionCount) {
    int index = get_global_id(0);
    int base = index*TILE_SIZE;
    while (base < numAtoms) {
-        float4 pos = posq[base];
+        real4 pos = posq[base];
 #ifdef USE_PERIODIC
        pos.x -= floor(pos.x*invPeriodicBoxSize.x)*periodicBoxSize.x;
        pos.y -= floor(pos.y*invPeriodicBoxSize.y)*periodicBoxSize.y;
        pos.z -= floor(pos.z*invPeriodicBoxSize.z)*periodicBoxSize.z;
-        float4 firstPoint = pos;
+        real4 firstPoint = pos;
 #endif
-        float4 minPos = pos;
-        float4 maxPos = pos;
+        real4 minPos = pos;
+        real4 maxPos = pos;
        int last = min(base+TILE_SIZE, numAtoms);
        for (int i = base+1; i < last; i++) {
            pos = posq[i];
@@ -46,8 +46,8 @@ __kernel void findBlockBounds(int numAtoms, float4 periodicBoxSize, float4 invPe
 * to global memory.
 */
 void storeInteractionData(__local ushort2* buffer, __local int* valid, __local short* sum, __local ushort2* temp, __local int* baseIndex,
-            __global unsigned int* interactionCount, __global ushort2* interactingTiles, float cutoffSquared, float4 periodicBoxSize,
-            float4 invPeriodicBoxSize, __global const float4* posq, __global const float4* blockCenter, __global const float4* blockBoundingBox, unsigned int maxTiles) {
+            __global unsigned int* interactionCount, __global ushort2* interactingTiles, real cutoffSquared, real4 periodicBoxSize,
+            real4 invPeriodicBoxSize, __global const real4* posq, __global const real4* blockCenter, __global const real4* blockBoundingBox, unsigned int maxTiles) {
    // The buffer is full, so we need to compact it and write out results.  Start by doing a parallel prefix sum.

    for (int i = get_local_id(0); i < BUFFER_SIZE; i += GROUP_SIZE)
@@ -91,7 +91,7 @@ void storeInteractionData(__local ushort2* buffer, __local int* valid, __local s

    int index = get_local_id(0)&(TILE_SIZE-1);
    int group = get_local_id(0)/TILE_SIZE;
-    float4 center, boxSize, pos;
+    real4 center, boxSize, pos;
    for (int tile = 0; tile < numValid; tile++) {
        int x = temp[tile].x;
        int y = temp[tile].y;
@@ -103,12 +103,12 @@ void storeInteractionData(__local ushort2* buffer, __local int* valid, __local s
 #ifdef MAC_AMD_WORKAROUND
        int box = (group == 0 ? x : y);
        int atom = (group == 0 ? y : x)*TILE_SIZE+index;
-        __global float* bc = (__global float*) blockCenter;
-        __global float* bb = (__global float*) blockBoundingBox;
-        __global float* ps = (__global float*) posq;
-        center = (float4) (bc[4*box], bc[4*box+1], bc[4*box+2], 0.0f);
-        boxSize = (float4) (bb[4*box], bb[4*box+1], bb[4*box+2], 0.0f);
-        pos = (float4) (ps[4*atom], ps[4*atom+1], ps[4*atom+2], 0.0f);
+        __global real* bc = (__global real*) blockCenter;
+        __global real* bb = (__global real*) blockBoundingBox;
+        __global real* ps = (__global real*) posq;
+        center = (real4) (bc[4*box], bc[4*box+1], bc[4*box+2], 0);
+        boxSize = (real4) (bb[4*box], bb[4*box+1], bb[4*box+2], 0);
+        pos = (real4) (ps[4*atom], ps[4*atom+1], ps[4*atom+2], 0);
 #else
        center = blockCenter[(group == 0 ? x : y)];
        boxSize = blockBoundingBox[(group == 0 ? x : y)];
@@ -117,13 +117,13 @@ void storeInteractionData(__local ushort2* buffer, __local int* valid, __local s

        // Find the distance of the atom from the bounding box.

-        float4 delta = pos-center;
+        real4 delta = pos-center;
 #ifdef USE_PERIODIC
        delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
        delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
        delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
 #endif
-        delta = max((float4) 0.0f, fabs(delta)-boxSize);
+        delta = max((real4) 0, fabs(delta)-boxSize);
        __local ushort* flag = (__local ushort*) &buffer[tile];
        if (delta.x*delta.x+delta.y*delta.y+delta.z*delta.z < cutoffSquared)
            flag[group] = false;
@@ -155,9 +155,9 @@ void storeInteractionData(__local ushort2* buffer, __local int* valid, __local s
 * Compare the bounding boxes for each pair of blocks.  If they are sufficiently far apart,
 * mark them as non-interacting.
 */
-__kernel void findBlocksWithInteractions(float cutoffSquared, float4 periodicBoxSize, float4 invPeriodicBoxSize, __global const float4* restrict blockCenter,
-        __global const float4* restrict blockBoundingBox, __global unsigned int* restrict interactionCount, __global ushort2* restrict interactingTiles,
-        __global unsigned int* restrict interactionFlags, __global const float4* restrict posq, unsigned int maxTiles, unsigned int startTileIndex,
+__kernel void findBlocksWithInteractions(real cutoffSquared, real4 periodicBoxSize, real4 invPeriodicBoxSize, __global const real4* restrict blockCenter,
+        __global const real4* restrict blockBoundingBox, __global unsigned int* restrict interactionCount, __global ushort2* restrict interactingTiles,
+        __global unsigned int* restrict interactionFlags, __global const real4* restrict posq, unsigned int maxTiles, unsigned int startTileIndex,
        unsigned int endTileIndex) {
    __local ushort2 buffer[BUFFER_SIZE];
    __local int valid[BUFFER_SIZE];
@@ -194,26 +194,26 @@ __kernel void findBlocksWithInteractions(float cutoffSquared, float4 periodicBox
            // Find the distance between the bounding boxes of the two cells.

 #ifdef MAC_AMD_WORKAROUND
-            __global float* bc = (__global float*) blockCenter;
-            __global float* bb = (__global float*) blockBoundingBox;
-            float4 bcx = (float4) (bc[4*x], bc[4*x+1], bc[4*x+2], 0.0f);
-            float4 bcy = (float4) (bc[4*y], bc[4*y+1], bc[4*y+2], 0.0f);
-            float4 delta = bcx-bcy;
-            float4 boxSizea = (float4) (bb[4*x], bb[4*x+1], bb[4*x+2], 0.0f);
-            float4 boxSizeb = (float4) (bb[4*y], bb[4*y+1], bb[4*y+2], 0.0f);
+            __global real* bc = (__global real*) blockCenter;
+            __global real* bb = (__global real*) blockBoundingBox;
+            real4 bcx = (real4) (bc[4*x], bc[4*x+1], bc[4*x+2], 0);
+            real4 bcy = (real4) (bc[4*y], bc[4*y+1], bc[4*y+2], 0);
+            real4 delta = bcx-bcy;
+            real4 boxSizea = (real4) (bb[4*x], bb[4*x+1], bb[4*x+2], 0);
+            real4 boxSizeb = (real4) (bb[4*y], bb[4*y+1], bb[4*y+2], 0);
 #else
-            float4 delta = blockCenter[x]-blockCenter[y];
-            float4 boxSizea = blockBoundingBox[x];
-            float4 boxSizeb = blockBoundingBox[y];
+            real4 delta = blockCenter[x]-blockCenter[y];
+            real4 boxSizea = blockBoundingBox[x];
+            real4 boxSizeb = blockBoundingBox[y];
 #endif
 #ifdef USE_PERIODIC
            delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
            delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
            delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
 #endif
-            delta.x = max(0.0f, fabs(delta.x)-boxSizea.x-boxSizeb.x);
-            delta.y = max(0.0f, fabs(delta.y)-boxSizea.y-boxSizeb.y);
-            delta.z = max(0.0f, fabs(delta.z)-boxSizea.z-boxSizeb.z);
+            delta.x = max((real) 0, fabs(delta.x)-boxSizea.x-boxSizeb.x);
+            delta.y = max((real) 0, fabs(delta.y)-boxSizea.y-boxSizeb.y);
+            delta.z = max((real) 0, fabs(delta.z)-boxSizea.z-boxSizeb.z);
            if (delta.x*delta.x+delta.y*delta.y+delta.z*delta.z < cutoffSquared) {
                // Add this tile to the buffer.

@@ -241,8 +241,8 @@ __kernel void findBlocksWithInteractions(float cutoffSquared, float4 periodicBox
 * Compare each atom in one block to the bounding box of another block, and set
 * flags for which ones are interacting.
 */
-__kernel void findInteractionsWithinBlocks(float cutoffSquared, float4 periodicBoxSize, float4 invPeriodicBoxSize, __global const float4* restrict posq, __global const ushort2* restrict tiles, __global const float4* restrict blockCenter,
-            __global const float4* restrict blockBoundingBox, __global unsigned int* restrict interactionFlags, __global const unsigned int* restrict interactionCount, __local volatile unsigned int* restrict flags, unsigned int maxTiles) {
+__kernel void findInteractionsWithinBlocks(real cutoffSquared, real4 periodicBoxSize, real4 invPeriodicBoxSize, __global const real4* restrict posq, __global const ushort2* restrict tiles, __global const real4* restrict blockCenter,
+            __global const real4* restrict blockBoundingBox, __global unsigned int* restrict interactionFlags, __global const unsigned int* restrict interactionCount, __local volatile unsigned int* restrict flags, unsigned int maxTiles) {
    unsigned int totalWarps = get_global_size(0)/TILE_SIZE;
    unsigned int warp = get_global_id(0)/TILE_SIZE;
    unsigned int numTiles = interactionCount[0];
@@ -253,7 +253,7 @@ __kernel void findInteractionsWithinBlocks(float cutoffSquared, float4 periodicB
    if (numTiles > maxTiles)
        return;
    unsigned int lasty = 0xFFFFFFFF;
-    float4 apos;
+    real4 apos;
    while (pos < end) {
        // Extract the coordinates of this tile
        ushort2 tileIndices = tiles[pos];
@@ -266,20 +266,20 @@ __kernel void findInteractionsWithinBlocks(float cutoffSquared, float4 periodicB
        else {
            // Load the bounding box for x and the atom positions for y.

-            float4 center = blockCenter[x];
-            float4 boxSize = blockBoundingBox[x];
+            real4 center = blockCenter[x];
+            real4 boxSize = blockBoundingBox[x];
            if (y != lasty)
                apos = posq[y*TILE_SIZE+index];

            // Find the distance of the atom from the bounding box.

-            float4 delta = apos-center;
+            real4 delta = apos-center;
 #ifdef USE_PERIODIC
                delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
                delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
                delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
 #endif
-            delta = max((float4) 0.0f, fabs(delta)-boxSize);
+            delta = max((real4) 0, fabs(delta)-boxSize);
            int thread = get_local_id(0);
            flags[thread] = (delta.x*delta.x+delta.y*delta.y+delta.z*delta.z > cutoffSquared ? 0 : 1 << index);


--- a/platforms/opencl/src/kernels/findInteractingBlocks_cpu.cl
+++ b/platforms/opencl/src/kernels/findInteractingBlocks_cpu.cl
@@ -8,19 +8,19 @@
 /**
 * Find a bounding box for the atoms in each block.
 */
-__kernel void findBlockBounds(int numAtoms, float4 periodicBoxSize, float4 invPeriodicBoxSize, __global const float4* restrict posq, __global float4* restrict blockCenter, __global float4* restrict blockBoundingBox, __global unsigned int* restrict interactionCount) {
+__kernel void findBlockBounds(int numAtoms, real4 periodicBoxSize, real4 invPeriodicBoxSize, __global const real4* restrict posq, __global real4* restrict blockCenter, __global real4* restrict blockBoundingBox, __global unsigned int* restrict interactionCount) {
    int index = get_global_id(0);
    int base = index*TILE_SIZE;
    while (base < numAtoms) {
-        float4 pos = posq[base];
+        real4 pos = posq[base];
 #ifdef USE_PERIODIC
        pos.x -= floor(pos.x*invPeriodicBoxSize.x)*periodicBoxSize.x;
        pos.y -= floor(pos.y*invPeriodicBoxSize.y)*periodicBoxSize.y;
        pos.z -= floor(pos.z*invPeriodicBoxSize.z)*periodicBoxSize.z;
-        float4 firstPoint = pos;
+        real4 firstPoint = pos;
 #endif
-        float4 minPos = pos;
-        float4 maxPos = pos;
+        real4 minPos = pos;
+        real4 maxPos = pos;
        int last = min(base+TILE_SIZE, numAtoms);
        for (int i = base+1; i < last; i++) {
            pos = posq[i];
@@ -46,14 +46,14 @@ __kernel void findBlockBounds(int numAtoms, float4 periodicBoxSize, float4 invPe
 * to global memory.
 */
 void storeInteractionData(ushort2* buffer, int numValid, __global unsigned int* interactionCount, __global ushort2* interactingTiles,
-            __global unsigned int* interactionFlags, float cutoffSquared, float4 periodicBoxSize, float4 invPeriodicBoxSize,
-            __global float4* posq, __global float4* blockCenter, __global float4* blockBoundingBox, unsigned int maxTiles) {
+            __global unsigned int* interactionFlags, real cutoffSquared, real4 periodicBoxSize, real4 invPeriodicBoxSize,
+            __global real4* posq, __global real4* blockCenter, __global real4* blockBoundingBox, unsigned int maxTiles) {
    // Filter the list of tiles by comparing the distance from each atom to the other bounding box.

    unsigned int flagsBuffer[2*BUFFER_SIZE];
-    float4 atomPositions[TILE_SIZE];
+    real4 atomPositions[TILE_SIZE];
    int lasty = -1;
-    float4 centery, boxSizey;
+    real4 centery, boxSizey;
    for (int tile = 0; tile < numValid; ) {
        int x = buffer[tile].x;
        int y = buffer[tile].y;
@@ -64,8 +64,8 @@ void storeInteractionData(ushort2* buffer, int numValid, __global unsigned int*

        // Load the atom positions and bounding boxes.

-        float4 centerx = blockCenter[x];
-        float4 boxSizex = blockBoundingBox[x];
+        real4 centerx = blockCenter[x];
+        real4 boxSizex = blockBoundingBox[x];
        if (y != lasty) {
            for (int atom = 0; atom < TILE_SIZE; atom++)
                atomPositions[atom] = posq[y*TILE_SIZE+atom];
@@ -78,18 +78,18 @@ void storeInteractionData(ushort2* buffer, int numValid, __global unsigned int*

        unsigned int flags1 = 0, flags2 = 0;
        for (int atom = 0; atom < TILE_SIZE; atom++) {
-            float4 delta = atomPositions[atom]-centerx;
+            real4 delta = atomPositions[atom]-centerx;
 #ifdef USE_PERIODIC
            delta.xyz -= floor(delta.xyz*invPeriodicBoxSize.xyz+0.5f)*periodicBoxSize.xyz;
 #endif
-            delta = max((float4) 0.0f, fabs(delta)-boxSizex);
+            delta = max((real4) 0, fabs(delta)-boxSizex);
            if (dot(delta.xyz, delta.xyz) < cutoffSquared)
                flags1 += 1 << atom;
            delta = posq[x*TILE_SIZE+atom]-centery;
 #ifdef USE_PERIODIC
            delta.xyz -= floor(delta.xyz*invPeriodicBoxSize.xyz+0.5f)*periodicBoxSize.xyz;
 #endif
-            delta = max((float4) 0.0f, fabs(delta)-boxSizey);
+            delta = max((real4) 0, fabs(delta)-boxSizey);
            if (dot(delta.xyz, delta.xyz) < cutoffSquared)
                flags2 += 1 << atom;
        }
@@ -121,9 +121,9 @@ void storeInteractionData(ushort2* buffer, int numValid, __global unsigned int*
 * Compare the bounding boxes for each pair of blocks.  If they are sufficiently far apart,
 * mark them as non-interacting.
 */
-__kernel void findBlocksWithInteractions(float cutoffSquared, float4 periodicBoxSize, float4 invPeriodicBoxSize, __global const float4* restrict blockCenter,
-        __global const float4* restrict blockBoundingBox, __global unsigned int* restrict interactionCount, __global ushort2* restrict interactingTiles,
-        __global unsigned int* restrict interactionFlags, __global const float4* restrict posq, unsigned int maxTiles, unsigned int startTileIndex,
+__kernel void findBlocksWithInteractions(real cutoffSquared, real4 periodicBoxSize, real4 invPeriodicBoxSize, __global const real4* restrict blockCenter,
+        __global const real4* restrict blockBoundingBox, __global unsigned int* restrict interactionCount, __global ushort2* restrict interactingTiles,
+        __global unsigned int* restrict interactionFlags, __global const real4* restrict posq, unsigned int maxTiles, unsigned int startTileIndex,
        unsigned int endTileIndex) {
    ushort2 buffer[BUFFER_SIZE];
    int valuesInBuffer = 0;
@@ -142,17 +142,17 @@ __kernel void findBlocksWithInteractions(float cutoffSquared, float4 periodicBox

        // Find the distance between the bounding boxes of the two cells.

-        float4 delta = blockCenter[x]-blockCenter[y];
+        real4 delta = blockCenter[x]-blockCenter[y];
 #ifdef USE_PERIODIC
        delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
        delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
        delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
 #endif
-        float4 boxSizea = blockBoundingBox[x];
-        float4 boxSizeb = blockBoundingBox[y];
-        delta.x = max(0.0f, fabs(delta.x)-boxSizea.x-boxSizeb.x);
-        delta.y = max(0.0f, fabs(delta.y)-boxSizea.y-boxSizeb.y);
-        delta.z = max(0.0f, fabs(delta.z)-boxSizea.z-boxSizeb.z);
+        real4 boxSizea = blockBoundingBox[x];
+        real4 boxSizeb = blockBoundingBox[y];
+        delta.x = max((real) 0, fabs(delta.x)-boxSizea.x-boxSizeb.x);
+        delta.y = max((real) 0, fabs(delta.y)-boxSizea.y-boxSizeb.y);
+        delta.z = max((real) 0, fabs(delta.z)-boxSizea.z-boxSizeb.z);
        if (delta.x*delta.x+delta.y*delta.y+delta.z*delta.z < cutoffSquared) {
            // Add this tile to the buffer.


--- a/platforms/opencl/src/kernels/harmonicAngleForce.cl
+++ b/platforms/opencl/src/kernels/harmonicAngleForce.cl
 float2 angleParams = PARAMS[index];
-float deltaIdeal = theta-angleParams.x;
+real deltaIdeal = theta-angleParams.x;
 energy += 0.5f*angleParams.y*deltaIdeal*deltaIdeal;
-float dEdAngle = angleParams.y*deltaIdeal;
+real dEdAngle = angleParams.y*deltaIdeal;
--- a/platforms/opencl/src/kernels/harmonicBondForce.cl
+++ b/platforms/opencl/src/kernels/harmonicBondForce.cl
 float2 bondParams = PARAMS[index];
-float deltaIdeal = r-bondParams.x;
+real deltaIdeal = r-bondParams.x;
 energy += 0.5f * bondParams.y*deltaIdeal*deltaIdeal;
-float dEdR = bondParams.y * deltaIdeal;
+real dEdR = bondParams.y * deltaIdeal;
--- a/platforms/opencl/src/kernels/nonbondedExceptions.cl
+++ b/platforms/opencl/src/kernels/nonbondedExceptions.cl
 float4 exceptionParams = PARAMS[index];
-float4 delta = pos2-pos1;
-float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
-float invR = RSQRT(r2);
-float sig2 = invR*exceptionParams.y;
+real4 delta = pos2-pos1;
+real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
+real invR = RSQRT(r2);
+real sig2 = invR*exceptionParams.y;
 sig2 *= sig2;
-float sig6 = sig2*sig2*sig2;
-float dEdR = exceptionParams.z*(12.0f*sig6-6.0f)*sig6;
-float tempEnergy = exceptionParams.z*(sig6-1.0f)*sig6;
+real sig6 = sig2*sig2*sig2;
+real dEdR = exceptionParams.z*(12.0f*sig6-6.0f)*sig6;
+real tempEnergy = exceptionParams.z*(sig6-1.0f)*sig6;
 dEdR += exceptionParams.x*invR;
 dEdR *= invR*invR;
 tempEnergy += exceptionParams.x*invR;
 energy += tempEnergy;
 delta.xyz *= dEdR;
-float4 force1 = -delta;
-float4 force2 = delta;
+real4 force1 = -delta;
+real4 force2 = delta;
--- a/platforms/opencl/src/kernels/nonbonded_cpu.cl
+++ b/platforms/opencl/src/kernels/nonbonded_cpu.cl
 #define TILE_SIZE 32

 typedef struct {
-    float x, y, z;
-    float q;
-    float fx, fy, fz;
+    real x, y, z;
+    real q;
+    real fx, fy, fz;
    ATOM_PARAMETER_DATA
 } AtomData;

@@ -11,11 +11,11 @@ typedef struct {
 * Compute nonbonded interactions.
 */

-__kernel void computeNonbonded(__global float4* restrict forceBuffers, __global float* restrict energyBuffer, __global const float4* restrict posq, __global const unsigned int* restrict exclusions,
+__kernel void computeNonbonded(__global real4* restrict forceBuffers, __global real* restrict energyBuffer, __global const real4* restrict posq, __global const unsigned int* restrict exclusions,
        __global const unsigned int* restrict exclusionIndices, __global const unsigned int* restrict exclusionRowIndices,
        unsigned int startTileIndex, unsigned int endTileIndex,
 #ifdef USE_CUTOFF
-        __global const ushort2* restrict tiles, __global const unsigned int* restrict interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global const unsigned int* restrict interactionFlags
+        __global const ushort2* restrict tiles, __global const unsigned int* restrict interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize, unsigned int maxTiles, __global const unsigned int* restrict interactionFlags
 #else
        unsigned int numTiles
 #endif
@@ -28,7 +28,7 @@ __kernel void computeNonbonded(__global float4* restrict forceBuffers, __global
    unsigned int pos = startTileIndex+get_group_id(0)*numTiles/get_num_groups(0);
    unsigned int end = startTileIndex+(get_group_id(0)+1)*numTiles/get_num_groups(0);
 #endif
-    float energy = 0.0f;
+    real energy = 0;
    unsigned int lasty = 0xFFFFFFFF;
    __local AtomData localData[TILE_SIZE];

@@ -71,7 +71,7 @@ __kernel void computeNonbonded(__global float4* restrict forceBuffers, __global
        if (lasty != y) {
            for (int localAtomIndex = 0; localAtomIndex < TILE_SIZE; localAtomIndex++) {
                unsigned int j = y*TILE_SIZE + localAtomIndex;
-                float4 tempPosq = posq[j];
+                real4 tempPosq = posq[j];
                localData[localAtomIndex].x = tempPosq.x;
                localData[localAtomIndex].y = tempPosq.y;
                localData[localAtomIndex].z = tempPosq.z;
@@ -87,34 +87,34 @@ __kernel void computeNonbonded(__global float4* restrict forceBuffers, __global
                unsigned int excl = exclusions[exclusionIndex+tgx];
 #endif
                unsigned int atom1 = x*TILE_SIZE+tgx;
-                float4 force = 0.0f;
-                float4 posq1 = posq[atom1];
+                real4 force = 0;
+                real4 posq1 = posq[atom1];
                LOAD_ATOM1_PARAMETERS
                for (unsigned int j = 0; j < TILE_SIZE; j++) {
 #ifdef USE_EXCLUSIONS
                    bool isExcluded = !(excl & 0x1);
 #endif
-                    float4 posq2 = (float4) (localData[j].x, localData[j].y, localData[j].z, localData[j].q);
-                    float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f);
+                    real4 posq2 = (real4) (localData[j].x, localData[j].y, localData[j].z, localData[j].q);
+                    real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
 #ifdef USE_PERIODIC
                    delta.xyz -= floor(delta.xyz*invPeriodicBoxSize.xyz+0.5f)*periodicBoxSize.xyz;
 #endif
-                    float r2 = dot(delta.xyz, delta.xyz);
+                    real r2 = dot(delta.xyz, delta.xyz);
 #ifdef USE_CUTOFF
                    if (r2 < CUTOFF_SQUARED) {
 #endif
-                    float invR = RSQRT(r2);
-                    float r = RECIP(invR);
+                    real invR = RSQRT(r2);
+                    real r = RECIP(invR);
                    unsigned int atom2 = j;
                    LOAD_ATOM2_PARAMETERS
                    atom2 = y*TILE_SIZE+j;
 #ifdef USE_SYMMETRIC
-                    float dEdR = 0.0f;
+                    real dEdR = 0;
 #else
-                    float4 dEdR1 = (float4) 0.0f;
-                    float4 dEdR2 = (float4) 0.0f;
+                    real4 dEdR1 = (real4) 0;
+                    real4 dEdR2 = (real4) 0;
 #endif
-                    float tempEnergy = 0.0f;
+                    real tempEnergy = 0;
                    COMPUTE_INTERACTION
                    energy += 0.5f*tempEnergy;
 #ifdef USE_SYMMETRIC
@@ -138,9 +138,9 @@ __kernel void computeNonbonded(__global float4* restrict forceBuffers, __global
            // This is an off-diagonal tile.

            for (int tgx = 0; tgx < TILE_SIZE; tgx++) {
-                localData[tgx].fx = 0.0f;
-                localData[tgx].fy = 0.0f;
-                localData[tgx].fz = 0.0f;
+                localData[tgx].fx = 0;
+                localData[tgx].fy = 0;
+                localData[tgx].fz = 0;
            }
 #ifdef USE_CUTOFF
            unsigned int flags1 = (numTiles <= maxTiles ? interactionFlags[2*pos] : 0xFFFFFFFF);
@@ -151,31 +151,31 @@ __kernel void computeNonbonded(__global float4* restrict forceBuffers, __global
                for (unsigned int tgx = 0; tgx < TILE_SIZE; tgx++) {
                    if ((flags2&(1<<tgx)) != 0) {
                        unsigned int atom1 = x*TILE_SIZE+tgx;
-                        float4 force = 0.0f;
-                        float4 posq1 = posq[atom1];
+                        real4 force = 0;
+                        real4 posq1 = posq[atom1];
                        LOAD_ATOM1_PARAMETERS
                        for (unsigned int j = 0; j < TILE_SIZE; j++) {
                            if ((flags1&(1<<j)) != 0) {
                                bool isExcluded = false;
-                                float4 posq2 = (float4) (localData[j].x, localData[j].y, localData[j].z, localData[j].q);
-                                float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f);
+                                real4 posq2 = (real4) (localData[j].x, localData[j].y, localData[j].z, localData[j].q);
+                                real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
 #ifdef USE_PERIODIC
                                delta.xyz -= floor(delta.xyz*invPeriodicBoxSize.xyz+0.5f)*periodicBoxSize.xyz;
 #endif
-                                float r2 = dot(delta.xyz, delta.xyz);
+                                real r2 = dot(delta.xyz, delta.xyz);
                                if (r2 < CUTOFF_SQUARED) {
-                                    float invR = RSQRT(r2);
-                                    float r = RECIP(invR);
+                                    real invR = RSQRT(r2);
+                                    real r = RECIP(invR);
                                    unsigned int atom2 = j;
                                    LOAD_ATOM2_PARAMETERS
                                    atom2 = y*TILE_SIZE+j;
 #ifdef USE_SYMMETRIC
-                                    float dEdR = 0.0f;
+                                    real dEdR = 0;
 #else
-                                    float4 dEdR1 = (float4) 0.0f;
-                                    float4 dEdR2 = (float4) 0.0f;
+                                    real4 dEdR1 = (real4) 0;
+                                    real4 dEdR2 = (real4) 0;
 #endif
-                                    float tempEnergy = 0.0f;
+                                    real tempEnergy = 0;
                                    COMPUTE_INTERACTION
                                    energy += tempEnergy;
 #ifdef USE_SYMMETRIC
@@ -208,8 +208,8 @@ __kernel void computeNonbonded(__global float4* restrict forceBuffers, __global

                for (unsigned int tgx = 0; tgx < TILE_SIZE; tgx++) {
                    unsigned int atom1 = x*TILE_SIZE+tgx;
-                    float4 force = 0.0f;
-                    float4 posq1 = posq[atom1];
+                    real4 force = 0;
+                    real4 posq1 = posq[atom1];
                    LOAD_ATOM1_PARAMETERS
 #ifdef USE_EXCLUSIONS
                    unsigned int excl = (hasExclusions ? exclusions[exclusionIndex+tgx] : 0xFFFFFFFF);
@@ -218,27 +218,27 @@ __kernel void computeNonbonded(__global float4* restrict forceBuffers, __global
 #ifdef USE_EXCLUSIONS
                        bool isExcluded = !(excl & 0x1);
 #endif
-                        float4 posq2 = (float4) (localData[j].x, localData[j].y, localData[j].z, localData[j].q);
-                        float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f);
+                        real4 posq2 = (real4) (localData[j].x, localData[j].y, localData[j].z, localData[j].q);
+                        real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
 #ifdef USE_PERIODIC
                        delta.xyz -= floor(delta.xyz*invPeriodicBoxSize.xyz+0.5f)*periodicBoxSize.xyz;
 #endif
-                        float r2 = dot(delta.xyz, delta.xyz);
+                        real r2 = dot(delta.xyz, delta.xyz);
 #ifdef USE_CUTOFF
                        if (r2 < CUTOFF_SQUARED) {
 #endif
-                        float invR = RSQRT(r2);
-                        float r = RECIP(invR);
+                        real invR = RSQRT(r2);
+                        real r = RECIP(invR);
                        unsigned int atom2 = j;
                        LOAD_ATOM2_PARAMETERS
                        atom2 = y*TILE_SIZE+j;
 #ifdef USE_SYMMETRIC
-                        float dEdR = 0.0f;
+                        real dEdR = 0;
 #else
-                        float4 dEdR1 = (float4) 0.0f;
-                        float4 dEdR2 = (float4) 0.0f;
+                        real4 dEdR1 = (real4) 0;
+                        real4 dEdR2 = (real4) 0;
 #endif
-                        float tempEnergy = 0.0f;
+                        real tempEnergy = 0;
                        COMPUTE_INTERACTION
                        energy += tempEnergy;
 #ifdef USE_SYMMETRIC
@@ -272,7 +272,7 @@ __kernel void computeNonbonded(__global float4* restrict forceBuffers, __global

            for (int tgx = 0; tgx < TILE_SIZE; tgx++) {
                unsigned int offset = y*TILE_SIZE+tgx + get_group_id(0)*PADDED_NUM_ATOMS;
-                float4 f = forceBuffers[offset];
+                real4 f = forceBuffers[offset];
                f.x += localData[tgx].fx;
                f.y += localData[tgx].fy;
                f.z += localData[tgx].fz;

--- a/platforms/opencl/src/kernels/nonbonded_default.cl
+++ b/platforms/opencl/src/kernels/nonbonded_default.cl
@@ -10,16 +10,16 @@
 // aligned which wastes space and causes LDS bank conflicts as stride is no
 // longer odd DWORDS.
 typedef struct {
-    float x, y, z;
-} UnalignedFloat3;
+    real x, y, z;
+} UnalignedReal3;

 typedef struct {
-    float x, y, z;
-    float q;
-    float fx, fy, fz;
+    real x, y, z;
+    real q;
+    real fx, fy, fz;
    ATOM_PARAMETER_DATA
 #ifndef PARAMETER_SIZE_IS_EVEN
-    float padding;
+    real padding;
 #endif
 } AtomData;

@@ -32,13 +32,13 @@ void computeNonbonded(
 #ifdef SUPPORTS_64_BIT_ATOMICS
        __global long* restrict forceBuffers,
 #else
-        __global float4* restrict forceBuffers,
+        __global real4* restrict forceBuffers,
 #endif
-        __global float* restrict energyBuffer, __global const float4* restrict posq, __global const unsigned int* restrict exclusions,
+        __global real* restrict energyBuffer, __global const real4* restrict posq, __global const unsigned int* restrict exclusions,
        __global const unsigned int* restrict exclusionIndices, __global const unsigned int* restrict exclusionRowIndices,
        unsigned int startTileIndex, unsigned int endTileIndex,
 #ifdef USE_CUTOFF
-        __global const ushort2* restrict tiles, __global const unsigned int* restrict interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global const unsigned int* restrict interactionFlags
+        __global const ushort2* restrict tiles, __global const unsigned int* restrict interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize, unsigned int maxTiles, __global const unsigned int* restrict interactionFlags
 #else
        unsigned int numTiles
 #endif
@@ -51,10 +51,10 @@ void computeNonbonded(
    unsigned int pos = startTileIndex+get_group_id(0)*numTiles/get_num_groups(0);
    unsigned int end = startTileIndex+(get_group_id(0)+1)*numTiles/get_num_groups(0);
 #endif
-    float energy = 0.0f;
+    real energy = 0;
    unsigned int lasty = 0xFFFFFFFF;
    __local AtomData localData[TILE_SIZE];
-    __local UnalignedFloat3 localForce[FORCE_WORK_GROUP_SIZE];
+    __local UnalignedReal3 localForce[FORCE_WORK_GROUP_SIZE];
 #ifdef USE_EXCLUSIONS
    __local unsigned int exclusionRange[2];
    __local int exclusionIndex[1];
@@ -83,8 +83,8 @@ void computeNonbonded(
        unsigned int tgx = get_local_id(0) & (TILE_SIZE-1);
        unsigned int localForceOffset = get_local_id(0) & ~(TILE_SIZE-1);
        unsigned int atom1 = x*TILE_SIZE + tgx;
-        float4 force = 0.0f;
-        float4 posq1 = posq[atom1];
+        real4 force = 0;
+        real4 posq1 = posq[atom1];
        LOAD_ATOM1_PARAMETERS

        // Locate the exclusion data for this tile.
@@ -121,25 +121,25 @@ void computeNonbonded(
                bool isExcluded = !(excl & 0x1);
 #endif
                unsigned int atom2 = baseLocalAtom+j;
-                float4 posq2 = (float4) (localData[atom2].x, localData[atom2].y, localData[atom2].z, localData[atom2].q);
-                float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f);
+                real4 posq2 = (real4) (localData[atom2].x, localData[atom2].y, localData[atom2].z, localData[atom2].q);
+                real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
 #ifdef USE_PERIODIC
                delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
                delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
                delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
 #endif
-                float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
-                float invR = RSQRT(r2);
-                float r = RECIP(invR);
+                real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
+                real invR = RSQRT(r2);
+                real r = RECIP(invR);
                LOAD_ATOM2_PARAMETERS
                atom2 = y*TILE_SIZE+baseLocalAtom+j;
 #ifdef USE_SYMMETRIC
-                float dEdR = 0.0f;
+                real dEdR = 0;
 #else
-                float4 dEdR1 = (float4) 0.0f;
-                float4 dEdR2 = (float4) 0.0f;
+                real4 dEdR1 = (real4) 0;
+                real4 dEdR2 = (real4) 0;
 #endif
-                float tempEnergy = 0.0f;
+                real tempEnergy = 0;
                COMPUTE_INTERACTION
                energy += 0.5f*tempEnergy;
 #ifdef USE_SYMMETRIC
@@ -173,8 +173,8 @@ void computeNonbonded(
 #else
                unsigned int offset = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
 #endif
-                // Cheaper to load/store float4 than float3.
-                float4 sum = forceBuffers[offset];
+                // Cheaper to load/store real4 than real3.
+                real4 sum = forceBuffers[offset];
                sum.xyz += force.xyz;
                forceBuffers[offset] = sum;
 #endif
@@ -187,16 +187,16 @@ void computeNonbonded(
            if (lasty != y && get_local_id(0) < TILE_SIZE) {
                const unsigned int localAtomIndex = tgx;
                unsigned int j = y*TILE_SIZE + tgx;
-                float4 tempPosq = posq[j];
+                real4 tempPosq = posq[j];
                localData[localAtomIndex].x = tempPosq.x;
                localData[localAtomIndex].y = tempPosq.y;
                localData[localAtomIndex].z = tempPosq.z;
                localData[localAtomIndex].q = tempPosq.w;
                LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
            }
-            localForce[get_local_id(0)].x = 0.0f;
-            localForce[get_local_id(0)].y = 0.0f;
-            localForce[get_local_id(0)].z = 0.0f;
+            localForce[get_local_id(0)].x = 0;
+            localForce[get_local_id(0)].y = 0;
+            localForce[get_local_id(0)].z = 0;
            barrier(CLK_LOCAL_MEM_FENCE);

            // Compute the full set of interactions in this tile.
@@ -210,26 +210,26 @@ void computeNonbonded(
 #ifdef USE_EXCLUSIONS
                bool isExcluded = !(excl & 0x1);
 #endif
-                float4 posq2 = (float4) (localData[tj].x, localData[tj].y, localData[tj].z, localData[tj].q);
-                float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f);
+                real4 posq2 = (real4) (localData[tj].x, localData[tj].y, localData[tj].z, localData[tj].q);
+                real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
 #ifdef USE_PERIODIC
                delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
                delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
                delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
 #endif
-                float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
-                float invR = RSQRT(r2);
-                float r = RECIP(invR);
+                real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
+                real invR = RSQRT(r2);
+                real r = RECIP(invR);
                int atom2 = tj;
                LOAD_ATOM2_PARAMETERS
                atom2 = y*TILE_SIZE+tj;
 #ifdef USE_SYMMETRIC
-                float dEdR = 0.0f;
+                real dEdR = 0;
 #else
-                float4 dEdR1 = (float4) 0.0f;
-                float4 dEdR2 = (float4) 0.0f;
+                real4 dEdR1 = (real4) 0;
+                real4 dEdR2 = (real4) 0;
 #endif
-                float tempEnergy = 0.0f;
+                real tempEnergy = 0;
                COMPUTE_INTERACTION
                energy += tempEnergy;
 #ifdef USE_SYMMETRIC
@@ -277,9 +277,9 @@ void computeNonbonded(
                const unsigned int offset1 = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
                const unsigned int offset2 = y*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
 #endif
-                // Cheaper to load/store float4 than float3. Do all loads before all stores to minimize store-load waits.
-                float4 sum1 = forceBuffers[offset1];
-                float4 sum2 = forceBuffers[offset2];
+                // Cheaper to load/store real4 than real3. Do all loads before all stores to minimize store-load waits.
+                real4 sum1 = forceBuffers[offset1];
+                real4 sum2 = forceBuffers[offset2];
                sum1.x += localData[tgx].fx + force.x;
                sum1.y += localData[tgx].fy + force.y;
                sum1.z += localData[tgx].fz + force.z;

--- a/platforms/opencl/src/kernels/nonbonded_nvidia.cl
+++ b/platforms/opencl/src/kernels/nonbonded_nvidia.cl
@@ -6,12 +6,12 @@
 #define WARPS_PER_GROUP (FORCE_WORK_GROUP_SIZE/TILE_SIZE)

 typedef struct {
-    float x, y, z;
-    float q;
-    float fx, fy, fz;
+    real x, y, z;
+    real q;
+    real fx, fy, fz;
    ATOM_PARAMETER_DATA
 #ifndef PARAMETER_SIZE_IS_EVEN
-    float padding;
+    real padding;
 #endif
 } AtomData;

@@ -22,13 +22,13 @@ __kernel void computeNonbonded(
 #ifdef SUPPORTS_64_BIT_ATOMICS
        __global long* restrict forceBuffers,
 #else
-        __global float4* restrict forceBuffers,
+        __global real4* restrict forceBuffers,
 #endif
-        __global float* restrict energyBuffer, __global const float4* restrict posq, __global const unsigned int* restrict exclusions,
+        __global real* restrict energyBuffer, __global const real4* restrict posq, __global const unsigned int* restrict exclusions,
        __global const unsigned int* restrict exclusionIndices, __global const unsigned int* restrict exclusionRowIndices,
        unsigned int startTileIndex, unsigned int endTileIndex,
 #ifdef USE_CUTOFF
-        __global const ushort2* restrict tiles, __global const unsigned int* restrict interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global const unsigned int* restrict interactionFlags
+        __global const ushort2* restrict tiles, __global const unsigned int* restrict interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize, unsigned int maxTiles, __global const unsigned int* restrict interactionFlags
 #else
        unsigned int numTiles
 #endif
@@ -43,9 +43,9 @@ __kernel void computeNonbonded(
    unsigned int pos = startTileIndex+warp*numTiles/totalWarps;
    unsigned int end = startTileIndex+(warp+1)*numTiles/totalWarps;
 #endif
-    float energy = 0.0f;
+    real energy = 0;
    __local AtomData localData[FORCE_WORK_GROUP_SIZE];
-    __local float tempBuffer[3*FORCE_WORK_GROUP_SIZE];
+    __local real tempBuffer[3*FORCE_WORK_GROUP_SIZE];
    __local unsigned int exclusionRange[2*WARPS_PER_GROUP];
    __local int exclusionIndex[WARPS_PER_GROUP];
    __local int2* reservedBlocks = (__local int2*) exclusionRange;
@@ -56,7 +56,7 @@ __kernel void computeNonbonded(
        const unsigned int tbx = get_local_id(0) - tgx;
        const unsigned int localGroupIndex = get_local_id(0)/TILE_SIZE;
        unsigned int x, y;
-        float4 force = 0.0f;
+        real4 force = 0;
        if (pos < end) {
 #ifdef USE_CUTOFF
            if (numTiles <= maxTiles) {
@@ -75,7 +75,7 @@ __kernel void computeNonbonded(
                }
            }
            unsigned int atom1 = x*TILE_SIZE + tgx;
-            float4 posq1 = posq[atom1];
+            real4 posq1 = posq[atom1];
            LOAD_ATOM1_PARAMETERS

            // Locate the exclusion data for this tile.
@@ -111,25 +111,25 @@ __kernel void computeNonbonded(
                    bool isExcluded = !(excl & 0x1);
 #endif
                    int atom2 = tbx+j;
-                    float4 posq2 = (float4) (localData[atom2].x, localData[atom2].y, localData[atom2].z, localData[atom2].q);
-                    float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f);
+                    real4 posq2 = (real4) (localData[atom2].x, localData[atom2].y, localData[atom2].z, localData[atom2].q);
+                    real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
 #ifdef USE_PERIODIC
                    delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
                    delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
                    delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
 #endif
-                    float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
-                    float invR = RSQRT(r2);
-                    float r = RECIP(invR);
+                    real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
+                    real invR = RSQRT(r2);
+                    real r = RECIP(invR);
                    LOAD_ATOM2_PARAMETERS
                    atom2 = y*TILE_SIZE+j;
 #ifdef USE_SYMMETRIC
-                    float dEdR = 0.0f;
+                    real dEdR = 0;
 #else
-                    float4 dEdR1 = (float4) 0.0f;
-                    float4 dEdR2 = (float4) 0.0f;
+                    real4 dEdR1 = (real4) 0;
+                    real4 dEdR2 = (real4) 0;
 #endif
-                    float tempEnergy = 0.0f;
+                    real tempEnergy = 0;
                    COMPUTE_INTERACTION
                    energy += 0.5f*tempEnergy;
 #ifdef USE_SYMMETRIC
@@ -147,15 +147,15 @@ __kernel void computeNonbonded(

                const unsigned int localAtomIndex = get_local_id(0);
                unsigned int j = y*TILE_SIZE + tgx;
-                float4 tempPosq = posq[j];
+                real4 tempPosq = posq[j];
                localData[localAtomIndex].x = tempPosq.x;
                localData[localAtomIndex].y = tempPosq.y;
                localData[localAtomIndex].z = tempPosq.z;
                localData[localAtomIndex].q = tempPosq.w;
                LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
-                localData[localAtomIndex].fx = 0.0f;
-                localData[localAtomIndex].fy = 0.0f;
-                localData[localAtomIndex].fz = 0.0f;
+                localData[localAtomIndex].fx = 0;
+                localData[localAtomIndex].fy = 0;
+                localData[localAtomIndex].fz = 0;
 #ifdef USE_CUTOFF
                unsigned int flags = (numTiles <= maxTiles ? interactionFlags[pos] : 0xFFFFFFFF);
                if (!hasExclusions && flags != 0xFFFFFFFF) {
@@ -171,25 +171,25 @@ __kernel void computeNonbonded(
                                int atom2 = tbx+j;
                                int bufferIndex = 3*get_local_id(0);
 #ifdef USE_SYMMETRIC
-                                float dEdR = 0.0f;
+                                real dEdR = 0;
 #else
-                                float4 dEdR1 = (float4) 0.0f;
-                                float4 dEdR2 = (float4) 0.0f;
+                                real4 dEdR1 = (real4) 0;
+                                real4 dEdR2 = (real4) 0;
 #endif
-                                float tempEnergy = 0.0f;
-                                float4 posq2 = (float4) (localData[atom2].x, localData[atom2].y, localData[atom2].z, localData[atom2].q);
-                                float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f);
+                                real tempEnergy = 0;
+                                real4 posq2 = (real4) (localData[atom2].x, localData[atom2].y, localData[atom2].z, localData[atom2].q);
+                                real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
 #ifdef USE_PERIODIC
                                delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
                                delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
                                delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
 #endif
-                                float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
+                                real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
 #ifdef USE_CUTOFF
                                if (r2 < CUTOFF_SQUARED) {
 #endif
-                                    float invR = RSQRT(r2);
-                                    float r = RECIP(invR);
+                                    real invR = RSQRT(r2);
+                                    real r = RECIP(invR);
                                    LOAD_ATOM2_PARAMETERS
                                    atom2 = y*TILE_SIZE+j;
                                    COMPUTE_INTERACTION
@@ -241,28 +241,28 @@ __kernel void computeNonbonded(
                        bool isExcluded = !(excl & 0x1);
 #endif
                        int atom2 = tbx+tj;
-                        float4 posq2 = (float4) (localData[atom2].x, localData[atom2].y, localData[atom2].z, localData[atom2].q);
-                        float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f);
+                        real4 posq2 = (real4) (localData[atom2].x, localData[atom2].y, localData[atom2].z, localData[atom2].q);
+                        real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
 #ifdef USE_PERIODIC
                        delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
                        delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
                        delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
 #endif
-                        float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
+                        real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
 #ifdef USE_CUTOFF
                        if (r2 < CUTOFF_SQUARED) {
 #endif
-                            float invR = RSQRT(r2);
-                            float r = RECIP(invR);
+                            real invR = RSQRT(r2);
+                            real r = RECIP(invR);
                            LOAD_ATOM2_PARAMETERS
                            atom2 = y*TILE_SIZE+tj;
 #ifdef USE_SYMMETRIC
-                            float dEdR = 0.0f;
+                            real dEdR = 0;
 #else
-                            float4 dEdR1 = (float4) 0.0f;
-                            float4 dEdR2 = (float4) 0.0f;
+                            real4 dEdR1 = (real4) 0;
+                            real4 dEdR2 = (real4) 0;
 #endif
-                            float tempEnergy = 0.0f;
+                            real tempEnergy = 0;
                            COMPUTE_INTERACTION
                            energy += tempEnergy;
 #ifdef USE_SYMMETRIC
@@ -347,7 +347,7 @@ __kernel void computeNonbonded(
                    }
                    if (writeY > -1) {
                        const unsigned int offset = y*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
-                        forceBuffers[offset] += (float4) (localData[get_local_id(0)].fx, localData[get_local_id(0)].fy, localData[get_local_id(0)].fz, 0.0f);
+                        forceBuffers[offset] += (real4) (localData[get_local_id(0)].fx, localData[get_local_id(0)].fy, localData[get_local_id(0)].fz, 0);
                    }
                    done = true;
                    if (tgx == 0)

--- a/platforms/opencl/src/kernels/periodicTorsionForce.cl
+++ b/platforms/opencl/src/kernels/periodicTorsionForce.cl
 float4 torsionParams = PARAMS[index];
-float deltaAngle = torsionParams.z*theta-torsionParams.y;
+real deltaAngle = torsionParams.z*theta-torsionParams.y;
 energy += torsionParams.x*(1.0f+cos(deltaAngle));
-float sinDeltaAngle = sin(deltaAngle);
-float dEdAngle = -torsionParams.x*torsionParams.z*sinDeltaAngle;
+real sinDeltaAngle = sin(deltaAngle);
+real dEdAngle = -torsionParams.x*torsionParams.z*sinDeltaAngle;
--- a/platforms/opencl/src/kernels/pme.cl
+++ b/platforms/opencl/src/kernels/pme.cl
-__kernel void updateBsplines(__global const float4* restrict posq, __global float4* restrict pmeBsplineTheta, __local float4* restrict bsplinesCache,
-        __global int2* restrict pmeAtomGridIndex, float4 periodicBoxSize, float4 invPeriodicBoxSize) {
-    const float4 scale = 1.0f/(PME_ORDER-1);
+__kernel void updateBsplines(__global const real4* restrict posq, __global real4* restrict pmeBsplineTheta, __local real4* restrict bsplinesCache,
+        __global int2* restrict pmeAtomGridIndex, real4 periodicBoxSize, real4 invPeriodicBoxSize) {
+    const real4 scale = 1/(real) (PME_ORDER-1);
    for (int i = get_global_id(0); i < NUM_ATOMS; i += get_global_size(0)) {
-        __local float4* data = &bsplinesCache[get_local_id(0)*PME_ORDER];
-        float4 pos = posq[i];
+        __local real4* data = &bsplinesCache[get_local_id(0)*PME_ORDER];
+        real4 pos = posq[i];
        pos.x -= floor(pos.x*invPeriodicBoxSize.x)*periodicBoxSize.x;
        pos.y -= floor(pos.y*invPeriodicBoxSize.y)*periodicBoxSize.y;
        pos.z -= floor(pos.z*invPeriodicBoxSize.z)*periodicBoxSize.z;
-        float4 t = (float4) ((pos.x*invPeriodicBoxSize.x)*GRID_SIZE_X,
+        real4 t = (real4) ((pos.x*invPeriodicBoxSize.x)*GRID_SIZE_X,
                             (pos.y*invPeriodicBoxSize.y)*GRID_SIZE_Y,
                             (pos.z*invPeriodicBoxSize.z)*GRID_SIZE_Z, 0.0f);
-        float4 dr = (float4) (t.x-(int) t.x, t.y-(int) t.y, t.z-(int) t.z, 0.0f);
+        real4 dr = (real4) (t.x-(int) t.x, t.y-(int) t.y, t.z-(int) t.z, 0.0f);
        int4 gridIndex = (int4) (((int) t.x) % GRID_SIZE_X,
                                 ((int) t.y) % GRID_SIZE_Y,
                                 ((int) t.z) % GRID_SIZE_Z, 0);
@@ -19,15 +19,15 @@ __kernel void updateBsplines(__global const float4* restrict posq, __global floa
        data[1] = dr;
        data[0] = 1.0f-dr;
        for (int j = 3; j < PME_ORDER; j++) {
-            float div = 1.0f/(j-1.0f);
+            real div = RECIP(j-1.0f);
            data[j-1] = div*dr*data[j-2];
            for (int k = 1; k < (j-1); k++)
-                data[j-k-1] = div*((dr+(float4) k) *data[j-k-2] + (-dr+(float4) (j-k))*data[j-k-1]);
+                data[j-k-1] = div*((dr+(real4) k) *data[j-k-2] + (-dr+(real4) (j-k))*data[j-k-1]);
            data[0] = div*(- dr+1.0f)*data[0];
        }
        data[PME_ORDER-1] = scale*dr*data[PME_ORDER-2];
        for (int j = 1; j < (PME_ORDER-1); j++)
-            data[PME_ORDER-j-1] = scale*((dr+(float4) j)*data[PME_ORDER-j-2] + (-dr+(float4) (PME_ORDER-j))*data[PME_ORDER-j-1]);
+            data[PME_ORDER-j-1] = scale*((dr+(real4) j)*data[PME_ORDER-j-2] + (-dr+(real4) (PME_ORDER-j))*data[PME_ORDER-j-1]);
        data[0] = scale*(-dr+1.0f)*data[0];
        for (int j = 0; j < PME_ORDER; j++) {
            data[j].w = pos.w; // Storing the charge here improves cache coherency in the charge spreading kernel
@@ -39,7 +39,7 @@ __kernel void updateBsplines(__global const float4* restrict posq, __global floa
 /**
 * For each grid point, find the range of sorted atoms associated with that point.
 */
-__kernel void findAtomRangeForGrid(__global int2* restrict pmeAtomGridIndex, __global int* restrict pmeAtomRange, __global const float4* restrict posq, float4 periodicBoxSize, float4 invPeriodicBoxSize) {
+__kernel void findAtomRangeForGrid(__global int2* restrict pmeAtomGridIndex, __global int* restrict pmeAtomRange, __global const real4* restrict posq, real4 periodicBoxSize, real4 invPeriodicBoxSize) {
    int start = (NUM_ATOMS*get_global_id(0))/get_global_size(0);
    int end = (NUM_ATOMS*(get_global_id(0)+1))/get_global_size(0);
    int last = (start == 0 ? -1 : pmeAtomGridIndex[start-1].y);
@@ -66,11 +66,11 @@ __kernel void findAtomRangeForGrid(__global int2* restrict pmeAtomGridIndex, __g
 * The grid index won't be needed again.  Reuse that component to hold the z index, thus saving
 * some work in the charge spreading kernel.
 */
-__kernel void recordZIndex(__global int2* restrict pmeAtomGridIndex, __global const float4* restrict posq, float4 periodicBoxSize, float4 invPeriodicBoxSize) {
+__kernel void recordZIndex(__global int2* restrict pmeAtomGridIndex, __global const real4* restrict posq, real4 periodicBoxSize, real4 invPeriodicBoxSize) {
    int start = (NUM_ATOMS*get_global_id(0))/get_global_size(0);
    int end = (NUM_ATOMS*(get_global_id(0)+1))/get_global_size(0);
    for (int i = start; i < end; ++i) {
-        float posz = posq[pmeAtomGridIndex[i].x].z;
+        real posz = posq[pmeAtomGridIndex[i].x].z;
        posz -= floor(posz*invPeriodicBoxSize.z)*periodicBoxSize.z;
        int z = ((int) ((posz*invPeriodicBoxSize.z)*GRID_SIZE_Z)) % GRID_SIZE_Z;
        pmeAtomGridIndex[i].y = z;
@@ -83,14 +83,14 @@ __kernel void recordZIndex(__global int2* restrict pmeAtomGridIndex, __global co
 #define BUFFER_SIZE (PME_ORDER*PME_ORDER*PME_ORDER)

 __kernel __attribute__((reqd_work_group_size(BUFFER_SIZE, 1, 1)))
-void gridSpreadCharge(__global const float4* restrict posq, __global const int2* restrict pmeAtomGridIndex, __global const int* restrict pmeAtomRange,
-        __global long* restrict pmeGrid, __global const float4* restrict pmeBsplineTheta, float4 periodicBoxSize, float4 invPeriodicBoxSize) {
+void gridSpreadCharge(__global const real4* restrict posq, __global const int2* restrict pmeAtomGridIndex, __global const int* restrict pmeAtomRange,
+        __global long* restrict pmeGrid, __global const real4* restrict pmeBsplineTheta, real4 periodicBoxSize, real4 invPeriodicBoxSize) {
    int ix = get_local_id(0)/(PME_ORDER*PME_ORDER);
    int remainder = get_local_id(0)-ix*PME_ORDER*PME_ORDER;
    int iy = remainder/PME_ORDER;
    int iz = remainder-iy*PME_ORDER;
-    __local float4 theta[PME_ORDER];
-    __local float charge[BUFFER_SIZE];
+    __local real4 theta[PME_ORDER];
+    __local real charge[BUFFER_SIZE];
    __local int basex[BUFFER_SIZE];
    __local int basey[BUFFER_SIZE];
    __local int basez[BUFFER_SIZE];
@@ -101,7 +101,7 @@ void gridSpreadCharge(__global const float4* restrict posq, __global const int2*
            if (get_local_id(0) < BUFFER_SIZE) {
                int atomIndex = baseIndex+get_local_id(0);
                if (atomIndex < NUM_ATOMS) {
-                    float4 pos = posq[atomIndex];
+                    real4 pos = posq[atomIndex];
                    charge[get_local_id(0)] = pos.w;
                    pos.x -= floor(pos.x*invPeriodicBoxSize.x)*periodicBoxSize.x;
                    pos.y -= floor(pos.y*invPeriodicBoxSize.y)*periodicBoxSize.y;
@@ -118,32 +118,40 @@ void gridSpreadCharge(__global const float4* restrict posq, __global const int2*
                if (get_local_id(0) < PME_ORDER)
                    theta[get_local_id(0)] = pmeBsplineTheta[atomIndex+get_local_id(0)*NUM_ATOMS];
                barrier(CLK_LOCAL_MEM_FENCE);
-                float add = charge[index]*theta[ix].x*theta[iy].y*theta[iz].z;
+                real add = charge[index]*theta[ix].x*theta[iy].y*theta[iz].z;
                int x = basex[index]+ix;
                int y = basey[index]+iy;
                int z = basez[index]+iz;
                x -= (x >= GRID_SIZE_X ? GRID_SIZE_X : 0);
                y -= (y >= GRID_SIZE_Y ? GRID_SIZE_Y : 0);
                z -= (z >= GRID_SIZE_Z ? GRID_SIZE_Z : 0);
+#ifdef USE_DOUBLE_PRECISION
+                atom_add(&pmeGrid[2*(x*GRID_SIZE_Y*GRID_SIZE_Z+y*GRID_SIZE_Z+z)], (long) (add*0xFFFFFFFF));
+#else
                atom_add(&pmeGrid[x*GRID_SIZE_Y*GRID_SIZE_Z+y*GRID_SIZE_Z+z], (long) (add*0xFFFFFFFF));
+#endif
            }
        }
    }
 }

 __kernel void finishSpreadCharge(__global long* restrict pmeGrid) {
-    __global float2* floatGrid = (__global float2*) pmeGrid;
+    __global real2* realGrid = (__global real2*) pmeGrid;
    const unsigned int gridSize = GRID_SIZE_X*GRID_SIZE_Y*GRID_SIZE_Z;
-    float scale = EPSILON_FACTOR/(float) 0xFFFFFFFF;
+    real scale = EPSILON_FACTOR/(real) 0xFFFFFFFF;
    for (int index = get_global_id(0); index < gridSize; index += get_global_size(0)) {
+#ifdef USE_DOUBLE_PRECISION
+        long value = pmeGrid[2*index];
+#else
        long value = pmeGrid[index];
-        float2 floatValue = (float2) ((float) (value*scale), 0.0f);
-        floatGrid[index] = floatValue;
+#endif
+        real2 realValue = (real2) ((real) (value*scale), 0);
+        realGrid[index] = realValue;
    }
 }
 #else
-__kernel void gridSpreadCharge(__global const float4* restrict posq, __global const int2* restrict pmeAtomGridIndex, __global const int* restrict pmeAtomRange,
-        __global float2* restrict pmeGrid, __global const float4* restrict pmeBsplineTheta) {
+__kernel void gridSpreadCharge(__global const real4* restrict posq, __global const int2* restrict pmeAtomGridIndex, __global const int* restrict pmeAtomRange,
+        __global real2* restrict pmeGrid, __global const real4* restrict pmeBsplineTheta) {
    unsigned int numGridPoints = GRID_SIZE_X*GRID_SIZE_Y*GRID_SIZE_Z;
    for (int gridIndex = get_global_id(0); gridIndex < numGridPoints; gridIndex += get_global_size(0)) {
        // Compute the charge on a grid point.
@@ -153,7 +161,7 @@ __kernel void gridSpreadCharge(__global const float4* restrict posq, __global co
        int remainder = gridIndex-gridPoint.x*GRID_SIZE_Y*GRID_SIZE_Z;
        gridPoint.y = remainder/GRID_SIZE_Z;
        gridPoint.z = remainder-gridPoint.y*GRID_SIZE_Z;
-        float result = 0.0f;
+        real result = 0.0f;

        // Loop over all atoms that affect this grid point.

@@ -174,7 +182,7 @@ __kernel void gridSpreadCharge(__global const float4* restrict posq, __global co
                    int atomIndex = atomData.x;
                    int z = atomData.y;
                    int iz = gridPoint.z-z+(gridPoint.z >= z ? 0 : GRID_SIZE_Z);
-                    float atomCharge = pmeBsplineTheta[atomIndex+ix*NUM_ATOMS].w;
+                    real atomCharge = pmeBsplineTheta[atomIndex+ix*NUM_ATOMS].w;
                    result += atomCharge*pmeBsplineTheta[atomIndex+ix*NUM_ATOMS].x*pmeBsplineTheta[atomIndex+iy*NUM_ATOMS].y*pmeBsplineTheta[atomIndex+iz*NUM_ATOMS].z;
                }
                if (z1 > gridPoint.z)
@@ -189,21 +197,21 @@ __kernel void gridSpreadCharge(__global const float4* restrict posq, __global co
                        int atomIndex = atomData.x;
                        int z = atomData.y;
                        int iz = gridPoint.z-z+(gridPoint.z >= z ? 0 : GRID_SIZE_Z);
-                        float atomCharge = pmeBsplineTheta[atomIndex+ix*NUM_ATOMS].w;
+                        real atomCharge = pmeBsplineTheta[atomIndex+ix*NUM_ATOMS].w;
                        result += atomCharge*pmeBsplineTheta[atomIndex+ix*NUM_ATOMS].x*pmeBsplineTheta[atomIndex+iy*NUM_ATOMS].y*pmeBsplineTheta[atomIndex+iz*NUM_ATOMS].z;
                    }
                }
            }
        }
-        pmeGrid[gridIndex] = (float2) (result*EPSILON_FACTOR, 0.0f);
+        pmeGrid[gridIndex] = (real2) (result*EPSILON_FACTOR, 0);
    }
 }
 #endif

-__kernel void reciprocalConvolution(__global float2* restrict pmeGrid, __global float* restrict energyBuffer, __global const float* restrict pmeBsplineModuliX,
-        __global const float* restrict pmeBsplineModuliY, __global const float* restrict pmeBsplineModuliZ, float4 invPeriodicBoxSize, float recipScaleFactor) {
+__kernel void reciprocalConvolution(__global real2* restrict pmeGrid, __global real* restrict energyBuffer, __global const real* restrict pmeBsplineModuliX,
+        __global const real* restrict pmeBsplineModuliY, __global const real* restrict pmeBsplineModuliZ, real4 invPeriodicBoxSize, real recipScaleFactor) {
    const unsigned int gridSize = GRID_SIZE_X*GRID_SIZE_Y*GRID_SIZE_Z;
-    float energy = 0.0f;
+    real energy = 0.0f;
    for (int index = get_global_id(0); index < gridSize; index += get_global_size(0)) {
        int kx = index/(GRID_SIZE_Y*GRID_SIZE_Z);
        int remainder = index-kx*GRID_SIZE_Y*GRID_SIZE_Z;
@@ -214,36 +222,36 @@ __kernel void reciprocalConvolution(__global float2* restrict pmeGrid, __global
        int mx = (kx < (GRID_SIZE_X+1)/2) ? kx : (kx-GRID_SIZE_X);
        int my = (ky < (GRID_SIZE_Y+1)/2) ? ky : (ky-GRID_SIZE_Y);
        int mz = (kz < (GRID_SIZE_Z+1)/2) ? kz : (kz-GRID_SIZE_Z);
-        float mhx = mx*invPeriodicBoxSize.x;
-        float mhy = my*invPeriodicBoxSize.y;
-        float mhz = mz*invPeriodicBoxSize.z;
-        float bx = pmeBsplineModuliX[kx];
-        float by = pmeBsplineModuliY[ky];
-        float bz = pmeBsplineModuliZ[kz];
-        float2 grid = pmeGrid[index];
-        float m2 = mhx*mhx+mhy*mhy+mhz*mhz;
-        float denom = m2*bx*by*bz;
-        float eterm = recipScaleFactor*EXP(-RECIP_EXP_FACTOR*m2)/denom;
-        pmeGrid[index] = (float2) (grid.x*eterm, grid.y*eterm);
+        real mhx = mx*invPeriodicBoxSize.x;
+        real mhy = my*invPeriodicBoxSize.y;
+        real mhz = mz*invPeriodicBoxSize.z;
+        real bx = pmeBsplineModuliX[kx];
+        real by = pmeBsplineModuliY[ky];
+        real bz = pmeBsplineModuliZ[kz];
+        real2 grid = pmeGrid[index];
+        real m2 = mhx*mhx+mhy*mhy+mhz*mhz;
+        real denom = m2*bx*by*bz;
+        real eterm = recipScaleFactor*EXP(-RECIP_EXP_FACTOR*m2)/denom;
+        pmeGrid[index] = (real2) (grid.x*eterm, grid.y*eterm);
        energy += eterm*(grid.x*grid.x + grid.y*grid.y);
    }
    energyBuffer[get_global_id(0)] += 0.5f*energy;
 }

-__kernel void gridInterpolateForce(__global const float4* restrict posq, __global float4* restrict forceBuffers, __global const float2* restrict pmeGrid,
-        float4 periodicBoxSize, float4 invPeriodicBoxSize, __local float4* restrict bsplinesCache) {
-    const float4 scale = 1.0f/(PME_ORDER-1);
-    __local float4* data = &bsplinesCache[get_local_id(0)*PME_ORDER];
-    __local float4* ddata = &bsplinesCache[get_local_id(0)*PME_ORDER + get_local_size(0)*PME_ORDER];
+__kernel void gridInterpolateForce(__global const real4* restrict posq, __global real4* restrict forceBuffers, __global const real2* restrict pmeGrid,
+        real4 periodicBoxSize, real4 invPeriodicBoxSize, __local real4* restrict bsplinesCache) {
+    const real4 scale = 1/(real) (PME_ORDER-1);
+    __local real4* data = &bsplinesCache[get_local_id(0)*PME_ORDER];
+    __local real4* ddata = &bsplinesCache[get_local_id(0)*PME_ORDER + get_local_size(0)*PME_ORDER];
    for (int atom = get_global_id(0); atom < NUM_ATOMS; atom += get_global_size(0)) {
-        float4 force = 0.0f;
-        float4 pos = posq[atom];
+        real4 force = 0.0f;
+        real4 pos = posq[atom];
        pos.x -= floor(pos.x*invPeriodicBoxSize.x)*periodicBoxSize.x;
        pos.y -= floor(pos.y*invPeriodicBoxSize.y)*periodicBoxSize.y;
        pos.z -= floor(pos.z*invPeriodicBoxSize.z)*periodicBoxSize.z;
-        float4 t = (float4) ((pos.x*invPeriodicBoxSize.x)*GRID_SIZE_X,
-                             (pos.y*invPeriodicBoxSize.y)*GRID_SIZE_Y,
-                             (pos.z*invPeriodicBoxSize.z)*GRID_SIZE_Z, 0.0f);
+        real4 t = (real4) ((pos.x*invPeriodicBoxSize.x)*GRID_SIZE_X,
+                           (pos.y*invPeriodicBoxSize.y)*GRID_SIZE_Y,
+                           (pos.z*invPeriodicBoxSize.z)*GRID_SIZE_Z, 0.0f);
        int4 gridIndex = (int4) (((int) t.x) % GRID_SIZE_X,
                                 ((int) t.y) % GRID_SIZE_Y,
                                 ((int) t.z) % GRID_SIZE_Z, 0);
@@ -251,15 +259,15 @@ __kernel void gridInterpolateForce(__global const float4* restrict posq, __globa
        // Since we need the full set of thetas, it's faster to compute them here than load them
        // from global memory.

-        float4 dr = (float4) (t.x-(int) t.x, t.y-(int) t.y, t.z-(int) t.z, 0.0f);
+        real4 dr = (real4) (t.x-(int) t.x, t.y-(int) t.y, t.z-(int) t.z, 0.0f);
        data[PME_ORDER-1] = 0.0f;
        data[1] = dr;
        data[0] = 1.0f-dr;
        for (int j = 3; j < PME_ORDER; j++) {
-            float div = 1.0f/(j-1.0f);
+            real div = RECIP(j-1.0f);
            data[j-1] = div*dr*data[j-2];
            for (int k = 1; k < (j-1); k++)
-                data[j-k-1] = div*((dr+(float4) k) *data[j-k-2] + (-dr+(float4) (j-k))*data[j-k-1]);
+                data[j-k-1] = div*((dr+(real4) k) *data[j-k-2] + (-dr+(real4) (j-k))*data[j-k-1]);
            data[0] = div*(- dr+1.0f)*data[0];
        }
        ddata[0] = -data[0];
@@ -267,7 +275,7 @@ __kernel void gridInterpolateForce(__global const float4* restrict posq, __globa
            ddata[j] = data[j-1]-data[j];
        data[PME_ORDER-1] = scale*dr*data[PME_ORDER-2];
        for (int j = 1; j < (PME_ORDER-1); j++)
-            data[PME_ORDER-j-1] = scale*((dr+(float4) j)*data[PME_ORDER-j-2] + (-dr+(float4) (PME_ORDER-j))*data[PME_ORDER-j-1]);
+            data[PME_ORDER-j-1] = scale*((dr+(real4) j)*data[PME_ORDER-j-2] + (-dr+(real4) (PME_ORDER-j))*data[PME_ORDER-j-1]);
        data[0] = scale*(-dr+1.0f)*data[0];

        // Compute the force on this atom.
@@ -282,7 +290,7 @@ __kernel void gridInterpolateForce(__global const float4* restrict posq, __globa
                    int zindex = gridIndex.z+iz;
                    zindex -= (zindex >= GRID_SIZE_Z ? GRID_SIZE_Z : 0);
                    int index = xindex*GRID_SIZE_Y*GRID_SIZE_Z + yindex*GRID_SIZE_Z + zindex;
-                    float gridvalue = pmeGrid[index].x;
+                    real gridvalue = pmeGrid[index].x;
                    force.x += ddata[ix].x*data[iy].y*data[iz].z*gridvalue;
                    force.y += data[ix].x*ddata[iy].y*data[iz].z*gridvalue;
 #ifndef MAC_AMD_WORKAROUND
@@ -302,14 +310,14 @@ __kernel void gridInterpolateForce(__global const float4* restrict posq, __globa
                    int zindex = gridIndex.z+iz;
                    zindex -= (zindex >= GRID_SIZE_Z ? GRID_SIZE_Z : 0);
                    int index = xindex*GRID_SIZE_Y*GRID_SIZE_Z + yindex*GRID_SIZE_Z + zindex;
-                    float gridvalue = pmeGrid[index].x;
+                    real gridvalue = pmeGrid[index].x;
                    force.z += data[ix].x*data[iy].y*ddata[iz].z*gridvalue;
                }
            }
        }
 #endif
-        float4 totalForce = forceBuffers[atom];
-        float q = pos.w*EPSILON_FACTOR;
+        real4 totalForce = forceBuffers[atom];
+        real q = pos.w*EPSILON_FACTOR;
        totalForce.x -= q*force.x*GRID_SIZE_X*invPeriodicBoxSize.x;
        totalForce.y -= q*force.y*GRID_SIZE_Y*invPeriodicBoxSize.y;
        totalForce.z -= q*force.z*GRID_SIZE_Z*invPeriodicBoxSize.z;

--- a/platforms/opencl/src/kernels/rbTorsionForce.cl
+++ b/platforms/opencl/src/kernels/rbTorsionForce.cl
@@ -4,9 +4,9 @@ if (theta < 0.0f)
 else
    theta -= PI;
 cosangle = -cosangle;
-float cosFactor = cosangle;
-float dEdAngle = -torsionParams.s1;
-float rbEnergy = torsionParams.s0;
+real cosFactor = cosangle;
+real dEdAngle = -torsionParams.s1;
+real rbEnergy = torsionParams.s0;
 rbEnergy += torsionParams.s1*cosFactor;
 dEdAngle -= 2.0f*torsionParams.s2*cosFactor;
 cosFactor *= cosangle;

--- a/platforms/opencl/src/kernels/torsionForce.cl
+++ b/platforms/opencl/src/kernels/torsionForce.cl
-const float PI = 3.14159265358979323846f;
-float4 v0 = (float4) (pos1.xyz-pos2.xyz, 0.0f);
-float4 v1 = (float4) (pos3.xyz-pos2.xyz, 0.0f);
-float4 v2 = (float4) (pos3.xyz-pos4.xyz, 0.0f);
-float4 cp0 = cross(v0, v1);
-float4 cp1 = cross(v1, v2);
-float cosangle = dot(normalize(cp0), normalize(cp1));
-float theta;
+const real PI = 3.14159265358979323846f;
+real4 v0 = (real4) (pos1.xyz-pos2.xyz, 0.0f);
+real4 v1 = (real4) (pos3.xyz-pos2.xyz, 0.0f);
+real4 v2 = (real4) (pos3.xyz-pos4.xyz, 0.0f);
+real4 cp0 = cross(v0, v1);
+real4 cp1 = cross(v1, v2);
+real cosangle = dot(normalize(cp0), normalize(cp1));
+real theta;
 if (cosangle > 0.99f || cosangle < -0.99f) {
    // We're close to the singularity in acos(), so take the cross product and use asin() instead.

-    float4 cross_prod = cross(cp0, cp1);
-    float scale = dot(cp0, cp0)*dot(cp1, cp1);
+    real4 cross_prod = cross(cp0, cp1);
+    real scale = dot(cp0, cp0)*dot(cp1, cp1);
    theta = asin(sqrt(dot(cross_prod, cross_prod)/scale));
-    if (cosangle < 0.0f)
+    if (cosangle < 0)
        theta = PI-theta;
 }
 else
   theta = acos(cosangle);
 theta = (dot(v0, cp1) >= 0 ? theta : -theta);
 COMPUTE_FORCE
-float normCross1 = dot(cp0, cp0);
-float normSqrBC = dot(v1, v1);
-float normBC = sqrt(normSqrBC);
-float normCross2 = dot(cp1, cp1);
-float dp = 1.0f/normSqrBC;
-float4 ff = (float4) ((-dEdAngle*normBC)/normCross1, dot(v0, v1)*dp, dot(v2, v1)*dp, (dEdAngle*normBC)/normCross2);
-float4 force1 = ff.x*cp0;
-float4 force4 = ff.w*cp1;
-float4 s = ff.y*force1 - ff.z*force4;
-float4 force2 = s-force1;
-float4 force3 = -s-force4;
+real normCross1 = dot(cp0, cp0);
+real normSqrBC = dot(v1, v1);
+real normBC = sqrt(normSqrBC);
+real normCross2 = dot(cp1, cp1);
+real dp = 1.0f/normSqrBC;
+real4 ff = (real4) ((-dEdAngle*normBC)/normCross1, dot(v0, v1)*dp, dot(v2, v1)*dp, (dEdAngle*normBC)/normCross2);
+real4 force1 = ff.x*cp0;
+real4 force4 = ff.w*cp1;
+real4 s = ff.y*force1 - ff.z*force4;
+real4 force2 = s-force1;
+real4 force3 = -s-force4;
--- a/platforms/opencl/src/kernels/utilities.cl
+++ b/platforms/opencl/src/kernels/utilities.cl
@@ -69,11 +69,11 @@ __kernel void clearSixBuffers(__global int* restrict buffer1, int size1, __globa
 * Sum a collection of buffers into the first one.
 */

-__kernel void reduceFloat4Buffer(__global float4* restrict buffer, int bufferSize, int numBuffers) {
+__kernel void reduceReal4Buffer(__global real4* restrict buffer, int bufferSize, int numBuffers) {
    int index = get_global_id(0);
    int totalSize = bufferSize*numBuffers;
    while (index < bufferSize) {
-        float4 sum = buffer[index];
+        real4 sum = buffer[index];
        for (int i = index+bufferSize; i < totalSize; i += bufferSize)
            sum += buffer[i];
        buffer[index] = sum;
@@ -84,11 +84,11 @@ __kernel void reduceFloat4Buffer(__global float4* restrict buffer, int bufferSiz
 /**
 * Sum the various buffers containing forces.
 */
-__kernel void reduceForces(__global const long* restrict longBuffer, __global float4* restrict buffer, int bufferSize, int numBuffers) {
+__kernel void reduceForces(__global const long* restrict longBuffer, __global real4* restrict buffer, int bufferSize, int numBuffers) {
    int totalSize = bufferSize*numBuffers;
-    float scale = 1.0f/(float) 0xFFFFFFFF;
+    real scale = 1/(real) 0xFFFFFFFF;
    for (int index = get_global_id(0); index < bufferSize; index += get_global_size(0)) {
-        float4 sum = (float4) (scale*longBuffer[index], scale*longBuffer[index+bufferSize], scale*longBuffer[index+2*bufferSize], 0.0f);
+        real4 sum = (real4) (scale*longBuffer[index], scale*longBuffer[index+bufferSize], scale*longBuffer[index+2*bufferSize], 0);
        for (int i = index; i < totalSize; i += bufferSize)
            sum += buffer[i];
        buffer[index] = sum;