Finished implementing double precision for OpenCL

59f8cf87 · Peter Eastman · c8dac206 · 59f8cf87 · 59f8cf87 · 59f8cf87
Commit 59f8cf87 authored Oct 17, 2012 by Peter Eastman
3 changed files
--- a/platforms/opencl/src/kernels/pme_cpu.cl
+++ b/platforms/opencl/src/kernels/pme_cpu.cl
-__kernel void updateBsplines(__global const float4* restrict posq, __global float4* restrict pmeBsplineTheta, __local float4* restrict bsplinesCache, __global int2* restrict pmeAtomGridIndex, float4 periodicBoxSize, float4 invPeriodicBoxSize, __global float4* restrict pmeBsplineDTheta) {
-    const float4 scale = 1.0f/(PME_ORDER-1);
+__kernel void updateBsplines(__global const real4* restrict posq, __global real4* restrict pmeBsplineTheta, __local real4* restrict bsplinesCache, __global int2* restrict pmeAtomGridIndex, real4 periodicBoxSize, real4 invPeriodicBoxSize, __global real4* restrict pmeBsplineDTheta) {
+    const real4 scale = 1.0f/(PME_ORDER-1);
    for (int i = get_global_id(0); i < NUM_ATOMS; i += get_global_size(0)) {
-        __local float4* data = &bsplinesCache[get_local_id(0)*PME_ORDER];
-        __local float4* ddata = &bsplinesCache[get_local_id(0)*PME_ORDER + get_local_size(0)*PME_ORDER];
+        __local real4* data = &bsplinesCache[get_local_id(0)*PME_ORDER];
+        __local real4* ddata = &bsplinesCache[get_local_id(0)*PME_ORDER + get_local_size(0)*PME_ORDER];
        for (int j = 0; j < PME_ORDER; j++) {
-	    data[j] = 0.0f;
-            ddata[j] = 0.0f;
+	    data[j] = 0;
+            ddata[j] = 0;
        }
-        float4 pos = posq[i];
+        real4 pos = posq[i];
        pos.x -= floor(pos.x*invPeriodicBoxSize.x)*periodicBoxSize.x;
        pos.y -= floor(pos.y*invPeriodicBoxSize.y)*periodicBoxSize.y;
        pos.z -= floor(pos.z*invPeriodicBoxSize.z)*periodicBoxSize.z;
-        float4 t = (float4) ((pos.x*invPeriodicBoxSize.x)*GRID_SIZE_X,
+        real4 t = (real4) ((pos.x*invPeriodicBoxSize.x)*GRID_SIZE_X,
                             (pos.y*invPeriodicBoxSize.y)*GRID_SIZE_Y,
-                             (pos.z*invPeriodicBoxSize.z)*GRID_SIZE_Z, 0.0f);
-        float4 dr = (float4) (t.x-(int) t.x, t.y-(int) t.y, t.z-(int) t.z, 0.0f);
-        data[PME_ORDER-1] = 0.0f;
+                             (pos.z*invPeriodicBoxSize.z)*GRID_SIZE_Z, 0);
+        real4 dr = (real4) (t.x-(int) t.x, t.y-(int) t.y, t.z-(int) t.z, 0);
+        data[PME_ORDER-1] = 0;
        data[1] = dr;
        data[0] = 1.0f-dr;
        for (int j = 3; j < PME_ORDER; j++) {
-            float div = 1.0f/(j-1.0f);
+            real div = 1.0f/(j-1.0f);
            data[j-1] = div*dr*data[j-2];
            for (int k = 1; k < (j-1); k++)
-                data[j-k-1] = div*((dr+(float4) k) *data[j-k-2] + (-dr+(float4) (j-k))*data[j-k-1]);
+                data[j-k-1] = div*((dr+(real4) k) *data[j-k-2] + (-dr+(real4) (j-k))*data[j-k-1]);
            data[0] = div*(- dr+1.0f)*data[0];
        }
        ddata[0] = -data[0];
@@ -30,7 +30,7 @@ __kernel void updateBsplines(__global const float4* restrict posq, __global floa
            ddata[j] = data[j-1]-data[j];
        data[PME_ORDER-1] = scale*dr*data[PME_ORDER-2];
        for (int j = 1; j < (PME_ORDER-1); j++)
-            data[PME_ORDER-j-1] = scale*((dr+(float4) j)*data[PME_ORDER-j-2] + (-dr+(float4) (PME_ORDER-j))*data[PME_ORDER-j-1]);
+            data[PME_ORDER-j-1] = scale*((dr+(real4) j)*data[PME_ORDER-j-2] + (-dr+(real4) (PME_ORDER-j))*data[PME_ORDER-j-1]);
        data[0] = scale*(-dr+1.0f)*data[0];
        for (int j = 0; j < PME_ORDER; j++) {
            pmeBsplineTheta[i+j*NUM_ATOMS] = data[j];
@@ -42,27 +42,27 @@ __kernel void updateBsplines(__global const float4* restrict posq, __global floa
 /**
 * This kernel is not actually used when running on a CPU.
 */
-__kernel void findAtomRangeForGrid(__global const int2* restrict pmeAtomGridIndex, __global int* restrict pmeAtomRange, __global const float4* restrict posq, float4 periodicBoxSize, float4 invPeriodicBoxSize) {
+__kernel void findAtomRangeForGrid(__global const int2* restrict pmeAtomGridIndex, __global int* restrict pmeAtomRange, __global const real4* restrict posq, real4 periodicBoxSize, real4 invPeriodicBoxSize) {
 }

-__kernel void gridSpreadCharge(__global const float4* restrict posq, __global const int2* restrict pmeAtomGridIndex, __global const int* restrict pmeAtomRange, __global float2* restrict pmeGrid, __global const float4* restrict pmeBsplineTheta, float4 periodicBoxSize, float4 invPeriodicBoxSize) {
+__kernel void gridSpreadCharge(__global const real4* restrict posq, __global const int2* restrict pmeAtomGridIndex, __global const int* restrict pmeAtomRange, __global real2* restrict pmeGrid, __global const real4* restrict pmeBsplineTheta, real4 periodicBoxSize, real4 invPeriodicBoxSize) {
    const int firstx = get_global_id(0)*GRID_SIZE_X/get_global_size(0);
    const int lastx = (get_global_id(0)+1)*GRID_SIZE_X/get_global_size(0);
    for (int gridIndex = firstx*GRID_SIZE_Y*GRID_SIZE_Z; gridIndex < lastx*GRID_SIZE_Y*GRID_SIZE_Z; gridIndex++)
-        pmeGrid[gridIndex] = (float2) (0.0f, 0.0f);
+        pmeGrid[gridIndex] = (real2) 0;
    for (int atom = 0; atom < NUM_ATOMS; atom++) {
-        float4 pos = posq[atom];
+        real4 pos = posq[atom];
        pos.x -= floor(pos.x*invPeriodicBoxSize.x)*periodicBoxSize.x;
        pos.y -= floor(pos.y*invPeriodicBoxSize.y)*periodicBoxSize.y;
        pos.z -= floor(pos.z*invPeriodicBoxSize.z)*periodicBoxSize.z;
-        float4 t = (float4) ((pos.x*invPeriodicBoxSize.x)*GRID_SIZE_X,
+        real4 t = (real4) ((pos.x*invPeriodicBoxSize.x)*GRID_SIZE_X,
                             (pos.y*invPeriodicBoxSize.y)*GRID_SIZE_Y,
-                             (pos.z*invPeriodicBoxSize.z)*GRID_SIZE_Z, 0.0f);
-        float4 dr = (float4) (t.x-(int) t.x, t.y-(int) t.y, t.z-(int) t.z, 0.0f);
+                             (pos.z*invPeriodicBoxSize.z)*GRID_SIZE_Z, 0);
+        real4 dr = (real4) (t.x-(int) t.x, t.y-(int) t.y, t.z-(int) t.z, 0);
        int4 gridIndex = (int4) (((int) t.x) % GRID_SIZE_X,
                                 ((int) t.y) % GRID_SIZE_Y,
                                 ((int) t.z) % GRID_SIZE_Z, 0);
-        float atomCharge = pos.w*EPSILON_FACTOR;
+        real atomCharge = pos.w*EPSILON_FACTOR;
        for (int ix = 0; ix < PME_ORDER; ix++) {
            int xindex = gridIndex.x+ix;
            xindex -= (xindex >= GRID_SIZE_X ? GRID_SIZE_X : 0);
@@ -82,10 +82,10 @@ __kernel void gridSpreadCharge(__global const float4* restrict posq, __global co
    }
 }

-__kernel void reciprocalConvolution(__global float2* restrict pmeGrid, __global float* restrict energyBuffer, __global const float* restrict pmeBsplineModuliX,
-        __global const float* restrict pmeBsplineModuliY, __global const float* restrict pmeBsplineModuliZ, float4 invPeriodicBoxSize, float recipScaleFactor) {
+__kernel void reciprocalConvolution(__global real2* restrict pmeGrid, __global real* restrict energyBuffer, __global const real* restrict pmeBsplineModuliX,
+        __global const real* restrict pmeBsplineModuliY, __global const real* restrict pmeBsplineModuliZ, real4 invPeriodicBoxSize, real recipScaleFactor) {
    const unsigned int gridSize = GRID_SIZE_X*GRID_SIZE_Y*GRID_SIZE_Z;
-    float energy = 0.0f;
+    real energy = 0;
    for (int index = get_global_id(0); index < gridSize; index += get_global_size(0)) {
        int kx = index/(GRID_SIZE_Y*GRID_SIZE_Z);
        int remainder = index-kx*GRID_SIZE_Y*GRID_SIZE_Z;
@@ -96,60 +96,60 @@ __kernel void reciprocalConvolution(__global float2* restrict pmeGrid, __global
        int mx = (kx < (GRID_SIZE_X+1)/2) ? kx : (kx-GRID_SIZE_X);
        int my = (ky < (GRID_SIZE_Y+1)/2) ? ky : (ky-GRID_SIZE_Y);
        int mz = (kz < (GRID_SIZE_Z+1)/2) ? kz : (kz-GRID_SIZE_Z);
-        float mhx = mx*invPeriodicBoxSize.x;
-        float mhy = my*invPeriodicBoxSize.y;
-        float mhz = mz*invPeriodicBoxSize.z;
-        float bx = pmeBsplineModuliX[kx];
-        float by = pmeBsplineModuliY[ky];
-        float bz = pmeBsplineModuliZ[kz];
-        float2 grid = pmeGrid[index];
-        float m2 = mhx*mhx+mhy*mhy+mhz*mhz;
-        float denom = m2*bx*by*bz;
-        float eterm = recipScaleFactor*EXP(-RECIP_EXP_FACTOR*m2)/denom;
-        pmeGrid[index] = (float2) (grid.x*eterm, grid.y*eterm);
+        real mhx = mx*invPeriodicBoxSize.x;
+        real mhy = my*invPeriodicBoxSize.y;
+        real mhz = mz*invPeriodicBoxSize.z;
+        real bx = pmeBsplineModuliX[kx];
+        real by = pmeBsplineModuliY[ky];
+        real bz = pmeBsplineModuliZ[kz];
+        real2 grid = pmeGrid[index];
+        real m2 = mhx*mhx+mhy*mhy+mhz*mhz;
+        real denom = m2*bx*by*bz;
+        real eterm = recipScaleFactor*EXP(-RECIP_EXP_FACTOR*m2)/denom;
+        pmeGrid[index] = (real2) (grid.x*eterm, grid.y*eterm);
        energy += eterm*(grid.x*grid.x + grid.y*grid.y);
    }
    energyBuffer[get_global_id(0)] += 0.5f*energy;
 }

-__kernel void gridInterpolateForce(__global const float4* restrict posq, __global float4* restrict forceBuffers, __global const float2* restrict pmeGrid, float4 periodicBoxSize, float4 invPeriodicBoxSize, __global const float4* restrict pmeBsplineTheta, __global const float4* restrict pmeBsplineDTheta) {
+__kernel void gridInterpolateForce(__global const real4* restrict posq, __global real4* restrict forceBuffers, __global const real2* restrict pmeGrid, real4 periodicBoxSize, real4 invPeriodicBoxSize, __global const real4* restrict pmeBsplineTheta, __global const real4* restrict pmeBsplineDTheta) {
    for (int atom = get_global_id(0); atom < NUM_ATOMS; atom += get_global_size(0)) {
-        float4 force = 0.0f;
-        float4 pos = posq[atom];
+        real4 force = 0;
+        real4 pos = posq[atom];
        pos.x -= floor(pos.x*invPeriodicBoxSize.x)*periodicBoxSize.x;
        pos.y -= floor(pos.y*invPeriodicBoxSize.y)*periodicBoxSize.y;
        pos.z -= floor(pos.z*invPeriodicBoxSize.z)*periodicBoxSize.z;
-        float4 t = (float4) ((pos.x*invPeriodicBoxSize.x)*GRID_SIZE_X,
-                             (pos.y*invPeriodicBoxSize.y)*GRID_SIZE_Y,
-                             (pos.z*invPeriodicBoxSize.z)*GRID_SIZE_Z, 0.0f);
+        real4 t = (real4) ((pos.x*invPeriodicBoxSize.x)*GRID_SIZE_X,
+                           (pos.y*invPeriodicBoxSize.y)*GRID_SIZE_Y,
+                           (pos.z*invPeriodicBoxSize.z)*GRID_SIZE_Z, 0);
        int4 gridIndex = (int4) (((int) t.x) % GRID_SIZE_X,
                                 ((int) t.y) % GRID_SIZE_Y,
                                 ((int) t.z) % GRID_SIZE_Z, 0);
        for (int ix = 0; ix < PME_ORDER; ix++) {
            int xindex = gridIndex.x+ix;
            xindex -= (xindex >= GRID_SIZE_X ? GRID_SIZE_X : 0);
-            float tx = pmeBsplineTheta[atom+ix*NUM_ATOMS].x;
-            float dtx = pmeBsplineDTheta[atom+ix*NUM_ATOMS].x;
+            real tx = pmeBsplineTheta[atom+ix*NUM_ATOMS].x;
+            real dtx = pmeBsplineDTheta[atom+ix*NUM_ATOMS].x;
            for (int iy = 0; iy < PME_ORDER; iy++) {
                int yindex = gridIndex.y+iy;
                yindex -= (yindex >= GRID_SIZE_Y ? GRID_SIZE_Y : 0);
-                float ty = pmeBsplineTheta[atom+iy*NUM_ATOMS].y;
-                float dty = pmeBsplineDTheta[atom+iy*NUM_ATOMS].y;
+                real ty = pmeBsplineTheta[atom+iy*NUM_ATOMS].y;
+                real dty = pmeBsplineDTheta[atom+iy*NUM_ATOMS].y;
                for (int iz = 0; iz < PME_ORDER; iz++) {
                    int zindex = gridIndex.z+iz;
                    zindex -= (zindex >= GRID_SIZE_Z ? GRID_SIZE_Z : 0);
-                    float tz = pmeBsplineTheta[atom+iz*NUM_ATOMS].z;
-                    float dtz = pmeBsplineDTheta[atom+iz*NUM_ATOMS].z;
+                    real tz = pmeBsplineTheta[atom+iz*NUM_ATOMS].z;
+                    real dtz = pmeBsplineDTheta[atom+iz*NUM_ATOMS].z;
                    int index = xindex*GRID_SIZE_Y*GRID_SIZE_Z + yindex*GRID_SIZE_Z + zindex;
-                    float gridvalue = pmeGrid[index].x;
+                    real gridvalue = pmeGrid[index].x;
                    force.x += dtx*ty*tz*gridvalue;
                    force.y += tx*dty*tz*gridvalue;
                    force.z += tx*ty*dtz*gridvalue;
                }
            }
        }
-        float4 totalForce = forceBuffers[atom];
-        float q = pos.w*EPSILON_FACTOR;
+        real4 totalForce = forceBuffers[atom];
+        real q = pos.w*EPSILON_FACTOR;
        totalForce.x -= q*force.x*GRID_SIZE_X*invPeriodicBoxSize.x;
        totalForce.y -= q*force.y*GRID_SIZE_Y*invPeriodicBoxSize.y;
        totalForce.z -= q*force.z*GRID_SIZE_Z*invPeriodicBoxSize.z;

--- a/platforms/opencl/src/kernels/removeCM.cl
+++ b/platforms/opencl/src/kernels/removeCM.cl
@@ -98,7 +98,7 @@ __kernel void removeCenterOfMassMomentum(unsigned int numAtoms, __global mixed4*
        temp[thread] += temp[thread+2];
 #endif
    barrier(CLK_LOCAL_MEM_FENCE);
-    cm = (temp[0]+temp[1])*INVERSE_TOTAL_MASS;
+    cm = (float4) (INVERSE_TOTAL_MASS*(temp[0].x+temp[1].x), INVERSE_TOTAL_MASS*(temp[0].y+temp[1].y), INVERSE_TOTAL_MASS*(temp[0].z+temp[1].z), 0);

    // Now remove the center of mass velocity from each atom.


--- a/platforms/opencl/tests/TestOpenCLAndersenThermostat.cpp
+++ b/platforms/opencl/tests/TestOpenCLAndersenThermostat.cpp
@@ -84,7 +84,7 @@ void testTemperature() {
    }
    ke /= numSteps;
    double expected = 0.5*numParticles*3*BOLTZ*temp;
-    ASSERT_EQUAL_TOL(expected, ke, 6/std::sqrt((double) numSteps));
+    ASSERT_USUALLY_EQUAL_TOL(expected, ke, 6/std::sqrt((double) numSteps));
 }

 void testConstraints() {