Check accuracy of native_ functions and only use them if they are accurate enough

b2960968 · Peter Eastman · a0651b5b · b2960968 · b2960968 · b2960968
Commit b2960968 authored Apr 29, 2010 by Peter Eastman
15 changed files
--- a/platforms/opencl/src/OpenCLContext.cpp
+++ b/platforms/opencl/src/OpenCLContext.cpp
@@ -96,6 +96,40 @@ OpenCLContext::OpenCLContext(int numParticles, int deviceIndex) : time(0.0), ste
    utilities = createProgram(OpenCLKernelSources::utilities);
    clearBufferKernel = cl::Kernel(utilities, "clearBuffer");
    reduceFloat4Kernel = cl::Kernel(utilities, "reduceFloat4Buffer");
+    // Decide whether native_sqrt(), native_rsqrt(), and native_recip() are sufficiently accurate to use.
+    cl::Kernel accuracyKernel(utilities, "determineNativeAccuracy");
+    OpenCLArray<mm_float4> values(*this, 20, "values", true);
+    float nextValue = 1e-4;
+    for (int i = 0; i < values.getSize(); ++i) {
+        values[i].x = nextValue;
+        nextValue *= M_PI;
+    }
+    values.upload();
+    accuracyKernel.setArg<cl::Buffer>(0, values.getDeviceBuffer());
+    accuracyKernel.setArg<cl_int>(1, values.getSize());
+    executeKernel(accuracyKernel, values.getSize());
+    values.download();
+    double maxSqrtError = 0.0, maxRsqrtError = 0.0, maxRecipError = 0.0;
+    for (int i = 0; i < values.getSize(); ++i) {
+        double correctSqrt = sqrt(values[i].x);
+        maxSqrtError = max(maxSqrtError, fabs(correctSqrt-values[i].y)/correctSqrt);
+        maxRsqrtError = max(maxRsqrtError, fabs(1.0/correctSqrt-values[i].z)*correctSqrt);
+        maxRecipError = max(maxRecipError, fabs(1.0/values[i].x-values[i].w)/values[i].w);
+    }
+    if (maxSqrtError < 1e-6)
+        compilationOptions += " -DSQRT=native_sqrt";
+    else
+        compilationOptions += " -DSQRT=sqrt";
+    if (maxRsqrtError < 1e-6)
+        compilationOptions += " -DRSQRT=native_rsqrt";
+    else
+        compilationOptions += " -DRSQRT=rsqrt";
+    if (maxRecipError < 1e-6)
+        compilationOptions += " -DRECIP=native_recip";
+    else
+        compilationOptions += " -DRECIP=1.0f/";
 }
 OpenCLContext::~OpenCLContext() {

--- a/platforms/opencl/src/kernels/customBondForce.cl
+++ b/platforms/opencl/src/kernels/customBondForce.cl
@@ -14,7 +14,7 @@ __kernel void computeCustomBondForces(int numAtoms, int numBonds, __global float
        // Compute the force.
-        float r = native_sqrt(delta.x*delta.x + delta.y*delta.y + delta.z*delta.z);
+        float r = SQRT(delta.x*delta.x + delta.y*delta.y + delta.z*delta.z);
        COMPUTE_FORCE
        delta.xyz *= -dEdR/r;

--- a/platforms/opencl/src/kernels/customGBEnergyN2_default.cl
+++ b/platforms/opencl/src/kernels/customGBEnergyN2_default.cl
@@ -64,7 +64,7 @@ __kernel void computeN2Energy(__global float4* forceBuffers, __global float* ene
 #ifdef USE_CUTOFF
                if (r2 < CUTOFF_SQUARED) {
 #endif
-                float r = native_sqrt(r2);
+                float r = SQRT(r2);
                LOAD_ATOM2_PARAMETERS
                atom2 = y+baseLocalAtom+j;
                float dEdR = 0.0f;
@@ -140,7 +140,7 @@ __kernel void computeN2Energy(__global float4* forceBuffers, __global float* ene
 #ifdef USE_CUTOFF
                if (r2 < CUTOFF_SQUARED) {
 #endif
-                float r = native_sqrt(r2);
+                float r = SQRT(r2);
                LOAD_ATOM2_PARAMETERS
                atom2 = y+baseLocalAtom+tj;
                float dEdR = 0.0f;

--- a/platforms/opencl/src/kernels/customGBEnergyN2_nvidia.cl
+++ b/platforms/opencl/src/kernels/customGBEnergyN2_nvidia.cl
@@ -63,7 +63,7 @@ __kernel void computeN2Energy(__global float4* forceBuffers, __global float* ene
 #ifdef USE_CUTOFF
                if (r2 < CUTOFF_SQUARED) {
 #endif
-                float r = native_sqrt(r2);
+                float r = SQRT(r2);
                LOAD_ATOM2_PARAMETERS
                atom2 = y+j;
                float dEdR = 0.0f;
@@ -136,7 +136,7 @@ __kernel void computeN2Energy(__global float4* forceBuffers, __global float* ene
 #ifdef USE_CUTOFF
                    if (r2 < CUTOFF_SQUARED) {
 #endif
-                    float r = native_sqrt(r2);
+                    float r = SQRT(r2);
                    LOAD_ATOM2_PARAMETERS
                    atom2 = y+tj;
                    float dEdR = 0.0f;

--- a/platforms/opencl/src/kernels/customGBValueN2_default.cl
+++ b/platforms/opencl/src/kernels/customGBValueN2_default.cl
@@ -61,7 +61,7 @@ __kernel void computeN2Value(__global float4* posq, __local float4* local_posq,
 #ifdef USE_CUTOFF
                if (r2 < CUTOFF_SQUARED) {
 #endif
-                float r = native_sqrt(r2);
+                float r = SQRT(r2);
                LOAD_ATOM2_PARAMETERS
                atom2 = y+baseLocalAtom+j;
                float tempValue1 = 0.0f;
@@ -134,7 +134,7 @@ __kernel void computeN2Value(__global float4* posq, __local float4* local_posq,
 #ifdef USE_CUTOFF
                if (r2 < CUTOFF_SQUARED) {
 #endif
-                float r = native_sqrt(r2);
+                float r = SQRT(r2);
                LOAD_ATOM2_PARAMETERS
                atom2 = y+baseLocalAtom+tj;
                float tempValue1 = 0.0f;

--- a/platforms/opencl/src/kernels/customGBValueN2_nvidia.cl
+++ b/platforms/opencl/src/kernels/customGBValueN2_nvidia.cl
@@ -61,7 +61,7 @@ __kernel void computeN2Value(__global float4* posq, __local float4* local_posq,
 #ifdef USE_CUTOFF
                if (r2 < CUTOFF_SQUARED) {
 #endif
-                float r = native_sqrt(r2);
+                float r = SQRT(r2);
                LOAD_ATOM2_PARAMETERS
                atom2 = y+j;
                float tempValue1 = 0.0f;
@@ -122,7 +122,7 @@ __kernel void computeN2Value(__global float4* posq, __local float4* local_posq,
                            float tempValue1 = 0.0f;
                            float tempValue2 = 0.0f;
                            if (r2 < CUTOFF_SQUARED) {
-                                float r = native_sqrt(r2);
+                                float r = SQRT(r2);
                                LOAD_ATOM2_PARAMETERS
                                atom2 = y+j;
                                if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
@@ -177,7 +177,7 @@ __kernel void computeN2Value(__global float4* posq, __local float4* local_posq,
 #ifdef USE_CUTOFF
                    if (r2 < CUTOFF_SQUARED) {
 #endif
-                    float r = native_sqrt(r2);
+                    float r = SQRT(r2);
                    LOAD_ATOM2_PARAMETERS
                    atom2 = y+tj;
                    float tempValue1 = 0.0f;

--- a/platforms/opencl/src/kernels/customHbondForce.cl
+++ b/platforms/opencl/src/kernels/customHbondForce.cl
@@ -27,14 +27,14 @@ float4 deltaPeriodic(float4 vec1, float4 vec2) {
 */
 float computeAngle(float4 vec1, float4 vec2) {
    float dotProduct = vec1.x*vec2.x + vec1.y*vec2.y + vec1.z*vec2.z;
-    float cosine = dotProduct*native_rsqrt(vec1.w*vec2.w);
+    float cosine = dotProduct*RSQRT(vec1.w*vec2.w);
    float angle;
    if (cosine > 0.99f || cosine < -0.99f) {
        // We're close to the singularity in acos(), so take the cross product and use asin() instead.
        float4 crossProduct = cross(vec1, vec2);
        float scale = vec1.w*vec2.w;
-        angle = asin(native_sqrt(dot(crossProduct, crossProduct)/scale));
+        angle = asin(SQRT(dot(crossProduct, crossProduct)/scale));
        if (cosine < 0.0f)
            angle = M_PI-angle;
    }

--- a/platforms/opencl/src/kernels/gbsaObc_default.cl
+++ b/platforms/opencl/src/kernels/gbsaObc_default.cl
@@ -65,8 +65,8 @@ __kernel void computeBornSum(__global float* global_bornSum, __global float4* po
 #else
                if (atom1 < NUM_ATOMS && y+baseLocalAtom+j < NUM_ATOMS) {
 #endif
-                    float invR = native_rsqrt(r2);
+                    float invR = RSQRT(r2);
-                    float r = native_recip(invR);
+                    float r = RECIP(invR);
                    float2 params2 = (float2) (localData[baseLocalAtom+j].radius, localData[baseLocalAtom+j].scaledRadius);
                    float rScaledRadiusJ = r+params2.y;
                    if ((j != tgx) && (params1.x < rScaledRadiusJ)) {
@@ -133,8 +133,8 @@ __kernel void computeBornSum(__global float* global_bornSum, __global float4* po
 #else
                if (atom1 < NUM_ATOMS && y+baseLocalAtom+tj < NUM_ATOMS) {
 #endif
-                    float invR = native_rsqrt(r2);
+                    float invR = RSQRT(r2);
-                    float r = native_recip(invR);
+                    float r = RECIP(invR);
                    float2 params2 = (float2) (localData[baseLocalAtom+tj].radius, localData[baseLocalAtom+tj].scaledRadius);
                    float rScaledRadiusJ = r+params2.y;
                    if (params1.x < rScaledRadiusJ) {
@@ -241,14 +241,14 @@ __kernel void computeGBSAForce1(__global float4* forceBuffers, __global float* e
                    delta.z -= floor(delta.z*INV_PERIODIC_BOX_SIZE_Z+0.5f)*PERIODIC_BOX_SIZE_Z;
 #endif
                    float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
-                    float invR = native_rsqrt(r2);
+                    float invR = RSQRT(r2);
-                    float r = native_recip(invR);
+                    float r = RECIP(invR);
                    float bornRadius2 = localData[baseLocalAtom+j].bornRadius;
                    float alpha2_ij = bornRadius1*bornRadius2;
                    float D_ij = r2/(4.0f*alpha2_ij);
                    float expTerm = exp(-D_ij);
                    float denominator2 = r2 + alpha2_ij*expTerm;
-                    float denominator = native_sqrt(denominator2);
+                    float denominator = SQRT(denominator2);
                    float tempEnergy = (PREFACTOR*posq1.w*posq2.w)/denominator;
                    float Gpol = tempEnergy/denominator2;
                    float dGpol_dalpha2_ij = -0.5f*Gpol*expTerm*(1.0f+D_ij);
@@ -315,14 +315,14 @@ __kernel void computeGBSAForce1(__global float4* forceBuffers, __global float* e
                    delta.z -= floor(delta.z*INV_PERIODIC_BOX_SIZE_Z+0.5f)*PERIODIC_BOX_SIZE_Z;
 #endif
                    float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
-                    float invR = native_rsqrt(r2);
+                    float invR = RSQRT(r2);
-                    float r = native_recip(invR);
+                    float r = RECIP(invR);
                    float bornRadius2 = localData[baseLocalAtom+tj].bornRadius;
                    float alpha2_ij = bornRadius1*bornRadius2;
                    float D_ij = r2/(4.0f*alpha2_ij);
                    float expTerm = exp(-D_ij);
                    float denominator2 = r2 + alpha2_ij*expTerm;
-                    float denominator = native_sqrt(denominator2);
+                    float denominator = SQRT(denominator2);
                    float tempEnergy = (PREFACTOR*posq1.w*posq2.w)/denominator;
                    float Gpol = tempEnergy/denominator2;
                    float dGpol_dalpha2_ij = -0.5f*Gpol*expTerm*(1.0f+D_ij);

--- a/platforms/opencl/src/kernels/gbsaObc_nvidia.cl
+++ b/platforms/opencl/src/kernels/gbsaObc_nvidia.cl
@@ -65,8 +65,8 @@ __kernel void computeBornSum(__global float* global_bornSum, __global float4* po
 #else
                if (atom1 < NUM_ATOMS && y+j < NUM_ATOMS) {
 #endif
-                    float invR = native_rsqrt(r2);
+                    float invR = RSQRT(r2);
-                    float r = native_recip(invR);
+                    float r = RECIP(invR);
                    float2 params2 = (float2) (localData[tbx+j].radius, localData[tbx+j].scaledRadius);
                    float rScaledRadiusJ = r+params2.y;
                    if ((j != tgx) && (params1.x < rScaledRadiusJ)) {
@@ -130,8 +130,8 @@ __kernel void computeBornSum(__global float* global_bornSum, __global float4* po
 #else
                            if (atom1 < NUM_ATOMS && y+j < NUM_ATOMS) {
 #endif
-                                float invR = native_rsqrt(r2);
+                                float invR = RSQRT(r2);
-                                float r = native_recip(invR);
+                                float r = RECIP(invR);
                                float2 params2 = (float2) (localData[tbx+j].radius, localData[tbx+j].scaledRadius);
                                float rScaledRadiusJ = r+params2.y;
                                if (params1.x < rScaledRadiusJ) {
@@ -198,8 +198,8 @@ __kernel void computeBornSum(__global float* global_bornSum, __global float4* po
 #else
                    if (atom1 < NUM_ATOMS && y+tj < NUM_ATOMS) {
 #endif
-                        float invR = native_rsqrt(r2);
+                        float invR = RSQRT(r2);
-                        float r = native_recip(invR);
+                        float r = RECIP(invR);
                        float2 params2 = (float2) (localData[tbx+tj].radius, localData[tbx+tj].scaledRadius);
                        float rScaledRadiusJ = r+params2.y;
                        if (params1.x < rScaledRadiusJ) {
@@ -301,14 +301,14 @@ __kernel void computeGBSAForce1(__global float4* forceBuffers, __global float* e
                    delta.z -= floor(delta.z*INV_PERIODIC_BOX_SIZE_Z+0.5f)*PERIODIC_BOX_SIZE_Z;
 #endif
                    float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
-                    float invR = native_rsqrt(r2);
+                    float invR = RSQRT(r2);
-                    float r = native_recip(invR);
+                    float r = RECIP(invR);
                    float bornRadius2 = localData[tbx+j].bornRadius;
                    float alpha2_ij = bornRadius1*bornRadius2;
                    float D_ij = r2/(4.0f*alpha2_ij);
                    float expTerm = exp(-D_ij);
                    float denominator2 = r2 + alpha2_ij*expTerm;
-                    float denominator = native_sqrt(denominator2);
+                    float denominator = SQRT(denominator2);
                    float tempEnergy = (PREFACTOR*posq1.w*posq2.w)/denominator;
                    float Gpol = tempEnergy/denominator2;
                    float dGpol_dalpha2_ij = -0.5f*Gpol*expTerm*(1.0f+D_ij);
@@ -370,14 +370,14 @@ __kernel void computeGBSAForce1(__global float4* forceBuffers, __global float* e
                            delta.z -= floor(delta.z*INV_PERIODIC_BOX_SIZE_Z+0.5f)*PERIODIC_BOX_SIZE_Z;
 #endif
                            float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
-                            float invR = native_rsqrt(r2);
+                            float invR = RSQRT(r2);
-                            float r = native_recip(invR);
+                            float r = RECIP(invR);
                            float bornRadius2 = localData[tbx+j].bornRadius;
                            float alpha2_ij = bornRadius1*bornRadius2;
                            float D_ij = r2/(4.0f*alpha2_ij);
                            float expTerm = exp(-D_ij);
                            float denominator2 = r2 + alpha2_ij*expTerm;
-                            float denominator = native_sqrt(denominator2);
+                            float denominator = SQRT(denominator2);
                            float tempEnergy = (PREFACTOR*posq1.w*posq2.w)/denominator;
                            float Gpol = tempEnergy/denominator2;
                            float dGpol_dalpha2_ij = -0.5f*Gpol*expTerm*(1.0f+D_ij);
@@ -436,14 +436,14 @@ __kernel void computeGBSAForce1(__global float4* forceBuffers, __global float* e
                        delta.z -= floor(delta.z*INV_PERIODIC_BOX_SIZE_Z+0.5f)*PERIODIC_BOX_SIZE_Z;
 #endif
                        float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
-                        float invR = native_rsqrt(r2);
+                        float invR = RSQRT(r2);
-                        float r = native_recip(invR);
+                        float r = RECIP(invR);
                        float bornRadius2 = localData[tbx+tj].bornRadius;
                        float alpha2_ij = bornRadius1*bornRadius2;
                        float D_ij = r2/(4.0f*alpha2_ij);
                        float expTerm = exp(-D_ij);
                        float denominator2 = r2 + alpha2_ij*expTerm;
-                        float denominator = native_sqrt(denominator2);
+                        float denominator = SQRT(denominator2);
                        float tempEnergy = (PREFACTOR*posq1.w*posq2.w)/denominator;
                        float Gpol = tempEnergy/denominator2;
                        float dGpol_dalpha2_ij = -0.5f*Gpol*expTerm*(1.0f+D_ij);

--- a/platforms/opencl/src/kernels/harmonicAngleForce.cl
+++ b/platforms/opencl/src/kernels/harmonicAngleForce.cl
@@ -20,11 +20,11 @@ __kernel void calcHarmonicAngleForce(int numAtoms, int numAngles, __global float
        float4 v1 = a2-a3;
        float4 cp = cross(v0, v1);
        float rp = cp.x*cp.x + cp.y*cp.y + cp.z*cp.z;
-        rp = max(native_sqrt(rp), 1.0e-06f);
+        rp = max(SQRT(rp), 1.0e-06f);
        float r21 = v0.x*v0.x + v0.y*v0.y + v0.z*v0.z;
        float r23 = v1.x*v1.x + v1.y*v1.y + v1.z*v1.z;
        float dot = v0.x*v1.x + v0.y*v1.y + v0.z*v1.z;
-        float cosine = dot*native_rsqrt(r21*r23);
+        float cosine = dot*RSQRT(r21*r23);
        float deltaIdeal = acos(cosine)-angleParams.x;
        energy += 0.5f*angleParams.y*deltaIdeal*deltaIdeal;
        float dEdR = angleParams.y*deltaIdeal;

--- a/platforms/opencl/src/kernels/harmonicBondForce.cl
+++ b/platforms/opencl/src/kernels/harmonicBondForce.cl
@@ -14,7 +14,7 @@ __kernel void calcHarmonicBondForce(int numAtoms, int numBonds, __global float4*
        // Compute the force.
-        float r = native_sqrt(delta.x*delta.x + delta.y*delta.y + delta.z*delta.z);
+        float r = SQRT(delta.x*delta.x + delta.y*delta.y + delta.z*delta.z);
        float deltaIdeal = r-bondParams.x;
        energy += 0.5f * bondParams.y*deltaIdeal*deltaIdeal;
        float dEdR = bondParams.y * deltaIdeal;

--- a/platforms/opencl/src/kernels/nonbondedExceptions.cl
+++ b/platforms/opencl/src/kernels/nonbondedExceptions.cl
@@ -16,7 +16,7 @@ __kernel void computeNonbondedExceptions(__global float4* forceBuffers, __global
        // Compute the force.
        float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
-        float invR = native_rsqrt(r2);
+        float invR = RSQRT(r2);
        float sig2 = invR*exceptionParams.y;
        sig2 *= sig2;
        float sig6 = sig2*sig2*sig2;

--- a/platforms/opencl/src/kernels/nonbonded_default.cl
+++ b/platforms/opencl/src/kernels/nonbonded_default.cl
@@ -67,8 +67,8 @@ __kernel void computeNonbonded(__global float4* forceBuffers, __global float* en
                delta.z -= floor(delta.z*INV_PERIODIC_BOX_SIZE_Z+0.5f)*PERIODIC_BOX_SIZE_Z;
 #endif
                float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
-                float invR = native_rsqrt(r2);
+                float invR = RSQRT(r2);
-                float r = native_recip(invR);
+                float r = RECIP(invR);
                LOAD_ATOM2_PARAMETERS
                atom2 = y+baseLocalAtom+j;
                float dEdR = 0.0f;
@@ -135,8 +135,8 @@ __kernel void computeNonbonded(__global float4* forceBuffers, __global float* en
                delta.z -= floor(delta.z*INV_PERIODIC_BOX_SIZE_Z+0.5f)*PERIODIC_BOX_SIZE_Z;
 #endif
                float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
-                float invR = native_rsqrt(r2);
+                float invR = RSQRT(r2);
-                float r = native_recip(invR);
+                float r = RECIP(invR);
                LOAD_ATOM2_PARAMETERS
                atom2 = y+baseLocalAtom+tj;
                float dEdR = 0.0f;

--- a/platforms/opencl/src/kernels/nonbonded_nvidia.cl
+++ b/platforms/opencl/src/kernels/nonbonded_nvidia.cl
@@ -124,8 +124,8 @@ __kernel void computeNonbonded(__global float4* forceBuffers, __global float* en
                            delta.z -= floor(delta.z*INV_PERIODIC_BOX_SIZE_Z+0.5f)*PERIODIC_BOX_SIZE_Z;
 #endif
                            float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
-                            float invR = native_rsqrt(r2);
+                            float invR = RSQRT(r2);
-                            float r = native_recip(invR);
+                            float r = RECIP(invR);
                            LOAD_ATOM2_PARAMETERS
                            atom2 = y+j;
                            float dEdR = 0.0f;
@@ -181,8 +181,8 @@ __kernel void computeNonbonded(__global float4* forceBuffers, __global float* en
                    delta.z -= floor(delta.z*INV_PERIODIC_BOX_SIZE_Z+0.5f)*PERIODIC_BOX_SIZE_Z;
 #endif
                    float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
-                    float invR = native_rsqrt(r2);
+                    float invR = RSQRT(r2);
-                    float r = native_recip(invR);
+                    float r = RECIP(invR);
                    LOAD_ATOM2_PARAMETERS
                    atom2 = y+tj;
                    float dEdR = 0.0f;

--- a/platforms/opencl/src/kernels/utilities.cl
+++ b/platforms/opencl/src/kernels/utilities.cl
@@ -30,3 +30,14 @@ __kernel void reduceFloat4Buffer(__global float4* buffer, int bufferSize, int nu
        index += get_global_size(0);
    }
 }
+/**
+ * This is called to determine the accuracy of native_sqrt(), native_rsqrt() and native_recip().
+ */
+__kernel void determineNativeAccuracy(__global float4* values, int numValues) {
+    for (int i = 0; i < numValues; ++i) {
+        float v = values[i].x;
+        values[i] = (float4) (v, native_sqrt(v), native_rsqrt(v), native_recip(v));
+    }
+}
\ No newline at end of file