Use native_exp and native_log when they are sufficiently accurate

c87cee06 · Peter Eastman · 2a313e48 · c87cee06 · c87cee06 · c87cee06
Commit c87cee06 authored Jul 20, 2010 by Peter Eastman
9 changed files
--- a/platforms/opencl/src/OpenCLContext.cpp
+++ b/platforms/opencl/src/OpenCLContext.cpp
@@ -107,10 +107,10 @@ OpenCLContext::OpenCLContext(int numParticles, int deviceIndex) : time(0.0), ste
    // Decide whether native_sqrt(), native_rsqrt(), and native_recip() are sufficiently accurate to use.
    cl::Kernel accuracyKernel(utilities, "determineNativeAccuracy");
-    OpenCLArray<mm_float4> values(*this, 20, "values", true);
+    OpenCLArray<mm_float8> values(*this, 20, "values", true);
    float nextValue = 1e-4;
    for (int i = 0; i < values.getSize(); ++i) {
-        values[i].x = nextValue;
+        values[i].s0 = nextValue;
        nextValue *= M_PI;
    }
    values.upload();
@@ -118,12 +118,15 @@ OpenCLContext::OpenCLContext(int numParticles, int deviceIndex) : time(0.0), ste
    accuracyKernel.setArg<cl_int>(1, values.getSize());
    executeKernel(accuracyKernel, values.getSize());
    values.download();
-    double maxSqrtError = 0.0, maxRsqrtError = 0.0, maxRecipError = 0.0;
+    double maxSqrtError = 0.0, maxRsqrtError = 0.0, maxRecipError = 0.0, maxExpError = 0.0, maxLogError = 0.0;
    for (int i = 0; i < values.getSize(); ++i) {
-        double correctSqrt = sqrt(values[i].x);
+        double v = values[i].s0;
-        maxSqrtError = max(maxSqrtError, fabs(correctSqrt-values[i].y)/correctSqrt);
+        double correctSqrt = sqrt(v);
-        maxRsqrtError = max(maxRsqrtError, fabs(1.0/correctSqrt-values[i].z)*correctSqrt);
+        maxSqrtError = max(maxSqrtError, fabs(correctSqrt-values[i].s1)/correctSqrt);
-        maxRecipError = max(maxRecipError, fabs(1.0/values[i].x-values[i].w)/values[i].w);
+        maxRsqrtError = max(maxRsqrtError, fabs(1.0/correctSqrt-values[i].s2)*correctSqrt);
+        maxRecipError = max(maxRecipError, fabs(1.0/v-values[i].s3)/values[i].s3);
+        maxExpError = max(maxExpError, fabs(exp(v)-values[i].s4)/values[i].s4);
+        maxLogError = max(maxLogError, fabs(log(v)-values[i].s5)/values[i].s5);
    }
    if (maxSqrtError < 1e-6)
        compilationOptions += " -DSQRT=native_sqrt";
@@ -137,6 +140,14 @@ OpenCLContext::OpenCLContext(int numParticles, int deviceIndex) : time(0.0), ste
        compilationOptions += " -DRECIP=native_recip";
    else
        compilationOptions += " -DRECIP=1.0f/";
+    if (maxExpError < 1e-6)
+        compilationOptions += " -DEXP=native_exp";
+    else
+        compilationOptions += " -DEXP=exp";
+    if (maxLogError < 1e-6)
+        compilationOptions += " -DLOG=native_log";
+    else
+        compilationOptions += " -DLOG=log";
 }
 OpenCLContext::~OpenCLContext() {

--- a/platforms/opencl/src/OpenCLExpressionUtilities.cpp
+++ b/platforms/opencl/src/OpenCLExpressionUtilities.cpp
@@ -153,10 +153,10 @@ void OpenCLExpressionUtilities::processExpression(stringstream& out, const Expre
            out << "sqrt(" << getTempName(node.getChildren()[0], temps) << ")";
            break;
        case Operation::EXP:
-            out << "exp(" << getTempName(node.getChildren()[0], temps) << ")";
+            out << "EXP(" << getTempName(node.getChildren()[0], temps) << ")";
            break;
        case Operation::LOG:
-            out << "log(" << getTempName(node.getChildren()[0], temps) << ")";
+            out << "LOG(" << getTempName(node.getChildren()[0], temps) << ")";
            break;
        case Operation::SIN:
            out << "sin(" << getTempName(node.getChildren()[0], temps) << ")";

--- a/platforms/opencl/src/kernels/coulombLennardJones.cl
+++ b/platforms/opencl/src/kernels/coulombLennardJones.cl
@@ -17,7 +17,7 @@ if (!isExcluded || needCorrection) {
    if (needCorrection) {
        // Subtract off the part of this interaction that was included in the reciprocal space contribution.
-        tempForce = -prefactor*((1.0f-erfcAlphaR)-alphaR*exp(-alphaR*alphaR)*TWO_OVER_SQRT_PI);
+        tempForce = -prefactor*((1.0f-erfcAlphaR)-alphaR*EXP(-alphaR*alphaR)*TWO_OVER_SQRT_PI);
        tempEnergy += -prefactor*(1.0f-erfcAlphaR);
    }
    else if (r2 < CUTOFF_SQUARED) {
@@ -27,10 +27,10 @@ if (!isExcluded || needCorrection) {
        sig2 *= sig2;
        float sig6 = sig2*sig2*sig2;
        float eps = sigmaEpsilon1.y*sigmaEpsilon2.y;
-        tempForce = eps*(12.0f*sig6 - 6.0f)*sig6 + prefactor*(erfcAlphaR+alphaR*exp(-alphaR*alphaR)*TWO_OVER_SQRT_PI);
+        tempForce = eps*(12.0f*sig6 - 6.0f)*sig6 + prefactor*(erfcAlphaR+alphaR*EXP(-alphaR*alphaR)*TWO_OVER_SQRT_PI);
        tempEnergy += eps*(sig6 - 1.0f)*sig6 + prefactor*erfcAlphaR;
 #else
-        tempForce = prefactor*(erfcAlphaR+alphaR*exp(-alphaR*alphaR)*TWO_OVER_SQRT_PI);
+        tempForce = prefactor*(erfcAlphaR+alphaR*EXP(-alphaR*alphaR)*TWO_OVER_SQRT_PI);
        tempEnergy += prefactor*erfcAlphaR;
 #endif
    }

--- a/platforms/opencl/src/kernels/ewald.cl
+++ b/platforms/opencl/src/kernels/ewald.cl
@@ -46,7 +46,7 @@ __kernel void calculateEwaldCosSinSums(__global float* energyBuffer, __global fl
        // Compute the contribution to the energy.
        float k2 = kx*kx + ky*ky + kz*kz;
-        float ak = exp(k2*EXP_COEFFICIENT) / k2;
+        float ak = EXP(k2*EXP_COEFFICIENT) / k2;
        energy += reciprocalCoefficient*ak*(sum.x*sum.x + sum.y*sum.y);
        index += get_global_size(0);
    }
@@ -83,7 +83,7 @@ __kernel void calculateEwaldForces(__global float4* forceBuffers, __global float
                    int index = rx*(KMAX_Y*2-1)*(KMAX_Z*2-1) + (ry+KMAX_Y-1)*(KMAX_Z*2-1) + (rz+KMAX_Z-1);
                    float k2 = kx*kx + ky*ky + kz*kz;
-                    float ak = exp(k2*EXP_COEFFICIENT)/k2;
+                    float ak = EXP(k2*EXP_COEFFICIENT)/k2;
                    phase = apos.z*kz;
                    float2 structureFactor = multofFloat2(tab_xy, (float2) (cos(phase), sin(phase)));
                    float2 sum = cosSinSum[index];

--- a/platforms/opencl/src/kernels/gbsaObc2.cl
+++ b/platforms/opencl/src/kernels/gbsaObc2.cl
@@ -14,8 +14,8 @@ if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && atom1 != atom2) {
    float l_ij2I = l_ijI*l_ijI;
    float u_ij2J = u_ijJ*u_ijJ;
    float u_ij2I = u_ijI*u_ijI;
-    float t1J = log(u_ijJ/l_ijJ);
+    float t1J = LOG(u_ijJ/l_ijJ);
-    float t1I = log(u_ijI/l_ijI);
+    float t1I = LOG(u_ijI/l_ijI);
    float t2J = (l_ij2J-u_ij2J);
    float t2I = (l_ij2I-u_ij2I);
    float t3J = t2J*invR;

--- a/platforms/opencl/src/kernels/gbsaObc_default.cl
+++ b/platforms/opencl/src/kernels/gbsaObc_default.cl
@@ -75,7 +75,7 @@ void computeBornSum(__global float* global_bornSum, __global float4* posq, __glo
                        float u_ij = RECIP(rScaledRadiusJ);
                        float l_ij2 = l_ij*l_ij;
                        float u_ij2 = u_ij*u_ij;
-                        float ratio = log(u_ij / l_ij);
+                        float ratio = LOG(u_ij / l_ij);
                        bornSum += l_ij - u_ij + 0.25f*r*(u_ij2-l_ij2) + (0.50f*invR*ratio) +
                                         (0.25f*params2.y*params2.y*invR)*(l_ij2-u_ij2);
                        if (params1.x < params2.x-r)
@@ -143,7 +143,7 @@ void computeBornSum(__global float* global_bornSum, __global float4* posq, __glo
                        float u_ij = RECIP(rScaledRadiusJ);
                        float l_ij2 = l_ij*l_ij;
                        float u_ij2 = u_ij*u_ij;
-                        float ratio = log(u_ij / l_ij);
+                        float ratio = LOG(u_ij / l_ij);
                        bornSum += l_ij - u_ij + 0.25f*r*(u_ij2-l_ij2) + (0.50f*invR*ratio) +
                                         (0.25f*params2.y*params2.y*invR)*(l_ij2-u_ij2);
                        if (params1.x < params2.x-r)
@@ -155,7 +155,7 @@ void computeBornSum(__global float* global_bornSum, __global float4* posq, __glo
                        float u_ij = RECIP(rScaledRadiusI);
                        float l_ij2 = l_ij*l_ij;
                        float u_ij2 = u_ij*u_ij;
-                        float ratio = log(u_ij / l_ij);
+                        float ratio = LOG(u_ij / l_ij);
                        float term = l_ij - u_ij + 0.25f*r*(u_ij2-l_ij2) + (0.50f*invR*ratio) +
                                         (0.25f*params1.y*params1.y*invR)*(l_ij2-u_ij2);
                        if (params2.x < params1.x-r)
@@ -248,7 +248,7 @@ void computeGBSAForce1(__global float4* forceBuffers, __global float* energyBuff
                    float bornRadius2 = localData[baseLocalAtom+j].bornRadius;
                    float alpha2_ij = bornRadius1*bornRadius2;
                    float D_ij = r2/(4.0f*alpha2_ij);
-                    float expTerm = exp(-D_ij);
+                    float expTerm = EXP(-D_ij);
                    float denominator2 = r2 + alpha2_ij*expTerm;
                    float denominator = SQRT(denominator2);
                    float tempEnergy = (PREFACTOR*posq1.w*posq2.w)/denominator;
@@ -322,7 +322,7 @@ void computeGBSAForce1(__global float4* forceBuffers, __global float* energyBuff
                    float bornRadius2 = localData[baseLocalAtom+tj].bornRadius;
                    float alpha2_ij = bornRadius1*bornRadius2;
                    float D_ij = r2/(4.0f*alpha2_ij);
-                    float expTerm = exp(-D_ij);
+                    float expTerm = EXP(-D_ij);
                    float denominator2 = r2 + alpha2_ij*expTerm;
                    float denominator = SQRT(denominator2);
                    float tempEnergy = (PREFACTOR*posq1.w*posq2.w)/denominator;

--- a/platforms/opencl/src/kernels/gbsaObc_nvidia.cl
+++ b/platforms/opencl/src/kernels/gbsaObc_nvidia.cl
@@ -75,7 +75,7 @@ void computeBornSum(__global float* global_bornSum, __global float4* posq, __glo
                        float u_ij = RECIP(rScaledRadiusJ);
                        float l_ij2 = l_ij*l_ij;
                        float u_ij2 = u_ij*u_ij;
-                        float ratio = log(u_ij / l_ij);
+                        float ratio = LOG(u_ij / l_ij);
                        bornSum += l_ij - u_ij + 0.25f*r*(u_ij2-l_ij2) + (0.50f*invR*ratio) +
                                         (0.25f*params2.y*params2.y*invR)*(l_ij2-u_ij2);
                        if (params1.x < params2.x-r)
@@ -140,7 +140,7 @@ void computeBornSum(__global float* global_bornSum, __global float4* posq, __glo
                                    float u_ij = RECIP(rScaledRadiusJ);
                                    float l_ij2 = l_ij*l_ij;
                                    float u_ij2 = u_ij*u_ij;
-                                    float ratio = log(u_ij / l_ij);
+                                    float ratio = LOG(u_ij / l_ij);
                                    bornSum += l_ij - u_ij + 0.25f*r*(u_ij2-l_ij2) + (0.50f*invR*ratio) +
                                                     (0.25f*params2.y*params2.y*invR)*(l_ij2-u_ij2);
                                    if (params1.x < params2.x-r)
@@ -152,7 +152,7 @@ void computeBornSum(__global float* global_bornSum, __global float4* posq, __glo
                                    float u_ij = RECIP(rScaledRadiusI);
                                    float l_ij2 = l_ij*l_ij;
                                    float u_ij2 = u_ij*u_ij;
-                                    float ratio = log(u_ij / l_ij);
+                                    float ratio = LOG(u_ij / l_ij);
                                    float term = l_ij - u_ij + 0.25f*r*(u_ij2-l_ij2) + (0.50f*invR*ratio) +
                                                     (0.25f*params1.y*params1.y*invR)*(l_ij2-u_ij2);
                                    if (params2.x < params1.x-r)
@@ -208,7 +208,7 @@ void computeBornSum(__global float* global_bornSum, __global float4* posq, __glo
                            float u_ij = RECIP(rScaledRadiusJ);
                            float l_ij2 = l_ij*l_ij;
                            float u_ij2 = u_ij*u_ij;
-                            float ratio = log(u_ij / l_ij);
+                            float ratio = LOG(u_ij / l_ij);
                            bornSum += l_ij - u_ij + 0.25f*r*(u_ij2-l_ij2) + (0.50f*invR*ratio) +
                                             (0.25f*params2.y*params2.y*invR)*(l_ij2-u_ij2);
                            if (params1.x < params2.x-r)
@@ -220,7 +220,7 @@ void computeBornSum(__global float* global_bornSum, __global float4* posq, __glo
                            float u_ij = RECIP(rScaledRadiusI);
                            float l_ij2 = l_ij*l_ij;
                            float u_ij2 = u_ij*u_ij;
-                            float ratio = log(u_ij / l_ij);
+                            float ratio = LOG(u_ij / l_ij);
                            float term = l_ij - u_ij + 0.25f*r*(u_ij2-l_ij2) + (0.50f*invR*ratio) +
                                             (0.25f*params1.y*params1.y*invR)*(l_ij2-u_ij2);
                            if (params2.x < params1.x-r)
@@ -308,7 +308,7 @@ void computeGBSAForce1(__global float4* forceBuffers, __global float* energyBuff
                    float bornRadius2 = localData[tbx+j].bornRadius;
                    float alpha2_ij = bornRadius1*bornRadius2;
                    float D_ij = r2/(4.0f*alpha2_ij);
-                    float expTerm = exp(-D_ij);
+                    float expTerm = EXP(-D_ij);
                    float denominator2 = r2 + alpha2_ij*expTerm;
                    float denominator = SQRT(denominator2);
                    float tempEnergy = (PREFACTOR*posq1.w*posq2.w)/denominator;
@@ -377,7 +377,7 @@ void computeGBSAForce1(__global float4* forceBuffers, __global float* energyBuff
                            float bornRadius2 = localData[tbx+j].bornRadius;
                            float alpha2_ij = bornRadius1*bornRadius2;
                            float D_ij = r2/(4.0f*alpha2_ij);
-                            float expTerm = exp(-D_ij);
+                            float expTerm = EXP(-D_ij);
                            float denominator2 = r2 + alpha2_ij*expTerm;
                            float denominator = SQRT(denominator2);
                            float tempEnergy = (PREFACTOR*posq1.w*posq2.w)/denominator;
@@ -443,7 +443,7 @@ void computeGBSAForce1(__global float4* forceBuffers, __global float* energyBuff
                        float bornRadius2 = localData[tbx+tj].bornRadius;
                        float alpha2_ij = bornRadius1*bornRadius2;
                        float D_ij = r2/(4.0f*alpha2_ij);
-                        float expTerm = exp(-D_ij);
+                        float expTerm = EXP(-D_ij);
                        float denominator2 = r2 + alpha2_ij*expTerm;
                        float denominator = SQRT(denominator2);
                        float tempEnergy = (PREFACTOR*posq1.w*posq2.w)/denominator;

--- a/platforms/opencl/src/kernels/pme.cl
+++ b/platforms/opencl/src/kernels/pme.cl
@@ -173,7 +173,7 @@ __kernel void reciprocalConvolution(__global float2* pmeGrid, __global float* en
        float2 grid = pmeGrid[index];
        float m2 = mhx*mhx+mhy*mhy+mhz*mhz;
        float denom = m2*bx*by*bz;
-        float eterm = recipScaleFactor*exp(-RECIP_EXP_FACTOR*m2)/denom;
+        float eterm = recipScaleFactor*EXP(-RECIP_EXP_FACTOR*m2)/denom;
        pmeGrid[index] = (float2) (grid.x*eterm, grid.y*eterm);
        energy += eterm*(grid.x*grid.x + grid.y*grid.y);
    }

--- a/platforms/opencl/src/kernels/utilities.cl
+++ b/platforms/opencl/src/kernels/utilities.cl
@@ -59,12 +59,12 @@ __kernel void reduceFloat4Buffer(__global float4* buffer, int bufferSize, int nu
 }
 /**
- * This is called to determine the accuracy of native_sqrt(), native_rsqrt() and native_recip().
+ * This is called to determine the accuracy of various native functions.
 */
-__kernel void determineNativeAccuracy(__global float4* values, int numValues) {
+__kernel void determineNativeAccuracy(__global float8* values, int numValues) {
    for (int i = 0; i < numValues; ++i) {
-        float v = values[i].x;
+        float v = values[i].s0;
-        values[i] = (float4) (v, native_sqrt(v), native_rsqrt(v), native_recip(v));
+        values[i] = (float8) (v, native_sqrt(v), native_rsqrt(v), native_recip(v), native_exp(v), native_log(v), 0.0f, 0.0f);
    }
 }
\ No newline at end of file