Very minor optimizations

a7b68da3 · Peter Eastman · c56b4801 · a7b68da3 · a7b68da3 · a7b68da3
Commit a7b68da3 authored May 04, 2011 by Peter Eastman
11 changed files
--- a/platforms/opencl/src/kernels/cmapTorsionForce.cl
+++ b/platforms/opencl/src/kernels/cmapTorsionForce.cl
@@ -31,7 +31,7 @@ __kernel void computeCMAPTorsionForces(int numAtoms, int numTorsions, __global f

            float4 cross_prod = cross(cp0a, cp1a);
            float scale = dot(cp0a, cp0a)*dot(cp1a, cp1a);
-            angleA = asin(sqrt(dot(cross_prod, cross_prod)/scale));
+            angleA = asin(SQRT(dot(cross_prod, cross_prod)/scale));
            if (cosangle < 0.0f)
                angleA = PI-angleA;
        }
@@ -54,7 +54,7 @@ __kernel void computeCMAPTorsionForces(int numAtoms, int numTorsions, __global f

            float4 cross_prod = cross(cp0b, cp1b);
            float scale = dot(cp0b, cp0b)*dot(cp1b, cp1b);
-            angleB = asin(sqrt(dot(cross_prod, cross_prod)/scale));
+            angleB = asin(SQRT(dot(cross_prod, cross_prod)/scale));
            if (cosangle < 0.0f)
                angleB = PI-angleB;
        }
@@ -104,7 +104,7 @@ __kernel void computeCMAPTorsionForces(int numAtoms, int numTorsions, __global f

        float normCross1 = dot(cp0a, cp0a);
        float normSqrBC = dot(v1a, v1a);
-        float normBC = sqrt(normSqrBC);
+        float normBC = SQRT(normSqrBC);
        float normCross2 = dot(cp1a, cp1a);
        float dp = 1.0f/normSqrBC;
        float4 ff = (float4) ((-dEdA*normBC)/normCross1, dot(v0a, v1a)*dp, dot(v2a, v1a)*dp, (dEdA*normBC)/normCross2);
@@ -129,7 +129,7 @@ __kernel void computeCMAPTorsionForces(int numAtoms, int numTorsions, __global f

        normCross1 = dot(cp0b, cp0b);
        normSqrBC = dot(v1b, v1b);
-        normBC = sqrt(normSqrBC);
+        normBC = SQRT(normSqrBC);
        normCross2 = dot(cp1b, cp1b);
        dp = 1.0f/normSqrBC;
        ff = (float4) ((-dEdB*normBC)/normCross1, dot(v0b, v1b)*dp, dot(v2b, v1b)*dp, (dEdB*normBC)/normCross2);

--- a/platforms/opencl/src/kernels/customGBEnergyN2_cpu.cl
+++ b/platforms/opencl/src/kernels/customGBEnergyN2_cpu.cl
@@ -38,7 +38,7 @@ __kernel void computeN2Energy(__global float4* forceBuffers, __global float* ene
        else
 #endif
        {
-            y = (unsigned int) floor(NUM_BLOCKS+0.5f-sqrt((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
+            y = (unsigned int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
            x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
            if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
                y += (x < y ? -1 : 1);

--- a/platforms/opencl/src/kernels/customGBEnergyN2_default.cl
+++ b/platforms/opencl/src/kernels/customGBEnergyN2_default.cl
@@ -42,7 +42,7 @@ void computeN2Energy(__global float4* forceBuffers, __global float* energyBuffer
        else
 #endif
        {
-            y = (unsigned int) floor(NUM_BLOCKS+0.5f-sqrt((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
+            y = (unsigned int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
            x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
            if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
                y += (x < y ? -1 : 1);

--- a/platforms/opencl/src/kernels/customGBEnergyN2_nvidia.cl
+++ b/platforms/opencl/src/kernels/customGBEnergyN2_nvidia.cl
@@ -61,7 +61,7 @@ __kernel void computeN2Energy(__global float4* forceBuffers, __global float* ene
        else
 #endif
        {
-            y = (unsigned int) floor(NUM_BLOCKS+0.5f-sqrt((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
+            y = (unsigned int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
            x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
            if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
                y += (x < y ? -1 : 1);

--- a/platforms/opencl/src/kernels/customGBValueN2_cpu.cl
+++ b/platforms/opencl/src/kernels/customGBValueN2_cpu.cl
@@ -35,7 +35,7 @@ __kernel void computeN2Value(__global float4* posq, __local float4* local_posq,
        else
 #endif
        {
-            y = (unsigned int) floor(NUM_BLOCKS+0.5f-sqrt((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
+            y = (unsigned int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
            x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
            if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
                y += (x < y ? -1 : 1);

--- a/platforms/opencl/src/kernels/customGBValueN2_default.cl
+++ b/platforms/opencl/src/kernels/customGBValueN2_default.cl
@@ -39,7 +39,7 @@ void computeN2Value(__global float4* posq, __local float4* local_posq, __global
        else
 #endif
        {
-            y = (unsigned int) floor(NUM_BLOCKS+0.5f-sqrt((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
+            y = (unsigned int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
            x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
            if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
                y += (x < y ? -1 : 1);

--- a/platforms/opencl/src/kernels/customGBValueN2_nvidia.cl
+++ b/platforms/opencl/src/kernels/customGBValueN2_nvidia.cl
@@ -59,7 +59,7 @@ __kernel void computeN2Value(__global float4* posq, __local float4* local_posq,
        else
 #endif
        {
-            y = (unsigned int) floor(NUM_BLOCKS+0.5f-sqrt((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
+            y = (unsigned int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
            x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
            if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
                y += (x < y ? -1 : 1);

--- a/platforms/opencl/src/kernels/nonbonded_cpu.cl
+++ b/platforms/opencl/src/kernels/nonbonded_cpu.cl
@@ -43,7 +43,7 @@ __kernel void computeNonbonded(__global float4* forceBuffers, __global float* en
        else
 #endif
        {
-            y = (unsigned int) floor(NUM_BLOCKS+0.5f-sqrt((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
+            y = (unsigned int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
            x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
            if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
                y += (x < y ? -1 : 1);

--- a/platforms/opencl/src/kernels/nonbonded_default.cl
+++ b/platforms/opencl/src/kernels/nonbonded_default.cl
@@ -46,7 +46,7 @@ void computeNonbonded(__global float4* forceBuffers, __global float* energyBuffe
        else
 #endif
        {
-            y = (unsigned int) floor(NUM_BLOCKS+0.5f-sqrt((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
+            y = (unsigned int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
            x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
            if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
                y += (x < y ? -1 : 1);

--- a/platforms/opencl/src/kernels/nonbonded_nvidia.cl
+++ b/platforms/opencl/src/kernels/nonbonded_nvidia.cl
@@ -66,7 +66,7 @@ __kernel void computeNonbonded(__global float4* forceBuffers, __global float* en
        else
 #endif
        {
-            y = (unsigned int) floor(NUM_BLOCKS+0.5f-sqrt((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
+            y = (unsigned int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
            x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
            if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
                y += (x < y ? -1 : 1);
@@ -120,8 +120,8 @@ __kernel void computeNonbonded(__global float4* forceBuffers, __global float* en
                delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
 #endif
                float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
-                float r = sqrt(r2);
-                float invR = RECIP(r);
+                float invR = RSQRT(r2);
+                float r = RECIP(invR);
                LOAD_ATOM2_PARAMETERS
                atom2 = y*TILE_SIZE+j;
 #ifdef USE_SYMMETRIC

--- a/platforms/opencl/src/kernels/utilities.cl
+++ b/platforms/opencl/src/kernels/utilities.cl
@@ -63,7 +63,7 @@ __kernel void reduceFloat4Buffer(__global float4* buffer, int bufferSize, int nu
 */

 __kernel void determineNativeAccuracy(__global float8* values, int numValues) {
-    for (int i = 0; i < numValues; ++i) {
+    for (int i = get_global_id(0); i < numValues; i += get_global_size(0)) {
        float v = values[i].s0;
        values[i] = (float8) (v, native_sqrt(v), native_rsqrt(v), native_recip(v), native_exp(v), native_log(v), 0.0f, 0.0f);
    }