Commit c87cee06 authored by Peter Eastman's avatar Peter Eastman
Browse files

Use native_exp and native_log when they are sufficiently accurate

parent 2a313e48
...@@ -107,10 +107,10 @@ OpenCLContext::OpenCLContext(int numParticles, int deviceIndex) : time(0.0), ste ...@@ -107,10 +107,10 @@ OpenCLContext::OpenCLContext(int numParticles, int deviceIndex) : time(0.0), ste
// Decide whether native_sqrt(), native_rsqrt(), and native_recip() are sufficiently accurate to use. // Decide whether native_sqrt(), native_rsqrt(), and native_recip() are sufficiently accurate to use.
cl::Kernel accuracyKernel(utilities, "determineNativeAccuracy"); cl::Kernel accuracyKernel(utilities, "determineNativeAccuracy");
OpenCLArray<mm_float4> values(*this, 20, "values", true); OpenCLArray<mm_float8> values(*this, 20, "values", true);
float nextValue = 1e-4; float nextValue = 1e-4;
for (int i = 0; i < values.getSize(); ++i) { for (int i = 0; i < values.getSize(); ++i) {
values[i].x = nextValue; values[i].s0 = nextValue;
nextValue *= M_PI; nextValue *= M_PI;
} }
values.upload(); values.upload();
...@@ -118,12 +118,15 @@ OpenCLContext::OpenCLContext(int numParticles, int deviceIndex) : time(0.0), ste ...@@ -118,12 +118,15 @@ OpenCLContext::OpenCLContext(int numParticles, int deviceIndex) : time(0.0), ste
accuracyKernel.setArg<cl_int>(1, values.getSize()); accuracyKernel.setArg<cl_int>(1, values.getSize());
executeKernel(accuracyKernel, values.getSize()); executeKernel(accuracyKernel, values.getSize());
values.download(); values.download();
double maxSqrtError = 0.0, maxRsqrtError = 0.0, maxRecipError = 0.0; double maxSqrtError = 0.0, maxRsqrtError = 0.0, maxRecipError = 0.0, maxExpError = 0.0, maxLogError = 0.0;
for (int i = 0; i < values.getSize(); ++i) { for (int i = 0; i < values.getSize(); ++i) {
double correctSqrt = sqrt(values[i].x); double v = values[i].s0;
maxSqrtError = max(maxSqrtError, fabs(correctSqrt-values[i].y)/correctSqrt); double correctSqrt = sqrt(v);
maxRsqrtError = max(maxRsqrtError, fabs(1.0/correctSqrt-values[i].z)*correctSqrt); maxSqrtError = max(maxSqrtError, fabs(correctSqrt-values[i].s1)/correctSqrt);
maxRecipError = max(maxRecipError, fabs(1.0/values[i].x-values[i].w)/values[i].w); maxRsqrtError = max(maxRsqrtError, fabs(1.0/correctSqrt-values[i].s2)*correctSqrt);
maxRecipError = max(maxRecipError, fabs(1.0/v-values[i].s3)/values[i].s3);
maxExpError = max(maxExpError, fabs(exp(v)-values[i].s4)/values[i].s4);
maxLogError = max(maxLogError, fabs(log(v)-values[i].s5)/values[i].s5);
} }
if (maxSqrtError < 1e-6) if (maxSqrtError < 1e-6)
compilationOptions += " -DSQRT=native_sqrt"; compilationOptions += " -DSQRT=native_sqrt";
...@@ -137,6 +140,14 @@ OpenCLContext::OpenCLContext(int numParticles, int deviceIndex) : time(0.0), ste ...@@ -137,6 +140,14 @@ OpenCLContext::OpenCLContext(int numParticles, int deviceIndex) : time(0.0), ste
compilationOptions += " -DRECIP=native_recip"; compilationOptions += " -DRECIP=native_recip";
else else
compilationOptions += " -DRECIP=1.0f/"; compilationOptions += " -DRECIP=1.0f/";
if (maxExpError < 1e-6)
compilationOptions += " -DEXP=native_exp";
else
compilationOptions += " -DEXP=exp";
if (maxLogError < 1e-6)
compilationOptions += " -DLOG=native_log";
else
compilationOptions += " -DLOG=log";
} }
OpenCLContext::~OpenCLContext() { OpenCLContext::~OpenCLContext() {
......
...@@ -153,10 +153,10 @@ void OpenCLExpressionUtilities::processExpression(stringstream& out, const Expre ...@@ -153,10 +153,10 @@ void OpenCLExpressionUtilities::processExpression(stringstream& out, const Expre
out << "sqrt(" << getTempName(node.getChildren()[0], temps) << ")"; out << "sqrt(" << getTempName(node.getChildren()[0], temps) << ")";
break; break;
case Operation::EXP: case Operation::EXP:
out << "exp(" << getTempName(node.getChildren()[0], temps) << ")"; out << "EXP(" << getTempName(node.getChildren()[0], temps) << ")";
break; break;
case Operation::LOG: case Operation::LOG:
out << "log(" << getTempName(node.getChildren()[0], temps) << ")"; out << "LOG(" << getTempName(node.getChildren()[0], temps) << ")";
break; break;
case Operation::SIN: case Operation::SIN:
out << "sin(" << getTempName(node.getChildren()[0], temps) << ")"; out << "sin(" << getTempName(node.getChildren()[0], temps) << ")";
......
...@@ -17,7 +17,7 @@ if (!isExcluded || needCorrection) { ...@@ -17,7 +17,7 @@ if (!isExcluded || needCorrection) {
if (needCorrection) { if (needCorrection) {
// Subtract off the part of this interaction that was included in the reciprocal space contribution. // Subtract off the part of this interaction that was included in the reciprocal space contribution.
tempForce = -prefactor*((1.0f-erfcAlphaR)-alphaR*exp(-alphaR*alphaR)*TWO_OVER_SQRT_PI); tempForce = -prefactor*((1.0f-erfcAlphaR)-alphaR*EXP(-alphaR*alphaR)*TWO_OVER_SQRT_PI);
tempEnergy += -prefactor*(1.0f-erfcAlphaR); tempEnergy += -prefactor*(1.0f-erfcAlphaR);
} }
else if (r2 < CUTOFF_SQUARED) { else if (r2 < CUTOFF_SQUARED) {
...@@ -27,10 +27,10 @@ if (!isExcluded || needCorrection) { ...@@ -27,10 +27,10 @@ if (!isExcluded || needCorrection) {
sig2 *= sig2; sig2 *= sig2;
float sig6 = sig2*sig2*sig2; float sig6 = sig2*sig2*sig2;
float eps = sigmaEpsilon1.y*sigmaEpsilon2.y; float eps = sigmaEpsilon1.y*sigmaEpsilon2.y;
tempForce = eps*(12.0f*sig6 - 6.0f)*sig6 + prefactor*(erfcAlphaR+alphaR*exp(-alphaR*alphaR)*TWO_OVER_SQRT_PI); tempForce = eps*(12.0f*sig6 - 6.0f)*sig6 + prefactor*(erfcAlphaR+alphaR*EXP(-alphaR*alphaR)*TWO_OVER_SQRT_PI);
tempEnergy += eps*(sig6 - 1.0f)*sig6 + prefactor*erfcAlphaR; tempEnergy += eps*(sig6 - 1.0f)*sig6 + prefactor*erfcAlphaR;
#else #else
tempForce = prefactor*(erfcAlphaR+alphaR*exp(-alphaR*alphaR)*TWO_OVER_SQRT_PI); tempForce = prefactor*(erfcAlphaR+alphaR*EXP(-alphaR*alphaR)*TWO_OVER_SQRT_PI);
tempEnergy += prefactor*erfcAlphaR; tempEnergy += prefactor*erfcAlphaR;
#endif #endif
} }
......
...@@ -46,7 +46,7 @@ __kernel void calculateEwaldCosSinSums(__global float* energyBuffer, __global fl ...@@ -46,7 +46,7 @@ __kernel void calculateEwaldCosSinSums(__global float* energyBuffer, __global fl
// Compute the contribution to the energy. // Compute the contribution to the energy.
float k2 = kx*kx + ky*ky + kz*kz; float k2 = kx*kx + ky*ky + kz*kz;
float ak = exp(k2*EXP_COEFFICIENT) / k2; float ak = EXP(k2*EXP_COEFFICIENT) / k2;
energy += reciprocalCoefficient*ak*(sum.x*sum.x + sum.y*sum.y); energy += reciprocalCoefficient*ak*(sum.x*sum.x + sum.y*sum.y);
index += get_global_size(0); index += get_global_size(0);
} }
...@@ -83,7 +83,7 @@ __kernel void calculateEwaldForces(__global float4* forceBuffers, __global float ...@@ -83,7 +83,7 @@ __kernel void calculateEwaldForces(__global float4* forceBuffers, __global float
int index = rx*(KMAX_Y*2-1)*(KMAX_Z*2-1) + (ry+KMAX_Y-1)*(KMAX_Z*2-1) + (rz+KMAX_Z-1); int index = rx*(KMAX_Y*2-1)*(KMAX_Z*2-1) + (ry+KMAX_Y-1)*(KMAX_Z*2-1) + (rz+KMAX_Z-1);
float k2 = kx*kx + ky*ky + kz*kz; float k2 = kx*kx + ky*ky + kz*kz;
float ak = exp(k2*EXP_COEFFICIENT)/k2; float ak = EXP(k2*EXP_COEFFICIENT)/k2;
phase = apos.z*kz; phase = apos.z*kz;
float2 structureFactor = multofFloat2(tab_xy, (float2) (cos(phase), sin(phase))); float2 structureFactor = multofFloat2(tab_xy, (float2) (cos(phase), sin(phase)));
float2 sum = cosSinSum[index]; float2 sum = cosSinSum[index];
......
...@@ -14,8 +14,8 @@ if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && atom1 != atom2) { ...@@ -14,8 +14,8 @@ if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && atom1 != atom2) {
float l_ij2I = l_ijI*l_ijI; float l_ij2I = l_ijI*l_ijI;
float u_ij2J = u_ijJ*u_ijJ; float u_ij2J = u_ijJ*u_ijJ;
float u_ij2I = u_ijI*u_ijI; float u_ij2I = u_ijI*u_ijI;
float t1J = log(u_ijJ/l_ijJ); float t1J = LOG(u_ijJ/l_ijJ);
float t1I = log(u_ijI/l_ijI); float t1I = LOG(u_ijI/l_ijI);
float t2J = (l_ij2J-u_ij2J); float t2J = (l_ij2J-u_ij2J);
float t2I = (l_ij2I-u_ij2I); float t2I = (l_ij2I-u_ij2I);
float t3J = t2J*invR; float t3J = t2J*invR;
......
...@@ -75,7 +75,7 @@ void computeBornSum(__global float* global_bornSum, __global float4* posq, __glo ...@@ -75,7 +75,7 @@ void computeBornSum(__global float* global_bornSum, __global float4* posq, __glo
float u_ij = RECIP(rScaledRadiusJ); float u_ij = RECIP(rScaledRadiusJ);
float l_ij2 = l_ij*l_ij; float l_ij2 = l_ij*l_ij;
float u_ij2 = u_ij*u_ij; float u_ij2 = u_ij*u_ij;
float ratio = log(u_ij / l_ij); float ratio = LOG(u_ij / l_ij);
bornSum += l_ij - u_ij + 0.25f*r*(u_ij2-l_ij2) + (0.50f*invR*ratio) + bornSum += l_ij - u_ij + 0.25f*r*(u_ij2-l_ij2) + (0.50f*invR*ratio) +
(0.25f*params2.y*params2.y*invR)*(l_ij2-u_ij2); (0.25f*params2.y*params2.y*invR)*(l_ij2-u_ij2);
if (params1.x < params2.x-r) if (params1.x < params2.x-r)
...@@ -143,7 +143,7 @@ void computeBornSum(__global float* global_bornSum, __global float4* posq, __glo ...@@ -143,7 +143,7 @@ void computeBornSum(__global float* global_bornSum, __global float4* posq, __glo
float u_ij = RECIP(rScaledRadiusJ); float u_ij = RECIP(rScaledRadiusJ);
float l_ij2 = l_ij*l_ij; float l_ij2 = l_ij*l_ij;
float u_ij2 = u_ij*u_ij; float u_ij2 = u_ij*u_ij;
float ratio = log(u_ij / l_ij); float ratio = LOG(u_ij / l_ij);
bornSum += l_ij - u_ij + 0.25f*r*(u_ij2-l_ij2) + (0.50f*invR*ratio) + bornSum += l_ij - u_ij + 0.25f*r*(u_ij2-l_ij2) + (0.50f*invR*ratio) +
(0.25f*params2.y*params2.y*invR)*(l_ij2-u_ij2); (0.25f*params2.y*params2.y*invR)*(l_ij2-u_ij2);
if (params1.x < params2.x-r) if (params1.x < params2.x-r)
...@@ -155,7 +155,7 @@ void computeBornSum(__global float* global_bornSum, __global float4* posq, __glo ...@@ -155,7 +155,7 @@ void computeBornSum(__global float* global_bornSum, __global float4* posq, __glo
float u_ij = RECIP(rScaledRadiusI); float u_ij = RECIP(rScaledRadiusI);
float l_ij2 = l_ij*l_ij; float l_ij2 = l_ij*l_ij;
float u_ij2 = u_ij*u_ij; float u_ij2 = u_ij*u_ij;
float ratio = log(u_ij / l_ij); float ratio = LOG(u_ij / l_ij);
float term = l_ij - u_ij + 0.25f*r*(u_ij2-l_ij2) + (0.50f*invR*ratio) + float term = l_ij - u_ij + 0.25f*r*(u_ij2-l_ij2) + (0.50f*invR*ratio) +
(0.25f*params1.y*params1.y*invR)*(l_ij2-u_ij2); (0.25f*params1.y*params1.y*invR)*(l_ij2-u_ij2);
if (params2.x < params1.x-r) if (params2.x < params1.x-r)
...@@ -248,7 +248,7 @@ void computeGBSAForce1(__global float4* forceBuffers, __global float* energyBuff ...@@ -248,7 +248,7 @@ void computeGBSAForce1(__global float4* forceBuffers, __global float* energyBuff
float bornRadius2 = localData[baseLocalAtom+j].bornRadius; float bornRadius2 = localData[baseLocalAtom+j].bornRadius;
float alpha2_ij = bornRadius1*bornRadius2; float alpha2_ij = bornRadius1*bornRadius2;
float D_ij = r2/(4.0f*alpha2_ij); float D_ij = r2/(4.0f*alpha2_ij);
float expTerm = exp(-D_ij); float expTerm = EXP(-D_ij);
float denominator2 = r2 + alpha2_ij*expTerm; float denominator2 = r2 + alpha2_ij*expTerm;
float denominator = SQRT(denominator2); float denominator = SQRT(denominator2);
float tempEnergy = (PREFACTOR*posq1.w*posq2.w)/denominator; float tempEnergy = (PREFACTOR*posq1.w*posq2.w)/denominator;
...@@ -322,7 +322,7 @@ void computeGBSAForce1(__global float4* forceBuffers, __global float* energyBuff ...@@ -322,7 +322,7 @@ void computeGBSAForce1(__global float4* forceBuffers, __global float* energyBuff
float bornRadius2 = localData[baseLocalAtom+tj].bornRadius; float bornRadius2 = localData[baseLocalAtom+tj].bornRadius;
float alpha2_ij = bornRadius1*bornRadius2; float alpha2_ij = bornRadius1*bornRadius2;
float D_ij = r2/(4.0f*alpha2_ij); float D_ij = r2/(4.0f*alpha2_ij);
float expTerm = exp(-D_ij); float expTerm = EXP(-D_ij);
float denominator2 = r2 + alpha2_ij*expTerm; float denominator2 = r2 + alpha2_ij*expTerm;
float denominator = SQRT(denominator2); float denominator = SQRT(denominator2);
float tempEnergy = (PREFACTOR*posq1.w*posq2.w)/denominator; float tempEnergy = (PREFACTOR*posq1.w*posq2.w)/denominator;
......
...@@ -75,7 +75,7 @@ void computeBornSum(__global float* global_bornSum, __global float4* posq, __glo ...@@ -75,7 +75,7 @@ void computeBornSum(__global float* global_bornSum, __global float4* posq, __glo
float u_ij = RECIP(rScaledRadiusJ); float u_ij = RECIP(rScaledRadiusJ);
float l_ij2 = l_ij*l_ij; float l_ij2 = l_ij*l_ij;
float u_ij2 = u_ij*u_ij; float u_ij2 = u_ij*u_ij;
float ratio = log(u_ij / l_ij); float ratio = LOG(u_ij / l_ij);
bornSum += l_ij - u_ij + 0.25f*r*(u_ij2-l_ij2) + (0.50f*invR*ratio) + bornSum += l_ij - u_ij + 0.25f*r*(u_ij2-l_ij2) + (0.50f*invR*ratio) +
(0.25f*params2.y*params2.y*invR)*(l_ij2-u_ij2); (0.25f*params2.y*params2.y*invR)*(l_ij2-u_ij2);
if (params1.x < params2.x-r) if (params1.x < params2.x-r)
...@@ -140,7 +140,7 @@ void computeBornSum(__global float* global_bornSum, __global float4* posq, __glo ...@@ -140,7 +140,7 @@ void computeBornSum(__global float* global_bornSum, __global float4* posq, __glo
float u_ij = RECIP(rScaledRadiusJ); float u_ij = RECIP(rScaledRadiusJ);
float l_ij2 = l_ij*l_ij; float l_ij2 = l_ij*l_ij;
float u_ij2 = u_ij*u_ij; float u_ij2 = u_ij*u_ij;
float ratio = log(u_ij / l_ij); float ratio = LOG(u_ij / l_ij);
bornSum += l_ij - u_ij + 0.25f*r*(u_ij2-l_ij2) + (0.50f*invR*ratio) + bornSum += l_ij - u_ij + 0.25f*r*(u_ij2-l_ij2) + (0.50f*invR*ratio) +
(0.25f*params2.y*params2.y*invR)*(l_ij2-u_ij2); (0.25f*params2.y*params2.y*invR)*(l_ij2-u_ij2);
if (params1.x < params2.x-r) if (params1.x < params2.x-r)
...@@ -152,7 +152,7 @@ void computeBornSum(__global float* global_bornSum, __global float4* posq, __glo ...@@ -152,7 +152,7 @@ void computeBornSum(__global float* global_bornSum, __global float4* posq, __glo
float u_ij = RECIP(rScaledRadiusI); float u_ij = RECIP(rScaledRadiusI);
float l_ij2 = l_ij*l_ij; float l_ij2 = l_ij*l_ij;
float u_ij2 = u_ij*u_ij; float u_ij2 = u_ij*u_ij;
float ratio = log(u_ij / l_ij); float ratio = LOG(u_ij / l_ij);
float term = l_ij - u_ij + 0.25f*r*(u_ij2-l_ij2) + (0.50f*invR*ratio) + float term = l_ij - u_ij + 0.25f*r*(u_ij2-l_ij2) + (0.50f*invR*ratio) +
(0.25f*params1.y*params1.y*invR)*(l_ij2-u_ij2); (0.25f*params1.y*params1.y*invR)*(l_ij2-u_ij2);
if (params2.x < params1.x-r) if (params2.x < params1.x-r)
...@@ -208,7 +208,7 @@ void computeBornSum(__global float* global_bornSum, __global float4* posq, __glo ...@@ -208,7 +208,7 @@ void computeBornSum(__global float* global_bornSum, __global float4* posq, __glo
float u_ij = RECIP(rScaledRadiusJ); float u_ij = RECIP(rScaledRadiusJ);
float l_ij2 = l_ij*l_ij; float l_ij2 = l_ij*l_ij;
float u_ij2 = u_ij*u_ij; float u_ij2 = u_ij*u_ij;
float ratio = log(u_ij / l_ij); float ratio = LOG(u_ij / l_ij);
bornSum += l_ij - u_ij + 0.25f*r*(u_ij2-l_ij2) + (0.50f*invR*ratio) + bornSum += l_ij - u_ij + 0.25f*r*(u_ij2-l_ij2) + (0.50f*invR*ratio) +
(0.25f*params2.y*params2.y*invR)*(l_ij2-u_ij2); (0.25f*params2.y*params2.y*invR)*(l_ij2-u_ij2);
if (params1.x < params2.x-r) if (params1.x < params2.x-r)
...@@ -220,7 +220,7 @@ void computeBornSum(__global float* global_bornSum, __global float4* posq, __glo ...@@ -220,7 +220,7 @@ void computeBornSum(__global float* global_bornSum, __global float4* posq, __glo
float u_ij = RECIP(rScaledRadiusI); float u_ij = RECIP(rScaledRadiusI);
float l_ij2 = l_ij*l_ij; float l_ij2 = l_ij*l_ij;
float u_ij2 = u_ij*u_ij; float u_ij2 = u_ij*u_ij;
float ratio = log(u_ij / l_ij); float ratio = LOG(u_ij / l_ij);
float term = l_ij - u_ij + 0.25f*r*(u_ij2-l_ij2) + (0.50f*invR*ratio) + float term = l_ij - u_ij + 0.25f*r*(u_ij2-l_ij2) + (0.50f*invR*ratio) +
(0.25f*params1.y*params1.y*invR)*(l_ij2-u_ij2); (0.25f*params1.y*params1.y*invR)*(l_ij2-u_ij2);
if (params2.x < params1.x-r) if (params2.x < params1.x-r)
...@@ -308,7 +308,7 @@ void computeGBSAForce1(__global float4* forceBuffers, __global float* energyBuff ...@@ -308,7 +308,7 @@ void computeGBSAForce1(__global float4* forceBuffers, __global float* energyBuff
float bornRadius2 = localData[tbx+j].bornRadius; float bornRadius2 = localData[tbx+j].bornRadius;
float alpha2_ij = bornRadius1*bornRadius2; float alpha2_ij = bornRadius1*bornRadius2;
float D_ij = r2/(4.0f*alpha2_ij); float D_ij = r2/(4.0f*alpha2_ij);
float expTerm = exp(-D_ij); float expTerm = EXP(-D_ij);
float denominator2 = r2 + alpha2_ij*expTerm; float denominator2 = r2 + alpha2_ij*expTerm;
float denominator = SQRT(denominator2); float denominator = SQRT(denominator2);
float tempEnergy = (PREFACTOR*posq1.w*posq2.w)/denominator; float tempEnergy = (PREFACTOR*posq1.w*posq2.w)/denominator;
...@@ -377,7 +377,7 @@ void computeGBSAForce1(__global float4* forceBuffers, __global float* energyBuff ...@@ -377,7 +377,7 @@ void computeGBSAForce1(__global float4* forceBuffers, __global float* energyBuff
float bornRadius2 = localData[tbx+j].bornRadius; float bornRadius2 = localData[tbx+j].bornRadius;
float alpha2_ij = bornRadius1*bornRadius2; float alpha2_ij = bornRadius1*bornRadius2;
float D_ij = r2/(4.0f*alpha2_ij); float D_ij = r2/(4.0f*alpha2_ij);
float expTerm = exp(-D_ij); float expTerm = EXP(-D_ij);
float denominator2 = r2 + alpha2_ij*expTerm; float denominator2 = r2 + alpha2_ij*expTerm;
float denominator = SQRT(denominator2); float denominator = SQRT(denominator2);
float tempEnergy = (PREFACTOR*posq1.w*posq2.w)/denominator; float tempEnergy = (PREFACTOR*posq1.w*posq2.w)/denominator;
...@@ -443,7 +443,7 @@ void computeGBSAForce1(__global float4* forceBuffers, __global float* energyBuff ...@@ -443,7 +443,7 @@ void computeGBSAForce1(__global float4* forceBuffers, __global float* energyBuff
float bornRadius2 = localData[tbx+tj].bornRadius; float bornRadius2 = localData[tbx+tj].bornRadius;
float alpha2_ij = bornRadius1*bornRadius2; float alpha2_ij = bornRadius1*bornRadius2;
float D_ij = r2/(4.0f*alpha2_ij); float D_ij = r2/(4.0f*alpha2_ij);
float expTerm = exp(-D_ij); float expTerm = EXP(-D_ij);
float denominator2 = r2 + alpha2_ij*expTerm; float denominator2 = r2 + alpha2_ij*expTerm;
float denominator = SQRT(denominator2); float denominator = SQRT(denominator2);
float tempEnergy = (PREFACTOR*posq1.w*posq2.w)/denominator; float tempEnergy = (PREFACTOR*posq1.w*posq2.w)/denominator;
......
...@@ -173,7 +173,7 @@ __kernel void reciprocalConvolution(__global float2* pmeGrid, __global float* en ...@@ -173,7 +173,7 @@ __kernel void reciprocalConvolution(__global float2* pmeGrid, __global float* en
float2 grid = pmeGrid[index]; float2 grid = pmeGrid[index];
float m2 = mhx*mhx+mhy*mhy+mhz*mhz; float m2 = mhx*mhx+mhy*mhy+mhz*mhz;
float denom = m2*bx*by*bz; float denom = m2*bx*by*bz;
float eterm = recipScaleFactor*exp(-RECIP_EXP_FACTOR*m2)/denom; float eterm = recipScaleFactor*EXP(-RECIP_EXP_FACTOR*m2)/denom;
pmeGrid[index] = (float2) (grid.x*eterm, grid.y*eterm); pmeGrid[index] = (float2) (grid.x*eterm, grid.y*eterm);
energy += eterm*(grid.x*grid.x + grid.y*grid.y); energy += eterm*(grid.x*grid.x + grid.y*grid.y);
} }
......
...@@ -59,12 +59,12 @@ __kernel void reduceFloat4Buffer(__global float4* buffer, int bufferSize, int nu ...@@ -59,12 +59,12 @@ __kernel void reduceFloat4Buffer(__global float4* buffer, int bufferSize, int nu
} }
/** /**
* This is called to determine the accuracy of native_sqrt(), native_rsqrt() and native_recip(). * This is called to determine the accuracy of various native functions.
*/ */
__kernel void determineNativeAccuracy(__global float4* values, int numValues) { __kernel void determineNativeAccuracy(__global float8* values, int numValues) {
for (int i = 0; i < numValues; ++i) { for (int i = 0; i < numValues; ++i) {
float v = values[i].x; float v = values[i].s0;
values[i] = (float4) (v, native_sqrt(v), native_rsqrt(v), native_recip(v)); values[i] = (float8) (v, native_sqrt(v), native_rsqrt(v), native_recip(v), native_exp(v), native_log(v), 0.0f, 0.0f);
} }
} }
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment