Commit 3a415486 authored by peastman's avatar peastman
Browse files

Optimizations to CpuNonbondedForce

parent 79b85ad7
......@@ -191,6 +191,18 @@ static inline fvec8 sqrt(const fvec8& v) {
return fvec8(_mm256_sqrt_ps(v.val));
}
static inline fvec8 rsqrt(const fvec8& v) {
// Initial estimate of rsqrt().
fvec8 y(_mm256_rsqrt_ps(v.val));
// Perform an iteration of Newton refinement.
fvec8 x2 = v*0.5f;
y *= fvec8(1.5f)-x2*y*y;
return y;
}
static inline float dot8(const fvec8& v1, const fvec8& v2) {
fvec8 result = _mm256_dp_ps(v1, v2, 0xF1);
return _mm_cvtss_f32(result.lowerVec())+_mm_cvtss_f32(result.upperVec());
......
......@@ -251,11 +251,15 @@ static inline fvec4 abs(const fvec4& v) {
return vabsq_f32(v);
}
static inline fvec4 sqrt(const fvec4& v) {
static inline fvec4 rsqrt(const fvec4& v) {
float32x4_t recipSqrt = vrsqrteq_f32(v);
recipSqrt = vmulq_f32(recipSqrt, vrsqrtsq_f32(vmulq_f32(recipSqrt, v), recipSqrt));
recipSqrt = vmulq_f32(recipSqrt, vrsqrtsq_f32(vmulq_f32(recipSqrt, v), recipSqrt));
return vmulq_f32(v, recipSqrt);
return recipSqrt;
}
static inline fvec4 sqrt(const fvec4& v) {
return rsqrt(v)*v;
}
static inline float dot3(const fvec4& v1, const fvec4& v2) {
......
......@@ -330,7 +330,7 @@ static inline fvec4 ceil(const fvec4& v) {
return truncated + blend(0.0f, 1.0f, truncated<v);
}
static inline fvec4 sqrt(const fvec4& v) {
static inline fvec4 rsqrt(const fvec4& v) {
// Initial estimate of rsqrt().
ivec4 i = (__m128i) v;
......@@ -343,7 +343,11 @@ static inline fvec4 sqrt(const fvec4& v) {
y *= 1.5f-x2*y*y;
y *= 1.5f-x2*y*y;
y *= 1.5f-x2*y*y;
return y*v;
return y;
}
static inline fvec4 sqrt(const fvec4& v) {
return rsqrt(v)*v;
}
#endif /*OPENMM_VECTORIZE_PNACL_H_*/
......
......@@ -241,6 +241,18 @@ static inline fvec4 sqrt(const fvec4& v) {
return fvec4(_mm_sqrt_ps(v.val));
}
static inline fvec4 rsqrt(const fvec4& v) {
// Initial estimate of rsqrt().
fvec4 y(_mm_rsqrt_ps(v.val));
// Perform an iteration of Newton refinement.
fvec4 x2 = v*0.5f;
y *= fvec4(1.5f)-x2*y*y;
return y;
}
static inline float dot3(const fvec4& v1, const fvec4& v2) {
return _mm_cvtss_f32(_mm_dp_ps(v1, v2, 0x71));
}
......
......@@ -95,8 +95,7 @@ void CpuNonbondedForceVec4::calculateBlockIxnImpl(int blockIndex, float* forces,
// Compute the interactions.
fvec4 r = sqrt(r2);
fvec4 inverseR = fvec4(1.0f)/r;
fvec4 inverseR = rsqrt(r2);
fvec4 energy, dEdR;
float atomEpsilon = atomParameters[atom].second;
if (atomEpsilon != 0.0f) {
......@@ -108,6 +107,7 @@ void CpuNonbondedForceVec4::calculateBlockIxnImpl(int blockIndex, float* forces,
dEdR = epsSig6*(12.0f*sig6 - 6.0f);
energy = epsSig6*(sig6-1.0f);
if (useSwitch) {
fvec4 r = r2*inverseR;
fvec4 t = blend(0.0f, (r-switchingDistance)*invSwitchingInterval, r>switchingDistance);
fvec4 switchValue = 1+t*t*t*(-10.0f+t*(15.0f-t*6.0f));
fvec4 switchDeriv = t*t*(-30.0f+t*(60.0f-t*30.0f))*invSwitchingInterval;
......@@ -212,8 +212,8 @@ void CpuNonbondedForceVec4::calculateBlockEwaldIxnImpl(int blockIndex, float* fo
// Compute the interactions.
fvec4 r = sqrt(r2);
fvec4 inverseR = fvec4(1.0f)/r;
fvec4 inverseR = rsqrt(r2);
fvec4 r = r2*inverseR;
fvec4 energy, dEdR;
float atomEpsilon = atomParameters[atom].second;
if (atomEpsilon != 0.0f) {
......
......@@ -126,8 +126,7 @@ void CpuNonbondedForceVec8::calculateBlockIxnImpl(int blockIndex, float* forces,
// Compute the interactions.
fvec8 r = sqrt(r2);
fvec8 inverseR = fvec8(1.0f)/r;
fvec8 inverseR = rsqrt(r2);
fvec8 energy, dEdR;
float atomEpsilon = atomParameters[atom].second;
if (atomEpsilon != 0.0f) {
......@@ -139,6 +138,7 @@ void CpuNonbondedForceVec8::calculateBlockIxnImpl(int blockIndex, float* forces,
dEdR = epsSig6*(12.0f*sig6 - 6.0f);
energy = epsSig6*(sig6-1.0f);
if (useSwitch) {
fvec8 r = r2*inverseR;
fvec8 t = (r>switchingDistance) & ((r-switchingDistance)*invSwitchingInterval);
fvec8 switchValue = 1+t*t*t*(-10.0f+t*(15.0f-t*6.0f));
fvec8 switchDeriv = t*t*(-30.0f+t*(60.0f-t*30.0f))*invSwitchingInterval;
......@@ -242,8 +242,8 @@ void CpuNonbondedForceVec8::calculateBlockEwaldIxnImpl(int blockIndex, float* fo
// Compute the interactions.
fvec8 r = sqrt(r2);
fvec8 inverseR = fvec8(1.0f)/r;
fvec8 inverseR = rsqrt(r2);
fvec8 r = r2*inverseR;
fvec8 energy, dEdR;
float atomEpsilon = atomParameters[atom].second;
if (atomEpsilon != 0.0f) {
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment