Commit 6b10b909 authored by peastman's avatar peastman
Browse files

Continuing to optimize CPU nonbonded routines

parent 9bc194b9
......@@ -29,6 +29,7 @@
#include <set>
#include <utility>
#include <vector>
#include <smmintrin.h>
// ---------------------------------------------------------------------------------------
class CpuNonbondedForce {
......@@ -47,6 +48,7 @@ class CpuNonbondedForce {
float alphaEwald;
int numRx, numRy, numRz;
int meshDim[3];
__m128 boxSize, invBoxSize, half;
// parameter indices
......@@ -69,7 +71,7 @@ class CpuNonbondedForce {
void calculateOneIxn( int atom1, int atom2, float* atomCoordinates,
float** atomParameters, float* forces,
float* totalEnergy ) const;
double* totalEnergy ) const;
public:
......@@ -122,7 +124,7 @@ class CpuNonbondedForce {
--------------------------------------------------------------------------------------- */
void setPeriodic( float* boxSize );
void setPeriodic( float* periodicBoxSize );
/**---------------------------------------------------------------------------------------
......@@ -194,8 +196,7 @@ private:
float* fixedParameters, float* forces,
float* totalEnergy, bool includeDirect, bool includeReciprocal) const;
void getDeltaR(const float* atomCoordinatesI, const float* atomCoordinatesJ,
const float* boxSize, float* deltaR, bool periodic) const;
void getDeltaR(const __m128& posI, const __m128& posJ, __m128& deltaR, float& r2, bool periodic) const;
};
......
......@@ -113,16 +113,19 @@ void CpuNonbondedForce::setUseSwitchingFunction(float distance) {
--------------------------------------------------------------------------------------- */
void CpuNonbondedForce::setPeriodic(float* boxSize) {
void CpuNonbondedForce::setPeriodic(float* periodicBoxSize) {
assert(cutoff);
assert(boxSize[0] >= 2*cutoffDistance);
assert(boxSize[1] >= 2*cutoffDistance);
assert(boxSize[2] >= 2*cutoffDistance);
assert(periodicBoxSize[0] >= 2*cutoffDistance);
assert(periodicBoxSize[1] >= 2*cutoffDistance);
assert(periodicBoxSize[2] >= 2*cutoffDistance);
periodic = true;
periodicBoxSize[0] = boxSize[0];
periodicBoxSize[1] = boxSize[1];
periodicBoxSize[2] = boxSize[2];
this->periodicBoxSize[0] = periodicBoxSize[0];
this->periodicBoxSize[1] = periodicBoxSize[1];
this->periodicBoxSize[2] = periodicBoxSize[2];
boxSize = _mm_set_ps(0, periodicBoxSize[2], periodicBoxSize[1], periodicBoxSize[0]);
invBoxSize = _mm_set_ps(0, (1/periodicBoxSize[2]), (1/periodicBoxSize[1]), (1/periodicBoxSize[0]));
half = _mm_set1_ps(0.5);
}
/**---------------------------------------------------------------------------------------
......@@ -198,7 +201,6 @@ void CpuNonbondedForce::calculateEwaldIxn(int numberOfAtoms, float* atomCoordina
float totalSelfEwaldEnergy = 0.0;
float realSpaceEwaldEnergy = 0.0;
float recipEnergy = 0.0;
float totalRecipEnergy = 0.0;
float vdwEnergy = 0.0;
// **************************************************************************************
......@@ -207,7 +209,7 @@ void CpuNonbondedForce::calculateEwaldIxn(int numberOfAtoms, float* atomCoordina
if (includeReciprocal) {
for (int atomID = 0; atomID < numberOfAtoms; atomID++){
float selfEwaldEnergy = (float) (ONE_4PI_EPS0*atomParameters[atomID][QIndex]*atomParameters[atomID][QIndex] * alphaEwald/SQRT_PI);
float selfEwaldEnergy = (float) (ONE_4PI_EPS0*atomCoordinates[4*atomID+3]*atomCoordinates[4*atomID+3] * alphaEwald/SQRT_PI);
totalSelfEwaldEnergy -= selfEwaldEnergy;
}
}
......@@ -326,11 +328,8 @@ void CpuNonbondedForce::calculateEwaldIxn(int numberOfAtoms, float* atomCoordina
f[2] += 2 * recipCoeff * force * kz;
}
recipEnergy = recipCoeff * ak * (cs * cs + ss * ss);
totalRecipEnergy += recipEnergy;
if (totalEnergy)
*totalEnergy += recipEnergy;
*totalEnergy += recipCoeff * ak * (cs * cs + ss * ss);
lowrz = 1 - numRz;
}
......@@ -345,18 +344,21 @@ void CpuNonbondedForce::calculateEwaldIxn(int numberOfAtoms, float* atomCoordina
if (!includeDirect)
return;
float totalVdwEnergy = 0.0f;
float totalRealSpaceEwaldEnergy = 0.0f;
double totalVdwEnergy = 0.0f;
double totalRealSpaceEwaldEnergy = 0.0f;
for (int i = 0; i < (int) neighborList->size(); i++) {
pair<int, int> pair = (*neighborList)[i];
int ii = pair.first;
int jj = pair.second;
float deltaR[2][ReferenceForce::LastDeltaRIndex];
getDeltaR(atomCoordinates+4*jj, atomCoordinates+4*ii, periodicBoxSize, deltaR[0], true);
float r = deltaR[0][ReferenceForce::RIndex];
float inverseR = one/(deltaR[0][ReferenceForce::RIndex]);
__m128 deltaR;
__m128 posI = _mm_loadu_ps(atomCoordinates+4*ii);
__m128 posJ = _mm_loadu_ps(atomCoordinates+4*jj);
float r2;
getDeltaR(posJ, posI, deltaR, r2, true);
float r = sqrtf(r2);
float inverseR = 1/r;
float switchValue = 1, switchDeriv = 0;
if (useSwitch && r > switchingDistance) {
float t = (r-switchingDistance)/(cutoffDistance-switchingDistance);
......@@ -366,8 +368,9 @@ void CpuNonbondedForce::calculateEwaldIxn(int numberOfAtoms, float* atomCoordina
float alphaR = alphaEwald * r;
float dEdR = (float) (ONE_4PI_EPS0 * atomParameters[ii][QIndex] * atomParameters[jj][QIndex] * inverseR * inverseR * inverseR);
dEdR = (float) (dEdR * (erfc(alphaR) + 2 * alphaR * exp (- alphaR * alphaR) / SQRT_PI));
float chargeProd = ONE_4PI_EPS0*atomCoordinates[4*ii+3]*atomCoordinates[4*jj+3];
float dEdR = (float) (chargeProd * inverseR * inverseR * inverseR);
dEdR = (float) (dEdR * (erfc(alphaR) + 2 * alphaR * exp (- alphaR * alphaR) / SQRT_PI));
float sig = atomParameters[ii][SigIndex] + atomParameters[jj][SigIndex];
float sig2 = inverseR*sig;
......@@ -383,15 +386,13 @@ void CpuNonbondedForce::calculateEwaldIxn(int numberOfAtoms, float* atomCoordina
// accumulate forces
for (int kk = 0; kk < 3; kk++){
float force = dEdR*deltaR[0][kk];
forces[4*ii+kk] += force;
forces[4*jj+kk] -= force;
}
__m128 result = _mm_mul_ps(deltaR, _mm_set1_ps(dEdR));
_mm_storeu_ps(forces+4*ii, _mm_add_ps(_mm_loadu_ps(forces+4*ii), result));
_mm_storeu_ps(forces+4*jj, _mm_sub_ps(_mm_loadu_ps(forces+4*jj), result));
// accumulate energies
realSpaceEwaldEnergy = (float) (ONE_4PI_EPS0*atomParameters[ii][QIndex]*atomParameters[jj][QIndex]*inverseR*erfc(alphaR));
realSpaceEwaldEnergy = (float) (chargeProd*inverseR*erfc(alphaR));
totalVdwEnergy += vdwEnergy;
totalRealSpaceEwaldEnergy += realSpaceEwaldEnergy;
......@@ -410,26 +411,28 @@ void CpuNonbondedForce::calculateEwaldIxn(int numberOfAtoms, float* atomCoordina
int ii = i;
int jj = *iter;
float deltaR[2][ReferenceForce::LastDeltaRIndex];
getDeltaR(atomCoordinates+4*jj, atomCoordinates+4*ii, periodicBoxSize, deltaR[0], false);
float r = deltaR[0][ReferenceForce::RIndex];
float inverseR = one/(deltaR[0][ReferenceForce::RIndex]);
__m128 deltaR;
__m128 posI = _mm_loadu_ps(atomCoordinates+4*ii);
__m128 posJ = _mm_loadu_ps(atomCoordinates+4*jj);
float r2;
getDeltaR(posJ, posI, deltaR, r2, false);
float r = sqrtf(r2);
float inverseR = 1/r;
float alphaR = alphaEwald * r;
if (erf(alphaR) > 1e-6) {
float dEdR = (float) (ONE_4PI_EPS0 * atomParameters[ii][QIndex] * atomParameters[jj][QIndex] * inverseR * inverseR * inverseR);
dEdR = (float) (dEdR * (erf(alphaR) - 2 * alphaR * exp (- alphaR * alphaR) / SQRT_PI));
float chargeProd = ONE_4PI_EPS0*atomCoordinates[4*ii+3]*atomCoordinates[4*jj+3];
float dEdR = (float) (chargeProd * inverseR * inverseR * inverseR);
dEdR = (float) (dEdR * (erf(alphaR) - 2 * alphaR * exp (- alphaR * alphaR) / SQRT_PI));
// accumulate forces
for (int kk = 0; kk < 3; kk++){
float force = dEdR*deltaR[0][kk];
forces[4*ii+kk] -= force;
forces[4*jj+kk] += force;
}
__m128 result = _mm_mul_ps(deltaR, _mm_set1_ps(dEdR));
_mm_storeu_ps(forces+4*ii, _mm_add_ps(_mm_loadu_ps(forces+4*ii), result));
_mm_storeu_ps(forces+4*jj, _mm_sub_ps(_mm_loadu_ps(forces+4*jj), result));
// accumulate energies
realSpaceEwaldEnergy = (float) (ONE_4PI_EPS0*atomParameters[ii][QIndex]*atomParameters[jj][QIndex]*inverseR*erf(alphaR));
realSpaceEwaldEnergy = (float) (chargeProd*inverseR*erf(alphaR));
totalExclusionEnergy += realSpaceEwaldEnergy;
}
......@@ -470,10 +473,12 @@ void CpuNonbondedForce::calculatePairIxn(int numberOfAtoms, float* atomCoordinat
}
if (!includeDirect)
return;
double directEnergy = 0;
double* energyPtr = (totalEnergy == NULL ? NULL : &directEnergy);
if (cutoff) {
for (int i = 0; i < (int) neighborList->size(); i++) {
pair<int, int> pair = (*neighborList)[i];
calculateOneIxn(pair.first, pair.second, atomCoordinates, atomParameters, forces, totalEnergy);
calculateOneIxn(pair.first, pair.second, atomCoordinates, atomParameters, forces, energyPtr);
}
}
else {
......@@ -482,9 +487,11 @@ void CpuNonbondedForce::calculatePairIxn(int numberOfAtoms, float* atomCoordinat
for (int jj = ii+1; jj < numberOfAtoms; jj++)
if (exclusions[jj].find(ii) == exclusions[jj].end())
calculateOneIxn(ii, jj, atomCoordinates, atomParameters, forces, totalEnergy);
calculateOneIxn(ii, jj, atomCoordinates, atomParameters, forces, energyPtr);
}
}
if (totalEnergy != NULL)
*totalEnergy += (float) directEnergy;
}
/**---------------------------------------------------------------------------------------
......@@ -502,90 +509,63 @@ void CpuNonbondedForce::calculatePairIxn(int numberOfAtoms, float* atomCoordinat
void CpuNonbondedForce::calculateOneIxn(int ii, int jj, float* atomCoordinates,
float** atomParameters, float* forces,
float* totalEnergy) const {
// ---------------------------------------------------------------------------------------
static const std::string methodName = "\nCpuNonbondedForce::calculateOneIxn";
// ---------------------------------------------------------------------------------------
// constants -- reduce Visual Studio warnings regarding conversions between float & double
static const float zero = 0.0;
static const float one = 1.0;
static const float two = 2.0;
static const float three = 3.0;
static const float six = 6.0;
static const float twelve = 12.0;
static const float oneM = -1.0;
static const int threeI = 3;
static const int LastAtomIndex = 2;
float deltaR[2][ReferenceForce::LastDeltaRIndex];
double* totalEnergy) const {
// get deltaR, R2, and R between 2 atoms
getDeltaR(atomCoordinates+4*jj, atomCoordinates+4*ii, periodicBoxSize, deltaR[0], periodic);
float r2 = deltaR[0][ReferenceForce::R2Index];
float inverseR = one/(deltaR[0][ReferenceForce::RIndex]);
__m128 deltaR;
__m128 posI = _mm_loadu_ps(atomCoordinates+4*ii);
__m128 posJ = _mm_loadu_ps(atomCoordinates+4*jj);
float r2;
getDeltaR(posJ, posI, deltaR, r2, periodic);
float r = sqrtf(r2);
float inverseR = 1/r;
float switchValue = 1, switchDeriv = 0;
if (useSwitch) {
float r = deltaR[0][ReferenceForce::RIndex];
if (r > switchingDistance) {
float t = (r-switchingDistance)/(cutoffDistance-switchingDistance);
switchValue = 1+t*t*t*(-10+t*(15-t*6));
switchDeriv = t*t*(-30+t*(60-t*30))/(cutoffDistance-switchingDistance);
}
if (useSwitch && r > switchingDistance) {
float t = (r-switchingDistance)/(cutoffDistance-switchingDistance);
switchValue = 1+t*t*t*(-10+t*(15-t*6));
switchDeriv = t*t*(-30+t*(60-t*30))/(cutoffDistance-switchingDistance);
}
float sig = atomParameters[ii][SigIndex] + atomParameters[jj][SigIndex];
float sig2 = inverseR*sig;
sig2 *= sig2;
sig2 *= sig2;
float sig6 = sig2*sig2*sig2;
float eps = atomParameters[ii][EpsIndex]*atomParameters[jj][EpsIndex];
float dEdR = switchValue*eps*(twelve*sig6 - six)*sig6;
float dEdR = switchValue*eps*(12.0f*sig6 - 6.0f)*sig6;
float chargeProd = ONE_4PI_EPS0*atomCoordinates[4*ii+3]*atomCoordinates[4*jj+3];
if (cutoff)
dEdR += (float) (ONE_4PI_EPS0*atomParameters[ii][QIndex]*atomParameters[jj][QIndex]*(inverseR-2.0f*krf*r2));
dEdR += (float) (chargeProd*(inverseR-2.0f*krf*r2));
else
dEdR += (float) (ONE_4PI_EPS0*atomParameters[ii][QIndex]*atomParameters[jj][QIndex]*inverseR);
dEdR += (float) (chargeProd*inverseR);
dEdR *= inverseR*inverseR;
float energy = eps*(sig6-one)*sig6;
float energy = eps*(sig6-1.0f)*sig6;
if (useSwitch) {
dEdR -= energy*switchDeriv*inverseR;
energy *= switchValue;
}
if (cutoff)
energy += (float) (ONE_4PI_EPS0*atomParameters[ii][QIndex]*atomParameters[jj][QIndex]*(inverseR+krf*r2-crf));
else
energy += (float) (ONE_4PI_EPS0*atomParameters[ii][QIndex]*atomParameters[jj][QIndex]*inverseR);
// accumulate forces
// accumulate energies
for (int kk = 0; kk < 3; kk++){
float force = dEdR*deltaR[0][kk];
forces[4*ii+kk] += force;
forces[4*jj+kk] -= force;
if (totalEnergy) {
if (cutoff)
energy += (float) (chargeProd*(inverseR+krf*r2-crf));
else
energy += (float) (chargeProd*inverseR);
*totalEnergy += energy;
}
// accumulate energies
// accumulate forces
if (totalEnergy)
*totalEnergy += energy;
__m128 result = _mm_mul_ps(deltaR, _mm_set1_ps(dEdR));
_mm_storeu_ps(forces+4*ii, _mm_add_ps(_mm_loadu_ps(forces+4*ii), result));
_mm_storeu_ps(forces+4*jj, _mm_sub_ps(_mm_loadu_ps(forces+4*jj), result));
}
void CpuNonbondedForce::getDeltaR(const float* atomCoordinatesI, const float* atomCoordinatesJ, const float* boxSize, float* deltaR, bool periodic) const {
deltaR[ReferenceForce::XIndex] = atomCoordinatesJ[0] - atomCoordinatesI[0];
deltaR[ReferenceForce::YIndex] = atomCoordinatesJ[1] - atomCoordinatesI[1];
deltaR[ReferenceForce::ZIndex] = atomCoordinatesJ[2] - atomCoordinatesI[2];
void CpuNonbondedForce::getDeltaR(const __m128& posI, const __m128& posJ, __m128& deltaR, float& r2, bool periodic) const {
deltaR = _mm_sub_ps(posJ, posI);
if (periodic) {
deltaR[ReferenceForce::XIndex] -= (float) (floor(deltaR[ReferenceForce::XIndex]/boxSize[0]+0.5)*boxSize[0]);
deltaR[ReferenceForce::YIndex] -= (float) (floor(deltaR[ReferenceForce::YIndex]/boxSize[1]+0.5)*boxSize[1]);
deltaR[ReferenceForce::ZIndex] -= (float) (floor(deltaR[ReferenceForce::ZIndex]/boxSize[2]+0.5)*boxSize[2]);
__m128 base = _mm_mul_ps(_mm_floor_ps(_mm_add_ps(_mm_mul_ps(deltaR, invBoxSize), half)), boxSize);
deltaR = _mm_sub_ps(deltaR, base);
}
deltaR[ReferenceForce::R2Index] = DOT3(deltaR, deltaR);
deltaR[ReferenceForce::RIndex] = (float) SQRT(deltaR[ReferenceForce::R2Index]);
r2 = _mm_cvtss_f32(_mm_dp_ps(deltaR, deltaR, 0x71));
}
......@@ -93,28 +93,28 @@ void testEwaldPME(bool includeExceptions) {
}
}
// (1) Check whether the Reference and CUDA platforms agree when using Ewald Method
// (1) Check whether the Reference and CPU platforms agree when using Ewald Method
VerletIntegrator integrator1(0.01);
VerletIntegrator integrator2(0.01);
Context cuContext(system, integrator1, platform);
Context cpuContext(system, integrator1, platform);
Context referenceContext(system, integrator2, reference);
cuContext.setPositions(positions);
cpuContext.setPositions(positions);
referenceContext.setPositions(positions);
State cuState = cuContext.getState(State::Forces | State::Energy);
State cpuState = cpuContext.getState(State::Forces | State::Energy);
State referenceState = referenceContext.getState(State::Forces | State::Energy);
tol = 1e-2;
for (int i = 0; i < numParticles; i++) {
ASSERT_EQUAL_VEC(referenceState.getForces()[i], cuState.getForces()[i], tol);
ASSERT_EQUAL_VEC(referenceState.getForces()[i], cpuState.getForces()[i], tol);
}
tol = 1e-5;
ASSERT_EQUAL_TOL(referenceState.getPotentialEnergy(), cuState.getPotentialEnergy(), tol);
ASSERT_EQUAL_TOL(referenceState.getPotentialEnergy(), cpuState.getPotentialEnergy(), tol);
// (2) Check whether Ewald method in CUDA is self-consistent
// (2) Check whether Ewald method in CPU is self-consistent
double norm = 0.0;
for (int i = 0; i < numParticles; ++i) {
Vec3 f = cuState.getForces()[i];
Vec3 f = cpuState.getForces()[i];
norm += f[0]*f[0] + f[1]*f[1] + f[2]*f[2];
}
......@@ -123,38 +123,38 @@ void testEwaldPME(bool includeExceptions) {
double step = delta/norm;
for (int i = 0; i < numParticles; ++i) {
Vec3 p = positions[i];
Vec3 f = cuState.getForces()[i];
Vec3 f = cpuState.getForces()[i];
positions[i] = Vec3(p[0]-f[0]*step, p[1]-f[1]*step, p[2]-f[2]*step);
}
VerletIntegrator integrator3(0.01);
Context cuContext2(system, integrator3, platform);
cuContext2.setPositions(positions);
Context cpuContext2(system, integrator3, platform);
cpuContext2.setPositions(positions);
tol = 1e-2;
State cuState2 = cuContext2.getState(State::Energy);
ASSERT_EQUAL_TOL(norm, (cuState2.getPotentialEnergy()-cuState.getPotentialEnergy())/delta, tol)
State cpuState2 = cpuContext2.getState(State::Energy);
ASSERT_EQUAL_TOL(norm, (cpuState2.getPotentialEnergy()-cpuState.getPotentialEnergy())/delta, tol)
// (3) Check whether the Reference and CUDA platforms agree when using PME
// (3) Check whether the Reference and CPU platforms agree when using PME
nonbonded->setNonbondedMethod(NonbondedForce::PME);
cuContext.reinitialize();
cpuContext.reinitialize();
referenceContext.reinitialize();
cuContext.setPositions(positions);
cpuContext.setPositions(positions);
referenceContext.setPositions(positions);
cuState = cuContext.getState(State::Forces | State::Energy);
cpuState = cpuContext.getState(State::Forces | State::Energy);
referenceState = referenceContext.getState(State::Forces | State::Energy);
tol = 1e-2;
for (int i = 0; i < numParticles; i++) {
ASSERT_EQUAL_VEC(referenceState.getForces()[i], cuState.getForces()[i], tol);
ASSERT_EQUAL_VEC(referenceState.getForces()[i], cpuState.getForces()[i], tol);
}
tol = 1e-5;
ASSERT_EQUAL_TOL(referenceState.getPotentialEnergy(), cuState.getPotentialEnergy(), tol);
ASSERT_EQUAL_TOL(referenceState.getPotentialEnergy(), cpuState.getPotentialEnergy(), tol);
// (4) Check whether PME method in CUDA is self-consistent
// (4) Check whether PME method in CPU is self-consistent
norm = 0.0;
for (int i = 0; i < numParticles; ++i) {
Vec3 f = cuState.getForces()[i];
Vec3 f = cpuState.getForces()[i];
norm += f[0]*f[0] + f[1]*f[1] + f[2]*f[2];
}
......@@ -162,16 +162,16 @@ void testEwaldPME(bool includeExceptions) {
step = delta/norm;
for (int i = 0; i < numParticles; ++i) {
Vec3 p = positions[i];
Vec3 f = cuState.getForces()[i];
Vec3 f = cpuState.getForces()[i];
positions[i] = Vec3(p[0]-f[0]*step, p[1]-f[1]*step, p[2]-f[2]*step);
}
VerletIntegrator integrator4(0.01);
Context cuContext3(system, integrator4, platform);
cuContext3.setPositions(positions);
Context cpuContext3(system, integrator4, platform);
cpuContext3.setPositions(positions);
tol = 1e-2;
State cuState3 = cuContext3.getState(State::Energy);
ASSERT_EQUAL_TOL(norm, (cuState3.getPotentialEnergy()-cuState.getPotentialEnergy())/delta, tol)
State cpuState3 = cpuContext3.getState(State::Energy);
ASSERT_EQUAL_TOL(norm, (cpuState3.getPotentialEnergy()-cpuState.getPotentialEnergy())/delta, tol)
}
void testEwald2Ions() {
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment