Commit 5cd23acb authored by peastman's avatar peastman
Browse files

Improved vectorization of CpuNonbondedForce

parent e22808f3
...@@ -192,7 +192,8 @@ static inline fvec8 sqrt(fvec8 v) { ...@@ -192,7 +192,8 @@ static inline fvec8 sqrt(fvec8 v) {
} }
static inline float dot8(fvec8 v1, fvec8 v2) { static inline float dot8(fvec8 v1, fvec8 v2) {
return dot4(v1.lowerVec(), v2.lowerVec())+dot4(v1.upperVec(), v2.upperVec()); fvec8 result = _mm256_dp_ps(v1, v2, 0xF1);
return _mm_cvtss_f32(result.lowerVec())+_mm_cvtss_f32(result.upperVec());
} }
static inline void transpose(fvec4 in1, fvec4 in2, fvec4 in3, fvec4 in4, fvec4 in5, fvec4 in6, fvec4 in7, fvec4 in8, fvec8& out1, fvec8& out2, fvec8& out3, fvec8& out4) { static inline void transpose(fvec4 in1, fvec4 in2, fvec4 in3, fvec4 in4, fvec4 in5, fvec4 in6, fvec4 in7, fvec4 in8, fvec8& out1, fvec8& out2, fvec8& out3, fvec8& out4) {
...@@ -223,7 +224,6 @@ static inline void transpose(fvec8 in1, fvec8 in2, fvec8 in3, fvec8 in4, fvec4& ...@@ -223,7 +224,6 @@ static inline void transpose(fvec8 in1, fvec8 in2, fvec8 in3, fvec8 in4, fvec4&
// Functions that operate on ivec8s. // Functions that operate on ivec8s.
static inline bool any(ivec8 v) { static inline bool any(ivec8 v) {
return !_mm256_testz_si256(v, _mm256_set1_epi32(0xFFFFFFFF)); return !_mm256_testz_si256(v, _mm256_set1_epi32(0xFFFFFFFF));
} }
......
...@@ -228,7 +228,7 @@ private: ...@@ -228,7 +228,7 @@ private:
* Compute the displacement and squared distance between a collection of points, optionally using * Compute the displacement and squared distance between a collection of points, optionally using
* periodic boundary conditions. * periodic boundary conditions.
*/ */
void getDeltaR(const fvec4& posI, const fvec4& x, const fvec4& y, const fvec4& z, fvec4& dx, fvec4& dy, fvec4& dz, fvec4& r2, bool periodic, const fvec4& boxSize, const fvec4& invBoxSize) const; void getDeltaR(const float* posI, const fvec4& x, const fvec4& y, const fvec4& z, fvec4& dx, fvec4& dy, fvec4& dz, fvec4& r2, bool periodic, const fvec4& boxSize, const fvec4& invBoxSize) const;
/** /**
* Compute a fast approximation to erfc(x). * Compute a fast approximation to erfc(x).
......
...@@ -228,7 +228,7 @@ private: ...@@ -228,7 +228,7 @@ private:
* Compute the displacement and squared distance between a collection of points, optionally using * Compute the displacement and squared distance between a collection of points, optionally using
* periodic boundary conditions. * periodic boundary conditions.
*/ */
void getDeltaR(const fvec4& posI, const fvec8& x, const fvec8& y, const fvec8& z, fvec8& dx, fvec8& dy, fvec8& dz, fvec8& r2, bool periodic, const fvec4& boxSize, const fvec4& invBoxSize) const; void getDeltaR(const float* posI, const fvec8& x, const fvec8& y, const fvec8& z, fvec8& dx, fvec8& dy, fvec8& dz, fvec8& r2, bool periodic, const fvec4& boxSize, const fvec4& invBoxSize) const;
/** /**
* Compute a fast approximation to erfc(x). * Compute a fast approximation to erfc(x).
......
...@@ -451,11 +451,10 @@ void CpuNonbondedForce::calculateBlockIxn(int blockIndex, float* forces, double* ...@@ -451,11 +451,10 @@ void CpuNonbondedForce::calculateBlockIxn(int blockIndex, float* forces, double*
int blockAtom[4]; int blockAtom[4];
fvec4 blockAtomPosq[4]; fvec4 blockAtomPosq[4];
fvec4 blockAtomForce[4]; fvec4 blockAtomForceX(0.0f), blockAtomForceY(0.0f), blockAtomForceZ(0.0f);
for (int i = 0; i < 4; i++) { for (int i = 0; i < 4; i++) {
blockAtom[i] = neighborList->getSortedAtoms()[4*blockIndex+i]; blockAtom[i] = neighborList->getSortedAtoms()[4*blockIndex+i];
blockAtomPosq[i] = fvec4(posq+4*blockAtom[i]); blockAtomPosq[i] = fvec4(posq+4*blockAtom[i]);
blockAtomForce[i] = fvec4(0.0f);
} }
fvec4 blockAtomX = fvec4(blockAtomPosq[0][0], blockAtomPosq[1][0], blockAtomPosq[2][0], blockAtomPosq[3][0]); fvec4 blockAtomX = fvec4(blockAtomPosq[0][0], blockAtomPosq[1][0], blockAtomPosq[2][0], blockAtomPosq[3][0]);
fvec4 blockAtomY = fvec4(blockAtomPosq[0][1], blockAtomPosq[1][1], blockAtomPosq[2][1], blockAtomPosq[3][1]); fvec4 blockAtomY = fvec4(blockAtomPosq[0][1], blockAtomPosq[1][1], blockAtomPosq[2][1], blockAtomPosq[3][1]);
...@@ -463,15 +462,8 @@ void CpuNonbondedForce::calculateBlockIxn(int blockIndex, float* forces, double* ...@@ -463,15 +462,8 @@ void CpuNonbondedForce::calculateBlockIxn(int blockIndex, float* forces, double*
fvec4 blockAtomCharge = fvec4(ONE_4PI_EPS0)*fvec4(blockAtomPosq[0][3], blockAtomPosq[1][3], blockAtomPosq[2][3], blockAtomPosq[3][3]); fvec4 blockAtomCharge = fvec4(ONE_4PI_EPS0)*fvec4(blockAtomPosq[0][3], blockAtomPosq[1][3], blockAtomPosq[2][3], blockAtomPosq[3][3]);
fvec4 blockAtomSigma(atomParameters[blockAtom[0]].first, atomParameters[blockAtom[1]].first, atomParameters[blockAtom[2]].first, atomParameters[blockAtom[3]].first); fvec4 blockAtomSigma(atomParameters[blockAtom[0]].first, atomParameters[blockAtom[1]].first, atomParameters[blockAtom[2]].first, atomParameters[blockAtom[3]].first);
fvec4 blockAtomEpsilon(atomParameters[blockAtom[0]].second, atomParameters[blockAtom[1]].second, atomParameters[blockAtom[2]].second, atomParameters[blockAtom[3]].second); fvec4 blockAtomEpsilon(atomParameters[blockAtom[0]].second, atomParameters[blockAtom[1]].second, atomParameters[blockAtom[2]].second, atomParameters[blockAtom[3]].second);
bool needPeriodic = false; bool needPeriodic = (periodic && (any(blockAtomX < cutoffDistance) || any(blockAtomY < cutoffDistance) || any(blockAtomZ < cutoffDistance) ||
if (periodic) { any(blockAtomX > boxSize[0]-cutoffDistance) || any(blockAtomY > boxSize[1]-cutoffDistance) || any(blockAtomZ > boxSize[2]-cutoffDistance)));
for (int i = 0; i < 4 && !needPeriodic; i++)
for (int j = 0; j < 3; j++)
if (blockAtomPosq[i][j]-cutoffDistance < 0.0 || blockAtomPosq[i][j]+cutoffDistance > boxSize[j]) {
needPeriodic = true;
break;
}
}
const float invSwitchingInterval = 1/(cutoffDistance-switchingDistance); const float invSwitchingInterval = 1/(cutoffDistance-switchingDistance);
// Loop over neighbors for this block. // Loop over neighbors for this block.
...@@ -482,12 +474,11 @@ void CpuNonbondedForce::calculateBlockIxn(int blockIndex, float* forces, double* ...@@ -482,12 +474,11 @@ void CpuNonbondedForce::calculateBlockIxn(int blockIndex, float* forces, double*
// Load the next neighbor. // Load the next neighbor.
int atom = neighbors[i]; int atom = neighbors[i];
fvec4 atomPosq(posq+4*atom);
// Compute the distances to the block atoms. // Compute the distances to the block atoms.
fvec4 dx, dy, dz, r2; fvec4 dx, dy, dz, r2;
getDeltaR(atomPosq, blockAtomX, blockAtomY, blockAtomZ, dx, dy, dz, r2, needPeriodic, boxSize, invBoxSize); getDeltaR(posq+4*atom, blockAtomX, blockAtomY, blockAtomZ, dx, dy, dz, r2, needPeriodic, boxSize, invBoxSize);
ivec4 include; ivec4 include;
char excl = exclusions[i]; char excl = exclusions[i];
if (excl == 0) if (excl == 0)
...@@ -533,32 +524,37 @@ void CpuNonbondedForce::calculateBlockIxn(int blockIndex, float* forces, double* ...@@ -533,32 +524,37 @@ void CpuNonbondedForce::calculateBlockIxn(int blockIndex, float* forces, double*
// Accumulate energies. // Accumulate energies.
fvec4 one(1.0f);
if (totalEnergy) { if (totalEnergy) {
if (cutoff) if (cutoff)
energy += chargeProd*(inverseR+krf*r2-crf); energy += chargeProd*(inverseR+krf*r2-crf);
else else
energy += chargeProd*inverseR; energy += chargeProd*inverseR;
energy = blend(0.0f, energy, include); energy = blend(0.0f, energy, include);
*totalEnergy += dot4(energy, 1.0f); *totalEnergy += dot4(energy, one);
} }
// Accumulate forces. // Accumulate forces.
dEdR = blend(0.0f, dEdR, include); dEdR = blend(0.0f, dEdR, include);
fvec4 result[4] = {dx*dEdR, dy*dEdR, dz*dEdR, 0.0f}; fvec4 fx = dx*dEdR;
transpose(result[0], result[1], result[2], result[3]); fvec4 fy = dy*dEdR;
fvec4 atomForce(forces+4*atom); fvec4 fz = dz*dEdR;
for (int j = 0; j < 4; j++) { blockAtomForceX += fx;
blockAtomForce[j] += result[j]; blockAtomForceY += fy;
atomForce -= result[j]; blockAtomForceZ += fz;
} float* atomForce = forces+4*atom;
atomForce.store(forces+4*atom); atomForce[0] -= dot4(fx, one);
atomForce[1] -= dot4(fy, one);
atomForce[2] -= dot4(fz, one);
} }
// Record the forces on the block atoms. // Record the forces on the block atoms.
fvec4 f[4] = {blockAtomForceX, blockAtomForceY, blockAtomForceZ, 0.0f};
transpose(f[0], f[1], f[2], f[3]);
for (int j = 0; j < 4; j++) for (int j = 0; j < 4; j++)
(fvec4(forces+4*blockAtom[j])+blockAtomForce[j]).store(forces+4*blockAtom[j]); (fvec4(forces+4*blockAtom[j])+f[j]).store(forces+4*blockAtom[j]);
} }
void CpuNonbondedForce::calculateBlockEwaldIxn(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize) { void CpuNonbondedForce::calculateBlockEwaldIxn(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize) {
...@@ -566,11 +562,10 @@ void CpuNonbondedForce::calculateBlockEwaldIxn(int blockIndex, float* forces, do ...@@ -566,11 +562,10 @@ void CpuNonbondedForce::calculateBlockEwaldIxn(int blockIndex, float* forces, do
int blockAtom[4]; int blockAtom[4];
fvec4 blockAtomPosq[4]; fvec4 blockAtomPosq[4];
fvec4 blockAtomForce[4]; fvec4 blockAtomForceX(0.0f), blockAtomForceY(0.0f), blockAtomForceZ(0.0f);
for (int i = 0; i < 4; i++) { for (int i = 0; i < 4; i++) {
blockAtom[i] = neighborList->getSortedAtoms()[4*blockIndex+i]; blockAtom[i] = neighborList->getSortedAtoms()[4*blockIndex+i];
blockAtomPosq[i] = fvec4(posq+4*blockAtom[i]); blockAtomPosq[i] = fvec4(posq+4*blockAtom[i]);
blockAtomForce[i] = fvec4(0.0f);
} }
fvec4 blockAtomX = fvec4(blockAtomPosq[0][0], blockAtomPosq[1][0], blockAtomPosq[2][0], blockAtomPosq[3][0]); fvec4 blockAtomX = fvec4(blockAtomPosq[0][0], blockAtomPosq[1][0], blockAtomPosq[2][0], blockAtomPosq[3][0]);
fvec4 blockAtomY = fvec4(blockAtomPosq[0][1], blockAtomPosq[1][1], blockAtomPosq[2][1], blockAtomPosq[3][1]); fvec4 blockAtomY = fvec4(blockAtomPosq[0][1], blockAtomPosq[1][1], blockAtomPosq[2][1], blockAtomPosq[3][1]);
...@@ -578,13 +573,8 @@ void CpuNonbondedForce::calculateBlockEwaldIxn(int blockIndex, float* forces, do ...@@ -578,13 +573,8 @@ void CpuNonbondedForce::calculateBlockEwaldIxn(int blockIndex, float* forces, do
fvec4 blockAtomCharge = fvec4(ONE_4PI_EPS0)*fvec4(blockAtomPosq[0][3], blockAtomPosq[1][3], blockAtomPosq[2][3], blockAtomPosq[3][3]); fvec4 blockAtomCharge = fvec4(ONE_4PI_EPS0)*fvec4(blockAtomPosq[0][3], blockAtomPosq[1][3], blockAtomPosq[2][3], blockAtomPosq[3][3]);
fvec4 blockAtomSigma(atomParameters[blockAtom[0]].first, atomParameters[blockAtom[1]].first, atomParameters[blockAtom[2]].first, atomParameters[blockAtom[3]].first); fvec4 blockAtomSigma(atomParameters[blockAtom[0]].first, atomParameters[blockAtom[1]].first, atomParameters[blockAtom[2]].first, atomParameters[blockAtom[3]].first);
fvec4 blockAtomEpsilon(atomParameters[blockAtom[0]].second, atomParameters[blockAtom[1]].second, atomParameters[blockAtom[2]].second, atomParameters[blockAtom[3]].second); fvec4 blockAtomEpsilon(atomParameters[blockAtom[0]].second, atomParameters[blockAtom[1]].second, atomParameters[blockAtom[2]].second, atomParameters[blockAtom[3]].second);
bool needPeriodic = false; bool needPeriodic = (periodic && (any(blockAtomX < cutoffDistance) || any(blockAtomY < cutoffDistance) || any(blockAtomZ < cutoffDistance) ||
for (int i = 0; i < 4 && !needPeriodic; i++) any(blockAtomX > boxSize[0]-cutoffDistance) || any(blockAtomY > boxSize[1]-cutoffDistance) || any(blockAtomZ > boxSize[2]-cutoffDistance)));
for (int j = 0; j < 3; j++)
if (blockAtomPosq[i][j]-cutoffDistance < 0.0 || blockAtomPosq[i][j]+cutoffDistance > boxSize[j]) {
needPeriodic = true;
break;
}
const float invSwitchingInterval = 1/(cutoffDistance-switchingDistance); const float invSwitchingInterval = 1/(cutoffDistance-switchingDistance);
// Loop over neighbors for this block. // Loop over neighbors for this block.
...@@ -595,12 +585,11 @@ void CpuNonbondedForce::calculateBlockEwaldIxn(int blockIndex, float* forces, do ...@@ -595,12 +585,11 @@ void CpuNonbondedForce::calculateBlockEwaldIxn(int blockIndex, float* forces, do
// Load the next neighbor. // Load the next neighbor.
int atom = neighbors[i]; int atom = neighbors[i];
fvec4 atomPosq(posq+4*atom);
// Compute the distances to the block atoms. // Compute the distances to the block atoms.
fvec4 dx, dy, dz, r2; fvec4 dx, dy, dz, r2;
getDeltaR(atomPosq, blockAtomX, blockAtomY, blockAtomZ, dx, dy, dz, r2, needPeriodic, boxSize, invBoxSize); getDeltaR(posq+4*atom, blockAtomX, blockAtomY, blockAtomZ, dx, dy, dz, r2, needPeriodic, boxSize, invBoxSize);
ivec4 include; ivec4 include;
char excl = exclusions[i]; char excl = exclusions[i];
if (excl == 0) if (excl == 0)
...@@ -643,29 +632,34 @@ void CpuNonbondedForce::calculateBlockEwaldIxn(int blockIndex, float* forces, do ...@@ -643,29 +632,34 @@ void CpuNonbondedForce::calculateBlockEwaldIxn(int blockIndex, float* forces, do
// Accumulate energies. // Accumulate energies.
fvec4 one(1.0f);
if (totalEnergy) { if (totalEnergy) {
energy += chargeProd*inverseR*erfcApprox(alphaEwald*r); energy += chargeProd*inverseR*erfcApprox(alphaEwald*r);
energy = blend(0.0f, energy, include); energy = blend(0.0f, energy, include);
*totalEnergy += dot4(energy, 1.0f); *totalEnergy += dot4(energy, one);
} }
// Accumulate forces. // Accumulate forces.
dEdR = blend(0.0f, dEdR, include); dEdR = blend(0.0f, dEdR, include);
fvec4 result[4] = {dx*dEdR, dy*dEdR, dz*dEdR, 0.0f}; fvec4 fx = dx*dEdR;
transpose(result[0], result[1], result[2], result[3]); fvec4 fy = dy*dEdR;
fvec4 atomForce(forces+4*atom); fvec4 fz = dz*dEdR;
for (int j = 0; j < 4; j++) { blockAtomForceX += fx;
blockAtomForce[j] += result[j]; blockAtomForceY += fy;
atomForce -= result[j]; blockAtomForceZ += fz;
} float* atomForce = forces+4*atom;
atomForce.store(forces+4*atom); atomForce[0] -= dot4(fx, one);
atomForce[1] -= dot4(fy, one);
atomForce[2] -= dot4(fz, one);
} }
// Record the forces on the block atoms. // Record the forces on the block atoms.
fvec4 f[4] = {blockAtomForceX, blockAtomForceY, blockAtomForceZ, 0.0f};
transpose(f[0], f[1], f[2], f[3]);
for (int j = 0; j < 4; j++) for (int j = 0; j < 4; j++)
(fvec4(forces+4*blockAtom[j])+blockAtomForce[j]).store(forces+4*blockAtom[j]); (fvec4(forces+4*blockAtom[j])+f[j]).store(forces+4*blockAtom[j]);
} }
void CpuNonbondedForce::getDeltaR(const fvec4& posI, const fvec4& posJ, fvec4& deltaR, float& r2, bool periodic, const fvec4& boxSize, const fvec4& invBoxSize) const { void CpuNonbondedForce::getDeltaR(const fvec4& posI, const fvec4& posJ, fvec4& deltaR, float& r2, bool periodic, const fvec4& boxSize, const fvec4& invBoxSize) const {
...@@ -677,7 +671,7 @@ void CpuNonbondedForce::getDeltaR(const fvec4& posI, const fvec4& posJ, fvec4& d ...@@ -677,7 +671,7 @@ void CpuNonbondedForce::getDeltaR(const fvec4& posI, const fvec4& posJ, fvec4& d
r2 = dot3(deltaR, deltaR); r2 = dot3(deltaR, deltaR);
} }
void CpuNonbondedForce::getDeltaR(const fvec4& posI, const fvec4& x, const fvec4& y, const fvec4& z, fvec4& dx, fvec4& dy, fvec4& dz, fvec4& r2, bool periodic, const fvec4& boxSize, const fvec4& invBoxSize) const { void CpuNonbondedForce::getDeltaR(const float* posI, const fvec4& x, const fvec4& y, const fvec4& z, fvec4& dx, fvec4& dy, fvec4& dz, fvec4& r2, bool periodic, const fvec4& boxSize, const fvec4& invBoxSize) const {
dx = x-posI[0]; dx = x-posI[0];
dy = y-posI[1]; dy = y-posI[1];
dz = z-posI[2]; dz = z-posI[2];
......
...@@ -451,27 +451,18 @@ void CpuNonbondedForceVec8::calculateBlockIxn(int blockIndex, float* forces, dou ...@@ -451,27 +451,18 @@ void CpuNonbondedForceVec8::calculateBlockIxn(int blockIndex, float* forces, dou
int blockAtom[8]; int blockAtom[8];
fvec4 blockAtomPosq[8]; fvec4 blockAtomPosq[8];
fvec4 blockAtomForce[8]; fvec8 blockAtomForceX(0.0f), blockAtomForceY(0.0f), blockAtomForceZ(0.0f);
fvec8 blockAtomX, blockAtomY, blockAtomZ, blockAtomCharge;
for (int i = 0; i < 8; i++) { for (int i = 0; i < 8; i++) {
blockAtom[i] = neighborList->getSortedAtoms()[8*blockIndex+i]; blockAtom[i] = neighborList->getSortedAtoms()[8*blockIndex+i];
blockAtomPosq[i] = fvec4(posq+4*blockAtom[i]); blockAtomPosq[i] = fvec4(posq+4*blockAtom[i]);
blockAtomForce[i] = fvec4(0.0f);
} }
fvec8 blockAtomX = fvec8(blockAtomPosq[0][0], blockAtomPosq[1][0], blockAtomPosq[2][0], blockAtomPosq[3][0], blockAtomPosq[4][0], blockAtomPosq[5][0], blockAtomPosq[6][0], blockAtomPosq[7][0]); transpose(blockAtomPosq[0], blockAtomPosq[1], blockAtomPosq[2], blockAtomPosq[3], blockAtomPosq[4], blockAtomPosq[5], blockAtomPosq[6], blockAtomPosq[7], blockAtomX, blockAtomY, blockAtomZ, blockAtomCharge);
fvec8 blockAtomY = fvec8(blockAtomPosq[0][1], blockAtomPosq[1][1], blockAtomPosq[2][1], blockAtomPosq[3][1], blockAtomPosq[4][1], blockAtomPosq[5][1], blockAtomPosq[6][1], blockAtomPosq[7][1]); blockAtomCharge *= ONE_4PI_EPS0;
fvec8 blockAtomZ = fvec8(blockAtomPosq[0][2], blockAtomPosq[1][2], blockAtomPosq[2][2], blockAtomPosq[3][2], blockAtomPosq[4][2], blockAtomPosq[5][2], blockAtomPosq[6][2], blockAtomPosq[7][2]);
fvec8 blockAtomCharge = fvec8(ONE_4PI_EPS0)*fvec8(blockAtomPosq[0][3], blockAtomPosq[1][3], blockAtomPosq[2][3], blockAtomPosq[3][3], blockAtomPosq[4][3], blockAtomPosq[5][3], blockAtomPosq[6][3], blockAtomPosq[7][3]);
fvec8 blockAtomSigma(atomParameters[blockAtom[0]].first, atomParameters[blockAtom[1]].first, atomParameters[blockAtom[2]].first, atomParameters[blockAtom[3]].first, atomParameters[blockAtom[4]].first, atomParameters[blockAtom[5]].first, atomParameters[blockAtom[6]].first, atomParameters[blockAtom[7]].first); fvec8 blockAtomSigma(atomParameters[blockAtom[0]].first, atomParameters[blockAtom[1]].first, atomParameters[blockAtom[2]].first, atomParameters[blockAtom[3]].first, atomParameters[blockAtom[4]].first, atomParameters[blockAtom[5]].first, atomParameters[blockAtom[6]].first, atomParameters[blockAtom[7]].first);
fvec8 blockAtomEpsilon(atomParameters[blockAtom[0]].second, atomParameters[blockAtom[1]].second, atomParameters[blockAtom[2]].second, atomParameters[blockAtom[3]].second, atomParameters[blockAtom[4]].second, atomParameters[blockAtom[5]].second, atomParameters[blockAtom[6]].second, atomParameters[blockAtom[7]].second); fvec8 blockAtomEpsilon(atomParameters[blockAtom[0]].second, atomParameters[blockAtom[1]].second, atomParameters[blockAtom[2]].second, atomParameters[blockAtom[3]].second, atomParameters[blockAtom[4]].second, atomParameters[blockAtom[5]].second, atomParameters[blockAtom[6]].second, atomParameters[blockAtom[7]].second);
bool needPeriodic = false; bool needPeriodic = (periodic && (any(blockAtomX < cutoffDistance) || any(blockAtomY < cutoffDistance) || any(blockAtomZ < cutoffDistance) ||
if (periodic) { any(blockAtomX > boxSize[0]-cutoffDistance) || any(blockAtomY > boxSize[1]-cutoffDistance) || any(blockAtomZ > boxSize[2]-cutoffDistance)));
for (int i = 0; i < 8 && !needPeriodic; i++)
for (int j = 0; j < 3; j++)
if (blockAtomPosq[i][j]-cutoffDistance < 0.0 || blockAtomPosq[i][j]+cutoffDistance > boxSize[j]) {
needPeriodic = true;
break;
}
}
const float invSwitchingInterval = 1/(cutoffDistance-switchingDistance); const float invSwitchingInterval = 1/(cutoffDistance-switchingDistance);
// Loop over neighbors for this block. // Loop over neighbors for this block.
...@@ -482,12 +473,11 @@ void CpuNonbondedForceVec8::calculateBlockIxn(int blockIndex, float* forces, dou ...@@ -482,12 +473,11 @@ void CpuNonbondedForceVec8::calculateBlockIxn(int blockIndex, float* forces, dou
// Load the next neighbor. // Load the next neighbor.
int atom = neighbors[i]; int atom = neighbors[i];
fvec4 atomPosq(posq+4*atom);
// Compute the distances to the block atoms. // Compute the distances to the block atoms.
fvec8 dx, dy, dz, r2; fvec8 dx, dy, dz, r2;
getDeltaR(atomPosq, blockAtomX, blockAtomY, blockAtomZ, dx, dy, dz, r2, needPeriodic, boxSize, invBoxSize); getDeltaR(&posq[4*atom], blockAtomX, blockAtomY, blockAtomZ, dx, dy, dz, r2, needPeriodic, boxSize, invBoxSize);
ivec8 include; ivec8 include;
char excl = exclusions[i]; char excl = exclusions[i];
if (excl == 0) if (excl == 0)
...@@ -533,33 +523,37 @@ void CpuNonbondedForceVec8::calculateBlockIxn(int blockIndex, float* forces, dou ...@@ -533,33 +523,37 @@ void CpuNonbondedForceVec8::calculateBlockIxn(int blockIndex, float* forces, dou
// Accumulate energies. // Accumulate energies.
fvec8 one(1.0f);
if (totalEnergy) { if (totalEnergy) {
if (cutoff) if (cutoff)
energy += chargeProd*(inverseR+krf*r2-crf); energy += chargeProd*(inverseR+krf*r2-crf);
else else
energy += chargeProd*inverseR; energy += chargeProd*inverseR;
energy = blend(0.0f, energy, include); energy = blend(0.0f, energy, include);
*totalEnergy += dot8(energy, 1.0f); *totalEnergy += dot8(energy, one);
} }
// Accumulate forces. // Accumulate forces.
dEdR = blend(0.0f, dEdR, include); dEdR = blend(0.0f, dEdR, include);
fvec8 result[4] = {dx*dEdR, dy*dEdR, dz*dEdR, 0.0f}; fvec8 fx = dx*dEdR;
fvec4 rt[8]; fvec8 fy = dy*dEdR;
transpose(result[0], result[1], result[2], result[3], rt[0], rt[1], rt[2], rt[3], rt[4], rt[5], rt[6], rt[7]); fvec8 fz = dz*dEdR;
fvec4 atomForce(forces+4*atom); blockAtomForceX += fx;
for (int j = 0; j < 8; j++) { blockAtomForceY += fy;
blockAtomForce[j] += rt[j]; blockAtomForceZ += fz;
atomForce -= rt[j]; float* atomForce = forces+4*atom;
} atomForce[0] -= dot8(fx, one);
atomForce.store(forces+4*atom); atomForce[1] -= dot8(fy, one);
atomForce[2] -= dot8(fz, one);
} }
// Record the forces on the block atoms. // Record the forces on the block atoms.
fvec4 f[8];
transpose(blockAtomForceX, blockAtomForceY, blockAtomForceZ, 0.0f, f[0], f[1], f[2], f[3], f[4], f[5], f[6], f[7]);
for (int j = 0; j < 8; j++) for (int j = 0; j < 8; j++)
(fvec4(forces+4*blockAtom[j])+blockAtomForce[j]).store(forces+4*blockAtom[j]); (fvec4(forces+4*blockAtom[j])+f[j]).store(forces+4*blockAtom[j]);
} }
void CpuNonbondedForceVec8::calculateBlockEwaldIxn(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize) { void CpuNonbondedForceVec8::calculateBlockEwaldIxn(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize) {
...@@ -567,27 +561,18 @@ void CpuNonbondedForceVec8::calculateBlockEwaldIxn(int blockIndex, float* forces ...@@ -567,27 +561,18 @@ void CpuNonbondedForceVec8::calculateBlockEwaldIxn(int blockIndex, float* forces
int blockAtom[8]; int blockAtom[8];
fvec4 blockAtomPosq[8]; fvec4 blockAtomPosq[8];
fvec4 blockAtomForce[8]; fvec8 blockAtomForceX(0.0f), blockAtomForceY(0.0f), blockAtomForceZ(0.0f);
fvec8 blockAtomX, blockAtomY, blockAtomZ, blockAtomCharge;
for (int i = 0; i < 8; i++) { for (int i = 0; i < 8; i++) {
blockAtom[i] = neighborList->getSortedAtoms()[8*blockIndex+i]; blockAtom[i] = neighborList->getSortedAtoms()[8*blockIndex+i];
blockAtomPosq[i] = fvec4(posq+4*blockAtom[i]); blockAtomPosq[i] = fvec4(posq+4*blockAtom[i]);
blockAtomForce[i] = fvec4(0.0f);
} }
fvec8 blockAtomX = fvec8(blockAtomPosq[0][0], blockAtomPosq[1][0], blockAtomPosq[2][0], blockAtomPosq[3][0], blockAtomPosq[4][0], blockAtomPosq[5][0], blockAtomPosq[6][0], blockAtomPosq[7][0]); transpose(blockAtomPosq[0], blockAtomPosq[1], blockAtomPosq[2], blockAtomPosq[3], blockAtomPosq[4], blockAtomPosq[5], blockAtomPosq[6], blockAtomPosq[7], blockAtomX, blockAtomY, blockAtomZ, blockAtomCharge);
fvec8 blockAtomY = fvec8(blockAtomPosq[0][1], blockAtomPosq[1][1], blockAtomPosq[2][1], blockAtomPosq[3][1], blockAtomPosq[4][1], blockAtomPosq[5][1], blockAtomPosq[6][1], blockAtomPosq[7][1]); blockAtomCharge *= ONE_4PI_EPS0;
fvec8 blockAtomZ = fvec8(blockAtomPosq[0][2], blockAtomPosq[1][2], blockAtomPosq[2][2], blockAtomPosq[3][2], blockAtomPosq[4][2], blockAtomPosq[5][2], blockAtomPosq[6][2], blockAtomPosq[7][2]);
fvec8 blockAtomCharge = fvec8(ONE_4PI_EPS0)*fvec8(blockAtomPosq[0][3], blockAtomPosq[1][3], blockAtomPosq[2][3], blockAtomPosq[3][3], blockAtomPosq[4][3], blockAtomPosq[5][3], blockAtomPosq[6][3], blockAtomPosq[7][3]);
fvec8 blockAtomSigma(atomParameters[blockAtom[0]].first, atomParameters[blockAtom[1]].first, atomParameters[blockAtom[2]].first, atomParameters[blockAtom[3]].first, atomParameters[blockAtom[4]].first, atomParameters[blockAtom[5]].first, atomParameters[blockAtom[6]].first, atomParameters[blockAtom[7]].first); fvec8 blockAtomSigma(atomParameters[blockAtom[0]].first, atomParameters[blockAtom[1]].first, atomParameters[blockAtom[2]].first, atomParameters[blockAtom[3]].first, atomParameters[blockAtom[4]].first, atomParameters[blockAtom[5]].first, atomParameters[blockAtom[6]].first, atomParameters[blockAtom[7]].first);
fvec8 blockAtomEpsilon(atomParameters[blockAtom[0]].second, atomParameters[blockAtom[1]].second, atomParameters[blockAtom[2]].second, atomParameters[blockAtom[3]].second, atomParameters[blockAtom[4]].second, atomParameters[blockAtom[5]].second, atomParameters[blockAtom[6]].second, atomParameters[blockAtom[7]].second); fvec8 blockAtomEpsilon(atomParameters[blockAtom[0]].second, atomParameters[blockAtom[1]].second, atomParameters[blockAtom[2]].second, atomParameters[blockAtom[3]].second, atomParameters[blockAtom[4]].second, atomParameters[blockAtom[5]].second, atomParameters[blockAtom[6]].second, atomParameters[blockAtom[7]].second);
bool needPeriodic = false; bool needPeriodic = (periodic && (any(blockAtomX < cutoffDistance) || any(blockAtomY < cutoffDistance) || any(blockAtomZ < cutoffDistance) ||
if (periodic) { any(blockAtomX > boxSize[0]-cutoffDistance) || any(blockAtomY > boxSize[1]-cutoffDistance) || any(blockAtomZ > boxSize[2]-cutoffDistance)));
for (int i = 0; i < 8 && !needPeriodic; i++)
for (int j = 0; j < 3; j++)
if (blockAtomPosq[i][j]-cutoffDistance < 0.0 || blockAtomPosq[i][j]+cutoffDistance > boxSize[j]) {
needPeriodic = true;
break;
}
}
const float invSwitchingInterval = 1/(cutoffDistance-switchingDistance); const float invSwitchingInterval = 1/(cutoffDistance-switchingDistance);
// Loop over neighbors for this block. // Loop over neighbors for this block.
...@@ -598,12 +583,11 @@ void CpuNonbondedForceVec8::calculateBlockEwaldIxn(int blockIndex, float* forces ...@@ -598,12 +583,11 @@ void CpuNonbondedForceVec8::calculateBlockEwaldIxn(int blockIndex, float* forces
// Load the next neighbor. // Load the next neighbor.
int atom = neighbors[i]; int atom = neighbors[i];
fvec4 atomPosq(posq+4*atom);
// Compute the distances to the block atoms. // Compute the distances to the block atoms.
fvec8 dx, dy, dz, r2; fvec8 dx, dy, dz, r2;
getDeltaR(atomPosq, blockAtomX, blockAtomY, blockAtomZ, dx, dy, dz, r2, needPeriodic, boxSize, invBoxSize); getDeltaR(&posq[4*atom], blockAtomX, blockAtomY, blockAtomZ, dx, dy, dz, r2, needPeriodic, boxSize, invBoxSize);
ivec8 include; ivec8 include;
char excl = exclusions[i]; char excl = exclusions[i];
if (excl == 0) if (excl == 0)
...@@ -646,30 +630,34 @@ void CpuNonbondedForceVec8::calculateBlockEwaldIxn(int blockIndex, float* forces ...@@ -646,30 +630,34 @@ void CpuNonbondedForceVec8::calculateBlockEwaldIxn(int blockIndex, float* forces
// Accumulate energies. // Accumulate energies.
fvec8 one(1.0f);
if (totalEnergy) { if (totalEnergy) {
energy += chargeProd*inverseR*erfcApprox(alphaEwald*r); energy += chargeProd*inverseR*erfcApprox(alphaEwald*r);
energy = blend(0.0f, energy, include); energy = blend(0.0f, energy, include);
*totalEnergy += dot8(energy, 1.0f); *totalEnergy += dot8(energy, one);
} }
// Accumulate forces. // Accumulate forces.
dEdR = blend(0.0f, dEdR, include); dEdR = blend(0.0f, dEdR, include);
fvec8 result[4] = {dx*dEdR, dy*dEdR, dz*dEdR, 0.0f}; fvec8 fx = dx*dEdR;
fvec4 rt[8]; fvec8 fy = dy*dEdR;
transpose(result[0], result[1], result[2], result[3], rt[0], rt[1], rt[2], rt[3], rt[4], rt[5], rt[6], rt[7]); fvec8 fz = dz*dEdR;
fvec4 atomForce(forces+4*atom); blockAtomForceX += fx;
for (int j = 0; j < 8; j++) { blockAtomForceY += fy;
blockAtomForce[j] += rt[j]; blockAtomForceZ += fz;
atomForce -= rt[j]; float* atomForce = forces+4*atom;
} atomForce[0] -= dot8(fx, one);
atomForce.store(forces+4*atom); atomForce[1] -= dot8(fy, one);
atomForce[2] -= dot8(fz, one);
} }
// Record the forces on the block atoms. // Record the forces on the block atoms.
fvec4 f[8];
transpose(blockAtomForceX, blockAtomForceY, blockAtomForceZ, 0.0f, f[0], f[1], f[2], f[3], f[4], f[5], f[6], f[7]);
for (int j = 0; j < 8; j++) for (int j = 0; j < 8; j++)
(fvec4(forces+4*blockAtom[j])+blockAtomForce[j]).store(forces+4*blockAtom[j]); (fvec4(forces+4*blockAtom[j])+f[j]).store(forces+4*blockAtom[j]);
} }
void CpuNonbondedForceVec8::getDeltaR(const fvec4& posI, const fvec4& posJ, fvec4& deltaR, float& r2, bool periodic, const fvec4& boxSize, const fvec4& invBoxSize) const { void CpuNonbondedForceVec8::getDeltaR(const fvec4& posI, const fvec4& posJ, fvec4& deltaR, float& r2, bool periodic, const fvec4& boxSize, const fvec4& invBoxSize) const {
...@@ -681,7 +669,7 @@ void CpuNonbondedForceVec8::getDeltaR(const fvec4& posI, const fvec4& posJ, fvec ...@@ -681,7 +669,7 @@ void CpuNonbondedForceVec8::getDeltaR(const fvec4& posI, const fvec4& posJ, fvec
r2 = dot3(deltaR, deltaR); r2 = dot3(deltaR, deltaR);
} }
void CpuNonbondedForceVec8::getDeltaR(const fvec4& posI, const fvec8& x, const fvec8& y, const fvec8& z, fvec8& dx, fvec8& dy, fvec8& dz, fvec8& r2, bool periodic, const fvec4& boxSize, const fvec4& invBoxSize) const { void CpuNonbondedForceVec8::getDeltaR(const float* posI, const fvec8& x, const fvec8& y, const fvec8& z, fvec8& dx, fvec8& dy, fvec8& dz, fvec8& r2, bool periodic, const fvec4& boxSize, const fvec4& invBoxSize) const {
dx = x-posI[0]; dx = x-posI[0];
dy = y-posI[1]; dy = y-posI[1];
dz = z-posI[2]; dz = z-posI[2];
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment