Commit 56902b45 authored by peastman's avatar peastman
Browse files

Restructured nonbonded calculation to allow more efficient vectorization

parent 6eacad45
......@@ -88,6 +88,27 @@ public:
fvec4 operator-() const {
return _mm_sub_ps(_mm_set1_ps(0.0f), val);
}
fvec4 operator&(__m128i other) const {
return _mm_and_si128(val, other);
}
fvec4 operator==(fvec4 other) const {
return _mm_cmpeq_ps(val, other);
}
fvec4 operator!=(fvec4 other) const {
return _mm_cmpneq_ps(val, other);
}
fvec4 operator>(fvec4 other) const {
return _mm_cmpgt_ps(val, other);
}
fvec4 operator<(fvec4 other) const {
return _mm_cmplt_ps(val, other);
}
fvec4 operator>=(fvec4 other) const {
return _mm_cmpge_ps(val, other);
}
fvec4 operator<=(fvec4 other) const {
return _mm_cmple_ps(val, other);
}
operator ivec4() const;
};
......@@ -174,6 +195,11 @@ static inline fvec4 max(fvec4 v1, fvec4 v2) {
return fvec4(_mm_max_ps(v1.val, v2.val));
}
static inline fvec4 abs(fvec4 v) {
static const __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x7FFFFFFF));
return fvec4(_mm_and_ps(v.val, mask));
}
static inline fvec4 sqrt(fvec4 v) {
return fvec4(_mm_sqrt_ps(v.val));
}
......@@ -182,6 +208,10 @@ static inline float dot3(fvec4 v1, fvec4 v2) {
return _mm_cvtss_f32(_mm_dp_ps(v1, v2, 0x71));
}
static inline float dot4(fvec4 v1, fvec4 v2) {
return _mm_cvtss_f32(_mm_dp_ps(v1, v2, 0xF1));
}
// Functions that operate on ivec4s.
static inline ivec4 min(ivec4 v1, ivec4 v2) {
......@@ -192,5 +222,27 @@ static inline ivec4 max(ivec4 v1, ivec4 v2) {
return ivec4(_mm_max_epi32(v1.val, v2.val));
}
static inline ivec4 abs(ivec4 v) {
return ivec4(_mm_abs_epi32(v.val));
}
// Mathematical operators involving a scalar and a vector.
static inline fvec4 operator+(float v1, fvec4 v2) {
return fvec4(v1)+v2;
}
static inline fvec4 operator-(float v1, fvec4 v2) {
return fvec4(v1)-v2;
}
static inline fvec4 operator*(float v1, fvec4 v2) {
return fvec4(v1)*v2;
}
static inline fvec4 operator/(float v1, fvec4 v2) {
return fvec4(v1)/v2;
}
#endif /*OPENMM_VECTORIZE_H_*/
......@@ -13,19 +13,25 @@ class OPENMM_EXPORT_CPU CpuNeighborList {
public:
class ThreadData;
class VoxelHash;
static const int BlockSize;
CpuNeighborList();
~CpuNeighborList();
void computeNeighborList(int numAtoms, const std::vector<float>& atomLocations, const std::vector<std::set<int> >& exclusions,
const float* periodicBoxSize, bool usePeriodic, float maxDistance);
const std::vector<std::pair<int, int> >& getNeighbors();
int getNumBlocks() const;
const std::vector<int>& getSortedAtoms() const;
const std::vector<int>& getBlockNeighbors(int blockIndex) const;
const std::vector<char>& getBlockExclusions(int blockIndex) const;
/**
* This routine contains the code executed by each thread.
*/
void runThread(int index, std::vector<std::pair<int, int> >& threadNeighbors);
void runThread(int index);
private:
bool isDeleted;
int numThreads, waitCount;
std::vector<std::pair<int, int> > neighbors;
std::vector<int> sortedAtoms;
std::vector<std::vector<int> > blockNeighbors;
std::vector<std::vector<char> > blockExclusions;
std::vector<pthread_t> thread;
std::vector<ThreadData*> threadData;
pthread_cond_t startCondition, endCondition;
......
......@@ -25,6 +25,7 @@
#ifndef OPENMM_CPU_NONBONDED_FORCE_H__
#define OPENMM_CPU_NONBONDED_FORCE_H__
#include "CpuNeighborList.h"
#include "ReferencePairIxn.h"
#include "openmm/internal/vectorize.h"
#include <pthread.h>
......@@ -33,6 +34,8 @@
#include <vector>
// ---------------------------------------------------------------------------------------
namespace OpenMM {
class CpuNonbondedForce {
public:
class ThreadData;
......@@ -63,7 +66,7 @@ class CpuNonbondedForce {
--------------------------------------------------------------------------------------- */
void setUseCutoff(float distance, const std::vector<std::pair<int, int> >& neighbors, float solventDielectric);
void setUseCutoff(float distance, const CpuNeighborList& neighbors, float solventDielectric);
/**---------------------------------------------------------------------------------------
......@@ -127,9 +130,9 @@ class CpuNonbondedForce {
--------------------------------------------------------------------------------------- */
void calculateReciprocalIxn(int numberOfAtoms, float* posq, std::vector<OpenMM::RealVec>& atomCoordinates,
void calculateReciprocalIxn(int numberOfAtoms, float* posq, std::vector<RealVec>& atomCoordinates,
const std::vector<std::pair<float, float> >& atomParameters, const std::vector<std::set<int> >& exclusions,
std::vector<OpenMM::RealVec>& forces, float* totalEnergy) const;
std::vector<RealVec>& forces, float* totalEnergy) const;
/**---------------------------------------------------------------------------------------
......@@ -159,14 +162,14 @@ private:
bool periodic;
bool ewald;
bool pme;
const std::vector<std::pair<int, int> >* neighborList;
const CpuNeighborList* neighborList;
float periodicBoxSize[3];
float cutoffDistance, switchingDistance;
float krf, crf;
float alphaEwald;
int numRx, numRy, numRz;
int meshDim[3];
std::vector<float> ewaldScaleX, ewaldScaleY, ewaldScaleDeriv;
std::vector<float> ewaldScaleTable;
float ewaldDX, ewaldDXInv;
bool isDeleted;
int numThreads, waitCount;
......@@ -199,16 +202,27 @@ private:
/**---------------------------------------------------------------------------------------
Calculate LJ Coulomb pair ixn between two atoms
Calculate all the interactions for one atom block.
@param atom1 the index of the first atom
@param atom2 the index of the second atom
@param blockIndex the index of the atom block
@param forces force array (forces added)
@param totalEnergy total energy
--------------------------------------------------------------------------------------- */
void calculateBlockIxn(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize);
/**---------------------------------------------------------------------------------------
Calculate all the interactions for one atom block.
@param blockIndex the index of the atom block
@param forces force array (forces added)
@param totalEnergy total energy
--------------------------------------------------------------------------------------- */
void calculateOneEwaldIxn(int atom1, int atom2, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize);
void calculateBlockEwaldIxn(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize);
/**
* Compute the displacement and squared distance between two points, optionally using
......@@ -219,7 +233,7 @@ private:
/**
* Compute a fast approximation to erfc(x).
*/
static float erfcApprox(float x);
static fvec4 erfcApprox(fvec4 x);
/**
* Create a lookup table for the scale factor used with Ewald and PME.
......@@ -229,9 +243,11 @@ private:
/**
* Evaluate the scale factor used with Ewald and PME: erfc(alpha*r) + 2*alpha*r*exp(-alpha*alpha*r*r)/sqrt(PI)
*/
float ewaldScaleFunction(float x);
fvec4 ewaldScaleFunction(fvec4 x);
};
} // namespace OpenMM
// ---------------------------------------------------------------------------------------
#endif // OPENMM_CPU_NONBONDED_FORCE_H__
......@@ -234,7 +234,7 @@ double CpuCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeFo
neighborList.computeNeighborList(numParticles, posq, exclusions, floatBoxSize, periodic || ewald || pme, nonbondedCutoff+padding);
lastPositions = posData;
}
nonbonded.setUseCutoff(nonbondedCutoff, neighborList.getNeighbors(), rfDielectric);
nonbonded.setUseCutoff(nonbondedCutoff, neighborList, rfDielectric);
}
if (periodic || ewald || pme) {
double minAllowedSize = 1.999999*nonbondedCutoff;
......
#include "CpuNeighborList.h"
#include "openmm/internal/hardware.h"
#include "openmm/internal/vectorize.h"
#include "hilbert.h"
#include <algorithm>
#include <set>
#include <map>
......@@ -11,6 +12,8 @@ using namespace std;
namespace OpenMM {
const int CpuNeighborList::BlockSize = 4;
class VoxelIndex
{
public:
......@@ -77,23 +80,22 @@ public:
return VoxelIndex(x, y, z);
}
void getNeighbors(vector<pair<int, int> >& neighbors, const VoxelItem& referencePoint, const vector<set<int> >& exclusions, float maxDistance) const {
// Loop over neighboring voxels
// TODO use more clever selection of neighboring voxels
const int atomI = referencePoint.second;
const float* locationI = referencePoint.first;
fvec4 posI(locationI);
void getNeighbors(vector<int>& neighbors, int blockIndex, fvec4 blockCenter, fvec4 blockWidth, const vector<int>& sortedAtoms, vector<char>& exclusions, float maxDistance, const vector<int> blockAtoms, const float* atomLocations) const {
neighbors.resize(0);
exclusions.resize(0);
fvec4 boxSize(periodicBoxSize[0], periodicBoxSize[1], periodicBoxSize[2], 0);
fvec4 invBoxSize(1/periodicBoxSize[0], 1/periodicBoxSize[1], 1/periodicBoxSize[2], 0);
float maxDistanceSquared = maxDistance * maxDistance;
float refineCutoff = maxDistance-max(max(blockWidth[0], blockWidth[1]), blockWidth[2]);
float refineCutoffSquared = refineCutoff*refineCutoff;
int dIndexX = int(maxDistance / voxelSizeX) + 1; // How may voxels away do we have to look?
int dIndexY = int(maxDistance / voxelSizeY) + 1;
int dIndexZ = int(maxDistance / voxelSizeZ) + 1;
VoxelIndex centerVoxelIndex = getVoxelIndex(locationI);
int dIndexX = int((maxDistance+blockWidth[0])/voxelSizeX) + 1; // How may voxels away do we have to look?
int dIndexY = int((maxDistance+blockWidth[1])/voxelSizeY) + 1;
int dIndexZ = int((maxDistance+blockWidth[2])/voxelSizeZ) + 1;
float centerPos[4];
blockCenter.store(centerPos);
VoxelIndex centerVoxelIndex = getVoxelIndex(centerPos);
int lastx = centerVoxelIndex.x+dIndexX;
int lasty = centerVoxelIndex.y+dIndexY;
int lastz = centerVoxelIndex.z+dIndexZ;
......@@ -102,6 +104,7 @@ public:
lasty = min(lasty, centerVoxelIndex.y-dIndexY+ny-1);
lastz = min(lastz, centerVoxelIndex.z-dIndexZ+nz-1);
}
int lastSortedIndex = BlockSize*(blockIndex+1);
VoxelIndex voxelIndex(0, 0, 0);
for (int x = centerVoxelIndex.x - dIndexX; x <= lastx; ++x) {
voxelIndex.x = x;
......@@ -120,27 +123,52 @@ public:
continue; // no such voxel; skip
const Voxel& voxel = voxelEntry->second;
for (Voxel::const_iterator itemIter = voxel.begin(); itemIter != voxel.end(); ++itemIter) {
const int atomJ = itemIter->second;
const int sortedIndex = itemIter->second;
// Avoid duplicate entries.
if (atomJ >= atomI)
if (sortedIndex >= lastSortedIndex)
break;
fvec4 posJ(itemIter->first);
fvec4 delta = posJ-posI;
fvec4 atomPos(itemIter->first);
fvec4 delta = atomPos-centerPos;
if (usePeriodic) {
fvec4 base = round(delta*invBoxSize)*boxSize;
delta = delta-base;
}
delta = max(0.0f, abs(delta)-blockWidth);
float dSquared = dot3(delta, delta);
if (dSquared > maxDistanceSquared)
continue;
// Ignore exclusions.
if (exclusions[atomI].find(atomJ) != exclusions[atomI].end())
continue;
neighbors.push_back(make_pair(atomI, atomJ));
if (dSquared > refineCutoffSquared) {
// The distance is large enough that there might not be any actual interactions.
// Check individual atom pairs to be sure.
bool any = false;
for (int k = 0; k < (int) blockAtoms.size(); k++) {
fvec4 pos1(&atomLocations[4*blockAtoms[k]]);
delta = atomPos-pos1;
if (usePeriodic) {
fvec4 base = round(delta*invBoxSize)*boxSize;
delta = delta-base;
}
float r2 = dot3(delta, delta);
if (r2 < maxDistanceSquared) {
any = true;
break;
}
}
if (!any)
continue;
}
// Add this atom to the list of neighbors.
neighbors.push_back(sortedAtoms[sortedIndex]);
if (sortedIndex < BlockSize*blockIndex)
exclusions.push_back(0);
else
exclusions.push_back(0xF & (0xF<<(sortedIndex-BlockSize*blockIndex)));
}
}
}
......@@ -161,12 +189,11 @@ public:
}
int index;
CpuNeighborList& owner;
vector<pair<int, int> > threadNeighbors;
};
static void* threadBody(void* args) {
CpuNeighborList::ThreadData& data = *reinterpret_cast<CpuNeighborList::ThreadData*>(args);
data.owner.runThread(data.index, data.threadNeighbors);
data.owner.runThread(data.index);
delete &data;
return 0;
}
......@@ -204,6 +231,45 @@ CpuNeighborList::~CpuNeighborList() {
void CpuNeighborList::computeNeighborList(int numAtoms, const vector<float>& atomLocations, const vector<set<int> >& exclusions,
const float* periodicBoxSize, bool usePeriodic, float maxDistance) {
int numBlocks = (numAtoms+BlockSize-1)/BlockSize;
blockNeighbors.resize(numBlocks);
blockExclusions.resize(numBlocks);
sortedAtoms.resize(numAtoms);
// Sort the atoms based on a Hilbert curve.
float minx = atomLocations[0], maxx = atomLocations[0];
float miny = atomLocations[1], maxy = atomLocations[1];
float minz = atomLocations[2], maxz = atomLocations[2];
for (int i = 0; i < numAtoms; i++) {
const float* pos = &atomLocations[4*i];
if (pos[0] < minx)
minx = pos[0];
if (pos[1] < miny)
miny = pos[1];
if (pos[2] < minz)
minz = pos[2];
if (pos[0] > maxx)
maxx = pos[0];
if (pos[1] > maxy)
maxy = pos[1];
if (pos[2] > maxz)
maxz = pos[2];
}
float binWidth = max(max(maxx-minx, maxy-miny), maxz-minz)/255.0f;
float invBinWidth = 1.0f/binWidth;
vector<pair<int, int> > atomBins(numAtoms);
bitmask_t coords[3];
for (int i = 0; i < numAtoms; i++) {
const float* pos = &atomLocations[4*i];
coords[0] = (bitmask_t) ((pos[0]-minx)*invBinWidth);
coords[1] = (bitmask_t) ((pos[1]-miny)*invBinWidth);
coords[2] = (bitmask_t) ((pos[2]-minz)*invBinWidth);
int bin = (int) hilbert_c2i(3, 8, coords);
atomBins[i] = pair<int, int>(bin, i);
}
sort(atomBins.begin(), atomBins.end());
// Build the voxel hash.
float edgeSizeX, edgeSizeY, edgeSizeZ;
......@@ -215,8 +281,11 @@ void CpuNeighborList::computeNeighborList(int numAtoms, const vector<float>& ato
edgeSizeZ = 0.5f*periodicBoxSize[2]/floorf(periodicBoxSize[2]/maxDistance);
}
VoxelHash voxelHash(edgeSizeX, edgeSizeY, edgeSizeZ, periodicBoxSize, usePeriodic);
for (int i = 0; i < numAtoms; i++)
voxelHash.insert(i, &atomLocations[4*i]);
for (int i = 0; i < numAtoms; i++) {
int atomIndex = atomBins[i].second;
sortedAtoms[i] = atomIndex;
voxelHash.insert(i, &atomLocations[4*atomIndex]);
}
// Record the parameters for the threads.
......@@ -237,18 +306,37 @@ void CpuNeighborList::computeNeighborList(int numAtoms, const vector<float>& ato
pthread_cond_wait(&endCondition, &lock);
pthread_mutex_unlock(&lock);
// Combine the results from all the threads.
// Add padding atoms to fill up the last block.
neighbors.clear();
for (int i = 0; i < numThreads; i++)
neighbors.insert(neighbors.end(), threadData[i]->threadNeighbors.begin(), threadData[i]->threadNeighbors.end());
int numPadding = numBlocks*BlockSize-numAtoms;
if (numPadding > 0) {
char mask = (0xF0 >> numPadding) & 0xF;
for (int i = 0; i < numPadding; i++)
sortedAtoms.push_back(0);
vector<char>& exc = blockExclusions[blockExclusions.size()-1];
for (int i = 0; i < (int) exc.size(); i++)
exc[i] |= mask;
}
}
const vector<pair<int, int> >& CpuNeighborList::getNeighbors() {
return neighbors;
int CpuNeighborList::getNumBlocks() const {
return sortedAtoms.size()/BlockSize;
}
void CpuNeighborList::runThread(int index, vector<pair<int, int> >& threadNeighbors) {
const std::vector<int>& CpuNeighborList::getSortedAtoms() const {
return sortedAtoms;
}
const std::vector<int>& CpuNeighborList::getBlockNeighbors(int blockIndex) const {
return blockNeighbors[blockIndex];
}
const std::vector<char>& CpuNeighborList::getBlockExclusions(int blockIndex) const {
return blockExclusions[blockIndex];
}
void CpuNeighborList::runThread(int index) {
while (true) {
// Wait for the signal to start running.
......@@ -262,9 +350,41 @@ void CpuNeighborList::runThread(int index, vector<pair<int, int> >& threadNeighb
// Compute this thread's subset of neighbors.
threadNeighbors.clear();
for (int i = index; i < numAtoms; i += numThreads)
voxelHash->getNeighbors(threadNeighbors, VoxelItem(&atomLocations[4*i], i), *exclusions, maxDistance);
int numBlocks = blockNeighbors.size();
vector<int> blockAtoms;
for (int i = index; i < numBlocks; i += numThreads) {
{
int firstIndex = BlockSize*i;
int atomsInBlock = min(BlockSize, numAtoms-firstIndex);
blockAtoms.resize(atomsInBlock);
for (int j = 0; j < atomsInBlock; j++)
blockAtoms[j] = sortedAtoms[firstIndex+j];
}
int firstIndex = BlockSize*i;
fvec4 minPos(&atomLocations[4*sortedAtoms[firstIndex]]);
fvec4 maxPos = minPos;
int atomsInBlock = min(BlockSize, numAtoms-firstIndex);
for (int j = 1; j < atomsInBlock; j++) {
fvec4 pos(&atomLocations[4*sortedAtoms[firstIndex+j]]);
minPos = min(minPos, pos);
maxPos = max(maxPos, pos);
}
voxelHash->getNeighbors(blockNeighbors[i], i, (maxPos+minPos)*0.5f, (maxPos-minPos)*0.5f, sortedAtoms, blockExclusions[i], maxDistance, blockAtoms, atomLocations);
// Record the exclusions for this block.
for (int j = 0; j < atomsInBlock; j++) {
const set<int>& atomExclusions = (*exclusions)[sortedAtoms[firstIndex+j]];
char mask = 1<<j;
for (int k = 0; k < (int) blockNeighbors[i].size(); k++) {
int atomIndex = blockNeighbors[i][k];
if (atomExclusions.find(atomIndex) != atomExclusions.end())
blockExclusions[i][k] |= mask;
}
}
}
}
}
......
......@@ -114,7 +114,7 @@ CpuNonbondedForce::~CpuNonbondedForce(){
--------------------------------------------------------------------------------------- */
void CpuNonbondedForce::setUseCutoff(float distance, const vector<pair<int, int> >& neighbors, float solventDielectric) {
void CpuNonbondedForce::setUseCutoff(float distance, const CpuNeighborList& neighbors, float solventDielectric) {
cutoff = true;
cutoffDistance = distance;
......@@ -200,23 +200,22 @@ void CpuNonbondedForce::setUseSwitchingFunction(float distance) {
void CpuNonbondedForce::tabulateEwaldScaleFactor() {
ewaldDX = cutoffDistance/(NUM_TABLE_POINTS-2);
ewaldDXInv = 1.0f/ewaldDX;
vector<double> x(NUM_TABLE_POINTS);
vector<double> y(NUM_TABLE_POINTS);
vector<double> x(NUM_TABLE_POINTS+1);
vector<double> y(NUM_TABLE_POINTS+1);
vector<double> deriv;
for (int i = 0; i < NUM_TABLE_POINTS; i++) {
for (int i = 0; i < NUM_TABLE_POINTS+1; i++) {
double r = i*cutoffDistance/(NUM_TABLE_POINTS-2);
double alphaR = alphaEwald*r;
x[i] = r;
y[i] = erfc(alphaR) + TWO_OVER_SQRT_PI*alphaR*exp(-alphaR*alphaR);
}
SplineFitter::createNaturalSpline(x, y, deriv);
ewaldScaleX.resize(NUM_TABLE_POINTS);
ewaldScaleY.resize(NUM_TABLE_POINTS);
ewaldScaleDeriv.resize(NUM_TABLE_POINTS);
ewaldScaleTable.resize(4*NUM_TABLE_POINTS);
for (int i = 0; i < NUM_TABLE_POINTS; i++) {
ewaldScaleX[i] = (float) x[i];
ewaldScaleY[i] = (float) y[i];
ewaldScaleDeriv[i] = (float) (deriv[i]*ewaldDX*ewaldDX/6);
ewaldScaleTable[4*i] = (float) y[i];
ewaldScaleTable[4*i+1] = (float) y[i+1];
ewaldScaleTable[4*i+2] = (float) (deriv[i]*ewaldDX*ewaldDX/6);
ewaldScaleTable[4*i+3] = (float) (deriv[i+1]*ewaldDX*ewaldDX/6);
}
}
......@@ -384,7 +383,7 @@ void CpuNonbondedForce::calculateDirectIxn(int numberOfAtoms, float* posq, const
float inverseR = 1/r;
float chargeProd = ONE_4PI_EPS0*posq[4*ii+3]*posq[4*jj+3];
float alphaR = alphaEwald*r;
float erfcAlphaR = erfcApprox(alphaR);
float erfcAlphaR = erfcApprox(alphaR)[0];
float dEdR = (float) (chargeProd * inverseR * inverseR * inverseR);
dEdR = (float) (dEdR * (1.0f-erfcAlphaR-TWO_OVER_SQRT_PI*alphaR*exp(-alphaR*alphaR)));
fvec4 result = deltaR*dEdR;
......@@ -424,18 +423,14 @@ void CpuNonbondedForce::runThread(int index, vector<float>& threadForce, double&
if (ewald || pme) {
// Compute the interactions from the neighbor list.
for (int i = index; i < (int) neighborList->size(); i += numThreads) {
pair<int, int> pair = (*neighborList)[i];
calculateOneEwaldIxn(pair.first, pair.second, &threadForce[0], energyPtr, boxSize, invBoxSize);
}
for (int i = index; i < neighborList->getNumBlocks(); i += numThreads)
calculateBlockEwaldIxn(i, &threadForce[0], energyPtr, boxSize, invBoxSize);
}
else if (cutoff) {
// Compute the interactions from the neighbor list.
for (int i = index; i < (int) neighborList->size(); i += numThreads) {
pair<int, int> pair = (*neighborList)[i];
calculateOneIxn(pair.first, pair.second, &threadForce[0], energyPtr, boxSize, invBoxSize);
}
for (int i = index; i < neighborList->getNumBlocks(); i += numThreads)
calculateBlockIxn(i, &threadForce[0], energyPtr, boxSize, invBoxSize);
}
else {
// Loop over all atom pairs
......@@ -503,49 +498,196 @@ void CpuNonbondedForce::calculateOneIxn(int ii, int jj, float* forces, double* t
(fvec4(forces+4*jj)-result).store(forces+4*jj);
}
void CpuNonbondedForce::calculateOneEwaldIxn(int ii, int jj, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize) {
fvec4 deltaR;
fvec4 posI(posq+4*ii);
fvec4 posJ(posq+4*jj);
float r2;
getDeltaR(posJ, posI, deltaR, r2, true, boxSize, invBoxSize);
if (r2 < cutoffDistance*cutoffDistance) {
float r = sqrtf(r2);
float inverseR = 1/r;
float switchValue = 1, switchDeriv = 0;
if (useSwitch && r > switchingDistance) {
float t = (r-switchingDistance)/(cutoffDistance-switchingDistance);
switchValue = 1+t*t*t*(-10+t*(15-t*6));
switchDeriv = t*t*(-30+t*(60-t*30))/(cutoffDistance-switchingDistance);
void CpuNonbondedForce::calculateBlockIxn(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize) {
// Load the positions and parameters of the atoms in the block.
int blockAtom[4];
fvec4 blockAtomPosq[4];
fvec4 blockAtomForce[4];
for (int i = 0; i < 4; i++) {
blockAtom[i] = neighborList->getSortedAtoms()[4*blockIndex+i];
blockAtomPosq[i] = fvec4(posq+4*blockAtom[i]);
blockAtomForce[i] = fvec4(0.0f);
}
fvec4 blockAtomCharge = fvec4(ONE_4PI_EPS0)*fvec4(blockAtomPosq[0][3], blockAtomPosq[1][3], blockAtomPosq[2][3], blockAtomPosq[3][3]);
fvec4 blockAtomSigma(atomParameters[blockAtom[0]].first, atomParameters[blockAtom[1]].first, atomParameters[blockAtom[2]].first, atomParameters[blockAtom[3]].first);
fvec4 blockAtomEpsilon(atomParameters[blockAtom[0]].second, atomParameters[blockAtom[1]].second, atomParameters[blockAtom[2]].second, atomParameters[blockAtom[3]].second);
// Loop over neighbors for this block.
const vector<int>& neighbors = neighborList->getBlockNeighbors(blockIndex);
const vector<char>& exclusions = neighborList->getBlockExclusions(blockIndex);
float blockAtomR2[4];
bool include[4];
fvec4 blockAtomDelta[4];
for (int i = 0; i < (int) neighbors.size(); i++) {
// Load the next neighbor.
int atom = neighbors[i];
fvec4 atomPosq(posq+4*atom);
// Compute the distances to the block atoms.
bool any = false;
for (int j = 0; j < 4; j++) {
getDeltaR(atomPosq, blockAtomPosq[j], blockAtomDelta[j], blockAtomR2[j], periodic, boxSize, invBoxSize);
include[j] = (((exclusions[i]>>j)&1) == 0 && (!cutoff || blockAtomR2[j] < cutoffDistance*cutoffDistance));
any |= include[j];
}
float chargeProd = ONE_4PI_EPS0*posq[4*ii+3]*posq[4*jj+3];
float dEdR = chargeProd*inverseR*ewaldScaleFunction(r);
float sig = atomParameters[ii].first + atomParameters[jj].first;
float sig2 = inverseR*sig;
sig2 *= sig2;
float sig6 = sig2*sig2*sig2;
float eps = atomParameters[ii].second*atomParameters[jj].second;
dEdR += switchValue*eps*(12.0f*sig6 - 6.0f)*sig6;
if (!any)
continue; // No interactions to compute.
// Compute the interactions.
fvec4 r2(blockAtomR2);
fvec4 r = sqrt(r2);
fvec4 inverseR = fvec4(1.0f)/r;
fvec4 switchValue(1.0f), switchDeriv(0.0f);
if (useSwitch) {
fvec4 t = (r>switchingDistance) & ((r-switchingDistance)/(cutoffDistance-switchingDistance));
switchValue = 1+t*t*t*(-10.0f+t*(15.0f-t*6.0f));
switchDeriv = t*t*(-30.0f+t*(60.0f-t*30.0f))/(cutoffDistance-switchingDistance);
}
fvec4 sig = blockAtomSigma+atomParameters[atom].first;
fvec4 sig2 = inverseR*sig;
sig2 *= sig2;
fvec4 sig6 = sig2*sig2*sig2;
fvec4 eps = blockAtomEpsilon*atomParameters[atom].second;
fvec4 dEdR = switchValue*eps*(12.0f*sig6 - 6.0f)*sig6;
fvec4 chargeProd = blockAtomCharge*posq[4*atom+3];
if (cutoff)
dEdR += chargeProd*(inverseR-2.0f*krf*r2);
else
dEdR += chargeProd*inverseR;
dEdR *= inverseR*inverseR;
float energy = eps*(sig6-1.0f)*sig6;
fvec4 energy = eps*(sig6-1.0f)*sig6;
if (useSwitch) {
dEdR -= energy*switchDeriv*inverseR;
energy *= switchValue;
}
// accumulate forces
// Accumulate energies.
fvec4 result = deltaR*dEdR;
(fvec4(forces+4*ii)+result).store(forces+4*ii);
(fvec4(forces+4*jj)-result).store(forces+4*jj);
if (totalEnergy) {
if (cutoff)
energy += chargeProd*(inverseR+krf*r2-crf);
else
energy += chargeProd*inverseR;
for (int j = 0; j < 4; j++)
if (include[j])
*totalEnergy += energy[j];
}
// accumulate energies
// Accumulate forces.
fvec4 atomForce(forces+4*atom);
for (int j = 0; j < 4; j++) {
if (include[j]) {
fvec4 result = blockAtomDelta[j]*dEdR[j];
blockAtomForce[j] += result;
atomForce -= result;
}
}
atomForce.store(forces+4*atom);
}
// Record the forces on the block atoms.
for (int j = 0; j < 4; j++)
(fvec4(forces+4*blockAtom[j])+blockAtomForce[j]).store(forces+4*blockAtom[j]);
}
void CpuNonbondedForce::calculateBlockEwaldIxn(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize) {
// Load the positions and parameters of the atoms in the block.
int blockAtom[4];
fvec4 blockAtomPosq[4];
fvec4 blockAtomForce[4];
for (int i = 0; i < 4; i++) {
blockAtom[i] = neighborList->getSortedAtoms()[4*blockIndex+i];
blockAtomPosq[i] = fvec4(posq+4*blockAtom[i]);
blockAtomForce[i] = fvec4(0.0f);
}
fvec4 blockAtomCharge = fvec4(ONE_4PI_EPS0)*fvec4(blockAtomPosq[0][3], blockAtomPosq[1][3], blockAtomPosq[2][3], blockAtomPosq[3][3]);
fvec4 blockAtomSigma(atomParameters[blockAtom[0]].first, atomParameters[blockAtom[1]].first, atomParameters[blockAtom[2]].first, atomParameters[blockAtom[3]].first);
fvec4 blockAtomEpsilon(atomParameters[blockAtom[0]].second, atomParameters[blockAtom[1]].second, atomParameters[blockAtom[2]].second, atomParameters[blockAtom[3]].second);
// Loop over neighbors for this block.
const vector<int>& neighbors = neighborList->getBlockNeighbors(blockIndex);
const vector<char>& exclusions = neighborList->getBlockExclusions(blockIndex);
float blockAtomR2[4];
bool include[4];
fvec4 blockAtomDelta[4];
for (int i = 0; i < (int) neighbors.size(); i++) {
// Load the next neighbor.
int atom = neighbors[i];
fvec4 atomPosq(posq+4*atom);
// Compute the distances to the block atoms.
bool any = false;
for (int j = 0; j < 4; j++) {
getDeltaR(atomPosq, blockAtomPosq[j], blockAtomDelta[j], blockAtomR2[j], periodic, boxSize, invBoxSize);
include[j] = (((exclusions[i]>>j)&1) == 0 && blockAtomR2[j] < cutoffDistance*cutoffDistance);
any |= include[j];
}
if (!any)
continue; // No interactions to compute.
// Compute the interactions.
fvec4 r2(blockAtomR2);
fvec4 r = sqrt(r2);
fvec4 inverseR = fvec4(1.0f)/r;
fvec4 switchValue(1.0f), switchDeriv(0.0f);
if (useSwitch) {
fvec4 t = (r>switchingDistance) & ((r-switchingDistance)/(cutoffDistance-switchingDistance));
switchValue = 1+t*t*t*(-10.0f+t*(15.0f-t*6.0f));
switchDeriv = t*t*(-30.0f+t*(60.0f-t*30.0f))/(cutoffDistance-switchingDistance);
}
fvec4 chargeProd = blockAtomCharge*posq[4*atom+3];
fvec4 dEdR = chargeProd*inverseR*ewaldScaleFunction(r);
fvec4 sig = blockAtomSigma+atomParameters[atom].first;
fvec4 sig2 = inverseR*sig;
sig2 *= sig2;
fvec4 sig6 = sig2*sig2*sig2;
fvec4 eps = blockAtomEpsilon*atomParameters[atom].second;
dEdR += switchValue*eps*(12.0f*sig6 - 6.0f)*sig6;
dEdR *= inverseR*inverseR;
fvec4 energy = eps*(sig6-1.0f)*sig6;
if (useSwitch) {
dEdR -= energy*switchDeriv*inverseR;
energy *= switchValue;
}
// Accumulate energies.
if (totalEnergy) {
energy += (float) (chargeProd*inverseR*erfcApprox(alphaEwald*r));
*totalEnergy += energy;
energy += chargeProd*inverseR*erfcApprox(alphaEwald*r);
for (int j = 0; j < 4; j++)
if (include[j])
*totalEnergy += energy[j];
}
// Accumulate forces.
fvec4 atomForce(forces+4*atom);
for (int j = 0; j < 4; j++) {
if (include[j]) {
fvec4 result = blockAtomDelta[j]*dEdR[j];
blockAtomForce[j] += result;
atomForce -= result;
}
}
atomForce.store(forces+4*atom);
}
// Record the forces on the block atoms.
for (int j = 0; j < 4; j++)
(fvec4(forces+4*blockAtom[j])+blockAtomForce[j]).store(forces+4*blockAtom[j]);
}
void CpuNonbondedForce::getDeltaR(const fvec4& posI, const fvec4& posJ, fvec4& deltaR, float& r2, bool periodic, const fvec4& boxSize, const fvec4& invBoxSize) const {
......@@ -557,24 +699,33 @@ void CpuNonbondedForce::getDeltaR(const fvec4& posI, const fvec4& posJ, fvec4& d
r2 = dot3(deltaR, deltaR);
}
float CpuNonbondedForce::erfcApprox(float x) {
fvec4 CpuNonbondedForce::erfcApprox(fvec4 x) {
// This approximation for erfc is from Abramowitz and Stegun (1964) p. 299. They cite the following as
// the original source: C. Hastings, Jr., Approximations for Digital Computers (1955). It has a maximum
// error of 3e-7.
float t = 1.0f+(0.0705230784f+(0.0422820123f+(0.0092705272f+(0.0001520143f+(0.0002765672f+0.0000430638f*x)*x)*x)*x)*x)*x;
fvec4 t = 1.0f+(0.0705230784f+(0.0422820123f+(0.0092705272f+(0.0001520143f+(0.0002765672f+0.0000430638f*x)*x)*x)*x)*x)*x;
t *= t;
t *= t;
t *= t;
return 1.0f/(t*t);
}
float CpuNonbondedForce::ewaldScaleFunction(float x) {
fvec4 CpuNonbondedForce::ewaldScaleFunction(fvec4 x) {
// Compute the tabulated Ewald scale factor: erfc(alpha*r) + 2*alpha*r*exp(-alpha*alpha*r*r)/sqrt(PI)
int lower = (int) (x*ewaldDXInv);
int upper = lower+1;
float a = (ewaldScaleX[upper]-x)*ewaldDXInv;
float b = 1.0f-a;
return a*ewaldScaleY[lower]+b*ewaldScaleY[upper]+((a*a*a-a)*ewaldScaleDeriv[lower] + (b*b*b-b)*ewaldScaleDeriv[upper]);
float y[4];
fvec4 x1 = x*ewaldDXInv;
ivec4 index = floor(x1);
fvec4 coeff[4];
coeff[1] = x1-index;
coeff[0] = 1.0f-coeff[1];
coeff[2] = coeff[0]*coeff[0]*coeff[0]-coeff[0];
coeff[3] = coeff[1]*coeff[1]*coeff[1]-coeff[1];
_MM_TRANSPOSE4_PS(coeff[0], coeff[1], coeff[2], coeff[3]);
for (int i = 0; i < 4; i++) {
if (index[i] < NUM_TABLE_POINTS)
y[i] = dot4(coeff[i], fvec4(&ewaldScaleTable[4*index[i]]));
}
return fvec4(y);
}
......@@ -39,6 +39,7 @@
#include "sfmt/SFMT.h"
#include <iostream>
#include <set>
#include <utility>
#include <vector>
using namespace OpenMM;
......@@ -68,10 +69,19 @@ void testNeighborList(bool periodic) {
// Convert the neighbor list to a set for faster lookup.
set<pair<int, int> > neighbors;
for (int i = 0; i < (int) neighborList.getNeighbors().size(); i++) {
pair<int, int> entry = neighborList.getNeighbors()[i];
ASSERT(neighbors.find(entry) == neighbors.end() && neighbors.find(make_pair(entry.second, entry.first)) == neighbors.end()); // No duplicates
neighbors.insert(entry);
for (int i = 0; i < (int) neighborList.getSortedAtoms().size(); i++) {
int blockIndex = i/CpuNeighborList::BlockSize;
int indexInBlock = i-blockIndex*CpuNeighborList::BlockSize;
char mask = 1<<indexInBlock;
for (int j = 0; j < (int) neighborList.getBlockExclusions(blockIndex).size(); j++) {
if ((neighborList.getBlockExclusions(blockIndex)[j] & mask) == 0) {
int atom1 = neighborList.getSortedAtoms()[i];
int atom2 = neighborList.getBlockNeighbors(blockIndex)[j];
pair<int, int> entry = make_pair(min(atom1, atom2), max(atom1, atom2));
ASSERT(neighbors.find(entry) == neighbors.end() && neighbors.find(make_pair(entry.second, entry.first)) == neighbors.end()); // No duplicates
neighbors.insert(entry);
}
}
}
// Check each particle pair and figure out whether they should be in the neighbor list.
......@@ -90,7 +100,8 @@ void testNeighborList(bool periodic) {
if (dx*dx + dy*dy + dz*dz > cutoff*cutoff)
shouldInclude = false;
bool isIncluded = (neighbors.find(make_pair(i, j)) != neighbors.end() || neighbors.find(make_pair(j, i)) != neighbors.end());
ASSERT_EQUAL(shouldInclude, isIncluded);
if (shouldInclude)
ASSERT(isIncluded);
}
}
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment