Commit e22808f3 authored by peastman's avatar peastman
Browse files

Created AVX implementation of CPU nonbonded force

parent ddad7c6c
...@@ -132,6 +132,12 @@ public: ...@@ -132,6 +132,12 @@ public:
operator __m256i() const { operator __m256i() const {
return val; return val;
} }
ivec4 lowerVec() const {
return _mm256_castsi256_si128(val);
}
ivec4 upperVec() const {
return _mm256_extractf128_si256(val, 1);
}
void store(int* v) const { void store(int* v) const {
_mm256_storeu_si256((__m256i*) v, val); _mm256_storeu_si256((__m256i*) v, val);
} }
......
...@@ -45,8 +45,7 @@ class OPENMM_EXPORT_CPU CpuNeighborList { ...@@ -45,8 +45,7 @@ class OPENMM_EXPORT_CPU CpuNeighborList {
public: public:
class ThreadTask; class ThreadTask;
class Voxels; class Voxels;
static const int BlockSize; CpuNeighborList(int blockSize);
CpuNeighborList();
void computeNeighborList(int numAtoms, const AlignedArray<float>& atomLocations, const std::vector<std::set<int> >& exclusions, void computeNeighborList(int numAtoms, const AlignedArray<float>& atomLocations, const std::vector<std::set<int> >& exclusions,
const float* periodicBoxSize, bool usePeriodic, float maxDistance, ThreadPool& threads); const float* periodicBoxSize, bool usePeriodic, float maxDistance, ThreadPool& threads);
int getNumBlocks() const; int getNumBlocks() const;
...@@ -59,6 +58,7 @@ public: ...@@ -59,6 +58,7 @@ public:
void threadComputeNeighborList(ThreadPool& threads, int threadIndex); void threadComputeNeighborList(ThreadPool& threads, int threadIndex);
void runThread(int index); void runThread(int index);
private: private:
int blockSize;
std::vector<int> sortedAtoms; std::vector<int> sortedAtoms;
std::vector<std::vector<int> > blockNeighbors; std::vector<std::vector<int> > blockNeighbors;
std::vector<std::vector<char> > blockExclusions; std::vector<std::vector<char> > blockExclusions;
......
/* Portions copyright (c) 2006-2013 Stanford University and Simbios.
* Contributors: Pande Group
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files (the
* "Software"), to deal in the Software without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sublicense, and/or sell copies of the Software, and to
* permit persons to whom the Software is furnished to do so, subject
* to the following conditions:
*
* The above copyright notice and this permission notice shall be included
* in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE
* LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
* OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
* WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
#ifndef OPENMM_CPU_NONBONDED_FORCE_H__
#define OPENMM_CPU_NONBONDED_FORCE_H__
#include "AlignedArray.h"
#include "CpuNeighborList.h"
#include "ReferencePairIxn.h"
#include "openmm/internal/ThreadPool.h"
#include "openmm/internal/vectorize8.h"
#include <set>
#include <utility>
#include <vector>
// ---------------------------------------------------------------------------------------
namespace OpenMM {
class CpuNonbondedForceVec8 {
public:
class ComputeDirectTask;
/**---------------------------------------------------------------------------------------
Constructor
--------------------------------------------------------------------------------------- */
CpuNonbondedForceVec8();
/**---------------------------------------------------------------------------------------
Set the force to use a cutoff.
@param distance the cutoff distance
@param neighbors the neighbor list to use
@param solventDielectric the dielectric constant of the bulk solvent
--------------------------------------------------------------------------------------- */
void setUseCutoff(float distance, const CpuNeighborList& neighbors, float solventDielectric);
/**---------------------------------------------------------------------------------------
Set the force to use a switching function on the Lennard-Jones interaction.
@param distance the switching distance
--------------------------------------------------------------------------------------- */
void setUseSwitchingFunction(float distance);
/**---------------------------------------------------------------------------------------
Set the force to use periodic boundary conditions. This requires that a cutoff has
already been set, and the smallest side of the periodic box is at least twice the cutoff
distance.
@param boxSize the X, Y, and Z widths of the periodic box
--------------------------------------------------------------------------------------- */
void setPeriodic(float* periodicBoxSize);
/**---------------------------------------------------------------------------------------
Set the force to use Ewald summation.
@param alpha the Ewald separation parameter
@param kmaxx the largest wave vector in the x direction
@param kmaxy the largest wave vector in the y direction
@param kmaxz the largest wave vector in the z direction
--------------------------------------------------------------------------------------- */
void setUseEwald(float alpha, int kmaxx, int kmaxy, int kmaxz);
/**---------------------------------------------------------------------------------------
Set the force to use Particle-Mesh Ewald (PME) summation.
@param alpha the Ewald separation parameter
@param gridSize the dimensions of the mesh
--------------------------------------------------------------------------------------- */
void setUsePME(float alpha, int meshSize[3]);
/**---------------------------------------------------------------------------------------
Calculate Ewald ixn
@param numberOfAtoms number of atoms
@param posq atom coordinates and charges
@param atomCoordinates atom coordinates (in format needed by PME)
@param atomParameters atom parameters (sigma/2, 2*sqrt(epsilon))
@param exclusions atom exclusion indices
exclusions[atomIndex] contains the list of exclusions for that atom
@param forces force array (forces added)
@param totalEnergy total energy
--------------------------------------------------------------------------------------- */
void calculateReciprocalIxn(int numberOfAtoms, float* posq, const std::vector<RealVec>& atomCoordinates,
const std::vector<std::pair<float, float> >& atomParameters, const std::vector<std::set<int> >& exclusions,
std::vector<RealVec>& forces, double* totalEnergy) const;
/**---------------------------------------------------------------------------------------
Calculate LJ Coulomb pair ixn
@param numberOfAtoms number of atoms
@param posq atom coordinates and charges
@param atomCoordinates atom coordinates (periodic boundary conditions not applied)
@param atomParameters atom parameters (sigma/2, 2*sqrt(epsilon))
@param exclusions atom exclusion indices
exclusions[atomIndex] contains the list of exclusions for that atom
@param forces force array (forces added)
@param totalEnergy total energy
@param threads the thread pool to use
--------------------------------------------------------------------------------------- */
void calculateDirectIxn(int numberOfAtoms, float* posq, const std::vector<RealVec>& atomCoordinates, const std::vector<std::pair<float, float> >& atomParameters,
const std::vector<std::set<int> >& exclusions, std::vector<AlignedArray<float> >& threadForce, double* totalEnergy, ThreadPool& threads);
/**
* This routine contains the code executed by each thread.
*/
void threadComputeDirect(ThreadPool& threads, int threadIndex);
private:
bool cutoff;
bool useSwitch;
bool periodic;
bool ewald;
bool pme;
bool tableIsValid;
const CpuNeighborList* neighborList;
float periodicBoxSize[3];
float cutoffDistance, switchingDistance;
float krf, crf;
float alphaEwald;
int numRx, numRy, numRz;
int meshDim[3];
std::vector<float> ewaldScaleTable;
float ewaldDX, ewaldDXInv;
std::vector<double> threadEnergy;
// The following variables are used to make information accessible to the individual threads.
int numberOfAtoms;
float* posq;
RealVec const* atomCoordinates;
std::pair<float, float> const* atomParameters;
std::set<int> const* exclusions;
std::vector<AlignedArray<float> >* threadForce;
bool includeEnergy;
void* atomicCounter;
static const float TWO_OVER_SQRT_PI;
static const int NUM_TABLE_POINTS;
/**---------------------------------------------------------------------------------------
Calculate LJ Coulomb pair ixn between two atoms
@param atom1 the index of the first atom
@param atom2 the index of the second atom
@param forces force array (forces added)
@param totalEnergy total energy
--------------------------------------------------------------------------------------- */
void calculateOneIxn(int atom1, int atom2, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize);
/**---------------------------------------------------------------------------------------
Calculate all the interactions for one atom block.
@param blockIndex the index of the atom block
@param forces force array (forces added)
@param totalEnergy total energy
--------------------------------------------------------------------------------------- */
void calculateBlockIxn(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize);
/**---------------------------------------------------------------------------------------
Calculate all the interactions for one atom block.
@param blockIndex the index of the atom block
@param forces force array (forces added)
@param totalEnergy total energy
--------------------------------------------------------------------------------------- */
void calculateBlockEwaldIxn(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize);
/**
* Compute the displacement and squared distance between two points, optionally using
* periodic boundary conditions.
*/
void getDeltaR(const fvec4& posI, const fvec4& posJ, fvec4& deltaR, float& r2, bool periodic, const fvec4& boxSize, const fvec4& invBoxSize) const;
/**
* Compute the displacement and squared distance between a collection of points, optionally using
* periodic boundary conditions.
*/
void getDeltaR(const fvec4& posI, const fvec8& x, const fvec8& y, const fvec8& z, fvec8& dx, fvec8& dy, fvec8& dz, fvec8& r2, bool periodic, const fvec4& boxSize, const fvec4& invBoxSize) const;
/**
* Compute a fast approximation to erfc(x).
*/
static fvec8 erfcApprox(fvec8 x);
/**
* Create a lookup table for the scale factor used with Ewald and PME.
*/
void tabulateEwaldScaleFactor();
/**
* Evaluate the scale factor used with Ewald and PME: erfc(alpha*r) + 2*alpha*r*exp(-alpha*alpha*r*r)/sqrt(PI)
*/
fvec8 ewaldScaleFunction(fvec8 x);
};
} // namespace OpenMM
// ---------------------------------------------------------------------------------------
#endif // OPENMM_CPU_NONBONDED_FORCE_H__
...@@ -43,8 +43,6 @@ using namespace std; ...@@ -43,8 +43,6 @@ using namespace std;
namespace OpenMM { namespace OpenMM {
const int CpuNeighborList::BlockSize = 4;
class VoxelIndex class VoxelIndex
{ {
public: public:
...@@ -62,8 +60,8 @@ public: ...@@ -62,8 +60,8 @@ public:
*/ */
class CpuNeighborList::Voxels { class CpuNeighborList::Voxels {
public: public:
Voxels(float vsx, float vsy, float minx, float maxx, float miny, float maxy, const float* periodicBoxSize, bool usePeriodic) : Voxels(int blockSize, float vsx, float vsy, float minx, float maxx, float miny, float maxy, const float* periodicBoxSize, bool usePeriodic) :
voxelSizeX(vsx), voxelSizeY(vsy), minx(minx), maxx(maxx), miny(miny), maxy(maxy), periodicBoxSize(periodicBoxSize), usePeriodic(usePeriodic) { blockSize(blockSize), voxelSizeX(vsx), voxelSizeY(vsy), minx(minx), maxx(maxx), miny(miny), maxy(maxy), periodicBoxSize(periodicBoxSize), usePeriodic(usePeriodic) {
if (usePeriodic) { if (usePeriodic) {
nx = (int) floorf(periodicBoxSize[0]/voxelSizeX+0.5f); nx = (int) floorf(periodicBoxSize[0]/voxelSizeX+0.5f);
ny = (int) floorf(periodicBoxSize[1]/voxelSizeY+0.5f); ny = (int) floorf(periodicBoxSize[1]/voxelSizeY+0.5f);
...@@ -156,7 +154,7 @@ public: ...@@ -156,7 +154,7 @@ public:
return VoxelIndex(x, y); return VoxelIndex(x, y);
} }
void getNeighbors(vector<int>& neighbors, int blockIndex, fvec4 blockCenter, fvec4 blockWidth, const vector<int>& sortedAtoms, vector<char>& exclusions, float maxDistance, const vector<int> blockAtoms, const float* atomLocations) const { void getNeighbors(vector<int>& neighbors, int blockIndex, fvec4 blockCenter, fvec4 blockWidth, const vector<int>& sortedAtoms, vector<char>& exclusions, float maxDistance, const vector<int>& blockAtoms, const float* atomLocations, const vector<VoxelIndex>& atomVoxelIndex) const {
neighbors.resize(0); neighbors.resize(0);
exclusions.resize(0); exclusions.resize(0);
fvec4 boxSize(periodicBoxSize[0], periodicBoxSize[1], periodicBoxSize[2], 0); fvec4 boxSize(periodicBoxSize[0], periodicBoxSize[1], periodicBoxSize[2], 0);
...@@ -175,9 +173,6 @@ public: ...@@ -175,9 +173,6 @@ public:
float centerPos[4]; float centerPos[4];
blockCenter.store(centerPos); blockCenter.store(centerPos);
VoxelIndex centerVoxelIndex = getVoxelIndex(centerPos); VoxelIndex centerVoxelIndex = getVoxelIndex(centerPos);
VoxelIndex atomVoxelIndex[BlockSize];
for (int i = 0; i < (int) blockAtoms.size(); i++)
atomVoxelIndex[i] = getVoxelIndex(&atomLocations[4*blockAtoms[i]]);
int startx = centerVoxelIndex.x-dIndexX; int startx = centerVoxelIndex.x-dIndexX;
int starty = centerVoxelIndex.y-dIndexY; int starty = centerVoxelIndex.y-dIndexY;
int endx = centerVoxelIndex.x+dIndexX; int endx = centerVoxelIndex.x+dIndexX;
...@@ -193,7 +188,7 @@ public: ...@@ -193,7 +188,7 @@ public:
endx = min(endx, nx-1); endx = min(endx, nx-1);
endy = min(endy, ny-1); endy = min(endy, ny-1);
} }
int lastSortedIndex = BlockSize*(blockIndex+1); int lastSortedIndex = blockSize*(blockIndex+1);
VoxelIndex voxelIndex(0, 0); VoxelIndex voxelIndex(0, 0);
for (int x = startx; x <= endx; ++x) { for (int x = startx; x <= endx; ++x) {
voxelIndex.x = x; voxelIndex.x = x;
...@@ -300,10 +295,12 @@ public: ...@@ -300,10 +295,12 @@ public:
// Add this atom to the list of neighbors. // Add this atom to the list of neighbors.
neighbors.push_back(sortedAtoms[sortedIndex]); neighbors.push_back(sortedAtoms[sortedIndex]);
if (sortedIndex < BlockSize*blockIndex) if (sortedIndex < blockSize*blockIndex)
exclusions.push_back(0); exclusions.push_back(0);
else else {
exclusions.push_back(0xF & (0xF<<(sortedIndex-BlockSize*blockIndex))); int mask = (1<<blockSize)-1;
exclusions.push_back(mask & (mask<<(sortedIndex-blockSize*blockIndex)));
}
} }
} }
} }
...@@ -311,6 +308,7 @@ public: ...@@ -311,6 +308,7 @@ public:
} }
private: private:
int blockSize;
float voxelSizeX, voxelSizeY; float voxelSizeX, voxelSizeY;
float minx, maxx, miny, maxy; float minx, maxx, miny, maxy;
int nx, ny; int nx, ny;
...@@ -329,12 +327,12 @@ public: ...@@ -329,12 +327,12 @@ public:
CpuNeighborList& owner; CpuNeighborList& owner;
}; };
CpuNeighborList::CpuNeighborList() { CpuNeighborList::CpuNeighborList(int blockSize) : blockSize(blockSize) {
} }
void CpuNeighborList::computeNeighborList(int numAtoms, const AlignedArray<float>& atomLocations, const vector<set<int> >& exclusions, void CpuNeighborList::computeNeighborList(int numAtoms, const AlignedArray<float>& atomLocations, const vector<set<int> >& exclusions,
const float* periodicBoxSize, bool usePeriodic, float maxDistance, ThreadPool& threads) { const float* periodicBoxSize, bool usePeriodic, float maxDistance, ThreadPool& threads) {
int numBlocks = (numAtoms+BlockSize-1)/BlockSize; int numBlocks = (numAtoms+blockSize-1)/blockSize;
blockNeighbors.resize(numBlocks); blockNeighbors.resize(numBlocks);
blockExclusions.resize(numBlocks); blockExclusions.resize(numBlocks);
sortedAtoms.resize(numAtoms); sortedAtoms.resize(numAtoms);
...@@ -381,7 +379,7 @@ void CpuNeighborList::computeNeighborList(int numAtoms, const AlignedArray<float ...@@ -381,7 +379,7 @@ void CpuNeighborList::computeNeighborList(int numAtoms, const AlignedArray<float
edgeSizeX = 0.6f*periodicBoxSize[0]/floorf(periodicBoxSize[0]/maxDistance); edgeSizeX = 0.6f*periodicBoxSize[0]/floorf(periodicBoxSize[0]/maxDistance);
edgeSizeY = 0.6f*periodicBoxSize[1]/floorf(periodicBoxSize[1]/maxDistance); edgeSizeY = 0.6f*periodicBoxSize[1]/floorf(periodicBoxSize[1]/maxDistance);
} }
Voxels voxels(edgeSizeX, edgeSizeY, minx, maxx, miny, maxy, periodicBoxSize, usePeriodic); Voxels voxels(blockSize, edgeSizeX, edgeSizeY, minx, maxx, miny, maxy, periodicBoxSize, usePeriodic);
for (int i = 0; i < numAtoms; i++) { for (int i = 0; i < numAtoms; i++) {
int atomIndex = atomBins[i].second; int atomIndex = atomBins[i].second;
sortedAtoms[i] = atomIndex; sortedAtoms[i] = atomIndex;
...@@ -397,9 +395,9 @@ void CpuNeighborList::computeNeighborList(int numAtoms, const AlignedArray<float ...@@ -397,9 +395,9 @@ void CpuNeighborList::computeNeighborList(int numAtoms, const AlignedArray<float
// Add padding atoms to fill up the last block. // Add padding atoms to fill up the last block.
int numPadding = numBlocks*BlockSize-numAtoms; int numPadding = numBlocks*blockSize-numAtoms;
if (numPadding > 0) { if (numPadding > 0) {
char mask = (0xF0 >> numPadding) & 0xF; char mask = ((0xFFFF-(1<<blockSize)+1) >> numPadding);
for (int i = 0; i < numPadding; i++) for (int i = 0; i < numPadding; i++)
sortedAtoms.push_back(0); sortedAtoms.push_back(0);
vector<char>& exc = blockExclusions[blockExclusions.size()-1]; vector<char>& exc = blockExclusions[blockExclusions.size()-1];
...@@ -409,7 +407,7 @@ void CpuNeighborList::computeNeighborList(int numAtoms, const AlignedArray<float ...@@ -409,7 +407,7 @@ void CpuNeighborList::computeNeighborList(int numAtoms, const AlignedArray<float
} }
int CpuNeighborList::getNumBlocks() const { int CpuNeighborList::getNumBlocks() const {
return sortedAtoms.size()/BlockSize; return sortedAtoms.size()/blockSize;
} }
const std::vector<int>& CpuNeighborList::getSortedAtoms() const { const std::vector<int>& CpuNeighborList::getSortedAtoms() const {
...@@ -446,14 +444,18 @@ void CpuNeighborList::threadComputeNeighborList(ThreadPool& threads, int threadI ...@@ -446,14 +444,18 @@ void CpuNeighborList::threadComputeNeighborList(ThreadPool& threads, int threadI
int numBlocks = blockNeighbors.size(); int numBlocks = blockNeighbors.size();
vector<int> blockAtoms; vector<int> blockAtoms;
vector<VoxelIndex> atomVoxelIndex;
for (int i = threadIndex; i < numBlocks; i += numThreads) { for (int i = threadIndex; i < numBlocks; i += numThreads) {
// Find the atoms in this block and compute their bounding box. // Find the atoms in this block and compute their bounding box.
int firstIndex = BlockSize*i; int firstIndex = blockSize*i;
int atomsInBlock = min(BlockSize, numAtoms-firstIndex); int atomsInBlock = min(blockSize, numAtoms-firstIndex);
blockAtoms.resize(atomsInBlock); blockAtoms.resize(atomsInBlock);
for (int j = 0; j < atomsInBlock; j++) atomVoxelIndex.resize(atomsInBlock);
for (int j = 0; j < atomsInBlock; j++) {
blockAtoms[j] = sortedAtoms[firstIndex+j]; blockAtoms[j] = sortedAtoms[firstIndex+j];
atomVoxelIndex[j] = voxels->getVoxelIndex(&atomLocations[4*blockAtoms[j]]);
}
fvec4 minPos(&atomLocations[4*sortedAtoms[firstIndex]]); fvec4 minPos(&atomLocations[4*sortedAtoms[firstIndex]]);
fvec4 maxPos = minPos; fvec4 maxPos = minPos;
for (int j = 1; j < atomsInBlock; j++) { for (int j = 1; j < atomsInBlock; j++) {
...@@ -461,7 +463,7 @@ void CpuNeighborList::threadComputeNeighborList(ThreadPool& threads, int threadI ...@@ -461,7 +463,7 @@ void CpuNeighborList::threadComputeNeighborList(ThreadPool& threads, int threadI
minPos = min(minPos, pos); minPos = min(minPos, pos);
maxPos = max(maxPos, pos); maxPos = max(maxPos, pos);
} }
voxels->getNeighbors(blockNeighbors[i], i, (maxPos+minPos)*0.5f, (maxPos-minPos)*0.5f, sortedAtoms, blockExclusions[i], maxDistance, blockAtoms, atomLocations); voxels->getNeighbors(blockNeighbors[i], i, (maxPos+minPos)*0.5f, (maxPos-minPos)*0.5f, sortedAtoms, blockExclusions[i], maxDistance, blockAtoms, atomLocations, atomVoxelIndex);
// Record the exclusions for this block. // Record the exclusions for this block.
......
/* Portions copyright (c) 2006-2013 Stanford University and Simbios.
* Contributors: Pande Group
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files (the
* "Software"), to deal in the Software without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sublicense, and/or sell copies of the Software, and to
* permit persons to whom the Software is furnished to do so, subject
* to the following conditions:
*
* The above copyright notice and this permission notice shall be included
* in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE
* LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
* OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
* WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
#include <complex>
#include "SimTKOpenMMCommon.h"
#include "SimTKOpenMMUtilities.h"
#include "CpuNonbondedForceVec8.h"
#include "ReferenceForce.h"
#include "ReferencePME.h"
#include "openmm/internal/vectorize.h"
#include "gmx_atomic.h"
// In case we're using some primitive version of Visual Studio this will
// make sure that erf() and erfc() are defined.
#include "openmm/internal/MSVC_erfc.h"
using namespace std;
using namespace OpenMM;
const float CpuNonbondedForceVec8::TWO_OVER_SQRT_PI = (float) (2/sqrt(PI_M));
const int CpuNonbondedForceVec8::NUM_TABLE_POINTS = 2048;
class CpuNonbondedForceVec8::ComputeDirectTask : public ThreadPool::Task {
public:
ComputeDirectTask(CpuNonbondedForceVec8& owner) : owner(owner) {
}
void execute(ThreadPool& threads, int threadIndex) {
owner.threadComputeDirect(threads, threadIndex);
}
CpuNonbondedForceVec8& owner;
};
/**---------------------------------------------------------------------------------------
CpuNonbondedForceVec8 constructor
--------------------------------------------------------------------------------------- */
CpuNonbondedForceVec8::CpuNonbondedForceVec8() : cutoff(false), useSwitch(false), periodic(false), ewald(false), pme(false), tableIsValid(false) {
}
/**---------------------------------------------------------------------------------------
Set the force to use a cutoff.
@param distance the cutoff distance
@param neighbors the neighbor list to use
@param solventDielectric the dielectric constant of the bulk solvent
--------------------------------------------------------------------------------------- */
void CpuNonbondedForceVec8::setUseCutoff(float distance, const CpuNeighborList& neighbors, float solventDielectric) {
if (distance != cutoffDistance)
tableIsValid = false;
cutoff = true;
cutoffDistance = distance;
neighborList = &neighbors;
krf = pow(cutoffDistance, -3.0f)*(solventDielectric-1.0)/(2.0*solventDielectric+1.0);
crf = (1.0/cutoffDistance)*(3.0*solventDielectric)/(2.0*solventDielectric+1.0);
}
/**---------------------------------------------------------------------------------------
Set the force to use a switching function on the Lennard-Jones interaction.
@param distance the switching distance
--------------------------------------------------------------------------------------- */
void CpuNonbondedForceVec8::setUseSwitchingFunction(float distance) {
useSwitch = true;
switchingDistance = distance;
}
/**---------------------------------------------------------------------------------------
Set the force to use periodic boundary conditions. This requires that a cutoff has
also been set, and the smallest side of the periodic box is at least twice the cutoff
distance.
@param boxSize the X, Y, and Z widths of the periodic box
--------------------------------------------------------------------------------------- */
void CpuNonbondedForceVec8::setPeriodic(float* periodicBoxSize) {
assert(cutoff);
assert(periodicBoxSize[0] >= 2*cutoffDistance);
assert(periodicBoxSize[1] >= 2*cutoffDistance);
assert(periodicBoxSize[2] >= 2*cutoffDistance);
periodic = true;
this->periodicBoxSize[0] = periodicBoxSize[0];
this->periodicBoxSize[1] = periodicBoxSize[1];
this->periodicBoxSize[2] = periodicBoxSize[2];
}
/**---------------------------------------------------------------------------------------
Set the force to use Ewald summation.
@param alpha the Ewald separation parameter
@param kmaxx the largest wave vector in the x direction
@param kmaxy the largest wave vector in the y direction
@param kmaxz the largest wave vector in the z direction
--------------------------------------------------------------------------------------- */
void CpuNonbondedForceVec8::setUseEwald(float alpha, int kmaxx, int kmaxy, int kmaxz) {
if (alpha != alphaEwald)
tableIsValid = false;
alphaEwald = alpha;
numRx = kmaxx;
numRy = kmaxy;
numRz = kmaxz;
ewald = true;
tabulateEwaldScaleFactor();
}
/**---------------------------------------------------------------------------------------
Set the force to use Particle-Mesh Ewald (PME) summation.
@param alpha the Ewald separation parameter
@param gridSize the dimensions of the mesh
--------------------------------------------------------------------------------------- */
void CpuNonbondedForceVec8::setUsePME(float alpha, int meshSize[3]) {
if (alpha != alphaEwald)
tableIsValid = false;
alphaEwald = alpha;
meshDim[0] = meshSize[0];
meshDim[1] = meshSize[1];
meshDim[2] = meshSize[2];
pme = true;
tabulateEwaldScaleFactor();
}
void CpuNonbondedForceVec8::tabulateEwaldScaleFactor() {
if (tableIsValid)
return;
tableIsValid = true;
ewaldDX = cutoffDistance/NUM_TABLE_POINTS;
ewaldDXInv = 1.0f/ewaldDX;
ewaldScaleTable.resize(NUM_TABLE_POINTS+4);
for (int i = 0; i < NUM_TABLE_POINTS+4; i++) {
double r = i*ewaldDX;
double alphaR = alphaEwald*r;
ewaldScaleTable[i] = erfc(alphaR) + TWO_OVER_SQRT_PI*alphaR*exp(-alphaR*alphaR);
}
}
void CpuNonbondedForceVec8::calculateReciprocalIxn(int numberOfAtoms, float* posq, const vector<RealVec>& atomCoordinates,
const vector<pair<float, float> >& atomParameters, const vector<set<int> >& exclusions,
vector<RealVec>& forces, double* totalEnergy) const {
typedef std::complex<float> d_complex;
static const float epsilon = 1.0;
int kmax = (ewald ? std::max(numRx, std::max(numRy,numRz)) : 0);
float factorEwald = -1 / (4*alphaEwald*alphaEwald);
float TWO_PI = 2.0 * PI_M;
float recipCoeff = (float)(ONE_4PI_EPS0*4*PI_M/(periodicBoxSize[0] * periodicBoxSize[1] * periodicBoxSize[2]) /epsilon);
if (pme) {
pme_t pmedata;
RealOpenMM virial[3][3];
pme_init(&pmedata, alphaEwald, numberOfAtoms, meshDim, 5, 1);
vector<RealOpenMM> charges(numberOfAtoms);
for (int i = 0; i < numberOfAtoms; i++)
charges[i] = posq[4*i+3];
RealOpenMM boxSize[3] = {periodicBoxSize[0], periodicBoxSize[1], periodicBoxSize[2]};
RealOpenMM recipEnergy = 0.0;
pme_exec(pmedata, atomCoordinates, forces, charges, boxSize, &recipEnergy, virial);
if (totalEnergy)
*totalEnergy += recipEnergy;
pme_destroy(pmedata);
}
// Ewald method
else if (ewald) {
// setup reciprocal box
float recipBoxSize[3] = { TWO_PI / periodicBoxSize[0], TWO_PI / periodicBoxSize[1], TWO_PI / periodicBoxSize[2]};
// setup K-vectors
#define EIR(x, y, z) eir[(x)*numberOfAtoms*3+(y)*3+z]
vector<d_complex> eir(kmax*numberOfAtoms*3);
vector<d_complex> tab_xy(numberOfAtoms);
vector<d_complex> tab_qxyz(numberOfAtoms);
for (int i = 0; (i < numberOfAtoms); i++) {
float* pos = posq+4*i;
for (int m = 0; (m < 3); m++)
EIR(0, i, m) = d_complex(1,0);
for (int m=0; (m<3); m++)
EIR(1, i, m) = d_complex(cos(pos[m]*recipBoxSize[m]),
sin(pos[m]*recipBoxSize[m]));
for (int j=2; (j<kmax); j++)
for (int m=0; (m<3); m++)
EIR(j, i, m) = EIR(j-1, i, m) * EIR(1, i, m);
}
// calculate reciprocal space energy and forces
int lowry = 0;
int lowrz = 1;
for (int rx = 0; rx < numRx; rx++) {
float kx = rx * recipBoxSize[0];
for (int ry = lowry; ry < numRy; ry++) {
float ky = ry * recipBoxSize[1];
if (ry >= 0) {
for (int n = 0; n < numberOfAtoms; n++)
tab_xy[n] = EIR(rx, n, 0) * EIR(ry, n, 1);
}
else {
for (int n = 0; n < numberOfAtoms; n++)
tab_xy[n]= EIR(rx, n, 0) * conj (EIR(-ry, n, 1));
}
for (int rz = lowrz; rz < numRz; rz++) {
if (rz >= 0) {
for (int n = 0; n < numberOfAtoms; n++)
tab_qxyz[n] = posq[4*n+3] * (tab_xy[n] * EIR(rz, n, 2));
}
else {
for (int n = 0; n < numberOfAtoms; n++)
tab_qxyz[n] = posq[4*n+3] * (tab_xy[n] * conj(EIR(-rz, n, 2)));
}
float cs = 0.0f;
float ss = 0.0f;
for (int n = 0; n < numberOfAtoms; n++) {
cs += tab_qxyz[n].real();
ss += tab_qxyz[n].imag();
}
float kz = rz * recipBoxSize[2];
float k2 = kx * kx + ky * ky + kz * kz;
float ak = exp(k2*factorEwald) / k2;
for (int n = 0; n < numberOfAtoms; n++) {
float force = ak * (cs * tab_qxyz[n].imag() - ss * tab_qxyz[n].real());
forces[n][0] += 2 * recipCoeff * force * kx;
forces[n][1] += 2 * recipCoeff * force * ky;
forces[n][2] += 2 * recipCoeff * force * kz;
}
if (totalEnergy)
*totalEnergy += recipCoeff * ak * (cs * cs + ss * ss);
lowrz = 1 - numRz;
}
lowry = 1 - numRy;
}
}
}
}
void CpuNonbondedForceVec8::calculateDirectIxn(int numberOfAtoms, float* posq, const vector<RealVec>& atomCoordinates, const vector<pair<float, float> >& atomParameters,
const vector<set<int> >& exclusions, vector<AlignedArray<float> >& threadForce, double* totalEnergy, ThreadPool& threads) {
// Record the parameters for the threads.
this->numberOfAtoms = numberOfAtoms;
this->posq = posq;
this->atomCoordinates = &atomCoordinates[0];
this->atomParameters = &atomParameters[0];
this->exclusions = &exclusions[0];
this->threadForce = &threadForce;
includeEnergy = (totalEnergy != NULL);
threadEnergy.resize(threads.getNumThreads());
gmx_atomic_t counter;
gmx_atomic_set(&counter, 0);
this->atomicCounter = &counter;
// Signal the threads to start running and wait for them to finish.
ComputeDirectTask task(*this);
threads.execute(task);
threads.waitForThreads();
// Combine the energies from all the threads.
if (totalEnergy != NULL) {
double directEnergy = 0;
int numThreads = threads.getNumThreads();
for (int i = 0; i < numThreads; i++)
directEnergy += threadEnergy[i];
*totalEnergy += directEnergy;
}
}
void CpuNonbondedForceVec8::threadComputeDirect(ThreadPool& threads, int threadIndex) {
// Compute this thread's subset of interactions.
int numThreads = threads.getNumThreads();
threadEnergy[threadIndex] = 0;
double* energyPtr = (includeEnergy ? &threadEnergy[threadIndex] : NULL);
float* forces = &(*threadForce)[threadIndex][0];
fvec4 boxSize(periodicBoxSize[0], periodicBoxSize[1], periodicBoxSize[2], 0);
fvec4 invBoxSize((1/periodicBoxSize[0]), (1/periodicBoxSize[1]), (1/periodicBoxSize[2]), 0);
if (ewald || pme) {
// Compute the interactions from the neighbor list.
while (true) {
int nextBlock = gmx_atomic_fetch_add(reinterpret_cast<gmx_atomic_t*>(atomicCounter), 1);
if (nextBlock >= neighborList->getNumBlocks())
break;
calculateBlockEwaldIxn(nextBlock, forces, energyPtr, boxSize, invBoxSize);
}
// Now subtract off the exclusions, since they were implicitly included in the reciprocal space sum.
fvec4 boxSize(periodicBoxSize[0], periodicBoxSize[1], periodicBoxSize[2], 0);
fvec4 invBoxSize((1/periodicBoxSize[0]), (1/periodicBoxSize[1]), (1/periodicBoxSize[2]), 0);
for (int i = threadIndex; i < numberOfAtoms; i += numThreads) {
fvec4 posI((float) atomCoordinates[i][0], (float) atomCoordinates[i][1], (float) atomCoordinates[i][2], 0.0f);
for (set<int>::const_iterator iter = exclusions[i].begin(); iter != exclusions[i].end(); ++iter) {
if (*iter > i) {
int j = *iter;
fvec4 deltaR;
fvec4 posJ((float) atomCoordinates[j][0], (float) atomCoordinates[j][1], (float) atomCoordinates[j][2], 0.0f);
float r2;
getDeltaR(posJ, posI, deltaR, r2, false, boxSize, invBoxSize);
float r = sqrtf(r2);
float inverseR = 1/r;
float chargeProd = ONE_4PI_EPS0*posq[4*i+3]*posq[4*j+3];
float alphaR = alphaEwald*r;
float erfcAlphaR = erfcApprox(alphaR).lowerVec()[0];
float dEdR = (float) (chargeProd * inverseR * inverseR * inverseR);
dEdR = (float) (dEdR * (1.0f-erfcAlphaR-TWO_OVER_SQRT_PI*alphaR*exp(-alphaR*alphaR)));
fvec4 result = deltaR*dEdR;
(fvec4(forces+4*i)-result).store(forces+4*i);
(fvec4(forces+4*j)+result).store(forces+4*j);
if (includeEnergy)
threadEnergy[threadIndex] -= chargeProd*inverseR*(1.0f-erfcAlphaR);
}
}
}
}
else if (cutoff) {
// Compute the interactions from the neighbor list.
while (true) {
int nextBlock = gmx_atomic_fetch_add(reinterpret_cast<gmx_atomic_t*>(atomicCounter), 1);
if (nextBlock >= neighborList->getNumBlocks())
break;
calculateBlockIxn(nextBlock, forces, energyPtr, boxSize, invBoxSize);
}
}
else {
// Loop over all atom pairs
while (true) {
int i = gmx_atomic_fetch_add(reinterpret_cast<gmx_atomic_t*>(atomicCounter), 1);
if (i >= numberOfAtoms)
break;
for (int j = i+1; j < numberOfAtoms; j++)
if (exclusions[j].find(i) == exclusions[j].end())
calculateOneIxn(i, j, forces, energyPtr, boxSize, invBoxSize);
}
}
}
void CpuNonbondedForceVec8::calculateOneIxn(int ii, int jj, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize) {
// get deltaR, R2, and R between 2 atoms
fvec4 deltaR;
fvec4 posI(posq+4*ii);
fvec4 posJ(posq+4*jj);
float r2;
getDeltaR(posJ, posI, deltaR, r2, periodic, boxSize, invBoxSize);
if (cutoff && r2 >= cutoffDistance*cutoffDistance)
return;
float r = sqrtf(r2);
float inverseR = 1/r;
float switchValue = 1, switchDeriv = 0;
if (useSwitch && r > switchingDistance) {
float t = (r-switchingDistance)/(cutoffDistance-switchingDistance);
switchValue = 1+t*t*t*(-10+t*(15-t*6));
switchDeriv = t*t*(-30+t*(60-t*30))/(cutoffDistance-switchingDistance);
}
float sig = atomParameters[ii].first + atomParameters[jj].first;
float sig2 = inverseR*sig;
sig2 *= sig2;
float sig6 = sig2*sig2*sig2;
float eps = atomParameters[ii].second*atomParameters[jj].second;
float dEdR = switchValue*eps*(12.0f*sig6 - 6.0f)*sig6;
float chargeProd = ONE_4PI_EPS0*posq[4*ii+3]*posq[4*jj+3];
if (cutoff)
dEdR += (float) (chargeProd*(inverseR-2.0f*krf*r2));
else
dEdR += (float) (chargeProd*inverseR);
dEdR *= inverseR*inverseR;
float energy = eps*(sig6-1.0f)*sig6;
if (useSwitch) {
dEdR -= energy*switchDeriv*inverseR;
energy *= switchValue;
}
// accumulate energies
if (totalEnergy) {
if (cutoff)
energy += (float) (chargeProd*(inverseR+krf*r2-crf));
else
energy += (float) (chargeProd*inverseR);
*totalEnergy += energy;
}
// accumulate forces
fvec4 result = deltaR*dEdR;
(fvec4(forces+4*ii)+result).store(forces+4*ii);
(fvec4(forces+4*jj)-result).store(forces+4*jj);
}
void CpuNonbondedForceVec8::calculateBlockIxn(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize) {
// Load the positions and parameters of the atoms in the block.
int blockAtom[8];
fvec4 blockAtomPosq[8];
fvec4 blockAtomForce[8];
for (int i = 0; i < 8; i++) {
blockAtom[i] = neighborList->getSortedAtoms()[8*blockIndex+i];
blockAtomPosq[i] = fvec4(posq+4*blockAtom[i]);
blockAtomForce[i] = fvec4(0.0f);
}
fvec8 blockAtomX = fvec8(blockAtomPosq[0][0], blockAtomPosq[1][0], blockAtomPosq[2][0], blockAtomPosq[3][0], blockAtomPosq[4][0], blockAtomPosq[5][0], blockAtomPosq[6][0], blockAtomPosq[7][0]);
fvec8 blockAtomY = fvec8(blockAtomPosq[0][1], blockAtomPosq[1][1], blockAtomPosq[2][1], blockAtomPosq[3][1], blockAtomPosq[4][1], blockAtomPosq[5][1], blockAtomPosq[6][1], blockAtomPosq[7][1]);
fvec8 blockAtomZ = fvec8(blockAtomPosq[0][2], blockAtomPosq[1][2], blockAtomPosq[2][2], blockAtomPosq[3][2], blockAtomPosq[4][2], blockAtomPosq[5][2], blockAtomPosq[6][2], blockAtomPosq[7][2]);
fvec8 blockAtomCharge = fvec8(ONE_4PI_EPS0)*fvec8(blockAtomPosq[0][3], blockAtomPosq[1][3], blockAtomPosq[2][3], blockAtomPosq[3][3], blockAtomPosq[4][3], blockAtomPosq[5][3], blockAtomPosq[6][3], blockAtomPosq[7][3]);
fvec8 blockAtomSigma(atomParameters[blockAtom[0]].first, atomParameters[blockAtom[1]].first, atomParameters[blockAtom[2]].first, atomParameters[blockAtom[3]].first, atomParameters[blockAtom[4]].first, atomParameters[blockAtom[5]].first, atomParameters[blockAtom[6]].first, atomParameters[blockAtom[7]].first);
fvec8 blockAtomEpsilon(atomParameters[blockAtom[0]].second, atomParameters[blockAtom[1]].second, atomParameters[blockAtom[2]].second, atomParameters[blockAtom[3]].second, atomParameters[blockAtom[4]].second, atomParameters[blockAtom[5]].second, atomParameters[blockAtom[6]].second, atomParameters[blockAtom[7]].second);
bool needPeriodic = false;
if (periodic) {
for (int i = 0; i < 8 && !needPeriodic; i++)
for (int j = 0; j < 3; j++)
if (blockAtomPosq[i][j]-cutoffDistance < 0.0 || blockAtomPosq[i][j]+cutoffDistance > boxSize[j]) {
needPeriodic = true;
break;
}
}
const float invSwitchingInterval = 1/(cutoffDistance-switchingDistance);
// Loop over neighbors for this block.
const vector<int>& neighbors = neighborList->getBlockNeighbors(blockIndex);
const vector<char>& exclusions = neighborList->getBlockExclusions(blockIndex);
for (int i = 0; i < (int) neighbors.size(); i++) {
// Load the next neighbor.
int atom = neighbors[i];
fvec4 atomPosq(posq+4*atom);
// Compute the distances to the block atoms.
fvec8 dx, dy, dz, r2;
getDeltaR(atomPosq, blockAtomX, blockAtomY, blockAtomZ, dx, dy, dz, r2, needPeriodic, boxSize, invBoxSize);
ivec8 include;
char excl = exclusions[i];
if (excl == 0)
include = -1;
else
include = ivec8(excl&1 ? 0 : -1, excl&2 ? 0 : -1, excl&4 ? 0 : -1, excl&8 ? 0 : -1, excl&16 ? 0 : -1, excl&32 ? 0 : -1, excl&64 ? 0 : -1, excl&128 ? 0 : -1);
include = include & (r2 < cutoffDistance*cutoffDistance);
if (!any(include))
continue; // No interactions to compute.
// Compute the interactions.
fvec8 r = sqrt(r2);
fvec8 inverseR = fvec8(1.0f)/r;
fvec8 energy, dEdR;
float atomEpsilon = atomParameters[atom].second;
if (atomEpsilon != 0.0f) {
fvec8 sig = blockAtomSigma+atomParameters[atom].first;
fvec8 sig2 = inverseR*sig;
sig2 *= sig2;
fvec8 sig6 = sig2*sig2*sig2;
fvec8 epsSig6 = blockAtomEpsilon*atomEpsilon*sig6;
dEdR = epsSig6*(12.0f*sig6 - 6.0f);
energy = epsSig6*(sig6-1.0f);
if (useSwitch) {
fvec8 t = (r>switchingDistance) & ((r-switchingDistance)*invSwitchingInterval);
fvec8 switchValue = 1+t*t*t*(-10.0f+t*(15.0f-t*6.0f));
fvec8 switchDeriv = t*t*(-30.0f+t*(60.0f-t*30.0f))*invSwitchingInterval;
dEdR = switchValue*dEdR - energy*switchDeriv*r;
energy *= switchValue;
}
}
else {
energy = 0.0f;
dEdR = 0.0f;
}
fvec8 chargeProd = blockAtomCharge*posq[4*atom+3];
if (cutoff)
dEdR += chargeProd*(inverseR-2.0f*krf*r2);
else
dEdR += chargeProd*inverseR;
dEdR *= inverseR*inverseR;
// Accumulate energies.
if (totalEnergy) {
if (cutoff)
energy += chargeProd*(inverseR+krf*r2-crf);
else
energy += chargeProd*inverseR;
energy = blend(0.0f, energy, include);
*totalEnergy += dot8(energy, 1.0f);
}
// Accumulate forces.
dEdR = blend(0.0f, dEdR, include);
fvec8 result[4] = {dx*dEdR, dy*dEdR, dz*dEdR, 0.0f};
fvec4 rt[8];
transpose(result[0], result[1], result[2], result[3], rt[0], rt[1], rt[2], rt[3], rt[4], rt[5], rt[6], rt[7]);
fvec4 atomForce(forces+4*atom);
for (int j = 0; j < 8; j++) {
blockAtomForce[j] += rt[j];
atomForce -= rt[j];
}
atomForce.store(forces+4*atom);
}
// Record the forces on the block atoms.
for (int j = 0; j < 8; j++)
(fvec4(forces+4*blockAtom[j])+blockAtomForce[j]).store(forces+4*blockAtom[j]);
}
void CpuNonbondedForceVec8::calculateBlockEwaldIxn(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize) {
// Load the positions and parameters of the atoms in the block.
int blockAtom[8];
fvec4 blockAtomPosq[8];
fvec4 blockAtomForce[8];
for (int i = 0; i < 8; i++) {
blockAtom[i] = neighborList->getSortedAtoms()[8*blockIndex+i];
blockAtomPosq[i] = fvec4(posq+4*blockAtom[i]);
blockAtomForce[i] = fvec4(0.0f);
}
fvec8 blockAtomX = fvec8(blockAtomPosq[0][0], blockAtomPosq[1][0], blockAtomPosq[2][0], blockAtomPosq[3][0], blockAtomPosq[4][0], blockAtomPosq[5][0], blockAtomPosq[6][0], blockAtomPosq[7][0]);
fvec8 blockAtomY = fvec8(blockAtomPosq[0][1], blockAtomPosq[1][1], blockAtomPosq[2][1], blockAtomPosq[3][1], blockAtomPosq[4][1], blockAtomPosq[5][1], blockAtomPosq[6][1], blockAtomPosq[7][1]);
fvec8 blockAtomZ = fvec8(blockAtomPosq[0][2], blockAtomPosq[1][2], blockAtomPosq[2][2], blockAtomPosq[3][2], blockAtomPosq[4][2], blockAtomPosq[5][2], blockAtomPosq[6][2], blockAtomPosq[7][2]);
fvec8 blockAtomCharge = fvec8(ONE_4PI_EPS0)*fvec8(blockAtomPosq[0][3], blockAtomPosq[1][3], blockAtomPosq[2][3], blockAtomPosq[3][3], blockAtomPosq[4][3], blockAtomPosq[5][3], blockAtomPosq[6][3], blockAtomPosq[7][3]);
fvec8 blockAtomSigma(atomParameters[blockAtom[0]].first, atomParameters[blockAtom[1]].first, atomParameters[blockAtom[2]].first, atomParameters[blockAtom[3]].first, atomParameters[blockAtom[4]].first, atomParameters[blockAtom[5]].first, atomParameters[blockAtom[6]].first, atomParameters[blockAtom[7]].first);
fvec8 blockAtomEpsilon(atomParameters[blockAtom[0]].second, atomParameters[blockAtom[1]].second, atomParameters[blockAtom[2]].second, atomParameters[blockAtom[3]].second, atomParameters[blockAtom[4]].second, atomParameters[blockAtom[5]].second, atomParameters[blockAtom[6]].second, atomParameters[blockAtom[7]].second);
bool needPeriodic = false;
if (periodic) {
for (int i = 0; i < 8 && !needPeriodic; i++)
for (int j = 0; j < 3; j++)
if (blockAtomPosq[i][j]-cutoffDistance < 0.0 || blockAtomPosq[i][j]+cutoffDistance > boxSize[j]) {
needPeriodic = true;
break;
}
}
const float invSwitchingInterval = 1/(cutoffDistance-switchingDistance);
// Loop over neighbors for this block.
const vector<int>& neighbors = neighborList->getBlockNeighbors(blockIndex);
const vector<char>& exclusions = neighborList->getBlockExclusions(blockIndex);
for (int i = 0; i < (int) neighbors.size(); i++) {
// Load the next neighbor.
int atom = neighbors[i];
fvec4 atomPosq(posq+4*atom);
// Compute the distances to the block atoms.
fvec8 dx, dy, dz, r2;
getDeltaR(atomPosq, blockAtomX, blockAtomY, blockAtomZ, dx, dy, dz, r2, needPeriodic, boxSize, invBoxSize);
ivec8 include;
char excl = exclusions[i];
if (excl == 0)
include = -1;
else
include = ivec8(excl&1 ? 0 : -1, excl&2 ? 0 : -1, excl&4 ? 0 : -1, excl&8 ? 0 : -1, excl&16 ? 0 : -1, excl&32 ? 0 : -1, excl&64 ? 0 : -1, excl&128 ? 0 : -1);
include = include & (r2 < cutoffDistance*cutoffDistance);
if (!any(include))
continue; // No interactions to compute.
// Compute the interactions.
fvec8 r = sqrt(r2);
fvec8 inverseR = fvec8(1.0f)/r;
fvec8 energy, dEdR;
float atomEpsilon = atomParameters[atom].second;
if (atomEpsilon != 0.0f) {
fvec8 sig = blockAtomSigma+atomParameters[atom].first;
fvec8 sig2 = inverseR*sig;
sig2 *= sig2;
fvec8 sig6 = sig2*sig2*sig2;
fvec8 epsSig6 = blockAtomEpsilon*atomEpsilon*sig6;
dEdR = epsSig6*(12.0f*sig6 - 6.0f);
energy = epsSig6*(sig6-1.0f);
if (useSwitch) {
fvec8 t = (r>switchingDistance) & ((r-switchingDistance)*invSwitchingInterval);
fvec8 switchValue = 1+t*t*t*(-10.0f+t*(15.0f-t*6.0f));
fvec8 switchDeriv = t*t*(-30.0f+t*(60.0f-t*30.0f))*invSwitchingInterval;
dEdR = switchValue*dEdR - energy*switchDeriv*r;
energy *= switchValue;
}
}
else {
energy = 0.0f;
dEdR = 0.0f;
}
fvec8 chargeProd = blockAtomCharge*posq[4*atom+3];
dEdR += chargeProd*inverseR*ewaldScaleFunction(r);
dEdR *= inverseR*inverseR;
// Accumulate energies.
if (totalEnergy) {
energy += chargeProd*inverseR*erfcApprox(alphaEwald*r);
energy = blend(0.0f, energy, include);
*totalEnergy += dot8(energy, 1.0f);
}
// Accumulate forces.
dEdR = blend(0.0f, dEdR, include);
fvec8 result[4] = {dx*dEdR, dy*dEdR, dz*dEdR, 0.0f};
fvec4 rt[8];
transpose(result[0], result[1], result[2], result[3], rt[0], rt[1], rt[2], rt[3], rt[4], rt[5], rt[6], rt[7]);
fvec4 atomForce(forces+4*atom);
for (int j = 0; j < 8; j++) {
blockAtomForce[j] += rt[j];
atomForce -= rt[j];
}
atomForce.store(forces+4*atom);
}
// Record the forces on the block atoms.
for (int j = 0; j < 8; j++)
(fvec4(forces+4*blockAtom[j])+blockAtomForce[j]).store(forces+4*blockAtom[j]);
}
void CpuNonbondedForceVec8::getDeltaR(const fvec4& posI, const fvec4& posJ, fvec4& deltaR, float& r2, bool periodic, const fvec4& boxSize, const fvec4& invBoxSize) const {
deltaR = posJ-posI;
if (periodic) {
fvec4 base = round(deltaR*invBoxSize)*boxSize;
deltaR = deltaR-base;
}
r2 = dot3(deltaR, deltaR);
}
void CpuNonbondedForceVec8::getDeltaR(const fvec4& posI, const fvec8& x, const fvec8& y, const fvec8& z, fvec8& dx, fvec8& dy, fvec8& dz, fvec8& r2, bool periodic, const fvec4& boxSize, const fvec4& invBoxSize) const {
dx = x-posI[0];
dy = y-posI[1];
dz = z-posI[2];
if (periodic) {
dx -= round(dx*invBoxSize[0])*boxSize[0];
dy -= round(dy*invBoxSize[1])*boxSize[1];
dz -= round(dz*invBoxSize[2])*boxSize[2];
}
r2 = dx*dx + dy*dy + dz*dz;
}
fvec8 CpuNonbondedForceVec8::erfcApprox(fvec8 x) {
// This approximation for erfc is from Abramowitz and Stegun (1964) p. 299. They cite the following as
// the original source: C. Hastings, Jr., Approximations for Digital Computers (1955). It has a maximum
// error of 3e-7.
fvec8 t = 1.0f+(0.0705230784f+(0.0422820123f+(0.0092705272f+(0.0001520143f+(0.0002765672f+0.0000430638f*x)*x)*x)*x)*x)*x;
t *= t;
t *= t;
t *= t;
return 1.0f/(t*t);
}
fvec8 CpuNonbondedForceVec8::ewaldScaleFunction(fvec8 x) {
// Compute the tabulated Ewald scale factor: erfc(alpha*r) + 2*alpha*r*exp(-alpha*alpha*r*r)/sqrt(PI)
fvec8 x1 = x*ewaldDXInv;
ivec8 index = min(floor(x1), NUM_TABLE_POINTS);
fvec8 coeff2 = x1-index;
fvec8 coeff1 = 1.0f-coeff2;
ivec4 indexLower = index.lowerVec();
ivec4 indexUpper = index.upperVec();
fvec4 t1(&ewaldScaleTable[indexLower[0]]);
fvec4 t2(&ewaldScaleTable[indexLower[1]]);
fvec4 t3(&ewaldScaleTable[indexLower[2]]);
fvec4 t4(&ewaldScaleTable[indexLower[3]]);
fvec4 t5(&ewaldScaleTable[indexUpper[0]]);
fvec4 t6(&ewaldScaleTable[indexUpper[1]]);
fvec4 t7(&ewaldScaleTable[indexUpper[2]]);
fvec4 t8(&ewaldScaleTable[indexUpper[3]]);
fvec8 s1, s2, s3, s4;
transpose(t1, t2, t3, t4, t5, t6, t7, t8, s1, s2, s3, s4);
return coeff1*s1 + coeff2*s2;
}
...@@ -51,6 +51,7 @@ void testNeighborList(bool periodic) { ...@@ -51,6 +51,7 @@ void testNeighborList(bool periodic) {
const int numParticles = 500; const int numParticles = 500;
const float cutoff = 2.0f; const float cutoff = 2.0f;
const float boxSize[3] = {20.0f, 15.0f, 22.0f}; const float boxSize[3] = {20.0f, 15.0f, 22.0f};
const int blockSize = 8;
OpenMM_SFMT::SFMT sfmt; OpenMM_SFMT::SFMT sfmt;
init_gen_rand(0, sfmt); init_gen_rand(0, sfmt);
AlignedArray<float> positions(4*numParticles); AlignedArray<float> positions(4*numParticles);
...@@ -66,15 +67,15 @@ void testNeighborList(bool periodic) { ...@@ -66,15 +67,15 @@ void testNeighborList(bool periodic) {
} }
} }
ThreadPool threads; ThreadPool threads;
CpuNeighborList neighborList; CpuNeighborList neighborList(blockSize);
neighborList.computeNeighborList(numParticles, positions, exclusions, boxSize, periodic, cutoff, threads); neighborList.computeNeighborList(numParticles, positions, exclusions, boxSize, periodic, cutoff, threads);
// Convert the neighbor list to a set for faster lookup. // Convert the neighbor list to a set for faster lookup.
set<pair<int, int> > neighbors; set<pair<int, int> > neighbors;
for (int i = 0; i < (int) neighborList.getSortedAtoms().size(); i++) { for (int i = 0; i < (int) neighborList.getSortedAtoms().size(); i++) {
int blockIndex = i/CpuNeighborList::BlockSize; int blockIndex = i/blockSize;
int indexInBlock = i-blockIndex*CpuNeighborList::BlockSize; int indexInBlock = i-blockIndex*blockSize;
char mask = 1<<indexInBlock; char mask = 1<<indexInBlock;
for (int j = 0; j < (int) neighborList.getBlockExclusions(blockIndex).size(); j++) { for (int j = 0; j < (int) neighborList.getBlockExclusions(blockIndex).size(); j++) {
if ((neighborList.getBlockExclusions(blockIndex)[j] & mask) == 0) { if ((neighborList.getBlockExclusions(blockIndex)[j] & mask) == 0) {
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment