Commit 0b5d58d7 authored by Charlles Abreu's avatar Charlles Abreu
Browse files

Conflict resolution in TestSplineFilter.cpp

parents 9026dbe7 b0d13582
/** /**
* Perform the first step of Velocity Verlet integration. * Perform the first part of integration: velocity step.
*/ */
KERNEL void integrateNoseHooverMiddlePart1(int numAtoms, int numPairs, int paddedNumAtoms, GLOBAL mixed4* RESTRICT velm, GLOBAL const mm_long* RESTRICT force,
KERNEL void integrateVelocityVerletPart1(int numAtoms, int numPairs, int paddedNumAtoms, GLOBAL const mixed2* RESTRICT dt, GLOBAL const real4* RESTRICT posq, GLOBAL const mixed2* RESTRICT dt, GLOBAL const int* RESTRICT atomList, GLOBAL const int2* RESTRICT pairList) {
GLOBAL mixed4* RESTRICT velm, GLOBAL const mm_long* RESTRICT force, GLOBAL mixed4* RESTRICT posDelta, mixed fscale = dt[0].y/(mixed) 0x100000000;
GLOBAL const int* RESTRICT atomList, GLOBAL const int2* RESTRICT pairList
#ifdef USE_MIXED_PRECISION
,GLOBAL const real4* RESTRICT posqCorrection
#endif
){
const mixed2 stepSize = dt[0];
const mixed dtPos = stepSize.y;
const mixed dtVel = 0.5f*(stepSize.x+stepSize.y);
const mixed scale = 0.5f * dtVel/(mixed) 0x100000000;
int index = GLOBAL_ID; int index = GLOBAL_ID;
while (index < numAtoms) { while (index < numAtoms) {
int atom = atomList[index]; int atom = atomList[index];
mixed4 velocity = velm[atom]; mixed4 velocity = velm[atom];
if (velocity.w != 0.0) { if (velocity.w != 0.0) {
#ifdef USE_MIXED_PRECISION velocity.x += fscale*force[atom]*velocity.w;
real4 pos1 = posq[atom]; velocity.y += fscale*force[atom+paddedNumAtoms]*velocity.w;
real4 pos2 = posqCorrection[atom]; velocity.z += fscale*force[atom+paddedNumAtoms*2]*velocity.w;
mixed4 pos = make_mixed4(pos1.x+(mixed)pos2.x, pos1.y+(mixed)pos2.y, pos1.z+(mixed)pos2.z, pos1.w);
#else
real4 pos = posq[atom];
#endif
velocity.x += scale*force[atom]*velocity.w;
velocity.y += scale*force[atom+paddedNumAtoms]*velocity.w;
velocity.z += scale*force[atom+paddedNumAtoms*2]*velocity.w;
pos.x = velocity.x*dtPos;
pos.y = velocity.y*dtPos;
pos.z = velocity.z*dtPos;
posDelta[atom] = pos;
velm[atom] = velocity; velm[atom] = velocity;
} }
index += GLOBAL_SIZE; index += GLOBAL_SIZE;
...@@ -58,12 +38,12 @@ KERNEL void integrateVelocityVerletPart1(int numAtoms, int numPairs, int paddedN ...@@ -58,12 +38,12 @@ KERNEL void integrateVelocityVerletPart1(int numAtoms, int numPairs, int paddedN
relVel.z= v2.z - v1.z; relVel.z= v2.z - v1.z;
mixed3 comFrc; mixed3 comFrc;
mixed F1x = scale*force[atom1]; mixed F1x = fscale*force[atom1];
mixed F1y = scale*force[atom1+paddedNumAtoms]; mixed F1y = fscale*force[atom1+paddedNumAtoms];
mixed F1z = scale*force[atom1+paddedNumAtoms*2]; mixed F1z = fscale*force[atom1+paddedNumAtoms*2];
mixed F2x = scale*force[atom2]; mixed F2x = fscale*force[atom2];
mixed F2y = scale*force[atom2+paddedNumAtoms]; mixed F2y = fscale*force[atom2+paddedNumAtoms];
mixed F2z = scale*force[atom2+paddedNumAtoms*2]; mixed F2z = fscale*force[atom2+paddedNumAtoms*2];
comFrc.x = F1x + F2x; comFrc.x = F1x + F2x;
comFrc.y = F1y + F2y; comFrc.y = F1y + F2y;
comFrc.z = F1z + F2z; comFrc.z = F1z + F2z;
...@@ -77,35 +57,16 @@ KERNEL void integrateVelocityVerletPart1(int numAtoms, int numPairs, int paddedN ...@@ -77,35 +57,16 @@ KERNEL void integrateVelocityVerletPart1(int numAtoms, int numPairs, int paddedN
relVel.x += relFrc.x * invRedMass; relVel.x += relFrc.x * invRedMass;
relVel.y += relFrc.y * invRedMass; relVel.y += relFrc.y * invRedMass;
relVel.z += relFrc.z * invRedMass; relVel.z += relFrc.z * invRedMass;
#ifdef USE_MIXED_PRECISION
real4 posv1 = posq[atom1];
real4 posv2 = posq[atom2];
real4 posc1 = posqCorrection[atom1];
real4 posc2 = posqCorrection[atom2];
mixed4 pos1 = make_mixed4(posv1.x+(mixed)posc1.x, posv1.y+(mixed)posc1.y, posv1.z+(mixed)posc1.z, posv1.w);
mixed4 pos2 = make_mixed4(posv2.x+(mixed)posc2.x, posv2.y+(mixed)posc2.y, posv2.z+(mixed)posc2.z, posv2.w);
#else
real4 pos1 = posq[atom1];
real4 pos2 = posq[atom2];
#endif
if (v1.w != 0.0f) { if (v1.w != 0.0f) {
v1.x = comVel.x - relVel.x*mass2fract; v1.x = comVel.x - relVel.x*mass2fract;
v1.y = comVel.y - relVel.y*mass2fract; v1.y = comVel.y - relVel.y*mass2fract;
v1.z = comVel.z - relVel.z*mass2fract; v1.z = comVel.z - relVel.z*mass2fract;
pos1.x = v1.x*dtPos;
pos1.y = v1.y*dtPos;
pos1.z = v1.z*dtPos;
posDelta[atom1] = pos1;
velm[atom1] = v1; velm[atom1] = v1;
} }
if (v2.w != 0.0f) { if (v2.w != 0.0f) {
v2.x = comVel.x + relVel.x*mass1fract; v2.x = comVel.x + relVel.x*mass1fract;
v2.y = comVel.y + relVel.y*mass1fract; v2.y = comVel.y + relVel.y*mass1fract;
v2.z = comVel.z + relVel.z*mass1fract; v2.z = comVel.z + relVel.z*mass1fract;
pos2.x = v2.x*dtPos;
pos2.y = v2.y*dtPos;
pos2.z = v2.z*dtPos;
posDelta[atom2] = pos2;
velm[atom2] = v2; velm[atom2] = v2;
} }
index += GLOBAL_SIZE; index += GLOBAL_SIZE;
...@@ -113,22 +74,60 @@ KERNEL void integrateVelocityVerletPart1(int numAtoms, int numPairs, int paddedN ...@@ -113,22 +74,60 @@ KERNEL void integrateVelocityVerletPart1(int numAtoms, int numPairs, int paddedN
} }
/** /**
* Perform the second step of Velocity Verlet integration. * Perform the second part of integration: position half step
*/
KERNEL void integrateNoseHooverMiddlePart2(int numAtoms, GLOBAL mixed4* RESTRICT velm, GLOBAL mixed4* RESTRICT posDelta,
GLOBAL mixed4* RESTRICT oldDelta, GLOBAL const mixed2* RESTRICT dt) {
mixed halfdt = 0.5f*dt[0].y;
int index = GLOBAL_ID;
while (index < numAtoms) {
mixed4 velocity = velm[index];
if (velocity.w != 0.0) {
mixed4 delta = make_mixed4(halfdt*velocity.x, halfdt*velocity.y, halfdt*velocity.z, 0);
posDelta[index] = delta;
oldDelta[index] = delta;
}
index += GLOBAL_SIZE;
}
}
/**
* Perform the third part of integration: another position half step
*/ */
KERNEL void integrateNoseHooverMiddlePart3(int numAtoms, GLOBAL mixed4* RESTRICT velm, GLOBAL mixed4* RESTRICT posDelta,
GLOBAL mixed4* RESTRICT oldDelta, GLOBAL const mixed2* RESTRICT dt) {
mixed halfdt = 0.5f*dt[0].y;
int index = GLOBAL_ID;
while (index < numAtoms) {
mixed4 velocity = velm[index];
if (velocity.w != 0.0) {
mixed4 delta = make_mixed4(halfdt*velocity.x, halfdt*velocity.y, halfdt*velocity.z, 0);
posDelta[index] += delta;
oldDelta[index] += delta;
}
index += GLOBAL_SIZE;
}
}
KERNEL void integrateVelocityVerletPart2(int numAtoms, GLOBAL mixed2* RESTRICT dt, GLOBAL real4* RESTRICT posq, GLOBAL mixed4* RESTRICT velm, /**
GLOBAL const mixed4* RESTRICT posDelta * Perform the fourth part of integration: apply constraint forces to velocities, then record
* the constrained positions.
*/
KERNEL void integrateNoseHooverMiddlePart4(int numAtoms, GLOBAL real4* RESTRICT posq, GLOBAL mixed4* RESTRICT velm,
GLOBAL mixed4* RESTRICT posDelta, GLOBAL mixed4* RESTRICT oldDelta, GLOBAL const mixed2* RESTRICT dt
#ifdef USE_MIXED_PRECISION #ifdef USE_MIXED_PRECISION
,GLOBAL real4* RESTRICT posqCorrection , GLOBAL real4* RESTRICT posqCorrection
#endif #endif
){ ) {
mixed2 stepSize = dt[0]; mixed invDt = 1/dt[0].y;
int index = GLOBAL_ID; for (int index = GLOBAL_ID; index < numAtoms; index += GLOBAL_SIZE) {
if (index == 0)
dt[0].x = stepSize.y;
while(index < numAtoms) {
mixed4 velocity = velm[index]; mixed4 velocity = velm[index];
if (velocity.w != 0.0) { if (velocity.w != 0.0) {
mixed4 delta = posDelta[index];
velocity.x += (delta.x-oldDelta[index].x)*invDt;
velocity.y += (delta.y-oldDelta[index].y)*invDt;
velocity.z += (delta.z-oldDelta[index].z)*invDt;
velm[index] = velocity;
#ifdef USE_MIXED_PRECISION #ifdef USE_MIXED_PRECISION
real4 pos1 = posq[index]; real4 pos1 = posq[index];
real4 pos2 = posqCorrection[index]; real4 pos2 = posqCorrection[index];
...@@ -136,7 +135,6 @@ KERNEL void integrateVelocityVerletPart2(int numAtoms, GLOBAL mixed2* RESTRICT d ...@@ -136,7 +135,6 @@ KERNEL void integrateVelocityVerletPart2(int numAtoms, GLOBAL mixed2* RESTRICT d
#else #else
real4 pos = posq[index]; real4 pos = posq[index];
#endif #endif
mixed4 delta = posDelta[index];
pos.x += delta.x; pos.x += delta.x;
pos.y += delta.y; pos.y += delta.y;
pos.z += delta.z; pos.z += delta.z;
...@@ -147,120 +145,10 @@ KERNEL void integrateVelocityVerletPart2(int numAtoms, GLOBAL mixed2* RESTRICT d ...@@ -147,120 +145,10 @@ KERNEL void integrateVelocityVerletPart2(int numAtoms, GLOBAL mixed2* RESTRICT d
posq[index] = pos; posq[index] = pos;
#endif #endif
} }
index += GLOBAL_SIZE;
} }
} }
/** KERNEL void integrateNoseHooverHardWall(int numPairs, GLOBAL const float* RESTRICT maxPairDistance,
* Perform the third step of Velocity Verlet integration.
*/
KERNEL void integrateVelocityVerletPart3(int numAtoms, int numPairs, int paddedNumAtoms, GLOBAL mixed2* RESTRICT dt, GLOBAL real4* RESTRICT posq,
GLOBAL mixed4* RESTRICT velm, GLOBAL const mm_long* RESTRICT force, GLOBAL const mixed4* RESTRICT posDelta,
GLOBAL const int* RESTRICT atomList, GLOBAL const int2* RESTRICT pairList
#ifdef USE_MIXED_PRECISION
,GLOBAL const real4* RESTRICT posqCorrection
#endif
){
mixed2 stepSize = dt[0];
#ifdef SUPPORTS_DOUBLE_PRECISION
double oneOverDt = 1.0/stepSize.y;
#else
float oneOverDt = 1.0f/stepSize.y;
float correction = (1.0f-oneOverDt*stepSize.y)/stepSize.y;
#endif
const mixed dtVel = 0.5f*(stepSize.x+stepSize.y);
const mixed scale = 0.5f*dtVel/(mixed) 0x100000000;
int index = GLOBAL_ID;
if (index == 0)
dt[0].x = stepSize.y;
while(index < numAtoms) {
int atom = atomList[index];
mixed4 velocity = velm[atom];
if (velocity.w != 0.0) {
mixed4 deltaXconstrained = posDelta[atom];
velocity.x += scale*force[atom]*velocity.w + (deltaXconstrained.x - velocity.x*stepSize.y)*oneOverDt;
velocity.y += scale*force[atom+paddedNumAtoms]*velocity.w + (deltaXconstrained.y - velocity.y*stepSize.y)*oneOverDt;
velocity.z += scale*force[atom+paddedNumAtoms*2]*velocity.w + (deltaXconstrained.z - velocity.z*stepSize.y)*oneOverDt;
#ifndef SUPPORTS_DOUBLE_PRECISION
velocity.x += (deltaXconstrained.x - velocity.x*stepSize.y)*correction;
velocity.y += (deltaXconstrained.y - velocity.y*stepSize.y)*correction;
velocity.z += (deltaXconstrained.z - velocity.z*stepSize.y)*correction;
#endif
velm[atom] = velocity;
}
index += GLOBAL_SIZE;
}
index = GLOBAL_ID;
while(index < numPairs) {
int atom1 = pairList[index].x;
int atom2 = pairList[index].y;
mixed4 v1 = velm[atom1];
mixed4 v2 = velm[atom2];
mixed m1 = v1.w == 0.0f ? 0.0f : 1.0f / v1.w;
mixed m2 = v2.w == 0.0f ? 0.0f : 1.0f / v2.w;
mixed mass1fract = m1 / (m1 + m2);
mixed mass2fract = m2 / (m1 + m2);
mixed invRedMass = (m1 * m2 != 0.0f) ? (m1 + m2)/(m1 * m2) : 0.0f;
mixed invTotMass = (m1 + m2 != 0.0f) ? 1.0f /(m1 + m2) : 0.0f;
mixed3 comVel;
comVel.x= v1.x*mass1fract + v2.x*mass2fract;
comVel.y= v1.y*mass1fract + v2.y*mass2fract;
comVel.z= v1.z*mass1fract + v2.z*mass2fract;
mixed3 relVel;
relVel.x= v2.x - v1.x;
relVel.y= v2.y - v1.y;
relVel.z= v2.z - v1.z;
mixed3 comFrc;
mixed F1x = scale*force[atom1];
mixed F1y = scale*force[atom1+paddedNumAtoms];
mixed F1z = scale*force[atom1+paddedNumAtoms*2];
mixed F2x = scale*force[atom2];
mixed F2y = scale*force[atom2+paddedNumAtoms];
mixed F2z = scale*force[atom2+paddedNumAtoms*2];
comFrc.x = F1x + F2x;
comFrc.y = F1y + F2y;
comFrc.z = F1z + F2z;
mixed3 relFrc;
relFrc.x = mass1fract*F2x - mass2fract*F1x;
relFrc.y = mass1fract*F2y - mass2fract*F1y;
relFrc.z = mass1fract*F2z - mass2fract*F1z;
comVel.x += comFrc.x * invTotMass;
comVel.y += comFrc.y * invTotMass;
comVel.z += comFrc.z * invTotMass;
relVel.x += relFrc.x * invRedMass;
relVel.y += relFrc.y * invRedMass;
relVel.z += relFrc.z * invRedMass;
if (v1.w != 0.0f) {
mixed4 deltaXconstrained = posDelta[atom1];
v1.x = comVel.x - relVel.x*mass2fract + (deltaXconstrained.x - v1.x*stepSize.y)*oneOverDt;
v1.y = comVel.y - relVel.y*mass2fract + (deltaXconstrained.y - v1.y*stepSize.y)*oneOverDt;
v1.z = comVel.z - relVel.z*mass2fract + (deltaXconstrained.z - v1.z*stepSize.y)*oneOverDt;
#ifndef SUPPORTS_DOUBLE_PRECISION
v1.x += (deltaXconstrained.x - v1.x*stepSize.y)*correction;
v1.y += (deltaXconstrained.y - v1.y*stepSize.y)*correction;
v1.z += (deltaXconstrained.z - v1.z*stepSize.y)*correction;
#endif
velm[atom1] = v1;
}
if (v2.w != 0.0f) {
mixed4 deltaXconstrained = posDelta[atom2];
v2.x = comVel.x + relVel.x*mass1fract + (deltaXconstrained.x - v2.x*stepSize.y)*oneOverDt;
v2.y = comVel.y + relVel.y*mass1fract + (deltaXconstrained.y - v2.y*stepSize.y)*oneOverDt;
v2.z = comVel.z + relVel.z*mass1fract + (deltaXconstrained.z - v2.z*stepSize.y)*oneOverDt;
#ifndef SUPPORTS_DOUBLE_PRECISION
v2.x += (deltaXconstrained.x - v2.x*stepSize.y)*correction;
v2.y += (deltaXconstrained.y - v2.y*stepSize.y)*correction;
v2.z += (deltaXconstrained.z - v2.z*stepSize.y)*correction;
#endif
velm[atom2] = v2;
}
index += GLOBAL_SIZE;
}
}
KERNEL void integrateVelocityVerletHardWall(int numPairs, GLOBAL const float* RESTRICT maxPairDistance,
GLOBAL mixed2* RESTRICT dt, GLOBAL real4* RESTRICT posq, GLOBAL mixed2* RESTRICT dt, GLOBAL real4* RESTRICT posq,
GLOBAL mixed4* RESTRICT velm, GLOBAL const int2* RESTRICT pairList, GLOBAL mixed4* RESTRICT velm, GLOBAL const int2* RESTRICT pairList,
GLOBAL const float* RESTRICT pairTemperature GLOBAL const float* RESTRICT pairTemperature
...@@ -370,4 +258,3 @@ KERNEL void integrateVelocityVerletHardWall(int numPairs, GLOBAL const float* RE ...@@ -370,4 +258,3 @@ KERNEL void integrateVelocityVerletHardWall(int numPairs, GLOBAL const float* RE
} }
} }
} }
...@@ -53,7 +53,15 @@ public: ...@@ -53,7 +53,15 @@ public:
int getBlockSize() const; int getBlockSize() const;
const std::vector<int>& getSortedAtoms() const; const std::vector<int>& getSortedAtoms() const;
const std::vector<int>& getBlockNeighbors(int blockIndex) const; const std::vector<int>& getBlockNeighbors(int blockIndex) const;
const std::vector<char>& getBlockExclusions(int blockIndex) const;
/**
* Bitset for a single block, marking which indexes should be excluded. This data type needs to be big
* enough to store all the bits for any possible block size.
*/
using BlockExclusionMask = int16_t;
const std::vector<BlockExclusionMask>& getBlockExclusions(int blockIndex) const;
/** /**
* This routine contains the code executed by each thread. * This routine contains the code executed by each thread.
*/ */
...@@ -64,7 +72,7 @@ private: ...@@ -64,7 +72,7 @@ private:
std::vector<int> sortedAtoms; std::vector<int> sortedAtoms;
std::vector<float> sortedPositions; std::vector<float> sortedPositions;
std::vector<std::vector<int> > blockNeighbors; std::vector<std::vector<int> > blockNeighbors;
std::vector<std::vector<char> > blockExclusions; std::vector<std::vector<BlockExclusionMask> > blockExclusions;
// The following variables are used to make information accessible to the individual threads. // The following variables are used to make information accessible to the individual threads.
float minx, maxx, miny, maxy, minz, maxz; float minx, maxx, miny, maxy, minz, maxz;
std::vector<std::pair<int, int> > atomBins; std::vector<std::pair<int, int> > atomBins;
......
/* Portions copyright (c) 2006-2015 Stanford University and Simbios.
* Contributors: Daniel Towner
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files (the
* "Software"), to deal in the Software without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sublicense, and/or sell copies of the Software, and to
* permit persons to whom the Software is furnished to do so, subject
* to the following conditions:
*
* The above copyright notice and this permission notice shall be included
* in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE
* LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
* OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
* WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
#ifndef OPENMM_CPU_NONBONDED_FORCE_FVEC_H__
#define OPENMM_CPU_NONBONDED_FORCE_FVEC_H__
#include "CpuNonbondedForce.h"
#include "openmm/internal/vectorize.h"
#include "SimTKOpenMMUtilities.h"
#include <algorithm>
#include <vector>
namespace OpenMM {
enum BlockType {EWALD, NON_EWALD}; // :TODO: Better name for non-ewald.
enum PeriodicType {NoPeriodic, PeriodicPerAtom, PeriodicPerInteraction, PeriodicTriclinic};
/**
* Generic SIMD implementation of CpuNonbondedForce. The templating allows the same
* basic code to be reused for any sort of SIMD type, including SSE, AVX, AVX2, or
* AVX-512.
*/
template<typename FVEC>
class CpuNonbondedForceFvec : public CpuNonbondedForce {
public:
/**
* Store how many elements are contained in each block of atoms.
*/
static constexpr int blockSize = sizeof(FVEC) / sizeof(float);
protected:
/**---------------------------------------------------------------------------------------
Calculate all the interactions for one atom block. These are part of the virtual function interface
and consequently have names which explicitly call Ewald variant or not.
They internally call into the generic handler function below.
@param blockIndex the index of the atom block
@param forces force array (forces added)
@param totalEnergy total energy
---------------------------------------------------------------------------------------
@{
*/
void calculateBlockIxn(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize);
void calculateBlockEwaldIxn(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize);
/** @} */
/**---------------------------------------------------------------------------------------
Calculate all the interactions for one atom block. Identical to function prototypes above but
with an extra template parameter to choose whether to use Ewald processing or not.
--------------------------------------------------------------------------------------- */
template<BlockType BLOCK_TYPE>
void calculateBlockIxnHandler(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize);
/**
* Templatized implementation of calculateBlockIxn. It can handle both Ewald and non-ewald interactions
* through a template parameter since the code is so similar for the two cases. Note also that the
* floating-point SIMD type is also templated to allow any suitable type to be used.
*/
template <int PERIODIC_TYPE, BlockType BLOCK_TYPE>
void calculateBlockIxnImpl(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize, const fvec4& blockCenter);
/**
* Compute the displacement and squared distance between a collection of points, optionally using
* periodic boundary conditions.
*/
template <int PERIODIC_TYPE>
void getDeltaR(const fvec4& posI, const FVEC& x, const FVEC& y, const FVEC& z, FVEC& dx, FVEC& dy, FVEC& dz, FVEC& r2, const fvec4& boxSize, const fvec4& invBoxSize) const;
/**
* Compute an approximation of a function using a table lookup.
**/
FVEC approximateFunctionFromTable(const std::vector<float>& table, FVEC x, FVEC inverse) const;
};
/**
* Use a table lookup to approximate a function specific function.
*/
template<typename FVEC>
FVEC
CpuNonbondedForceFvec<FVEC>::approximateFunctionFromTable(const std::vector<float>& table,
const FVEC x, const FVEC inverse) const {
// Compute the set of 8 index positions from which to gather the table data.
const auto x1 = x * inverse;
const auto index = min(floor(x1), float(NUM_TABLE_POINTS));
FVEC s1, s2;
gatherVecPair(table.data(), index, s1, s2);
const auto coeff2 = x1-FVEC(index);
const auto coeff1 = 1.0f-coeff2;
return coeff1*s1 + coeff2*s2;
}
template<typename FVEC>
void CpuNonbondedForceFvec<FVEC>::calculateBlockIxn(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize) {
calculateBlockIxnHandler<BlockType::NON_EWALD>(blockIndex, forces, totalEnergy, boxSize, invBoxSize);
}
template<typename FVEC>
void CpuNonbondedForceFvec<FVEC>::calculateBlockEwaldIxn(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize) {
calculateBlockIxnHandler<BlockType::EWALD>(blockIndex, forces, totalEnergy, boxSize, invBoxSize);
}
template<typename FVEC>
template<BlockType BLOCK_TYPE>
void CpuNonbondedForceFvec<FVEC>::calculateBlockIxnHandler(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize) {
// Determine whether we need to apply periodic boundary conditions.
PeriodicType periodicType;
fvec4 blockCenter;
if (!periodic) {
periodicType = NoPeriodic;
blockCenter = 0.0f;
}
else {
using std::min;
using std::max;
const int* blockAtom = &neighborList->getSortedAtoms()[blockSize*blockIndex];
float minx, maxx, miny, maxy, minz, maxz;
minx = maxx = posq[4*blockAtom[0]];
miny = maxy = posq[4*blockAtom[0]+1];
minz = maxz = posq[4*blockAtom[0]+2];
for (int i = 1; i < blockSize; i++) {
minx = min(minx, posq[4*blockAtom[i]]);
maxx = max(maxx, posq[4*blockAtom[i]]);
miny = min(miny, posq[4*blockAtom[i]+1]);
maxy = max(maxy, posq[4*blockAtom[i]+1]);
minz = min(minz, posq[4*blockAtom[i]+2]);
maxz = max(maxz, posq[4*blockAtom[i]+2]);
}
blockCenter = fvec4(0.5f*(minx+maxx), 0.5f*(miny+maxy), 0.5f*(minz+maxz), 0.0f);
if (!(minx < cutoffDistance || miny < cutoffDistance || minz < cutoffDistance ||
maxx > boxSize[0]-cutoffDistance || maxy > boxSize[1]-cutoffDistance || maxz > boxSize[2]-cutoffDistance))
periodicType = NoPeriodic;
else if (triclinic)
periodicType = PeriodicTriclinic;
else if (0.5f*(boxSize[0]-(maxx-minx)) >= cutoffDistance &&
0.5f*(boxSize[1]-(maxy-miny)) >= cutoffDistance &&
0.5f*(boxSize[2]-(maxz-minz)) >= cutoffDistance)
periodicType = PeriodicPerAtom;
else
periodicType = PeriodicPerInteraction;
}
// Call the appropriate version depending on what calculation is required for periodic boundary conditions.
if (periodicType == NoPeriodic)
calculateBlockIxnImpl<NoPeriodic, BLOCK_TYPE>(blockIndex, forces, totalEnergy, boxSize, invBoxSize, blockCenter);
else if (periodicType == PeriodicPerAtom)
calculateBlockIxnImpl<PeriodicPerAtom, BLOCK_TYPE>(blockIndex, forces, totalEnergy, boxSize, invBoxSize, blockCenter);
else if (periodicType == PeriodicPerInteraction)
calculateBlockIxnImpl<PeriodicPerInteraction, BLOCK_TYPE>(blockIndex, forces, totalEnergy, boxSize, invBoxSize, blockCenter);
else if (periodicType == PeriodicTriclinic)
calculateBlockIxnImpl<PeriodicTriclinic, BLOCK_TYPE>(blockIndex, forces, totalEnergy, boxSize, invBoxSize, blockCenter);
}
template<typename FVEC>
template <int PERIODIC_TYPE, BlockType BLOCK_TYPE>
void CpuNonbondedForceFvec<FVEC>::calculateBlockIxnImpl(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize, const fvec4& blockCenter) {
// Load the positions and parameters of the atoms in the block.
const int* blockAtom = &neighborList->getSortedAtoms()[blockSize * blockIndex];
fvec4 blockAtomPosq[blockSize];
FVEC blockAtomForceX(0.0f), blockAtomForceY(0.0f), blockAtomForceZ(0.0f);
FVEC blockAtomX, blockAtomY, blockAtomZ, blockAtomCharge;
for (int i = 0; i < blockSize; i++) {
blockAtomPosq[i] = fvec4(posq+4*blockAtom[i]);
if (PERIODIC_TYPE == PeriodicPerAtom)
blockAtomPosq[i] -= floor((blockAtomPosq[i]-blockCenter)*invBoxSize+0.5f)*boxSize; // :TODO: Apply one to blockAtom?
}
transpose(blockAtomPosq, blockAtomX, blockAtomY, blockAtomZ, blockAtomCharge);
blockAtomCharge *= ONE_4PI_EPS0;
// Not the most efficient way to do this, but it works across all types we care about, and this isn't where
// the cycles are spent anyway.
FVEC blockAtomSigma = {};
FVEC blockAtomEpsilon = {};
for (int i=0; i<blockSize; ++i)
{
((float*)&blockAtomSigma)[i] = atomParameters[blockAtom[i]].first;
((float*)&blockAtomEpsilon)[i] = atomParameters[blockAtom[i]].second;
}
// Ewald needs C6 data gathered from a table. Unused variable for non-ewald.
const FVEC C6s = (BLOCK_TYPE == BlockType::EWALD) ? FVEC(C6params, blockAtom) : FVEC();
const bool needPeriodic = (PERIODIC_TYPE == PeriodicPerInteraction || PERIODIC_TYPE == PeriodicTriclinic);
const float invSwitchingInterval = 1/(cutoffDistance-switchingDistance);
const FVEC cutoffDistanceSquared = cutoffDistance * cutoffDistance;
// Loop over neighbors for this block.
const auto& neighbors = neighborList->getBlockNeighbors(blockIndex);
const auto& exclusions = neighborList->getBlockExclusions(blockIndex);
for (int i = 0; i < (int) neighbors.size(); i++) {
// Load the next neighbor.
int atom = neighbors[i];
// Compute the distances to the block atoms.
FVEC dx, dy, dz, r2;
fvec4 atomPos(posq+4*atom);
if (PERIODIC_TYPE == PeriodicPerAtom)
atomPos -= floor((atomPos-blockCenter)*invBoxSize+0.5f)*boxSize;
getDeltaR<PERIODIC_TYPE>(atomPos, blockAtomX, blockAtomY, blockAtomZ, dx, dy, dz, r2, boxSize, invBoxSize);
const auto exclNotMask = FVEC::expandBitsToMask(~exclusions[i]);
const auto include = blendZero(r2 < cutoffDistance*cutoffDistance, exclNotMask);
if (!any(include))
continue; // No interactions to compute.
// Compute the interactions.
const auto inverseR = rsqrt(r2);
const auto r = r2*inverseR;
FVEC energy, dEdR;
float atomEpsilon = atomParameters[atom].second;
if (atomEpsilon != 0.0f) {
const auto sig = blockAtomSigma+atomParameters[atom].first;
const auto sig2 = (inverseR*sig)*(inverseR*sig);
const auto sig6 = sig2*sig2*sig2;
const auto eps = blockAtomEpsilon*atomEpsilon;
const auto epsSig6 = eps*sig6;
dEdR = epsSig6*(12.0f*sig6 - 6.0f);
energy = epsSig6*(sig6-1.0f);
if (useSwitch) {
const auto t = blendZero((r-switchingDistance)*invSwitchingInterval, r>switchingDistance);
const auto switchValue = 1+t*t*t*(-10.0f+t*(15.0f-t*6.0f));
const auto switchDeriv = t*t*(-30.0f+t*(60.0f-t*30.0f))*invSwitchingInterval;
dEdR = switchValue*dEdR - energy*switchDeriv*r;
energy *= switchValue;
}
if (BLOCK_TYPE == BlockType::EWALD && ljpme) {
const auto C6ij = C6s*C6params[atom];
const auto inverseR2 = inverseR*inverseR;
const auto mysig2 = sig*sig;
const auto mysig6 = mysig2*mysig2*mysig2;
const auto emult = C6ij*inverseR2*inverseR2*inverseR2*approximateFunctionFromTable(exptermsTable, r, FVEC(exptermsDXInv));
const auto potentialShift = eps*(1.0f-mysig6*inverseRcut6)*mysig6*inverseRcut6 - C6ij*inverseRcut6Expterm;
dEdR += 6.0f*C6ij*inverseR2*inverseR2*inverseR2*approximateFunctionFromTable(dExptermsTable, r, FVEC(exptermsDXInv));
energy += emult + potentialShift;
}
}
else {
energy = 0.0f;
dEdR = 0.0f;
}
const auto chargeProd = blockAtomCharge*posq[4*atom+3];
if (BLOCK_TYPE == BlockType::EWALD)
{
dEdR += chargeProd*inverseR*approximateFunctionFromTable(ewaldScaleTable, r, FVEC(ewaldDXInv));
}
else
{
if (cutoff)
dEdR += chargeProd*(inverseR-2.0f*krf*r2);
else
dEdR += chargeProd*inverseR;
}
dEdR *= inverseR*inverseR;
// Accumulate energies.
if (totalEnergy) {
if (BLOCK_TYPE == BlockType::EWALD)
energy += chargeProd*inverseR*approximateFunctionFromTable(erfcTable, alphaEwald*r, FVEC(erfcDXInv));
else // Non-ewald.
{
if (cutoff)
energy += chargeProd*(inverseR+krf*r2-crf);
else
energy += chargeProd*inverseR;
}
energy = blendZero(energy, include);
*totalEnergy += reduceAdd(energy);
}
// Accumulate forces.
dEdR = blendZero(dEdR, include);
const auto fx = dx*dEdR;
const auto fy = dy*dEdR;
const auto fz = dz*dEdR;
blockAtomForceX += fx;
blockAtomForceY += fy;
blockAtomForceZ += fz;
float* const atomForce = forces+4*atom;
const fvec4 newAtomForce = fvec4(atomForce) - reduceToVec3(fx, fy, fz);
newAtomForce.store(atomForce);
}
// Record the forces on the block atoms.
fvec4 f[blockSize];
transpose(blockAtomForceX, blockAtomForceY, blockAtomForceZ, 0.0f, f);
for (int j = 0; j < blockSize; j++)
(fvec4(forces+4*blockAtom[j])+f[j]).store(forces+4*blockAtom[j]);
}
template<typename FVEC>
template <int PERIODIC_TYPE>
void CpuNonbondedForceFvec<FVEC>::getDeltaR(const fvec4& posI, const FVEC& x, const FVEC& y, const FVEC& z, FVEC& dx, FVEC& dy, FVEC& dz, FVEC& r2, const fvec4& boxSize, const fvec4& invBoxSize) const {
dx = x-posI[0];
dy = y-posI[1];
dz = z-posI[2];
if (PERIODIC_TYPE == PeriodicTriclinic) {
const auto scale3 = floor(dz*recipBoxSize[2]+0.5f);
dx -= scale3*periodicBoxVectors[2][0];
dy -= scale3*periodicBoxVectors[2][1];
dz -= scale3*periodicBoxVectors[2][2];
const auto scale2 = floor(dy*recipBoxSize[1]+0.5f);
dx -= scale2*periodicBoxVectors[1][0];
dy -= scale2*periodicBoxVectors[1][1];
const auto scale1 = floor(dx*recipBoxSize[0]+0.5f);
dx -= scale1*periodicBoxVectors[0][0];
}
else if (PERIODIC_TYPE == PeriodicPerInteraction) {
dx -= round(dx*invBoxSize[0])*boxSize[0];
dy -= round(dy*invBoxSize[1])*boxSize[1];
dz -= round(dz*invBoxSize[2])*boxSize[2];
}
r2 = dx*dx + dy*dy + dz*dz;
}
} // namespace OpenMM
#endif // OPENMM_CPU_NONBONDED_FORCE_FVEC_H__
/* Portions copyright (c) 2006-2015 Stanford University and Simbios.
* Contributors: Pande Group
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files (the
* "Software"), to deal in the Software without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sublicense, and/or sell copies of the Software, and to
* permit persons to whom the Software is furnished to do so, subject
* to the following conditions:
*
* The above copyright notice and this permission notice shall be included
* in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE
* LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
* OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
* WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
#ifndef OPENMM_CPU_NONBONDED_FORCE_VEC4_H__
#define OPENMM_CPU_NONBONDED_FORCE_VEC4_H__
#include "CpuNonbondedForce.h"
// ---------------------------------------------------------------------------------------
namespace OpenMM {
class CpuNonbondedForceVec4 : public CpuNonbondedForce {
public:
/**---------------------------------------------------------------------------------------
Constructor
--------------------------------------------------------------------------------------- */
CpuNonbondedForceVec4();
protected:
/**---------------------------------------------------------------------------------------
Calculate all the interactions for one atom block.
@param blockIndex the index of the atom block
@param forces force array (forces added)
@param totalEnergy total energy
--------------------------------------------------------------------------------------- */
void calculateBlockIxn(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize);
/**
* Templatized implementation of calculateBlockIxn.
*/
template <int PERIODIC_TYPE>
void calculateBlockIxnImpl(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize, const fvec4& blockCenter);
/**---------------------------------------------------------------------------------------
Calculate all the interactions for one atom block.
@param blockIndex the index of the atom block
@param forces force array (forces added)
@param totalEnergy total energy
--------------------------------------------------------------------------------------- */
void calculateBlockEwaldIxn(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize);
/**
* Templatized implementation of calculateBlockEwaldIxn.
*/
template <int PERIODIC_TYPE>
void calculateBlockEwaldIxnImpl(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize, const fvec4& blockCenter);
/**
* Compute the displacement and squared distance between a collection of points, optionally using
* periodic boundary conditions.
*/
template <int PERIODIC_TYPE>
void getDeltaR(const fvec4& posI, const fvec4& x, const fvec4& y, const fvec4& z, fvec4& dx, fvec4& dy, fvec4& dz, fvec4& r2, bool periodic, const fvec4& boxSize, const fvec4& invBoxSize) const;
/**
* Compute a fast approximation to erfc(x).
*/
fvec4 erfcApprox(const fvec4& x);
/**
* Evaluate the scale factor used with Ewald and PME: erfc(alpha*r) + 2*alpha*r*exp(-alpha*alpha*r*r)/sqrt(PI)
*/
fvec4 ewaldScaleFunction(const fvec4& x);
/**
* Compute a fast approximation to (1.0 - EXP(-dar^2) * (1.0 + dar^2 + 0.5*dar^4))
* where dar = (dispersionAlpha * R)
* needed for LJPME energies.
*/
fvec4 exptermsApprox(const fvec4& R);
/**
* Compute a fast approximation to (1.0 - EXP(-dar^2) * (1.0 + dar^2 + 0.5*dar^4 + dar^6/6.0))
* where dar = (dispersionAlpha * R)
* needed for LJPME forces.
*/
fvec4 dExptermsApprox(const fvec4& R);
};
} // namespace OpenMM
// ---------------------------------------------------------------------------------------
#endif // OPENMM_CPU_NONBONDED_FORCE_VEC4_H__
/* Portions copyright (c) 2006-2015 Stanford University and Simbios.
* Contributors: Pande Group
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files (the
* "Software"), to deal in the Software without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sublicense, and/or sell copies of the Software, and to
* permit persons to whom the Software is furnished to do so, subject
* to the following conditions:
*
* The above copyright notice and this permission notice shall be included
* in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE
* LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
* OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
* WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
#ifndef OPENMM_CPU_NONBONDED_FORCE_VEC8_H__
#define OPENMM_CPU_NONBONDED_FORCE_VEC8_H__
#include "CpuNonbondedForce.h"
#ifdef __AVX__
#include "openmm/internal/vectorize8.h"
// ---------------------------------------------------------------------------------------
namespace OpenMM {
class CpuNonbondedForceVec8 : public CpuNonbondedForce {
public:
CpuNonbondedForceVec8();
protected:
/**---------------------------------------------------------------------------------------
Calculate all the interactions for one atom block.
@param blockIndex the index of the atom block
@param forces force array (forces added)
@param totalEnergy total energy
--------------------------------------------------------------------------------------- */
void calculateBlockIxn(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize);
/**
* Templatized implementation of calculateBlockIxn.
*/
template <int PERIODIC_TYPE>
void calculateBlockIxnImpl(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize, const fvec4& blockCenter);
/**---------------------------------------------------------------------------------------
Calculate all the interactions for one atom block.
@param blockIndex the index of the atom block
@param forces force array (forces added)
@param totalEnergy total energy
--------------------------------------------------------------------------------------- */
void calculateBlockEwaldIxn(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize);
/**
* Templatized implementation of calculateBlockEwaldIxn.
*/
template <int PERIODIC_TYPE>
void calculateBlockEwaldIxnImpl(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize, const fvec4& blockCenter);
/**
* Compute the displacement and squared distance between a collection of points, optionally using
* periodic boundary conditions.
*/
template <int PERIODIC_TYPE>
void getDeltaR(const fvec4& posI, const fvec8& x, const fvec8& y, const fvec8& z, fvec8& dx, fvec8& dy, fvec8& dz, fvec8& r2, bool periodic, const fvec4& boxSize, const fvec4& invBoxSize) const;
/**
* Compute a fast approximation to erfc(x).
*/
fvec8 erfcApprox(const fvec8& x);
/**
* Evaluate the scale factor used with Ewald and PME: erfc(alpha*r) + 2*alpha*r*exp(-alpha*alpha*r*r)/sqrt(PI)
*/
fvec8 ewaldScaleFunction(const fvec8& x);
/**
* Compute a fast approximation to (1.0 - EXP(-dar^2) * (1.0 + dar^2 + 0.5*dar^4))
* where dar = (dispersionAlpha * R)
* needed for LJPME energies.
*/
fvec8 exptermsApprox(const fvec8& R);
/**
* Compute a fast approximation to (1.0 - EXP(-dar^2) * (1.0 + dar^2 + 0.5*dar^4 + dar^6/6.0))
* where dar = (dispersionAlpha * R)
* needed for LJPME forces.
*/
fvec8 dExptermsApprox(const fvec8& R);
};
} // namespace OpenMM
// ---------------------------------------------------------------------------------------
#endif // __AVX__
#endif // OPENMM_CPU_NONBONDED_FORCE_VEC8_H__
...@@ -363,7 +363,7 @@ void CpuCustomGBForce::calculateParticlePairValue(int index, ThreadData& data, i ...@@ -363,7 +363,7 @@ void CpuCustomGBForce::calculateParticlePairValue(int index, ThreadData& data, i
const int blockSize = neighborList->getBlockSize(); const int blockSize = neighborList->getBlockSize();
const int* blockAtom = &neighborList->getSortedAtoms()[blockSize*blockIndex]; const int* blockAtom = &neighborList->getSortedAtoms()[blockSize*blockIndex];
const vector<int>& neighbors = neighborList->getBlockNeighbors(blockIndex); const vector<int>& neighbors = neighborList->getBlockNeighbors(blockIndex);
const vector<char>& blockExclusions = neighborList->getBlockExclusions(blockIndex); const auto& blockExclusions = neighborList->getBlockExclusions(blockIndex);
for (int i = 0; i < (int) neighbors.size(); i++) { for (int i = 0; i < (int) neighbors.size(); i++) {
int first = neighbors[i]; int first = neighbors[i];
for (int k = 0; k < blockSize; k++) { for (int k = 0; k < blockSize; k++) {
...@@ -458,7 +458,7 @@ void CpuCustomGBForce::calculateParticlePairEnergyTerm(int index, ThreadData& da ...@@ -458,7 +458,7 @@ void CpuCustomGBForce::calculateParticlePairEnergyTerm(int index, ThreadData& da
const int blockSize = neighborList->getBlockSize(); const int blockSize = neighborList->getBlockSize();
const int* blockAtom = &neighborList->getSortedAtoms()[blockSize*blockIndex]; const int* blockAtom = &neighborList->getSortedAtoms()[blockSize*blockIndex];
const vector<int>& neighbors = neighborList->getBlockNeighbors(blockIndex); const vector<int>& neighbors = neighborList->getBlockNeighbors(blockIndex);
const vector<char>& blockExclusions = neighborList->getBlockExclusions(blockIndex); const auto& blockExclusions = neighborList->getBlockExclusions(blockIndex);
for (int i = 0; i < (int) neighbors.size(); i++) { for (int i = 0; i < (int) neighbors.size(); i++) {
int first = neighbors[i]; int first = neighbors[i];
for (int k = 0; k < blockSize; k++) { for (int k = 0; k < blockSize; k++) {
...@@ -545,7 +545,7 @@ void CpuCustomGBForce::calculateChainRuleForces(ThreadData& data, int numAtoms, ...@@ -545,7 +545,7 @@ void CpuCustomGBForce::calculateChainRuleForces(ThreadData& data, int numAtoms,
const int blockSize = neighborList->getBlockSize(); const int blockSize = neighborList->getBlockSize();
const int* blockAtom = &neighborList->getSortedAtoms()[blockSize*blockIndex]; const int* blockAtom = &neighborList->getSortedAtoms()[blockSize*blockIndex];
const vector<int>& neighbors = neighborList->getBlockNeighbors(blockIndex); const vector<int>& neighbors = neighborList->getBlockNeighbors(blockIndex);
const vector<char>& blockExclusions = neighborList->getBlockExclusions(blockIndex); const auto& blockExclusions = neighborList->getBlockExclusions(blockIndex);
for (int i = 0; i < (int) neighbors.size(); i++) { for (int i = 0; i < (int) neighbors.size(); i++) {
int first = neighbors[i]; int first = neighbors[i];
for (int k = 0; k < blockSize; k++) { for (int k = 0; k < blockSize; k++) {
......
...@@ -110,7 +110,7 @@ void CpuCustomManyParticleForce::calculateIxn(AlignedArray<float>& posq, vector< ...@@ -110,7 +110,7 @@ void CpuCustomManyParticleForce::calculateIxn(AlignedArray<float>& posq, vector<
neighborList->computeNeighborList(numParticles, posq, exclusions, periodicBoxVectors, usePeriodic, cutoffDistance, threads); neighborList->computeNeighborList(numParticles, posq, exclusions, periodicBoxVectors, usePeriodic, cutoffDistance, threads);
for (int blockIndex = 0; blockIndex < neighborList->getNumBlocks(); blockIndex++) { for (int blockIndex = 0; blockIndex < neighborList->getNumBlocks(); blockIndex++) {
const vector<int>& neighbors = neighborList->getBlockNeighbors(blockIndex); const vector<int>& neighbors = neighborList->getBlockNeighbors(blockIndex);
const vector<char>& exclusions = neighborList->getBlockExclusions(blockIndex); const auto& exclusions = neighborList->getBlockExclusions(blockIndex);
int numNeighbors = neighbors.size(); int numNeighbors = neighbors.size();
for (int i = 0; i < 4; i++) { for (int i = 0; i < 4; i++) {
int p1 = neighborList->getSortedAtoms()[4*blockIndex+i]; int p1 = neighborList->getSortedAtoms()[4*blockIndex+i];
......
...@@ -195,7 +195,7 @@ void CpuCustomNonbondedForce::threadComputeForce(ThreadPool& threads, int thread ...@@ -195,7 +195,7 @@ void CpuCustomNonbondedForce::threadComputeForce(ThreadPool& threads, int thread
const int blockSize = neighborList->getBlockSize(); const int blockSize = neighborList->getBlockSize();
const int* blockAtom = &neighborList->getSortedAtoms()[blockSize*blockIndex]; const int* blockAtom = &neighborList->getSortedAtoms()[blockSize*blockIndex];
const vector<int>& neighbors = neighborList->getBlockNeighbors(blockIndex); const vector<int>& neighbors = neighborList->getBlockNeighbors(blockIndex);
const vector<char>& exclusions = neighborList->getBlockExclusions(blockIndex); const auto& exclusions = neighborList->getBlockExclusions(blockIndex);
for (int i = 0; i < (int) neighbors.size(); i++) { for (int i = 0; i < (int) neighbors.size(); i++) {
int first = neighbors[i]; int first = neighbors[i];
for (int j = 0; j < (int) paramNames.size(); j++) for (int j = 0; j < (int) paramNames.size(); j++)
......
...@@ -183,7 +183,7 @@ void CpuGayBerneForce::threadComputeForce(ThreadPool& threads, int threadIndex, ...@@ -183,7 +183,7 @@ void CpuGayBerneForce::threadComputeForce(ThreadPool& threads, int threadIndex,
const int blockSize = neighborList->getBlockSize(); const int blockSize = neighborList->getBlockSize();
const int* blockAtom = &neighborList->getSortedAtoms()[blockSize*blockIndex]; const int* blockAtom = &neighborList->getSortedAtoms()[blockSize*blockIndex];
const vector<int>& neighbors = neighborList->getBlockNeighbors(blockIndex); const vector<int>& neighbors = neighborList->getBlockNeighbors(blockIndex);
const vector<char>& exclusions = neighborList->getBlockExclusions(blockIndex); const auto& exclusions = neighborList->getBlockExclusions(blockIndex);
for (int i = 0; i < (int) neighbors.size(); i++) { for (int i = 0; i < (int) neighbors.size(); i++) {
int first = neighbors[i]; int first = neighbors[i];
if (particles[first].sqrtEpsilon == 0.0f) if (particles[first].sqrtEpsilon == 0.0f)
......
...@@ -472,16 +472,11 @@ private: ...@@ -472,16 +472,11 @@ private:
int numParticles; int numParticles;
}; };
bool isVec8Supported(); CpuNonbondedForce* createCpuNonbondedForceVec();
CpuNonbondedForce* createCpuNonbondedForceVec4();
CpuNonbondedForce* createCpuNonbondedForceVec8();
CpuCalcNonbondedForceKernel::CpuCalcNonbondedForceKernel(string name, const Platform& platform, CpuPlatform::PlatformData& data) : CalcNonbondedForceKernel(name, platform), CpuCalcNonbondedForceKernel::CpuCalcNonbondedForceKernel(string name, const Platform& platform, CpuPlatform::PlatformData& data) : CalcNonbondedForceKernel(name, platform),
data(data), hasInitializedPme(false), hasInitializedDispersionPme(false), nonbonded(NULL) { data(data), hasInitializedPme(false), hasInitializedDispersionPme(false), nonbonded(NULL) {
if (isVec8Supported()) nonbonded = createCpuNonbondedForceVec();
nonbonded = createCpuNonbondedForceVec8();
else
nonbonded = createCpuNonbondedForceVec4();
} }
CpuCalcNonbondedForceKernel::~CpuCalcNonbondedForceKernel() { CpuCalcNonbondedForceKernel::~CpuCalcNonbondedForceKernel() {
......
...@@ -164,7 +164,7 @@ public: ...@@ -164,7 +164,7 @@ public:
return VoxelIndex(y, z); return VoxelIndex(y, z);
} }
void getNeighbors(vector<int>& neighbors, int blockIndex, const fvec4& blockCenter, const fvec4& blockWidth, const vector<int>& sortedAtoms, vector<char>& exclusions, float maxDistance, const vector<int>& blockAtoms, const vector<float>& blockAtomX, const vector<float>& blockAtomY, const vector<float>& blockAtomZ, const vector<float>& sortedPositions, const vector<VoxelIndex>& atomVoxelIndex) const { void getNeighbors(vector<int>& neighbors, int blockIndex, const fvec4& blockCenter, const fvec4& blockWidth, const vector<int>& sortedAtoms, vector<CpuNeighborList::BlockExclusionMask>& exclusions, float maxDistance, const vector<int>& blockAtoms, const vector<float>& blockAtomX, const vector<float>& blockAtomY, const vector<float>& blockAtomZ, const vector<float>& sortedPositions, const vector<VoxelIndex>& atomVoxelIndex) const {
neighbors.resize(0); neighbors.resize(0);
exclusions.resize(0); exclusions.resize(0);
fvec4 boxSize(periodicBoxSize[0], periodicBoxSize[1], periodicBoxSize[2], 0); fvec4 boxSize(periodicBoxSize[0], periodicBoxSize[1], periodicBoxSize[2], 0);
...@@ -484,10 +484,10 @@ void CpuNeighborList::computeNeighborList(int numAtoms, const AlignedArray<float ...@@ -484,10 +484,10 @@ void CpuNeighborList::computeNeighborList(int numAtoms, const AlignedArray<float
int numPadding = numBlocks*blockSize-numAtoms; int numPadding = numBlocks*blockSize-numAtoms;
if (numPadding > 0) { if (numPadding > 0) {
char mask = ((0xFFFF-(1<<blockSize)+1) >> numPadding); const BlockExclusionMask mask = (~0) << (blockSize - numPadding);
for (int i = 0; i < numPadding; i++) for (int i = 0; i < numPadding; i++)
sortedAtoms.push_back(0); sortedAtoms.push_back(0);
vector<char>& exc = blockExclusions[blockExclusions.size()-1]; auto& exc = blockExclusions[blockExclusions.size()-1];
for (int i = 0; i < (int) exc.size(); i++) for (int i = 0; i < (int) exc.size(); i++)
exc[i] |= mask; exc[i] |= mask;
} }
...@@ -509,7 +509,7 @@ const std::vector<int>& CpuNeighborList::getBlockNeighbors(int blockIndex) const ...@@ -509,7 +509,7 @@ const std::vector<int>& CpuNeighborList::getBlockNeighbors(int blockIndex) const
return blockNeighbors[blockIndex]; return blockNeighbors[blockIndex];
} }
const std::vector<char>& CpuNeighborList::getBlockExclusions(int blockIndex) const { const std::vector<CpuNeighborList::BlockExclusionMask>& CpuNeighborList::getBlockExclusions(int blockIndex) const {
return blockExclusions[blockIndex]; return blockExclusions[blockIndex];
} }
...@@ -573,12 +573,12 @@ void CpuNeighborList::threadComputeNeighborList(ThreadPool& threads, int threadI ...@@ -573,12 +573,12 @@ void CpuNeighborList::threadComputeNeighborList(ThreadPool& threads, int threadI
// Record the exclusions for this block. // Record the exclusions for this block.
map<int, char> atomFlags; map<int, BlockExclusionMask> atomFlags;
for (int j = 0; j < atomsInBlock; j++) { for (int j = 0; j < atomsInBlock; j++) {
const set<int>& atomExclusions = (*exclusions)[sortedAtoms[firstIndex+j]]; const set<int>& atomExclusions = (*exclusions)[sortedAtoms[firstIndex+j]];
char mask = 1<<j; const BlockExclusionMask mask = 1<<j;
for (int exclusion : atomExclusions) { for (int exclusion : atomExclusions) {
map<int, char>::iterator thisAtomFlags = atomFlags.find(exclusion); const auto thisAtomFlags = atomFlags.find(exclusion);
if (thisAtomFlags == atomFlags.end()) if (thisAtomFlags == atomFlags.end())
atomFlags[exclusion] = mask; atomFlags[exclusion] = mask;
else else
...@@ -588,7 +588,7 @@ void CpuNeighborList::threadComputeNeighborList(ThreadPool& threads, int threadI ...@@ -588,7 +588,7 @@ void CpuNeighborList::threadComputeNeighborList(ThreadPool& threads, int threadI
int numNeighbors = blockNeighbors[i].size(); int numNeighbors = blockNeighbors[i].size();
for (int k = 0; k < numNeighbors; k++) { for (int k = 0; k < numNeighbors; k++) {
int atomIndex = blockNeighbors[i][k]; int atomIndex = blockNeighbors[i][k];
map<int, char>::iterator thisAtomFlags = atomFlags.find(atomIndex); auto thisAtomFlags = atomFlags.find(atomIndex);
if (thisAtomFlags != atomFlags.end()) if (thisAtomFlags != atomFlags.end())
blockExclusions[i][k] |= thisAtomFlags->second; blockExclusions[i][k] |= thisAtomFlags->second;
} }
......
/* Portions copyright (c) 2006-2015 Stanford University and Simbios.
* Contributors: Daniel Towner
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files (the
* "Software"), to deal in the Software without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sublicense, and/or sell copies of the Software, and to
* permit persons to whom the Software is furnished to do so, subject
* to the following conditions:
*
* The above copyright notice and this permission notice shall be included
* in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE
* LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
* OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
* WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
#include "CpuNonbondedForceFvec.h"
OpenMM::CpuNonbondedForce* createCpuNonbondedForceVec4();
OpenMM::CpuNonbondedForce* createCpuNonbondedForceVec8();
bool isVec8Supported();
OpenMM::CpuNonbondedForce* createCpuNonbondedForceVec() {
if (isVec8Supported())
return createCpuNonbondedForceVec8();
else
return createCpuNonbondedForceVec4();
}
int getVecBlockSize() {
if (isVec8Supported())
return 8;
else
return 4;
}
...@@ -22,440 +22,11 @@ ...@@ -22,440 +22,11 @@
* WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/ */
#include "SimTKOpenMMUtilities.h" #include "CpuNonbondedForceFvec.h"
#include "CpuNonbondedForceVec4.h"
#include <algorithm>
#include <iostream>
using namespace std; // Very minimal file. It exists purely to be able to compile it in SIMD-4.
using namespace OpenMM;
/** OpenMM::CpuNonbondedForce* createCpuNonbondedForceVec4() {
* Factory method to create a CpuNonbondedForceVec4. return new OpenMM::CpuNonbondedForceFvec<fvec4>();
*/
CpuNonbondedForce* createCpuNonbondedForceVec4() {
return new CpuNonbondedForceVec4();
}
/**---------------------------------------------------------------------------------------
CpuNonbondedForceVec4 constructor
--------------------------------------------------------------------------------------- */
CpuNonbondedForceVec4::CpuNonbondedForceVec4() {
}
enum PeriodicType {NoPeriodic, PeriodicPerAtom, PeriodicPerInteraction, PeriodicTriclinic};
void CpuNonbondedForceVec4::calculateBlockIxn(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize) {
// Determine whether we need to apply periodic boundary conditions.
PeriodicType periodicType;
fvec4 blockCenter;
if (!periodic) {
periodicType = NoPeriodic;
blockCenter = 0.0f;
}
else {
const int* blockAtom = &neighborList->getSortedAtoms()[4*blockIndex];
float minx, maxx, miny, maxy, minz, maxz;
minx = maxx = posq[4*blockAtom[0]];
miny = maxy = posq[4*blockAtom[0]+1];
minz = maxz = posq[4*blockAtom[0]+2];
for (int i = 1; i < 4; i++) {
minx = min(minx, posq[4*blockAtom[i]]);
maxx = max(maxx, posq[4*blockAtom[i]]);
miny = min(miny, posq[4*blockAtom[i]+1]);
maxy = max(maxy, posq[4*blockAtom[i]+1]);
minz = min(minz, posq[4*blockAtom[i]+2]);
maxz = max(maxz, posq[4*blockAtom[i]+2]);
}
blockCenter = fvec4(0.5f*(minx+maxx), 0.5f*(miny+maxy), 0.5f*(minz+maxz), 0.0f);
if (!(minx < cutoffDistance || miny < cutoffDistance || minz < cutoffDistance ||
maxx > boxSize[0]-cutoffDistance || maxy > boxSize[1]-cutoffDistance || maxz > boxSize[2]-cutoffDistance))
periodicType = NoPeriodic;
else if (triclinic)
periodicType = PeriodicTriclinic;
else if (0.5f*(boxSize[0]-(maxx-minx)) >= cutoffDistance &&
0.5f*(boxSize[1]-(maxy-miny)) >= cutoffDistance &&
0.5f*(boxSize[2]-(maxz-minz)) >= cutoffDistance)
periodicType = PeriodicPerAtom;
else
periodicType = PeriodicPerInteraction;
}
// Call the appropriate version depending on what calculation is required for periodic boundary conditions.
if (periodicType == NoPeriodic)
calculateBlockIxnImpl<NoPeriodic>(blockIndex, forces, totalEnergy, boxSize, invBoxSize, blockCenter);
else if (periodicType == PeriodicPerAtom)
calculateBlockIxnImpl<PeriodicPerAtom>(blockIndex, forces, totalEnergy, boxSize, invBoxSize, blockCenter);
else if (periodicType == PeriodicPerInteraction)
calculateBlockIxnImpl<PeriodicPerInteraction>(blockIndex, forces, totalEnergy, boxSize, invBoxSize, blockCenter);
else if (periodicType == PeriodicTriclinic)
calculateBlockIxnImpl<PeriodicTriclinic>(blockIndex, forces, totalEnergy, boxSize, invBoxSize, blockCenter);
}
template <int PERIODIC_TYPE>
void CpuNonbondedForceVec4::calculateBlockIxnImpl(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize, const fvec4& blockCenter) {
// Load the positions and parameters of the atoms in the block.
const int* blockAtom = &neighborList->getSortedAtoms()[4*blockIndex];
fvec4 blockAtomPosq[4];
fvec4 blockAtomForceX(0.0f), blockAtomForceY(0.0f), blockAtomForceZ(0.0f);
for (int i = 0; i < 4; i++) {
blockAtomPosq[i] = fvec4(posq+4*blockAtom[i]);
if (PERIODIC_TYPE == PeriodicPerAtom)
blockAtomPosq[i] -= floor((blockAtomPosq[i]-blockCenter)*invBoxSize+0.5f)*boxSize;
}
fvec4 blockAtomX = fvec4(blockAtomPosq[0][0], blockAtomPosq[1][0], blockAtomPosq[2][0], blockAtomPosq[3][0]);
fvec4 blockAtomY = fvec4(blockAtomPosq[0][1], blockAtomPosq[1][1], blockAtomPosq[2][1], blockAtomPosq[3][1]);
fvec4 blockAtomZ = fvec4(blockAtomPosq[0][2], blockAtomPosq[1][2], blockAtomPosq[2][2], blockAtomPosq[3][2]);
fvec4 blockAtomCharge = fvec4(ONE_4PI_EPS0)*fvec4(blockAtomPosq[0][3], blockAtomPosq[1][3], blockAtomPosq[2][3], blockAtomPosq[3][3]);
fvec4 blockAtomSigma(atomParameters[blockAtom[0]].first, atomParameters[blockAtom[1]].first, atomParameters[blockAtom[2]].first, atomParameters[blockAtom[3]].first);
fvec4 blockAtomEpsilon(atomParameters[blockAtom[0]].second, atomParameters[blockAtom[1]].second, atomParameters[blockAtom[2]].second, atomParameters[blockAtom[3]].second);
const bool needPeriodic = (PERIODIC_TYPE == PeriodicPerInteraction || PERIODIC_TYPE == PeriodicTriclinic);
const float invSwitchingInterval = 1/(cutoffDistance-switchingDistance);
// Loop over neighbors for this block.
const vector<int>& neighbors = neighborList->getBlockNeighbors(blockIndex);
const vector<char>& exclusions = neighborList->getBlockExclusions(blockIndex);
for (int i = 0; i < (int) neighbors.size(); i++) {
// Load the next neighbor.
int atom = neighbors[i];
// Compute the distances to the block atoms.
fvec4 dx, dy, dz, r2;
fvec4 atomPos(posq+4*atom);
if (PERIODIC_TYPE == PeriodicPerAtom)
atomPos -= floor((atomPos-blockCenter)*invBoxSize+0.5f)*boxSize;
getDeltaR<PERIODIC_TYPE>(atomPos, blockAtomX, blockAtomY, blockAtomZ, dx, dy, dz, r2, needPeriodic, boxSize, invBoxSize);
ivec4 include;
char excl = exclusions[i];
if (excl == 0)
include = -1;
else
include = ivec4(excl&1 ? 0 : -1, excl&2 ? 0 : -1, excl&4 ? 0 : -1, excl&8 ? 0 : -1);
include = include & (r2 < cutoffDistance*cutoffDistance);
if (!any(include))
continue; // No interactions to compute.
// Compute the interactions.
fvec4 inverseR = rsqrt(r2);
fvec4 energy, dEdR;
float atomEpsilon = atomParameters[atom].second;
if (atomEpsilon != 0.0f) {
fvec4 sig = blockAtomSigma+atomParameters[atom].first;
fvec4 sig2 = inverseR*sig;
sig2 *= sig2;
fvec4 sig6 = sig2*sig2*sig2;
fvec4 epsSig6 = blockAtomEpsilon*atomEpsilon*sig6;
dEdR = epsSig6*(12.0f*sig6 - 6.0f);
energy = epsSig6*(sig6-1.0f);
if (useSwitch) {
fvec4 r = r2*inverseR;
fvec4 t = blend(0.0f, (r-switchingDistance)*invSwitchingInterval, r>switchingDistance);
fvec4 switchValue = 1+t*t*t*(-10.0f+t*(15.0f-t*6.0f));
fvec4 switchDeriv = t*t*(-30.0f+t*(60.0f-t*30.0f))*invSwitchingInterval;
dEdR = switchValue*dEdR - energy*switchDeriv*r;
energy *= switchValue;
}
}
else {
energy = 0.0f;
dEdR = 0.0f;
}
fvec4 chargeProd = blockAtomCharge*posq[4*atom+3];
if (cutoff)
dEdR += chargeProd*(inverseR-2.0f*krf*r2);
else
dEdR += chargeProd*inverseR;
dEdR *= inverseR*inverseR;
// Accumulate energies.
fvec4 one(1.0f);
if (totalEnergy) {
if (cutoff)
energy += chargeProd*(inverseR+krf*r2-crf);
else
energy += chargeProd*inverseR;
energy = blend(0.0f, energy, include);
*totalEnergy += dot4(energy, one);
}
// Accumulate forces.
dEdR = blend(0.0f, dEdR, include);
fvec4 fx = dx*dEdR;
fvec4 fy = dy*dEdR;
fvec4 fz = dz*dEdR;
blockAtomForceX += fx;
blockAtomForceY += fy;
blockAtomForceZ += fz;
float* atomForce = forces+4*atom;
atomForce[0] -= dot4(fx, one);
atomForce[1] -= dot4(fy, one);
atomForce[2] -= dot4(fz, one);
}
// Record the forces on the block atoms.
fvec4 f[4] = {blockAtomForceX, blockAtomForceY, blockAtomForceZ, 0.0f};
transpose(f[0], f[1], f[2], f[3]);
for (int j = 0; j < 4; j++)
(fvec4(forces+4*blockAtom[j])+f[j]).store(forces+4*blockAtom[j]);
}
void CpuNonbondedForceVec4::calculateBlockEwaldIxn(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize) {
// Determine whether we need to apply periodic boundary conditions.
PeriodicType periodicType;
fvec4 blockCenter;
if (!periodic) {
periodicType = NoPeriodic;
blockCenter = 0.0f;
}
else {
const int* blockAtom = &neighborList->getSortedAtoms()[4*blockIndex];
float minx, maxx, miny, maxy, minz, maxz;
minx = maxx = posq[4*blockAtom[0]];
miny = maxy = posq[4*blockAtom[0]+1];
minz = maxz = posq[4*blockAtom[0]+2];
for (int i = 1; i < 4; i++) {
minx = min(minx, posq[4*blockAtom[i]]);
maxx = max(maxx, posq[4*blockAtom[i]]);
miny = min(miny, posq[4*blockAtom[i]+1]);
maxy = max(maxy, posq[4*blockAtom[i]+1]);
minz = min(minz, posq[4*blockAtom[i]+2]);
maxz = max(maxz, posq[4*blockAtom[i]+2]);
}
blockCenter = fvec4(0.5f*(minx+maxx), 0.5f*(miny+maxy), 0.5f*(minz+maxz), 0.0f);
if (!(minx < cutoffDistance || miny < cutoffDistance || minz < cutoffDistance ||
maxx > boxSize[0]-cutoffDistance || maxy > boxSize[1]-cutoffDistance || maxz > boxSize[2]-cutoffDistance))
periodicType = NoPeriodic;
else if (triclinic)
periodicType = PeriodicTriclinic;
else if (0.5f*(boxSize[0]-(maxx-minx)) >= cutoffDistance &&
0.5f*(boxSize[1]-(maxy-miny)) >= cutoffDistance &&
0.5f*(boxSize[2]-(maxz-minz)) >= cutoffDistance)
periodicType = PeriodicPerAtom;
else
periodicType = PeriodicPerInteraction;
}
// Call the appropriate version depending on what calculation is required for periodic boundary conditions.
if (periodicType == NoPeriodic)
calculateBlockEwaldIxnImpl<NoPeriodic>(blockIndex, forces, totalEnergy, boxSize, invBoxSize, blockCenter);
else if (periodicType == PeriodicPerAtom)
calculateBlockEwaldIxnImpl<PeriodicPerAtom>(blockIndex, forces, totalEnergy, boxSize, invBoxSize, blockCenter);
else if (periodicType == PeriodicPerInteraction)
calculateBlockEwaldIxnImpl<PeriodicPerInteraction>(blockIndex, forces, totalEnergy, boxSize, invBoxSize, blockCenter);
else if (periodicType == PeriodicTriclinic)
calculateBlockEwaldIxnImpl<PeriodicTriclinic>(blockIndex, forces, totalEnergy, boxSize, invBoxSize, blockCenter);
}
template <int PERIODIC_TYPE>
void CpuNonbondedForceVec4::calculateBlockEwaldIxnImpl(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize, const fvec4& blockCenter) {
// Load the positions and parameters of the atoms in the block.
const int* blockAtom = &neighborList->getSortedAtoms()[4*blockIndex];
fvec4 blockAtomPosq[4];
fvec4 blockAtomForceX(0.0f), blockAtomForceY(0.0f), blockAtomForceZ(0.0f);
for (int i = 0; i < 4; i++) {
blockAtomPosq[i] = fvec4(posq+4*blockAtom[i]);
if (PERIODIC_TYPE == PeriodicPerAtom)
blockAtomPosq[i] -= floor((blockAtomPosq[i]-blockCenter)*invBoxSize+0.5f)*boxSize;
}
fvec4 blockAtomX = fvec4(blockAtomPosq[0][0], blockAtomPosq[1][0], blockAtomPosq[2][0], blockAtomPosq[3][0]);
fvec4 blockAtomY = fvec4(blockAtomPosq[0][1], blockAtomPosq[1][1], blockAtomPosq[2][1], blockAtomPosq[3][1]);
fvec4 blockAtomZ = fvec4(blockAtomPosq[0][2], blockAtomPosq[1][2], blockAtomPosq[2][2], blockAtomPosq[3][2]);
fvec4 blockAtomCharge = fvec4(ONE_4PI_EPS0)*fvec4(blockAtomPosq[0][3], blockAtomPosq[1][3], blockAtomPosq[2][3], blockAtomPosq[3][3]);
fvec4 blockAtomSigma(atomParameters[blockAtom[0]].first, atomParameters[blockAtom[1]].first, atomParameters[blockAtom[2]].first, atomParameters[blockAtom[3]].first);
fvec4 blockAtomEpsilon(atomParameters[blockAtom[0]].second, atomParameters[blockAtom[1]].second, atomParameters[blockAtom[2]].second, atomParameters[blockAtom[3]].second);
fvec4 C6s(C6params[blockAtom[0]], C6params[blockAtom[1]], C6params[blockAtom[2]], C6params[blockAtom[3]]);
const bool needPeriodic = (PERIODIC_TYPE == PeriodicPerInteraction || PERIODIC_TYPE == PeriodicTriclinic);
const float invSwitchingInterval = 1/(cutoffDistance-switchingDistance);
// Loop over neighbors for this block.
const vector<int>& neighbors = neighborList->getBlockNeighbors(blockIndex);
const vector<char>& exclusions = neighborList->getBlockExclusions(blockIndex);
for (int i = 0; i < (int) neighbors.size(); i++) {
// Load the next neighbor.
int atom = neighbors[i];
// Compute the distances to the block atoms.
fvec4 dx, dy, dz, r2;
fvec4 atomPos(posq+4*atom);
if (PERIODIC_TYPE == PeriodicPerAtom)
atomPos -= floor((atomPos-blockCenter)*invBoxSize+0.5f)*boxSize;
getDeltaR<PERIODIC_TYPE>(atomPos, blockAtomX, blockAtomY, blockAtomZ, dx, dy, dz, r2, needPeriodic, boxSize, invBoxSize);
ivec4 include;
char excl = exclusions[i];
if (excl == 0)
include = -1;
else
include = ivec4(excl&1 ? 0 : -1, excl&2 ? 0 : -1, excl&4 ? 0 : -1, excl&8 ? 0 : -1);
include = include & (r2 < cutoffDistance*cutoffDistance);
if (!any(include))
continue; // No interactions to compute.
// Compute the interactions.
fvec4 inverseR = rsqrt(r2);
fvec4 r = r2*inverseR;
fvec4 energy, dEdR;
float atomEpsilon = atomParameters[atom].second;
if (atomEpsilon != 0.0f) {
fvec4 sig = blockAtomSigma+atomParameters[atom].first;
fvec4 sig2 = inverseR*sig;
sig2 *= sig2;
fvec4 sig6 = sig2*sig2*sig2;
fvec4 eps = blockAtomEpsilon*atomEpsilon;
fvec4 epsSig6 = eps*sig6;
dEdR = epsSig6*(12.0f*sig6 - 6.0f);
energy = epsSig6*(sig6-1.0f);
if (useSwitch) {
fvec4 t = blend(0.0f, (r-switchingDistance)*invSwitchingInterval, r>switchingDistance);
fvec4 switchValue = 1+t*t*t*(-10.0f+t*(15.0f-t*6.0f));
fvec4 switchDeriv = t*t*(-30.0f+t*(60.0f-t*30.0f))*invSwitchingInterval;
dEdR = switchValue*dEdR - energy*switchDeriv*r;
energy *= switchValue;
}
if (ljpme) {
fvec4 C6ij = C6s*C6params[atom];
fvec4 inverseR2 = inverseR*inverseR;
fvec4 mysig2 = sig*sig;
fvec4 mysig6 = mysig2*mysig2*mysig2;
fvec4 emult = C6ij*inverseR2*inverseR2*inverseR2*exptermsApprox(r);
fvec4 potentialShift = eps*(1.0f-mysig6*inverseRcut6)*mysig6*inverseRcut6 - C6ij*inverseRcut6Expterm;
dEdR += 6.0f*C6ij*inverseR2*inverseR2*inverseR2*dExptermsApprox(r);
energy += emult + potentialShift;
}
}
else {
energy = 0.0f;
dEdR = 0.0f;
}
fvec4 chargeProd = blockAtomCharge*posq[4*atom+3];
dEdR += chargeProd*inverseR*ewaldScaleFunction(r);
dEdR *= inverseR*inverseR;
// Accumulate energies.
fvec4 one(1.0f);
if (totalEnergy) {
energy += chargeProd*inverseR*erfcApprox(alphaEwald*r);
energy = blend(0.0f, energy, include);
*totalEnergy += dot4(energy, one);
}
// Accumulate forces.
dEdR = blend(0.0f, dEdR, include);
fvec4 fx = dx*dEdR;
fvec4 fy = dy*dEdR;
fvec4 fz = dz*dEdR;
blockAtomForceX += fx;
blockAtomForceY += fy;
blockAtomForceZ += fz;
float* atomForce = forces+4*atom;
atomForce[0] -= dot4(fx, one);
atomForce[1] -= dot4(fy, one);
atomForce[2] -= dot4(fz, one);
}
// Record the forces on the block atoms.
fvec4 f[4] = {blockAtomForceX, blockAtomForceY, blockAtomForceZ, 0.0f};
transpose(f[0], f[1], f[2], f[3]);
for (int j = 0; j < 4; j++)
(fvec4(forces+4*blockAtom[j])+f[j]).store(forces+4*blockAtom[j]);
}
template <int PERIODIC_TYPE>
void CpuNonbondedForceVec4::getDeltaR(const fvec4& posI, const fvec4& x, const fvec4& y, const fvec4& z, fvec4& dx, fvec4& dy, fvec4& dz, fvec4& r2, bool periodic, const fvec4& boxSize, const fvec4& invBoxSize) const {
dx = x-posI[0];
dy = y-posI[1];
dz = z-posI[2];
if (PERIODIC_TYPE == PeriodicTriclinic) {
fvec4 scale3 = floor(dz*recipBoxSize[2]+0.5f);
dx -= scale3*periodicBoxVectors[2][0];
dy -= scale3*periodicBoxVectors[2][1];
dz -= scale3*periodicBoxVectors[2][2];
fvec4 scale2 = floor(dy*recipBoxSize[1]+0.5f);
dx -= scale2*periodicBoxVectors[1][0];
dy -= scale2*periodicBoxVectors[1][1];
fvec4 scale1 = floor(dx*recipBoxSize[0]+0.5f);
dx -= scale1*periodicBoxVectors[0][0];
}
else if (PERIODIC_TYPE == PeriodicPerInteraction) {
dx -= round(dx*invBoxSize[0])*boxSize[0];
dy -= round(dy*invBoxSize[1])*boxSize[1];
dz -= round(dz*invBoxSize[2])*boxSize[2];
}
r2 = dx*dx + dy*dy + dz*dz;
}
fvec4 CpuNonbondedForceVec4::erfcApprox(const fvec4& x) {
fvec4 x1 = x*erfcDXInv;
ivec4 index = min(floor(x1), NUM_TABLE_POINTS);
fvec4 coeff2 = x1-index;
fvec4 coeff1 = 1.0f-coeff2;
fvec4 t1(&erfcTable[index[0]]);
fvec4 t2(&erfcTable[index[1]]);
fvec4 t3(&erfcTable[index[2]]);
fvec4 t4(&erfcTable[index[3]]);
transpose(t1, t2, t3, t4);
return coeff1*t1 + coeff2*t2;
}
fvec4 CpuNonbondedForceVec4::ewaldScaleFunction(const fvec4& x) {
// Compute the tabulated Ewald scale factor: erfc(alpha*r) + 2*alpha*r*exp(-alpha*alpha*r*r)/sqrt(PI)
fvec4 x1 = x*ewaldDXInv;
ivec4 index = min(floor(x1), NUM_TABLE_POINTS);
fvec4 coeff2 = x1-index;
fvec4 coeff1 = 1.0f-coeff2;
fvec4 t1(&ewaldScaleTable[index[0]]);
fvec4 t2(&ewaldScaleTable[index[1]]);
fvec4 t3(&ewaldScaleTable[index[2]]);
fvec4 t4(&ewaldScaleTable[index[3]]);
transpose(t1, t2, t3, t4);
return coeff1*t1 + coeff2*t2;
}
fvec4 CpuNonbondedForceVec4::exptermsApprox(const fvec4& r) {
fvec4 r1 = r*exptermsDXInv;
ivec4 index = min(floor(r1), NUM_TABLE_POINTS);
fvec4 coeff2 = r1-index;
fvec4 coeff1 = 1.0f-coeff2;
fvec4 t1(&exptermsTable[index[0]]);
fvec4 t2(&exptermsTable[index[1]]);
fvec4 t3(&exptermsTable[index[2]]);
fvec4 t4(&exptermsTable[index[3]]);
transpose(t1, t2, t3, t4);
return coeff1*t1 + coeff2*t2;
}
fvec4 CpuNonbondedForceVec4::dExptermsApprox(const fvec4& r) {
fvec4 r1 = r*exptermsDXInv;
ivec4 index = min(floor(r1), NUM_TABLE_POINTS);
fvec4 coeff2 = r1-index;
fvec4 coeff1 = 1.0f-coeff2;
fvec4 t1(&dExptermsTable[index[0]]);
fvec4 t2(&dExptermsTable[index[1]]);
fvec4 t3(&dExptermsTable[index[2]]);
fvec4 t4(&dExptermsTable[index[3]]);
transpose(t1, t2, t3, t4);
return coeff1*t1 + coeff2*t2;
} }
This diff is collapsed.
...@@ -165,11 +165,14 @@ CpuPlatform::PlatformData::~PlatformData() { ...@@ -165,11 +165,14 @@ CpuPlatform::PlatformData::~PlatformData() {
delete neighborList; delete neighborList;
} }
bool isVec8Supported(); /**
* Return how much vectorisation is supported for host platform.
*/
int getVecBlockSize();
void CpuPlatform::PlatformData::requestNeighborList(double cutoffDistance, double padding, bool useExclusions, const vector<set<int> >& exclusionList) { void CpuPlatform::PlatformData::requestNeighborList(double cutoffDistance, double padding, bool useExclusions, const vector<set<int> >& exclusionList) {
if (neighborList == NULL) if (neighborList == NULL)
neighborList = new CpuNeighborList(isVec8Supported() ? 8 : 4); neighborList = new CpuNeighborList(getVecBlockSize());
if (cutoffDistance > cutoff) if (cutoffDistance > cutoff)
cutoff = cutoffDistance; cutoff = cutoffDistance;
if (cutoffDistance+padding > paddedCutoff) if (cutoffDistance+padding > paddedCutoff)
......
...@@ -133,10 +133,8 @@ KernelImpl* CudaKernelFactory::createKernelImpl(std::string name, const Platform ...@@ -133,10 +133,8 @@ KernelImpl* CudaKernelFactory::createKernelImpl(std::string name, const Platform
return new CommonIntegrateCustomStepKernel(name, platform, cu); return new CommonIntegrateCustomStepKernel(name, platform, cu);
if (name == ApplyAndersenThermostatKernel::Name()) if (name == ApplyAndersenThermostatKernel::Name())
return new CommonApplyAndersenThermostatKernel(name, platform, cu); return new CommonApplyAndersenThermostatKernel(name, platform, cu);
if (name == NoseHooverChainKernel::Name()) if (name == IntegrateNoseHooverStepKernel::Name())
return new CommonNoseHooverChainKernel(name, platform, cu); return new CommonIntegrateNoseHooverStepKernel(name, platform, cu);
if (name == IntegrateVelocityVerletStepKernel::Name())
return new CommonIntegrateVelocityVerletStepKernel(name, platform, cu);
if (name == ApplyMonteCarloBarostatKernel::Name()) if (name == ApplyMonteCarloBarostatKernel::Name())
return new CudaApplyMonteCarloBarostatKernel(name, platform, cu); return new CudaApplyMonteCarloBarostatKernel(name, platform, cu);
if (name == RemoveCMMotionKernel::Name()) if (name == RemoveCMMotionKernel::Name())
......
...@@ -96,7 +96,7 @@ CudaPlatform::CudaPlatform() { ...@@ -96,7 +96,7 @@ CudaPlatform::CudaPlatform() {
registerKernelFactory(CalcCustomManyParticleForceKernel::Name(), factory); registerKernelFactory(CalcCustomManyParticleForceKernel::Name(), factory);
registerKernelFactory(CalcGayBerneForceKernel::Name(), factory); registerKernelFactory(CalcGayBerneForceKernel::Name(), factory);
registerKernelFactory(IntegrateVerletStepKernel::Name(), factory); registerKernelFactory(IntegrateVerletStepKernel::Name(), factory);
registerKernelFactory(IntegrateVelocityVerletStepKernel::Name(), factory); registerKernelFactory(IntegrateNoseHooverStepKernel::Name(), factory);
registerKernelFactory(IntegrateLangevinStepKernel::Name(), factory); registerKernelFactory(IntegrateLangevinStepKernel::Name(), factory);
registerKernelFactory(IntegrateLangevinMiddleStepKernel::Name(), factory); registerKernelFactory(IntegrateLangevinMiddleStepKernel::Name(), factory);
registerKernelFactory(IntegrateBrownianStepKernel::Name(), factory); registerKernelFactory(IntegrateBrownianStepKernel::Name(), factory);
...@@ -104,7 +104,6 @@ CudaPlatform::CudaPlatform() { ...@@ -104,7 +104,6 @@ CudaPlatform::CudaPlatform() {
registerKernelFactory(IntegrateVariableLangevinStepKernel::Name(), factory); registerKernelFactory(IntegrateVariableLangevinStepKernel::Name(), factory);
registerKernelFactory(IntegrateCustomStepKernel::Name(), factory); registerKernelFactory(IntegrateCustomStepKernel::Name(), factory);
registerKernelFactory(ApplyAndersenThermostatKernel::Name(), factory); registerKernelFactory(ApplyAndersenThermostatKernel::Name(), factory);
registerKernelFactory(NoseHooverChainKernel::Name(), factory);
registerKernelFactory(ApplyMonteCarloBarostatKernel::Name(), factory); registerKernelFactory(ApplyMonteCarloBarostatKernel::Name(), factory);
registerKernelFactory(RemoveCMMotionKernel::Name(), factory); registerKernelFactory(RemoveCMMotionKernel::Name(), factory);
platformProperties.push_back(CudaDeviceIndex()); platformProperties.push_back(CudaDeviceIndex());
......
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2019 Stanford University and the Authors. *
* Authors: Andreas Krämer and Andrew C. Simmonett *
* Contributors: *
* *
* Permission is hereby granted, free of charge, to any person obtaining a *
* copy of this software and associated documentation files (the "Software"), *
* to deal in the Software without restriction, including without limitation *
* the rights to use, copy, modify, merge, publish, distribute, sublicense, *
* and/or sell copies of the Software, and to permit persons to whom the *
* Software is furnished to do so, subject to the following conditions: *
* *
* The above copyright notice and this permission notice shall be included in *
* all copies or substantial portions of the Software. *
* *
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, *
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL *
* THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, *
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR *
* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE *
* USE OR OTHER DEALINGS IN THE SOFTWARE. *
* -------------------------------------------------------------------------- */
#include "CudaTests.h"
#include "TestNoseHooverThermostat.h"
void runPlatformTests() {
}
...@@ -131,10 +131,8 @@ KernelImpl* OpenCLKernelFactory::createKernelImpl(std::string name, const Platfo ...@@ -131,10 +131,8 @@ KernelImpl* OpenCLKernelFactory::createKernelImpl(std::string name, const Platfo
return new CommonIntegrateCustomStepKernel(name, platform, cl); return new CommonIntegrateCustomStepKernel(name, platform, cl);
if (name == ApplyAndersenThermostatKernel::Name()) if (name == ApplyAndersenThermostatKernel::Name())
return new CommonApplyAndersenThermostatKernel(name, platform, cl); return new CommonApplyAndersenThermostatKernel(name, platform, cl);
if (name == NoseHooverChainKernel::Name()) if (name == IntegrateNoseHooverStepKernel::Name())
return new CommonNoseHooverChainKernel(name, platform, cl); return new CommonIntegrateNoseHooverStepKernel(name, platform, cl);
if (name == IntegrateVelocityVerletStepKernel::Name())
return new CommonIntegrateVelocityVerletStepKernel(name, platform, cl);
if (name == ApplyMonteCarloBarostatKernel::Name()) if (name == ApplyMonteCarloBarostatKernel::Name())
return new OpenCLApplyMonteCarloBarostatKernel(name, platform, cl); return new OpenCLApplyMonteCarloBarostatKernel(name, platform, cl);
if (name == RemoveCMMotionKernel::Name()) if (name == RemoveCMMotionKernel::Name())
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment