Commit da998c51 authored by peastman's avatar peastman
Browse files

Merge pull request #251 from peastman/avx

CPU platform can use AVX when available
parents 59982c00 6cde1860
#ifndef OPENMM_VECTORIZE8_H_
#define OPENMM_VECTORIZE8_H_
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2013 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
* Permission is hereby granted, free of charge, to any person obtaining a *
* copy of this software and associated documentation files (the "Software"), *
* to deal in the Software without restriction, including without limitation *
* the rights to use, copy, modify, merge, publish, distribute, sublicense, *
* and/or sell copies of the Software, and to permit persons to whom the *
* Software is furnished to do so, subject to the following conditions: *
* *
* The above copyright notice and this permission notice shall be included in *
* all copies or substantial portions of the Software. *
* *
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, *
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL *
* THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, *
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR *
* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE *
* USE OR OTHER DEALINGS IN THE SOFTWARE. *
* -------------------------------------------------------------------------- */
#include "vectorize.h"
#include <immintrin.h>
// This file defines classes and functions to simplify vectorizing code with AVX.
class ivec8;
/**
* An eight element vector of floats.
*/
class fvec8 {
public:
__m256 val;
fvec8() {}
fvec8(float v) : val(_mm256_set1_ps(v)) {}
fvec8(float v1, float v2, float v3, float v4, float v5, float v6, float v7, float v8) : val(_mm256_set_ps(v8, v7, v6, v5, v4, v3, v2, v1)) {}
fvec8(__m256 v) : val(v) {}
fvec8(const float* v) : val(_mm256_loadu_ps(v)) {}
operator __m256() const {
return val;
}
fvec4 lowerVec() const {
return _mm256_castps256_ps128(val);
}
fvec4 upperVec() const {
return _mm256_extractf128_ps(val, 1);
}
void store(float* v) const {
_mm256_storeu_ps(v, val);
}
fvec8 operator+(fvec8 other) const {
return _mm256_add_ps(val, other);
}
fvec8 operator-(fvec8 other) const {
return _mm256_sub_ps(val, other);
}
fvec8 operator*(fvec8 other) const {
return _mm256_mul_ps(val, other);
}
fvec8 operator/(fvec8 other) const {
return _mm256_div_ps(val, other);
}
void operator+=(fvec8 other) {
val = _mm256_add_ps(val, other);
}
void operator-=(fvec8 other) {
val = _mm256_sub_ps(val, other);
}
void operator*=(fvec8 other) {
val = _mm256_mul_ps(val, other);
}
void operator/=(fvec8 other) {
val = _mm256_div_ps(val, other);
}
fvec8 operator-() const {
return _mm256_sub_ps(_mm256_set1_ps(0.0f), val);
}
fvec8 operator&(fvec8 other) const {
return _mm256_and_ps(val, other);
}
fvec8 operator|(fvec8 other) const {
return _mm256_or_ps(val, other);
}
fvec8 operator==(fvec8 other) const {
return _mm256_cmp_ps(val, other, _CMP_EQ_OQ);
}
fvec8 operator!=(fvec8 other) const {
return _mm256_cmp_ps(val, other, _CMP_NEQ_OQ);
}
fvec8 operator>(fvec8 other) const {
return _mm256_cmp_ps(val, other, _CMP_GT_OQ);
}
fvec8 operator<(fvec8 other) const {
return _mm256_cmp_ps(val, other, _CMP_LT_OQ);
}
fvec8 operator>=(fvec8 other) const {
return _mm256_cmp_ps(val, other, _CMP_GE_OQ);
}
fvec8 operator<=(fvec8 other) const {
return _mm256_cmp_ps(val, other, _CMP_LE_OQ);
}
operator ivec8() const;
};
/**
* An eight element vector of ints.
*/
class ivec8 {
public:
__m256i val;
ivec8() {}
ivec8(int v) : val(_mm256_set1_epi32(v)) {}
ivec8(int v1, int v2, int v3, int v4, int v5, int v6, int v7, int v8) : val(_mm256_set_epi32(v8, v7, v6, v5, v4, v3, v2, v1)) {}
ivec8(__m256i v) : val(v) {}
ivec8(const int* v) : val(_mm256_loadu_si256((const __m256i*) v)) {}
operator __m256i() const {
return val;
}
ivec4 lowerVec() const {
return _mm256_castsi256_si128(val);
}
ivec4 upperVec() const {
return _mm256_extractf128_si256(val, 1);
}
void store(int* v) const {
_mm256_storeu_si256((__m256i*) v, val);
}
ivec8 operator&(ivec8 other) const {
return _mm256_and_ps(_mm256_castsi256_ps(val), _mm256_castsi256_ps(other.val));
}
ivec8 operator|(ivec8 other) const {
return _mm256_or_ps(_mm256_castsi256_ps(val), _mm256_castsi256_ps(other.val));
}
operator fvec8() const;
};
// Conversion operators.
inline fvec8::operator ivec8() const {
return _mm256_cvttps_epi32(val);
}
inline ivec8::operator fvec8() const {
return _mm256_cvtepi32_ps(val);
}
// Functions that operate on fvec8s.
static inline fvec8 floor(fvec8 v) {
return fvec8(_mm256_floor_ps(v.val));
}
static inline fvec8 ceil(fvec8 v) {
return fvec8(_mm256_ceil_ps(v.val));
}
static inline fvec8 round(fvec8 v) {
return fvec8(_mm256_round_ps(v.val, _MM_FROUND_TO_NEAREST_INT));
}
static inline fvec8 min(fvec8 v1, fvec8 v2) {
return fvec8(_mm256_min_ps(v1.val, v2.val));
}
static inline fvec8 max(fvec8 v1, fvec8 v2) {
return fvec8(_mm256_max_ps(v1.val, v2.val));
}
static inline fvec8 abs(fvec8 v) {
static const __m256 mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF));
return fvec8(_mm256_and_ps(v.val, mask));
}
static inline fvec8 sqrt(fvec8 v) {
return fvec8(_mm256_sqrt_ps(v.val));
}
static inline float dot8(fvec8 v1, fvec8 v2) {
fvec8 result = _mm256_dp_ps(v1, v2, 0xF1);
return _mm_cvtss_f32(result.lowerVec())+_mm_cvtss_f32(result.upperVec());
}
static inline void transpose(fvec4 in1, fvec4 in2, fvec4 in3, fvec4 in4, fvec4 in5, fvec4 in6, fvec4 in7, fvec4 in8, fvec8& out1, fvec8& out2, fvec8& out3, fvec8& out4) {
_MM_TRANSPOSE4_PS(in1, in2, in3, in4);
_MM_TRANSPOSE4_PS(in5, in6, in7, in8);
out1 = _mm256_castps128_ps256(in1);
out1 = _mm256_insertf128_ps(out1, in5, 1);
out2 = _mm256_castps128_ps256(in2);
out2 = _mm256_insertf128_ps(out2, in6, 1);
out3 = _mm256_castps128_ps256(in3);
out3 = _mm256_insertf128_ps(out3, in7, 1);
out4 = _mm256_castps128_ps256(in4);
out4 = _mm256_insertf128_ps(out4, in8, 1);
}
static inline void transpose(fvec8 in1, fvec8 in2, fvec8 in3, fvec8 in4, fvec4& out1, fvec4& out2, fvec4& out3, fvec4& out4, fvec4& out5, fvec4& out6, fvec4& out7, fvec4& out8) {
out1 = in1.lowerVec();
out2 = in2.lowerVec();
out3 = in3.lowerVec();
out4 = in4.lowerVec();
_MM_TRANSPOSE4_PS(out1, out2, out3, out4);
out5 = in1.upperVec();
out6 = in2.upperVec();
out7 = in3.upperVec();
out8 = in4.upperVec();
_MM_TRANSPOSE4_PS(out5, out6, out7, out8);
}
// Functions that operate on ivec8s.
static inline bool any(ivec8 v) {
return !_mm256_testz_si256(v, _mm256_set1_epi32(0xFFFFFFFF));
}
// Mathematical operators involving a scalar and a vector.
static inline fvec8 operator+(float v1, fvec8 v2) {
return fvec8(v1)+v2;
}
static inline fvec8 operator-(float v1, fvec8 v2) {
return fvec8(v1)-v2;
}
static inline fvec8 operator*(float v1, fvec8 v2) {
return fvec8(v1)*v2;
}
static inline fvec8 operator/(float v1, fvec8 v2) {
return fvec8(v1)/v2;
}
// Operations for blending fvec8s based on an ivec8.
static inline fvec8 blend(fvec8 v1, fvec8 v2, ivec8 mask) {
return fvec8(_mm256_blendv_ps(v1.val, v2.val, _mm256_castsi256_ps(mask.val)));
}
#endif /*OPENMM_VECTORIZE8_H_*/
......@@ -88,9 +88,7 @@ private:
*/
class CpuCalcNonbondedForceKernel : public CalcNonbondedForceKernel {
public:
CpuCalcNonbondedForceKernel(std::string name, const Platform& platform, CpuPlatform::PlatformData& data) : CalcNonbondedForceKernel(name, platform),
data(data), bonded14IndexArray(NULL), bonded14ParamArray(NULL), hasInitializedPme(false) {
}
CpuCalcNonbondedForceKernel(std::string name, const Platform& platform, CpuPlatform::PlatformData& data);
~CpuCalcNonbondedForceKernel();
/**
* Initialize the kernel.
......@@ -130,8 +128,8 @@ private:
std::vector<std::pair<float, float> > particleParams;
std::vector<RealVec> lastPositions;
NonbondedMethod nonbondedMethod;
CpuNeighborList neighborList;
CpuNonbondedForce nonbonded;
CpuNeighborList* neighborList;
CpuNonbondedForce* nonbonded;
Kernel optimizedPme;
};
......
......@@ -45,8 +45,7 @@ class OPENMM_EXPORT_CPU CpuNeighborList {
public:
class ThreadTask;
class Voxels;
static const int BlockSize;
CpuNeighborList();
CpuNeighborList(int blockSize);
void computeNeighborList(int numAtoms, const AlignedArray<float>& atomLocations, const std::vector<std::set<int> >& exclusions,
const float* periodicBoxSize, bool usePeriodic, float maxDistance, ThreadPool& threads);
int getNumBlocks() const;
......@@ -59,6 +58,7 @@ public:
void threadComputeNeighborList(ThreadPool& threads, int threadIndex);
void runThread(int index);
private:
int blockSize;
std::vector<int> sortedAtoms;
std::vector<std::vector<int> > blockNeighbors;
std::vector<std::vector<char> > blockExclusions;
......
......@@ -48,7 +48,13 @@ class CpuNonbondedForce {
--------------------------------------------------------------------------------------- */
CpuNonbondedForce();
/**
* Virtual destructor.
*/
virtual ~CpuNonbondedForce();
/**---------------------------------------------------------------------------------------
Set the force to use a cutoff.
......@@ -151,7 +157,7 @@ class CpuNonbondedForce {
*/
void threadComputeDirect(ThreadPool& threads, int threadIndex);
private:
protected:
bool cutoff;
bool useSwitch;
bool periodic;
......@@ -204,7 +210,7 @@ private:
--------------------------------------------------------------------------------------- */
void calculateBlockIxn(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize);
virtual void calculateBlockIxn(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize) = 0;
/**---------------------------------------------------------------------------------------
......@@ -216,7 +222,7 @@ private:
--------------------------------------------------------------------------------------- */
void calculateBlockEwaldIxn(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize);
virtual void calculateBlockEwaldIxn(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize) = 0;
/**
* Compute the displacement and squared distance between two points, optionally using
......@@ -224,26 +230,15 @@ private:
*/
void getDeltaR(const fvec4& posI, const fvec4& posJ, fvec4& deltaR, float& r2, bool periodic, const fvec4& boxSize, const fvec4& invBoxSize) const;
/**
* Compute the displacement and squared distance between a collection of points, optionally using
* periodic boundary conditions.
*/
void getDeltaR(const fvec4& posI, const fvec4& x, const fvec4& y, const fvec4& z, fvec4& dx, fvec4& dy, fvec4& dz, fvec4& r2, bool periodic, const fvec4& boxSize, const fvec4& invBoxSize) const;
/**
* Compute a fast approximation to erfc(x).
*/
static fvec4 erfcApprox(fvec4 x);
/**
* Create a lookup table for the scale factor used with Ewald and PME.
*/
void tabulateEwaldScaleFactor();
/**
* Evaluate the scale factor used with Ewald and PME: erfc(alpha*r) + 2*alpha*r*exp(-alpha*alpha*r*r)/sqrt(PI)
* Compute a fast approximation to erfc(x).
*/
fvec4 ewaldScaleFunction(fvec4 x);
static float erfcApprox(float x);
};
} // namespace OpenMM
......
/* Portions copyright (c) 2006-2013 Stanford University and Simbios.
* Contributors: Pande Group
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files (the
* "Software"), to deal in the Software without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sublicense, and/or sell copies of the Software, and to
* permit persons to whom the Software is furnished to do so, subject
* to the following conditions:
*
* The above copyright notice and this permission notice shall be included
* in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE
* LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
* OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
* WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
#ifndef OPENMM_CPU_NONBONDED_FORCE_VEC4_H__
#define OPENMM_CPU_NONBONDED_FORCE_VEC4_H__
#include "CpuNonbondedForce.h"
// ---------------------------------------------------------------------------------------
namespace OpenMM {
class CpuNonbondedForceVec4 : public CpuNonbondedForce {
public:
/**---------------------------------------------------------------------------------------
Constructor
--------------------------------------------------------------------------------------- */
CpuNonbondedForceVec4();
protected:
/**---------------------------------------------------------------------------------------
Calculate all the interactions for one atom block.
@param blockIndex the index of the atom block
@param forces force array (forces added)
@param totalEnergy total energy
--------------------------------------------------------------------------------------- */
void calculateBlockIxn(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize);
/**---------------------------------------------------------------------------------------
Calculate all the interactions for one atom block.
@param blockIndex the index of the atom block
@param forces force array (forces added)
@param totalEnergy total energy
--------------------------------------------------------------------------------------- */
void calculateBlockEwaldIxn(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize);
/**
* Compute the displacement and squared distance between a collection of points, optionally using
* periodic boundary conditions.
*/
void getDeltaR(const float* posI, const fvec4& x, const fvec4& y, const fvec4& z, fvec4& dx, fvec4& dy, fvec4& dz, fvec4& r2, bool periodic, const fvec4& boxSize, const fvec4& invBoxSize) const;
/**
* Compute a fast approximation to erfc(x).
*/
static fvec4 erfcApprox(fvec4 x);
/**
* Evaluate the scale factor used with Ewald and PME: erfc(alpha*r) + 2*alpha*r*exp(-alpha*alpha*r*r)/sqrt(PI)
*/
fvec4 ewaldScaleFunction(fvec4 x);
};
} // namespace OpenMM
// ---------------------------------------------------------------------------------------
#endif // OPENMM_CPU_NONBONDED_FORCE_VEC4_H__
/* Portions copyright (c) 2006-2013 Stanford University and Simbios.
* Contributors: Pande Group
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files (the
* "Software"), to deal in the Software without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sublicense, and/or sell copies of the Software, and to
* permit persons to whom the Software is furnished to do so, subject
* to the following conditions:
*
* The above copyright notice and this permission notice shall be included
* in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE
* LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
* OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
* WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
#ifndef OPENMM_CPU_NONBONDED_FORCE_VEC8_H__
#define OPENMM_CPU_NONBONDED_FORCE_VEC8_H__
#ifdef __AVX__
#include "CpuNonbondedForce.h"
#include "openmm/internal/vectorize8.h"
// ---------------------------------------------------------------------------------------
namespace OpenMM {
class CpuNonbondedForceVec8 : public CpuNonbondedForce {
public:
CpuNonbondedForceVec8();
protected:
/**---------------------------------------------------------------------------------------
Calculate all the interactions for one atom block.
@param blockIndex the index of the atom block
@param forces force array (forces added)
@param totalEnergy total energy
--------------------------------------------------------------------------------------- */
void calculateBlockIxn(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize);
/**---------------------------------------------------------------------------------------
Calculate all the interactions for one atom block.
@param blockIndex the index of the atom block
@param forces force array (forces added)
@param totalEnergy total energy
--------------------------------------------------------------------------------------- */
void calculateBlockEwaldIxn(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize);
/**
* Compute the displacement and squared distance between a collection of points, optionally using
* periodic boundary conditions.
*/
void getDeltaR(const float* posI, const fvec8& x, const fvec8& y, const fvec8& z, fvec8& dx, fvec8& dy, fvec8& dz, fvec8& r2, bool periodic, const fvec4& boxSize, const fvec4& invBoxSize) const;
/**
* Compute a fast approximation to erfc(x).
*/
static fvec8 erfcApprox(fvec8 x);
/**
* Evaluate the scale factor used with Ewald and PME: erfc(alpha*r) + 2*alpha*r*exp(-alpha*alpha*r*r)/sqrt(PI)
*/
fvec8 ewaldScaleFunction(fvec8 x);
};
} // namespace OpenMM
// ---------------------------------------------------------------------------------------
#endif // __AVX__
#endif // OPENMM_CPU_NONBONDED_FORCE_VEC8_H__
SET_SOURCE_FILES_PROPERTIES(${SOURCE_FILES} PROPERTIES COMPILE_FLAGS "${EXTRA_COMPILE_FLAGS} -msse4.1")
FOREACH(file ${SOURCE_FILES})
IF (file MATCHES ".*Vec8.*")
SET_SOURCE_FILES_PROPERTIES(${file} PROPERTIES COMPILE_FLAGS "${EXTRA_COMPILE_FLAGS} -msse4.1 -mavx")
ENDIF (file MATCHES ".*Vec8.*")
ENDFOREACH(file)
ADD_LIBRARY(${SHARED_TARGET} SHARED ${SOURCE_FILES} ${SOURCE_INCLUDE_FILES} ${API_ABS_INCLUDE_FILES})
IF (UNIX AND CMAKE_BUILD_TYPE MATCHES Debug)
......@@ -7,6 +11,6 @@ ELSE (UNIX AND CMAKE_BUILD_TYPE MATCHES Debug)
SET(MAIN_OPENMM_LIB ${OPENMM_LIBRARY_NAME})
ENDIF (UNIX AND CMAKE_BUILD_TYPE MATCHES Debug)
TARGET_LINK_LIBRARIES(${SHARED_TARGET} ${MAIN_OPENMM_LIB} ${PTHREADS_LIB})
SET_TARGET_PROPERTIES(${SHARED_TARGET} PROPERTIES LINK_FLAGS "${EXTRA_COMPILE_FLAGS}" COMPILE_FLAGS "${EXTRA_COMPILE_FLAGS} -DOPENMM_CPU_BUILDING_SHARED_LIBRARY")
SET_TARGET_PROPERTIES(${SHARED_TARGET} PROPERTIES LINK_FLAGS "${EXTRA_COMPILE_FLAGS}" COMPILE_FLAGS "${EXTRA_COMPILE_FLAGS} -msse4.1 -DOPENMM_CPU_BUILDING_SHARED_LIBRARY")
INSTALL_TARGETS(/lib/plugins RUNTIME_DIRECTORY /lib/plugins ${SHARED_TARGET})
......@@ -145,6 +145,22 @@ private:
int numParticles;
};
bool isVec8Supported();
CpuNonbondedForce* createCpuNonbondedForceVec4();
CpuNonbondedForce* createCpuNonbondedForceVec8();
CpuCalcNonbondedForceKernel::CpuCalcNonbondedForceKernel(string name, const Platform& platform, CpuPlatform::PlatformData& data) : CalcNonbondedForceKernel(name, platform),
data(data), bonded14IndexArray(NULL), bonded14ParamArray(NULL), hasInitializedPme(false), neighborList(NULL), nonbonded(NULL) {
if (isVec8Supported()) {
neighborList = new CpuNeighborList(8);
nonbonded = createCpuNonbondedForceVec8();
}
else {
neighborList = new CpuNeighborList(4);
nonbonded = createCpuNonbondedForceVec4();
}
}
CpuCalcNonbondedForceKernel::~CpuCalcNonbondedForceKernel() {
if (bonded14ParamArray != NULL) {
for (int i = 0; i < num14; i++) {
......@@ -154,6 +170,10 @@ CpuCalcNonbondedForceKernel::~CpuCalcNonbondedForceKernel() {
delete bonded14IndexArray;
delete bonded14ParamArray;
}
if (nonbonded != NULL)
delete nonbonded;
if (neighborList != NULL)
delete neighborList;
}
void CpuCalcNonbondedForceKernel::initialize(const System& system, const NonbondedForce& force) {
......@@ -305,26 +325,26 @@ double CpuCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeFo
}
}
if (needRecompute) {
neighborList.computeNeighborList(numParticles, posq, exclusions, floatBoxSize, data.isPeriodic, nonbondedCutoff+padding, data.threads);
neighborList->computeNeighborList(numParticles, posq, exclusions, floatBoxSize, data.isPeriodic, nonbondedCutoff+padding, data.threads);
lastPositions = posData;
}
nonbonded.setUseCutoff(nonbondedCutoff, neighborList, rfDielectric);
nonbonded->setUseCutoff(nonbondedCutoff, *neighborList, rfDielectric);
}
if (data.isPeriodic) {
double minAllowedSize = 1.999999*nonbondedCutoff;
if (boxSize[0] < minAllowedSize || boxSize[1] < minAllowedSize || boxSize[2] < minAllowedSize)
throw OpenMMException("The periodic box size has decreased to less than twice the nonbonded cutoff.");
nonbonded.setPeriodic(floatBoxSize);
nonbonded->setPeriodic(floatBoxSize);
}
if (ewald)
nonbonded.setUseEwald(ewaldAlpha, kmax[0], kmax[1], kmax[2]);
nonbonded->setUseEwald(ewaldAlpha, kmax[0], kmax[1], kmax[2]);
if (pme)
nonbonded.setUsePME(ewaldAlpha, gridSize);
nonbonded->setUsePME(ewaldAlpha, gridSize);
if (useSwitchingFunction)
nonbonded.setUseSwitchingFunction(switchingDistance);
nonbonded->setUseSwitchingFunction(switchingDistance);
double nonbondedEnergy = 0;
if (includeDirect)
nonbonded.calculateDirectIxn(numParticles, &posq[0], posData, particleParams, exclusions, data.threadForce, includeEnergy ? &nonbondedEnergy : NULL, data.threads);
nonbonded->calculateDirectIxn(numParticles, &posq[0], posData, particleParams, exclusions, data.threadForce, includeEnergy ? &nonbondedEnergy : NULL, data.threads);
if (includeReciprocal) {
if (useOptimizedPme) {
PmeIO io(&posq[0], &data.threadForce[0][0], numParticles);
......@@ -333,7 +353,7 @@ double CpuCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeFo
nonbondedEnergy += optimizedPme.getAs<CalcPmeReciprocalForceKernel>().finishComputation(io);
}
else
nonbonded.calculateReciprocalIxn(numParticles, &posq[0], posData, particleParams, exclusions, forceData, includeEnergy ? &nonbondedEnergy : NULL);
nonbonded->calculateReciprocalIxn(numParticles, &posq[0], posData, particleParams, exclusions, forceData, includeEnergy ? &nonbondedEnergy : NULL);
}
energy += nonbondedEnergy;
if (includeDirect) {
......
......@@ -43,8 +43,6 @@ using namespace std;
namespace OpenMM {
const int CpuNeighborList::BlockSize = 4;
class VoxelIndex
{
public:
......@@ -62,8 +60,8 @@ public:
*/
class CpuNeighborList::Voxels {
public:
Voxels(float vsx, float vsy, float minx, float maxx, float miny, float maxy, const float* periodicBoxSize, bool usePeriodic) :
voxelSizeX(vsx), voxelSizeY(vsy), minx(minx), maxx(maxx), miny(miny), maxy(maxy), periodicBoxSize(periodicBoxSize), usePeriodic(usePeriodic) {
Voxels(int blockSize, float vsx, float vsy, float minx, float maxx, float miny, float maxy, const float* periodicBoxSize, bool usePeriodic) :
blockSize(blockSize), voxelSizeX(vsx), voxelSizeY(vsy), minx(minx), maxx(maxx), miny(miny), maxy(maxy), periodicBoxSize(periodicBoxSize), usePeriodic(usePeriodic) {
if (usePeriodic) {
nx = (int) floorf(periodicBoxSize[0]/voxelSizeX+0.5f);
ny = (int) floorf(periodicBoxSize[1]/voxelSizeY+0.5f);
......@@ -156,7 +154,7 @@ public:
return VoxelIndex(x, y);
}
void getNeighbors(vector<int>& neighbors, int blockIndex, fvec4 blockCenter, fvec4 blockWidth, const vector<int>& sortedAtoms, vector<char>& exclusions, float maxDistance, const vector<int> blockAtoms, const float* atomLocations) const {
void getNeighbors(vector<int>& neighbors, int blockIndex, fvec4 blockCenter, fvec4 blockWidth, const vector<int>& sortedAtoms, vector<char>& exclusions, float maxDistance, const vector<int>& blockAtoms, const float* atomLocations, const vector<VoxelIndex>& atomVoxelIndex) const {
neighbors.resize(0);
exclusions.resize(0);
fvec4 boxSize(periodicBoxSize[0], periodicBoxSize[1], periodicBoxSize[2], 0);
......@@ -175,9 +173,6 @@ public:
float centerPos[4];
blockCenter.store(centerPos);
VoxelIndex centerVoxelIndex = getVoxelIndex(centerPos);
VoxelIndex atomVoxelIndex[BlockSize];
for (int i = 0; i < (int) blockAtoms.size(); i++)
atomVoxelIndex[i] = getVoxelIndex(&atomLocations[4*blockAtoms[i]]);
int startx = centerVoxelIndex.x-dIndexX;
int starty = centerVoxelIndex.y-dIndexY;
int endx = centerVoxelIndex.x+dIndexX;
......@@ -193,7 +188,7 @@ public:
endx = min(endx, nx-1);
endy = min(endy, ny-1);
}
int lastSortedIndex = BlockSize*(blockIndex+1);
int lastSortedIndex = blockSize*(blockIndex+1);
VoxelIndex voxelIndex(0, 0);
for (int x = startx; x <= endx; ++x) {
voxelIndex.x = x;
......@@ -300,10 +295,12 @@ public:
// Add this atom to the list of neighbors.
neighbors.push_back(sortedAtoms[sortedIndex]);
if (sortedIndex < BlockSize*blockIndex)
if (sortedIndex < blockSize*blockIndex)
exclusions.push_back(0);
else
exclusions.push_back(0xF & (0xF<<(sortedIndex-BlockSize*blockIndex)));
else {
int mask = (1<<blockSize)-1;
exclusions.push_back(mask & (mask<<(sortedIndex-blockSize*blockIndex)));
}
}
}
}
......@@ -311,6 +308,7 @@ public:
}
private:
int blockSize;
float voxelSizeX, voxelSizeY;
float minx, maxx, miny, maxy;
int nx, ny;
......@@ -329,12 +327,12 @@ public:
CpuNeighborList& owner;
};
CpuNeighborList::CpuNeighborList() {
CpuNeighborList::CpuNeighborList(int blockSize) : blockSize(blockSize) {
}
void CpuNeighborList::computeNeighborList(int numAtoms, const AlignedArray<float>& atomLocations, const vector<set<int> >& exclusions,
const float* periodicBoxSize, bool usePeriodic, float maxDistance, ThreadPool& threads) {
int numBlocks = (numAtoms+BlockSize-1)/BlockSize;
int numBlocks = (numAtoms+blockSize-1)/blockSize;
blockNeighbors.resize(numBlocks);
blockExclusions.resize(numBlocks);
sortedAtoms.resize(numAtoms);
......@@ -381,7 +379,7 @@ void CpuNeighborList::computeNeighborList(int numAtoms, const AlignedArray<float
edgeSizeX = 0.6f*periodicBoxSize[0]/floorf(periodicBoxSize[0]/maxDistance);
edgeSizeY = 0.6f*periodicBoxSize[1]/floorf(periodicBoxSize[1]/maxDistance);
}
Voxels voxels(edgeSizeX, edgeSizeY, minx, maxx, miny, maxy, periodicBoxSize, usePeriodic);
Voxels voxels(blockSize, edgeSizeX, edgeSizeY, minx, maxx, miny, maxy, periodicBoxSize, usePeriodic);
for (int i = 0; i < numAtoms; i++) {
int atomIndex = atomBins[i].second;
sortedAtoms[i] = atomIndex;
......@@ -397,9 +395,9 @@ void CpuNeighborList::computeNeighborList(int numAtoms, const AlignedArray<float
// Add padding atoms to fill up the last block.
int numPadding = numBlocks*BlockSize-numAtoms;
int numPadding = numBlocks*blockSize-numAtoms;
if (numPadding > 0) {
char mask = (0xF0 >> numPadding) & 0xF;
char mask = ((0xFFFF-(1<<blockSize)+1) >> numPadding);
for (int i = 0; i < numPadding; i++)
sortedAtoms.push_back(0);
vector<char>& exc = blockExclusions[blockExclusions.size()-1];
......@@ -409,7 +407,7 @@ void CpuNeighborList::computeNeighborList(int numAtoms, const AlignedArray<float
}
int CpuNeighborList::getNumBlocks() const {
return sortedAtoms.size()/BlockSize;
return sortedAtoms.size()/blockSize;
}
const std::vector<int>& CpuNeighborList::getSortedAtoms() const {
......@@ -446,14 +444,18 @@ void CpuNeighborList::threadComputeNeighborList(ThreadPool& threads, int threadI
int numBlocks = blockNeighbors.size();
vector<int> blockAtoms;
vector<VoxelIndex> atomVoxelIndex;
for (int i = threadIndex; i < numBlocks; i += numThreads) {
// Find the atoms in this block and compute their bounding box.
int firstIndex = BlockSize*i;
int atomsInBlock = min(BlockSize, numAtoms-firstIndex);
int firstIndex = blockSize*i;
int atomsInBlock = min(blockSize, numAtoms-firstIndex);
blockAtoms.resize(atomsInBlock);
for (int j = 0; j < atomsInBlock; j++)
atomVoxelIndex.resize(atomsInBlock);
for (int j = 0; j < atomsInBlock; j++) {
blockAtoms[j] = sortedAtoms[firstIndex+j];
atomVoxelIndex[j] = voxels->getVoxelIndex(&atomLocations[4*blockAtoms[j]]);
}
fvec4 minPos(&atomLocations[4*sortedAtoms[firstIndex]]);
fvec4 maxPos = minPos;
for (int j = 1; j < atomsInBlock; j++) {
......@@ -461,7 +463,7 @@ void CpuNeighborList::threadComputeNeighborList(ThreadPool& threads, int threadI
minPos = min(minPos, pos);
maxPos = max(maxPos, pos);
}
voxels->getNeighbors(blockNeighbors[i], i, (maxPos+minPos)*0.5f, (maxPos-minPos)*0.5f, sortedAtoms, blockExclusions[i], maxDistance, blockAtoms, atomLocations);
voxels->getNeighbors(blockNeighbors[i], i, (maxPos+minPos)*0.5f, (maxPos-minPos)*0.5f, sortedAtoms, blockExclusions[i], maxDistance, blockAtoms, atomLocations, atomVoxelIndex);
// Record the exclusions for this block.
......
......@@ -29,7 +29,6 @@
#include "CpuNonbondedForce.h"
#include "ReferenceForce.h"
#include "ReferencePME.h"
#include "openmm/internal/vectorize.h"
#include "gmx_atomic.h"
// In case we're using some primitive version of Visual Studio this will
......@@ -61,6 +60,9 @@ public:
CpuNonbondedForce::CpuNonbondedForce() : cutoff(false), useSwitch(false), periodic(false), ewald(false), pme(false), tableIsValid(false) {
}
CpuNonbondedForce::~CpuNonbondedForce() {
}
/**---------------------------------------------------------------------------------------
Set the force to use a cutoff.
......@@ -356,7 +358,7 @@ void CpuNonbondedForce::threadComputeDirect(ThreadPool& threads, int threadIndex
float inverseR = 1/r;
float chargeProd = ONE_4PI_EPS0*posq[4*i+3]*posq[4*j+3];
float alphaR = alphaEwald*r;
float erfcAlphaR = erfcApprox(alphaR)[0];
float erfcAlphaR = erfcApprox(alphaR);
float dEdR = (float) (chargeProd * inverseR * inverseR * inverseR);
dEdR = (float) (dEdR * (1.0f-erfcAlphaR-TWO_OVER_SQRT_PI*alphaR*exp(-alphaR*alphaR)));
fvec4 result = deltaR*dEdR;
......@@ -446,228 +448,6 @@ void CpuNonbondedForce::calculateOneIxn(int ii, int jj, float* forces, double* t
(fvec4(forces+4*jj)-result).store(forces+4*jj);
}
void CpuNonbondedForce::calculateBlockIxn(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize) {
// Load the positions and parameters of the atoms in the block.
int blockAtom[4];
fvec4 blockAtomPosq[4];
fvec4 blockAtomForce[4];
for (int i = 0; i < 4; i++) {
blockAtom[i] = neighborList->getSortedAtoms()[4*blockIndex+i];
blockAtomPosq[i] = fvec4(posq+4*blockAtom[i]);
blockAtomForce[i] = fvec4(0.0f);
}
fvec4 blockAtomX = fvec4(blockAtomPosq[0][0], blockAtomPosq[1][0], blockAtomPosq[2][0], blockAtomPosq[3][0]);
fvec4 blockAtomY = fvec4(blockAtomPosq[0][1], blockAtomPosq[1][1], blockAtomPosq[2][1], blockAtomPosq[3][1]);
fvec4 blockAtomZ = fvec4(blockAtomPosq[0][2], blockAtomPosq[1][2], blockAtomPosq[2][2], blockAtomPosq[3][2]);
fvec4 blockAtomCharge = fvec4(ONE_4PI_EPS0)*fvec4(blockAtomPosq[0][3], blockAtomPosq[1][3], blockAtomPosq[2][3], blockAtomPosq[3][3]);
fvec4 blockAtomSigma(atomParameters[blockAtom[0]].first, atomParameters[blockAtom[1]].first, atomParameters[blockAtom[2]].first, atomParameters[blockAtom[3]].first);
fvec4 blockAtomEpsilon(atomParameters[blockAtom[0]].second, atomParameters[blockAtom[1]].second, atomParameters[blockAtom[2]].second, atomParameters[blockAtom[3]].second);
bool needPeriodic = false;
if (periodic) {
for (int i = 0; i < 4 && !needPeriodic; i++)
for (int j = 0; j < 3; j++)
if (blockAtomPosq[i][j]-cutoffDistance < 0.0 || blockAtomPosq[i][j]+cutoffDistance > boxSize[j]) {
needPeriodic = true;
break;
}
}
const float invSwitchingInterval = 1/(cutoffDistance-switchingDistance);
// Loop over neighbors for this block.
const vector<int>& neighbors = neighborList->getBlockNeighbors(blockIndex);
const vector<char>& exclusions = neighborList->getBlockExclusions(blockIndex);
for (int i = 0; i < (int) neighbors.size(); i++) {
// Load the next neighbor.
int atom = neighbors[i];
fvec4 atomPosq(posq+4*atom);
// Compute the distances to the block atoms.
fvec4 dx, dy, dz, r2;
getDeltaR(atomPosq, blockAtomX, blockAtomY, blockAtomZ, dx, dy, dz, r2, needPeriodic, boxSize, invBoxSize);
ivec4 include;
char excl = exclusions[i];
if (excl == 0)
include = -1;
else
include = ivec4(excl&1 ? 0 : -1, excl&2 ? 0 : -1, excl&4 ? 0 : -1, excl&8 ? 0 : -1);
include = include & (r2 < cutoffDistance*cutoffDistance);
if (!any(include))
continue; // No interactions to compute.
// Compute the interactions.
fvec4 r = sqrt(r2);
fvec4 inverseR = fvec4(1.0f)/r;
fvec4 energy, dEdR;
float atomEpsilon = atomParameters[atom].second;
if (atomEpsilon != 0.0f) {
fvec4 sig = blockAtomSigma+atomParameters[atom].first;
fvec4 sig2 = inverseR*sig;
sig2 *= sig2;
fvec4 sig6 = sig2*sig2*sig2;
fvec4 epsSig6 = blockAtomEpsilon*atomEpsilon*sig6;
dEdR = epsSig6*(12.0f*sig6 - 6.0f);
energy = epsSig6*(sig6-1.0f);
if (useSwitch) {
fvec4 t = (r>switchingDistance) & ((r-switchingDistance)*invSwitchingInterval);
fvec4 switchValue = 1+t*t*t*(-10.0f+t*(15.0f-t*6.0f));
fvec4 switchDeriv = t*t*(-30.0f+t*(60.0f-t*30.0f))*invSwitchingInterval;
dEdR = switchValue*dEdR - energy*switchDeriv*r;
energy *= switchValue;
}
}
else {
energy = 0.0f;
dEdR = 0.0f;
}
fvec4 chargeProd = blockAtomCharge*posq[4*atom+3];
if (cutoff)
dEdR += chargeProd*(inverseR-2.0f*krf*r2);
else
dEdR += chargeProd*inverseR;
dEdR *= inverseR*inverseR;
// Accumulate energies.
if (totalEnergy) {
if (cutoff)
energy += chargeProd*(inverseR+krf*r2-crf);
else
energy += chargeProd*inverseR;
energy = blend(0.0f, energy, include);
*totalEnergy += dot4(energy, 1.0f);
}
// Accumulate forces.
dEdR = blend(0.0f, dEdR, include);
fvec4 result[4] = {dx*dEdR, dy*dEdR, dz*dEdR, 0.0f};
transpose(result[0], result[1], result[2], result[3]);
fvec4 atomForce(forces+4*atom);
for (int j = 0; j < 4; j++) {
blockAtomForce[j] += result[j];
atomForce -= result[j];
}
atomForce.store(forces+4*atom);
}
// Record the forces on the block atoms.
for (int j = 0; j < 4; j++)
(fvec4(forces+4*blockAtom[j])+blockAtomForce[j]).store(forces+4*blockAtom[j]);
}
void CpuNonbondedForce::calculateBlockEwaldIxn(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize) {
// Load the positions and parameters of the atoms in the block.
int blockAtom[4];
fvec4 blockAtomPosq[4];
fvec4 blockAtomForce[4];
for (int i = 0; i < 4; i++) {
blockAtom[i] = neighborList->getSortedAtoms()[4*blockIndex+i];
blockAtomPosq[i] = fvec4(posq+4*blockAtom[i]);
blockAtomForce[i] = fvec4(0.0f);
}
fvec4 blockAtomX = fvec4(blockAtomPosq[0][0], blockAtomPosq[1][0], blockAtomPosq[2][0], blockAtomPosq[3][0]);
fvec4 blockAtomY = fvec4(blockAtomPosq[0][1], blockAtomPosq[1][1], blockAtomPosq[2][1], blockAtomPosq[3][1]);
fvec4 blockAtomZ = fvec4(blockAtomPosq[0][2], blockAtomPosq[1][2], blockAtomPosq[2][2], blockAtomPosq[3][2]);
fvec4 blockAtomCharge = fvec4(ONE_4PI_EPS0)*fvec4(blockAtomPosq[0][3], blockAtomPosq[1][3], blockAtomPosq[2][3], blockAtomPosq[3][3]);
fvec4 blockAtomSigma(atomParameters[blockAtom[0]].first, atomParameters[blockAtom[1]].first, atomParameters[blockAtom[2]].first, atomParameters[blockAtom[3]].first);
fvec4 blockAtomEpsilon(atomParameters[blockAtom[0]].second, atomParameters[blockAtom[1]].second, atomParameters[blockAtom[2]].second, atomParameters[blockAtom[3]].second);
bool needPeriodic = false;
for (int i = 0; i < 4 && !needPeriodic; i++)
for (int j = 0; j < 3; j++)
if (blockAtomPosq[i][j]-cutoffDistance < 0.0 || blockAtomPosq[i][j]+cutoffDistance > boxSize[j]) {
needPeriodic = true;
break;
}
const float invSwitchingInterval = 1/(cutoffDistance-switchingDistance);
// Loop over neighbors for this block.
const vector<int>& neighbors = neighborList->getBlockNeighbors(blockIndex);
const vector<char>& exclusions = neighborList->getBlockExclusions(blockIndex);
for (int i = 0; i < (int) neighbors.size(); i++) {
// Load the next neighbor.
int atom = neighbors[i];
fvec4 atomPosq(posq+4*atom);
// Compute the distances to the block atoms.
fvec4 dx, dy, dz, r2;
getDeltaR(atomPosq, blockAtomX, blockAtomY, blockAtomZ, dx, dy, dz, r2, needPeriodic, boxSize, invBoxSize);
ivec4 include;
char excl = exclusions[i];
if (excl == 0)
include = -1;
else
include = ivec4(excl&1 ? 0 : -1, excl&2 ? 0 : -1, excl&4 ? 0 : -1, excl&8 ? 0 : -1);
include = include & (r2 < cutoffDistance*cutoffDistance);
if (!any(include))
continue; // No interactions to compute.
// Compute the interactions.
fvec4 r = sqrt(r2);
fvec4 inverseR = fvec4(1.0f)/r;
fvec4 energy, dEdR;
float atomEpsilon = atomParameters[atom].second;
if (atomEpsilon != 0.0f) {
fvec4 sig = blockAtomSigma+atomParameters[atom].first;
fvec4 sig2 = inverseR*sig;
sig2 *= sig2;
fvec4 sig6 = sig2*sig2*sig2;
fvec4 epsSig6 = blockAtomEpsilon*atomEpsilon*sig6;
dEdR = epsSig6*(12.0f*sig6 - 6.0f);
energy = epsSig6*(sig6-1.0f);
if (useSwitch) {
fvec4 t = (r>switchingDistance) & ((r-switchingDistance)*invSwitchingInterval);
fvec4 switchValue = 1+t*t*t*(-10.0f+t*(15.0f-t*6.0f));
fvec4 switchDeriv = t*t*(-30.0f+t*(60.0f-t*30.0f))*invSwitchingInterval;
dEdR = switchValue*dEdR - energy*switchDeriv*r;
energy *= switchValue;
}
}
else {
energy = 0.0f;
dEdR = 0.0f;
}
fvec4 chargeProd = blockAtomCharge*posq[4*atom+3];
dEdR += chargeProd*inverseR*ewaldScaleFunction(r);
dEdR *= inverseR*inverseR;
// Accumulate energies.
if (totalEnergy) {
energy += chargeProd*inverseR*erfcApprox(alphaEwald*r);
energy = blend(0.0f, energy, include);
*totalEnergy += dot4(energy, 1.0f);
}
// Accumulate forces.
dEdR = blend(0.0f, dEdR, include);
fvec4 result[4] = {dx*dEdR, dy*dEdR, dz*dEdR, 0.0f};
transpose(result[0], result[1], result[2], result[3]);
fvec4 atomForce(forces+4*atom);
for (int j = 0; j < 4; j++) {
blockAtomForce[j] += result[j];
atomForce -= result[j];
}
atomForce.store(forces+4*atom);
}
// Record the forces on the block atoms.
for (int j = 0; j < 4; j++)
(fvec4(forces+4*blockAtom[j])+blockAtomForce[j]).store(forces+4*blockAtom[j]);
}
void CpuNonbondedForce::getDeltaR(const fvec4& posI, const fvec4& posJ, fvec4& deltaR, float& r2, bool periodic, const fvec4& boxSize, const fvec4& invBoxSize) const {
deltaR = posJ-posI;
if (periodic) {
......@@ -677,41 +457,15 @@ void CpuNonbondedForce::getDeltaR(const fvec4& posI, const fvec4& posJ, fvec4& d
r2 = dot3(deltaR, deltaR);
}
void CpuNonbondedForce::getDeltaR(const fvec4& posI, const fvec4& x, const fvec4& y, const fvec4& z, fvec4& dx, fvec4& dy, fvec4& dz, fvec4& r2, bool periodic, const fvec4& boxSize, const fvec4& invBoxSize) const {
dx = x-posI[0];
dy = y-posI[1];
dz = z-posI[2];
if (periodic) {
dx -= round(dx*invBoxSize[0])*boxSize[0];
dy -= round(dy*invBoxSize[1])*boxSize[1];
dz -= round(dz*invBoxSize[2])*boxSize[2];
}
r2 = dx*dx + dy*dy + dz*dz;
}
fvec4 CpuNonbondedForce::erfcApprox(fvec4 x) {
float CpuNonbondedForce::erfcApprox(float x) {
// This approximation for erfc is from Abramowitz and Stegun (1964) p. 299. They cite the following as
// the original source: C. Hastings, Jr., Approximations for Digital Computers (1955). It has a maximum
// error of 3e-7.
fvec4 t = 1.0f+(0.0705230784f+(0.0422820123f+(0.0092705272f+(0.0001520143f+(0.0002765672f+0.0000430638f*x)*x)*x)*x)*x)*x;
float t = 1.0f+(0.0705230784f+(0.0422820123f+(0.0092705272f+(0.0001520143f+(0.0002765672f+0.0000430638f*x)*x)*x)*x)*x)*x;
t *= t;
t *= t;
t *= t;
return 1.0f/(t*t);
}
fvec4 CpuNonbondedForce::ewaldScaleFunction(fvec4 x) {
// Compute the tabulated Ewald scale factor: erfc(alpha*r) + 2*alpha*r*exp(-alpha*alpha*r*r)/sqrt(PI)
fvec4 x1 = x*ewaldDXInv;
ivec4 index = min(floor(x1), NUM_TABLE_POINTS);
fvec4 coeff2 = x1-index;
fvec4 coeff1 = 1.0f-coeff2;
fvec4 t1(&ewaldScaleTable[index[0]]);
fvec4 t2(&ewaldScaleTable[index[1]]);
fvec4 t3(&ewaldScaleTable[index[2]]);
fvec4 t4(&ewaldScaleTable[index[3]]);
transpose(t1, t2, t3, t4);
return coeff1*t1 + coeff2*t2;
}
/* Portions copyright (c) 2006-2013 Stanford University and Simbios.
* Contributors: Pande Group
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files (the
* "Software"), to deal in the Software without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sublicense, and/or sell copies of the Software, and to
* permit persons to whom the Software is furnished to do so, subject
* to the following conditions:
*
* The above copyright notice and this permission notice shall be included
* in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE
* LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
* OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
* WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
#include "SimTKOpenMMCommon.h"
#include "SimTKOpenMMUtilities.h"
#include "CpuNonbondedForceVec4.h"
using namespace std;
using namespace OpenMM;
/**
* Factory method to create a CpuNonbondedForceVec4.
*/
CpuNonbondedForce* createCpuNonbondedForceVec4() {
return new CpuNonbondedForceVec4();
}
/**---------------------------------------------------------------------------------------
CpuNonbondedForceVec4 constructor
--------------------------------------------------------------------------------------- */
CpuNonbondedForceVec4::CpuNonbondedForceVec4() {
}
void CpuNonbondedForceVec4::calculateBlockIxn(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize) {
// Load the positions and parameters of the atoms in the block.
int blockAtom[4];
fvec4 blockAtomPosq[4];
fvec4 blockAtomForceX(0.0f), blockAtomForceY(0.0f), blockAtomForceZ(0.0f);
for (int i = 0; i < 4; i++) {
blockAtom[i] = neighborList->getSortedAtoms()[4*blockIndex+i];
blockAtomPosq[i] = fvec4(posq+4*blockAtom[i]);
}
fvec4 blockAtomX = fvec4(blockAtomPosq[0][0], blockAtomPosq[1][0], blockAtomPosq[2][0], blockAtomPosq[3][0]);
fvec4 blockAtomY = fvec4(blockAtomPosq[0][1], blockAtomPosq[1][1], blockAtomPosq[2][1], blockAtomPosq[3][1]);
fvec4 blockAtomZ = fvec4(blockAtomPosq[0][2], blockAtomPosq[1][2], blockAtomPosq[2][2], blockAtomPosq[3][2]);
fvec4 blockAtomCharge = fvec4(ONE_4PI_EPS0)*fvec4(blockAtomPosq[0][3], blockAtomPosq[1][3], blockAtomPosq[2][3], blockAtomPosq[3][3]);
fvec4 blockAtomSigma(atomParameters[blockAtom[0]].first, atomParameters[blockAtom[1]].first, atomParameters[blockAtom[2]].first, atomParameters[blockAtom[3]].first);
fvec4 blockAtomEpsilon(atomParameters[blockAtom[0]].second, atomParameters[blockAtom[1]].second, atomParameters[blockAtom[2]].second, atomParameters[blockAtom[3]].second);
bool needPeriodic = (periodic && (any(blockAtomX < cutoffDistance) || any(blockAtomY < cutoffDistance) || any(blockAtomZ < cutoffDistance) ||
any(blockAtomX > boxSize[0]-cutoffDistance) || any(blockAtomY > boxSize[1]-cutoffDistance) || any(blockAtomZ > boxSize[2]-cutoffDistance)));
const float invSwitchingInterval = 1/(cutoffDistance-switchingDistance);
// Loop over neighbors for this block.
const vector<int>& neighbors = neighborList->getBlockNeighbors(blockIndex);
const vector<char>& exclusions = neighborList->getBlockExclusions(blockIndex);
for (int i = 0; i < (int) neighbors.size(); i++) {
// Load the next neighbor.
int atom = neighbors[i];
// Compute the distances to the block atoms.
fvec4 dx, dy, dz, r2;
getDeltaR(posq+4*atom, blockAtomX, blockAtomY, blockAtomZ, dx, dy, dz, r2, needPeriodic, boxSize, invBoxSize);
ivec4 include;
char excl = exclusions[i];
if (excl == 0)
include = -1;
else
include = ivec4(excl&1 ? 0 : -1, excl&2 ? 0 : -1, excl&4 ? 0 : -1, excl&8 ? 0 : -1);
include = include & (r2 < cutoffDistance*cutoffDistance);
if (!any(include))
continue; // No interactions to compute.
// Compute the interactions.
fvec4 r = sqrt(r2);
fvec4 inverseR = fvec4(1.0f)/r;
fvec4 energy, dEdR;
float atomEpsilon = atomParameters[atom].second;
if (atomEpsilon != 0.0f) {
fvec4 sig = blockAtomSigma+atomParameters[atom].first;
fvec4 sig2 = inverseR*sig;
sig2 *= sig2;
fvec4 sig6 = sig2*sig2*sig2;
fvec4 epsSig6 = blockAtomEpsilon*atomEpsilon*sig6;
dEdR = epsSig6*(12.0f*sig6 - 6.0f);
energy = epsSig6*(sig6-1.0f);
if (useSwitch) {
fvec4 t = (r>switchingDistance) & ((r-switchingDistance)*invSwitchingInterval);
fvec4 switchValue = 1+t*t*t*(-10.0f+t*(15.0f-t*6.0f));
fvec4 switchDeriv = t*t*(-30.0f+t*(60.0f-t*30.0f))*invSwitchingInterval;
dEdR = switchValue*dEdR - energy*switchDeriv*r;
energy *= switchValue;
}
}
else {
energy = 0.0f;
dEdR = 0.0f;
}
fvec4 chargeProd = blockAtomCharge*posq[4*atom+3];
if (cutoff)
dEdR += chargeProd*(inverseR-2.0f*krf*r2);
else
dEdR += chargeProd*inverseR;
dEdR *= inverseR*inverseR;
// Accumulate energies.
fvec4 one(1.0f);
if (totalEnergy) {
if (cutoff)
energy += chargeProd*(inverseR+krf*r2-crf);
else
energy += chargeProd*inverseR;
energy = blend(0.0f, energy, include);
*totalEnergy += dot4(energy, one);
}
// Accumulate forces.
dEdR = blend(0.0f, dEdR, include);
fvec4 fx = dx*dEdR;
fvec4 fy = dy*dEdR;
fvec4 fz = dz*dEdR;
blockAtomForceX += fx;
blockAtomForceY += fy;
blockAtomForceZ += fz;
float* atomForce = forces+4*atom;
atomForce[0] -= dot4(fx, one);
atomForce[1] -= dot4(fy, one);
atomForce[2] -= dot4(fz, one);
}
// Record the forces on the block atoms.
fvec4 f[4] = {blockAtomForceX, blockAtomForceY, blockAtomForceZ, 0.0f};
transpose(f[0], f[1], f[2], f[3]);
for (int j = 0; j < 4; j++)
(fvec4(forces+4*blockAtom[j])+f[j]).store(forces+4*blockAtom[j]);
}
void CpuNonbondedForceVec4::calculateBlockEwaldIxn(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize) {
// Load the positions and parameters of the atoms in the block.
int blockAtom[4];
fvec4 blockAtomPosq[4];
fvec4 blockAtomForceX(0.0f), blockAtomForceY(0.0f), blockAtomForceZ(0.0f);
for (int i = 0; i < 4; i++) {
blockAtom[i] = neighborList->getSortedAtoms()[4*blockIndex+i];
blockAtomPosq[i] = fvec4(posq+4*blockAtom[i]);
}
fvec4 blockAtomX = fvec4(blockAtomPosq[0][0], blockAtomPosq[1][0], blockAtomPosq[2][0], blockAtomPosq[3][0]);
fvec4 blockAtomY = fvec4(blockAtomPosq[0][1], blockAtomPosq[1][1], blockAtomPosq[2][1], blockAtomPosq[3][1]);
fvec4 blockAtomZ = fvec4(blockAtomPosq[0][2], blockAtomPosq[1][2], blockAtomPosq[2][2], blockAtomPosq[3][2]);
fvec4 blockAtomCharge = fvec4(ONE_4PI_EPS0)*fvec4(blockAtomPosq[0][3], blockAtomPosq[1][3], blockAtomPosq[2][3], blockAtomPosq[3][3]);
fvec4 blockAtomSigma(atomParameters[blockAtom[0]].first, atomParameters[blockAtom[1]].first, atomParameters[blockAtom[2]].first, atomParameters[blockAtom[3]].first);
fvec4 blockAtomEpsilon(atomParameters[blockAtom[0]].second, atomParameters[blockAtom[1]].second, atomParameters[blockAtom[2]].second, atomParameters[blockAtom[3]].second);
bool needPeriodic = (periodic && (any(blockAtomX < cutoffDistance) || any(blockAtomY < cutoffDistance) || any(blockAtomZ < cutoffDistance) ||
any(blockAtomX > boxSize[0]-cutoffDistance) || any(blockAtomY > boxSize[1]-cutoffDistance) || any(blockAtomZ > boxSize[2]-cutoffDistance)));
const float invSwitchingInterval = 1/(cutoffDistance-switchingDistance);
// Loop over neighbors for this block.
const vector<int>& neighbors = neighborList->getBlockNeighbors(blockIndex);
const vector<char>& exclusions = neighborList->getBlockExclusions(blockIndex);
for (int i = 0; i < (int) neighbors.size(); i++) {
// Load the next neighbor.
int atom = neighbors[i];
// Compute the distances to the block atoms.
fvec4 dx, dy, dz, r2;
getDeltaR(posq+4*atom, blockAtomX, blockAtomY, blockAtomZ, dx, dy, dz, r2, needPeriodic, boxSize, invBoxSize);
ivec4 include;
char excl = exclusions[i];
if (excl == 0)
include = -1;
else
include = ivec4(excl&1 ? 0 : -1, excl&2 ? 0 : -1, excl&4 ? 0 : -1, excl&8 ? 0 : -1);
include = include & (r2 < cutoffDistance*cutoffDistance);
if (!any(include))
continue; // No interactions to compute.
// Compute the interactions.
fvec4 r = sqrt(r2);
fvec4 inverseR = fvec4(1.0f)/r;
fvec4 energy, dEdR;
float atomEpsilon = atomParameters[atom].second;
if (atomEpsilon != 0.0f) {
fvec4 sig = blockAtomSigma+atomParameters[atom].first;
fvec4 sig2 = inverseR*sig;
sig2 *= sig2;
fvec4 sig6 = sig2*sig2*sig2;
fvec4 epsSig6 = blockAtomEpsilon*atomEpsilon*sig6;
dEdR = epsSig6*(12.0f*sig6 - 6.0f);
energy = epsSig6*(sig6-1.0f);
if (useSwitch) {
fvec4 t = (r>switchingDistance) & ((r-switchingDistance)*invSwitchingInterval);
fvec4 switchValue = 1+t*t*t*(-10.0f+t*(15.0f-t*6.0f));
fvec4 switchDeriv = t*t*(-30.0f+t*(60.0f-t*30.0f))*invSwitchingInterval;
dEdR = switchValue*dEdR - energy*switchDeriv*r;
energy *= switchValue;
}
}
else {
energy = 0.0f;
dEdR = 0.0f;
}
fvec4 chargeProd = blockAtomCharge*posq[4*atom+3];
dEdR += chargeProd*inverseR*ewaldScaleFunction(r);
dEdR *= inverseR*inverseR;
// Accumulate energies.
fvec4 one(1.0f);
if (totalEnergy) {
energy += chargeProd*inverseR*erfcApprox(alphaEwald*r);
energy = blend(0.0f, energy, include);
*totalEnergy += dot4(energy, one);
}
// Accumulate forces.
dEdR = blend(0.0f, dEdR, include);
fvec4 fx = dx*dEdR;
fvec4 fy = dy*dEdR;
fvec4 fz = dz*dEdR;
blockAtomForceX += fx;
blockAtomForceY += fy;
blockAtomForceZ += fz;
float* atomForce = forces+4*atom;
atomForce[0] -= dot4(fx, one);
atomForce[1] -= dot4(fy, one);
atomForce[2] -= dot4(fz, one);
}
// Record the forces on the block atoms.
fvec4 f[4] = {blockAtomForceX, blockAtomForceY, blockAtomForceZ, 0.0f};
transpose(f[0], f[1], f[2], f[3]);
for (int j = 0; j < 4; j++)
(fvec4(forces+4*blockAtom[j])+f[j]).store(forces+4*blockAtom[j]);
}
void CpuNonbondedForceVec4::getDeltaR(const float* posI, const fvec4& x, const fvec4& y, const fvec4& z, fvec4& dx, fvec4& dy, fvec4& dz, fvec4& r2, bool periodic, const fvec4& boxSize, const fvec4& invBoxSize) const {
dx = x-posI[0];
dy = y-posI[1];
dz = z-posI[2];
if (periodic) {
dx -= round(dx*invBoxSize[0])*boxSize[0];
dy -= round(dy*invBoxSize[1])*boxSize[1];
dz -= round(dz*invBoxSize[2])*boxSize[2];
}
r2 = dx*dx + dy*dy + dz*dz;
}
fvec4 CpuNonbondedForceVec4::erfcApprox(fvec4 x) {
// This approximation for erfc is from Abramowitz and Stegun (1964) p. 299. They cite the following as
// the original source: C. Hastings, Jr., Approximations for Digital Computers (1955). It has a maximum
// error of 3e-7.
fvec4 t = 1.0f+(0.0705230784f+(0.0422820123f+(0.0092705272f+(0.0001520143f+(0.0002765672f+0.0000430638f*x)*x)*x)*x)*x)*x;
t *= t;
t *= t;
t *= t;
return 1.0f/(t*t);
}
fvec4 CpuNonbondedForceVec4::ewaldScaleFunction(fvec4 x) {
// Compute the tabulated Ewald scale factor: erfc(alpha*r) + 2*alpha*r*exp(-alpha*alpha*r*r)/sqrt(PI)
fvec4 x1 = x*ewaldDXInv;
ivec4 index = min(floor(x1), NUM_TABLE_POINTS);
fvec4 coeff2 = x1-index;
fvec4 coeff1 = 1.0f-coeff2;
fvec4 t1(&ewaldScaleTable[index[0]]);
fvec4 t2(&ewaldScaleTable[index[1]]);
fvec4 t3(&ewaldScaleTable[index[2]]);
fvec4 t4(&ewaldScaleTable[index[3]]);
transpose(t1, t2, t3, t4);
return coeff1*t1 + coeff2*t2;
}
/* Portions copyright (c) 2006-2013 Stanford University and Simbios.
* Contributors: Pande Group
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files (the
* "Software"), to deal in the Software without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sublicense, and/or sell copies of the Software, and to
* permit persons to whom the Software is furnished to do so, subject
* to the following conditions:
*
* The above copyright notice and this permission notice shall be included
* in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE
* LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
* OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
* WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
#include "SimTKOpenMMCommon.h"
#include "SimTKOpenMMUtilities.h"
#include "CpuNonbondedForceVec8.h"
#include "openmm/internal/hardware.h"
using namespace std;
using namespace OpenMM;
#ifndef __AVX__
bool isVec8Supported() {
return false;
}
CpuNonbondedForce* createCpuNonbondedForceVec8() {
throw OpenMMException("Internal error: OpenMM was compiled without AVX support");
}
#else
/**
* Check whether 8 component vectors are supported with the current CPU.
*/
bool isVec8Supported() {
// Make sure the CPU supports AVX.
int cpuInfo[4];
cpuid(cpuInfo, 0);
if (cpuInfo[0] >= 1) {
cpuid(cpuInfo, 1);
return ((cpuInfo[2] & ((int) 1 << 28)) != 0);
}
return false;
}
/**
* Factory method to create a CpuNonbondedForceVec8.
*/
CpuNonbondedForce* createCpuNonbondedForceVec8() {
return new CpuNonbondedForceVec8();
}
/**---------------------------------------------------------------------------------------
CpuNonbondedForceVec8 constructor
--------------------------------------------------------------------------------------- */
CpuNonbondedForceVec8::CpuNonbondedForceVec8() {
}
void CpuNonbondedForceVec8::calculateBlockIxn(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize) {
// Load the positions and parameters of the atoms in the block.
int blockAtom[8];
fvec4 blockAtomPosq[8];
fvec8 blockAtomForceX(0.0f), blockAtomForceY(0.0f), blockAtomForceZ(0.0f);
fvec8 blockAtomX, blockAtomY, blockAtomZ, blockAtomCharge;
for (int i = 0; i < 8; i++) {
blockAtom[i] = neighborList->getSortedAtoms()[8*blockIndex+i];
blockAtomPosq[i] = fvec4(posq+4*blockAtom[i]);
}
transpose(blockAtomPosq[0], blockAtomPosq[1], blockAtomPosq[2], blockAtomPosq[3], blockAtomPosq[4], blockAtomPosq[5], blockAtomPosq[6], blockAtomPosq[7], blockAtomX, blockAtomY, blockAtomZ, blockAtomCharge);
blockAtomCharge *= ONE_4PI_EPS0;
fvec8 blockAtomSigma(atomParameters[blockAtom[0]].first, atomParameters[blockAtom[1]].first, atomParameters[blockAtom[2]].first, atomParameters[blockAtom[3]].first, atomParameters[blockAtom[4]].first, atomParameters[blockAtom[5]].first, atomParameters[blockAtom[6]].first, atomParameters[blockAtom[7]].first);
fvec8 blockAtomEpsilon(atomParameters[blockAtom[0]].second, atomParameters[blockAtom[1]].second, atomParameters[blockAtom[2]].second, atomParameters[blockAtom[3]].second, atomParameters[blockAtom[4]].second, atomParameters[blockAtom[5]].second, atomParameters[blockAtom[6]].second, atomParameters[blockAtom[7]].second);
bool needPeriodic = (periodic && (any(blockAtomX < cutoffDistance) || any(blockAtomY < cutoffDistance) || any(blockAtomZ < cutoffDistance) ||
any(blockAtomX > boxSize[0]-cutoffDistance) || any(blockAtomY > boxSize[1]-cutoffDistance) || any(blockAtomZ > boxSize[2]-cutoffDistance)));
const float invSwitchingInterval = 1/(cutoffDistance-switchingDistance);
// Loop over neighbors for this block.
const vector<int>& neighbors = neighborList->getBlockNeighbors(blockIndex);
const vector<char>& exclusions = neighborList->getBlockExclusions(blockIndex);
for (int i = 0; i < (int) neighbors.size(); i++) {
// Load the next neighbor.
int atom = neighbors[i];
// Compute the distances to the block atoms.
fvec8 dx, dy, dz, r2;
getDeltaR(&posq[4*atom], blockAtomX, blockAtomY, blockAtomZ, dx, dy, dz, r2, needPeriodic, boxSize, invBoxSize);
ivec8 include;
char excl = exclusions[i];
if (excl == 0)
include = -1;
else
include = ivec8(excl&1 ? 0 : -1, excl&2 ? 0 : -1, excl&4 ? 0 : -1, excl&8 ? 0 : -1, excl&16 ? 0 : -1, excl&32 ? 0 : -1, excl&64 ? 0 : -1, excl&128 ? 0 : -1);
include = include & (r2 < cutoffDistance*cutoffDistance);
if (!any(include))
continue; // No interactions to compute.
// Compute the interactions.
fvec8 r = sqrt(r2);
fvec8 inverseR = fvec8(1.0f)/r;
fvec8 energy, dEdR;
float atomEpsilon = atomParameters[atom].second;
if (atomEpsilon != 0.0f) {
fvec8 sig = blockAtomSigma+atomParameters[atom].first;
fvec8 sig2 = inverseR*sig;
sig2 *= sig2;
fvec8 sig6 = sig2*sig2*sig2;
fvec8 epsSig6 = blockAtomEpsilon*atomEpsilon*sig6;
dEdR = epsSig6*(12.0f*sig6 - 6.0f);
energy = epsSig6*(sig6-1.0f);
if (useSwitch) {
fvec8 t = (r>switchingDistance) & ((r-switchingDistance)*invSwitchingInterval);
fvec8 switchValue = 1+t*t*t*(-10.0f+t*(15.0f-t*6.0f));
fvec8 switchDeriv = t*t*(-30.0f+t*(60.0f-t*30.0f))*invSwitchingInterval;
dEdR = switchValue*dEdR - energy*switchDeriv*r;
energy *= switchValue;
}
}
else {
energy = 0.0f;
dEdR = 0.0f;
}
fvec8 chargeProd = blockAtomCharge*posq[4*atom+3];
if (cutoff)
dEdR += chargeProd*(inverseR-2.0f*krf*r2);
else
dEdR += chargeProd*inverseR;
dEdR *= inverseR*inverseR;
// Accumulate energies.
fvec8 one(1.0f);
if (totalEnergy) {
if (cutoff)
energy += chargeProd*(inverseR+krf*r2-crf);
else
energy += chargeProd*inverseR;
energy = blend(0.0f, energy, include);
*totalEnergy += dot8(energy, one);
}
// Accumulate forces.
dEdR = blend(0.0f, dEdR, include);
fvec8 fx = dx*dEdR;
fvec8 fy = dy*dEdR;
fvec8 fz = dz*dEdR;
blockAtomForceX += fx;
blockAtomForceY += fy;
blockAtomForceZ += fz;
float* atomForce = forces+4*atom;
atomForce[0] -= dot8(fx, one);
atomForce[1] -= dot8(fy, one);
atomForce[2] -= dot8(fz, one);
}
// Record the forces on the block atoms.
fvec4 f[8];
transpose(blockAtomForceX, blockAtomForceY, blockAtomForceZ, 0.0f, f[0], f[1], f[2], f[3], f[4], f[5], f[6], f[7]);
for (int j = 0; j < 8; j++)
(fvec4(forces+4*blockAtom[j])+f[j]).store(forces+4*blockAtom[j]);
}
void CpuNonbondedForceVec8::calculateBlockEwaldIxn(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize) {
// Load the positions and parameters of the atoms in the block.
int blockAtom[8];
fvec4 blockAtomPosq[8];
fvec8 blockAtomForceX(0.0f), blockAtomForceY(0.0f), blockAtomForceZ(0.0f);
fvec8 blockAtomX, blockAtomY, blockAtomZ, blockAtomCharge;
for (int i = 0; i < 8; i++) {
blockAtom[i] = neighborList->getSortedAtoms()[8*blockIndex+i];
blockAtomPosq[i] = fvec4(posq+4*blockAtom[i]);
}
transpose(blockAtomPosq[0], blockAtomPosq[1], blockAtomPosq[2], blockAtomPosq[3], blockAtomPosq[4], blockAtomPosq[5], blockAtomPosq[6], blockAtomPosq[7], blockAtomX, blockAtomY, blockAtomZ, blockAtomCharge);
blockAtomCharge *= ONE_4PI_EPS0;
fvec8 blockAtomSigma(atomParameters[blockAtom[0]].first, atomParameters[blockAtom[1]].first, atomParameters[blockAtom[2]].first, atomParameters[blockAtom[3]].first, atomParameters[blockAtom[4]].first, atomParameters[blockAtom[5]].first, atomParameters[blockAtom[6]].first, atomParameters[blockAtom[7]].first);
fvec8 blockAtomEpsilon(atomParameters[blockAtom[0]].second, atomParameters[blockAtom[1]].second, atomParameters[blockAtom[2]].second, atomParameters[blockAtom[3]].second, atomParameters[blockAtom[4]].second, atomParameters[blockAtom[5]].second, atomParameters[blockAtom[6]].second, atomParameters[blockAtom[7]].second);
bool needPeriodic = (periodic && (any(blockAtomX < cutoffDistance) || any(blockAtomY < cutoffDistance) || any(blockAtomZ < cutoffDistance) ||
any(blockAtomX > boxSize[0]-cutoffDistance) || any(blockAtomY > boxSize[1]-cutoffDistance) || any(blockAtomZ > boxSize[2]-cutoffDistance)));
const float invSwitchingInterval = 1/(cutoffDistance-switchingDistance);
// Loop over neighbors for this block.
const vector<int>& neighbors = neighborList->getBlockNeighbors(blockIndex);
const vector<char>& exclusions = neighborList->getBlockExclusions(blockIndex);
for (int i = 0; i < (int) neighbors.size(); i++) {
// Load the next neighbor.
int atom = neighbors[i];
// Compute the distances to the block atoms.
fvec8 dx, dy, dz, r2;
getDeltaR(&posq[4*atom], blockAtomX, blockAtomY, blockAtomZ, dx, dy, dz, r2, needPeriodic, boxSize, invBoxSize);
ivec8 include;
char excl = exclusions[i];
if (excl == 0)
include = -1;
else
include = ivec8(excl&1 ? 0 : -1, excl&2 ? 0 : -1, excl&4 ? 0 : -1, excl&8 ? 0 : -1, excl&16 ? 0 : -1, excl&32 ? 0 : -1, excl&64 ? 0 : -1, excl&128 ? 0 : -1);
include = include & (r2 < cutoffDistance*cutoffDistance);
if (!any(include))
continue; // No interactions to compute.
// Compute the interactions.
fvec8 r = sqrt(r2);
fvec8 inverseR = fvec8(1.0f)/r;
fvec8 energy, dEdR;
float atomEpsilon = atomParameters[atom].second;
if (atomEpsilon != 0.0f) {
fvec8 sig = blockAtomSigma+atomParameters[atom].first;
fvec8 sig2 = inverseR*sig;
sig2 *= sig2;
fvec8 sig6 = sig2*sig2*sig2;
fvec8 epsSig6 = blockAtomEpsilon*atomEpsilon*sig6;
dEdR = epsSig6*(12.0f*sig6 - 6.0f);
energy = epsSig6*(sig6-1.0f);
if (useSwitch) {
fvec8 t = (r>switchingDistance) & ((r-switchingDistance)*invSwitchingInterval);
fvec8 switchValue = 1+t*t*t*(-10.0f+t*(15.0f-t*6.0f));
fvec8 switchDeriv = t*t*(-30.0f+t*(60.0f-t*30.0f))*invSwitchingInterval;
dEdR = switchValue*dEdR - energy*switchDeriv*r;
energy *= switchValue;
}
}
else {
energy = 0.0f;
dEdR = 0.0f;
}
fvec8 chargeProd = blockAtomCharge*posq[4*atom+3];
dEdR += chargeProd*inverseR*ewaldScaleFunction(r);
dEdR *= inverseR*inverseR;
// Accumulate energies.
fvec8 one(1.0f);
if (totalEnergy) {
energy += chargeProd*inverseR*erfcApprox(alphaEwald*r);
energy = blend(0.0f, energy, include);
*totalEnergy += dot8(energy, one);
}
// Accumulate forces.
dEdR = blend(0.0f, dEdR, include);
fvec8 fx = dx*dEdR;
fvec8 fy = dy*dEdR;
fvec8 fz = dz*dEdR;
blockAtomForceX += fx;
blockAtomForceY += fy;
blockAtomForceZ += fz;
float* atomForce = forces+4*atom;
atomForce[0] -= dot8(fx, one);
atomForce[1] -= dot8(fy, one);
atomForce[2] -= dot8(fz, one);
}
// Record the forces on the block atoms.
fvec4 f[8];
transpose(blockAtomForceX, blockAtomForceY, blockAtomForceZ, 0.0f, f[0], f[1], f[2], f[3], f[4], f[5], f[6], f[7]);
for (int j = 0; j < 8; j++)
(fvec4(forces+4*blockAtom[j])+f[j]).store(forces+4*blockAtom[j]);
}
void CpuNonbondedForceVec8::getDeltaR(const float* posI, const fvec8& x, const fvec8& y, const fvec8& z, fvec8& dx, fvec8& dy, fvec8& dz, fvec8& r2, bool periodic, const fvec4& boxSize, const fvec4& invBoxSize) const {
dx = x-posI[0];
dy = y-posI[1];
dz = z-posI[2];
if (periodic) {
dx -= round(dx*invBoxSize[0])*boxSize[0];
dy -= round(dy*invBoxSize[1])*boxSize[1];
dz -= round(dz*invBoxSize[2])*boxSize[2];
}
r2 = dx*dx + dy*dy + dz*dz;
}
fvec8 CpuNonbondedForceVec8::erfcApprox(fvec8 x) {
// This approximation for erfc is from Abramowitz and Stegun (1964) p. 299. They cite the following as
// the original source: C. Hastings, Jr., Approximations for Digital Computers (1955). It has a maximum
// error of 3e-7.
fvec8 t = 1.0f+(0.0705230784f+(0.0422820123f+(0.0092705272f+(0.0001520143f+(0.0002765672f+0.0000430638f*x)*x)*x)*x)*x)*x;
t *= t;
t *= t;
t *= t;
return 1.0f/(t*t);
}
fvec8 CpuNonbondedForceVec8::ewaldScaleFunction(fvec8 x) {
// Compute the tabulated Ewald scale factor: erfc(alpha*r) + 2*alpha*r*exp(-alpha*alpha*r*r)/sqrt(PI)
fvec8 x1 = x*ewaldDXInv;
ivec8 index = min(floor(x1), NUM_TABLE_POINTS);
fvec8 coeff2 = x1-index;
fvec8 coeff1 = 1.0f-coeff2;
ivec4 indexLower = index.lowerVec();
ivec4 indexUpper = index.upperVec();
fvec4 t1(&ewaldScaleTable[indexLower[0]]);
fvec4 t2(&ewaldScaleTable[indexLower[1]]);
fvec4 t3(&ewaldScaleTable[indexLower[2]]);
fvec4 t4(&ewaldScaleTable[indexLower[3]]);
fvec4 t5(&ewaldScaleTable[indexUpper[0]]);
fvec4 t6(&ewaldScaleTable[indexUpper[1]]);
fvec4 t7(&ewaldScaleTable[indexUpper[2]]);
fvec4 t8(&ewaldScaleTable[indexUpper[3]]);
fvec8 s1, s2, s3, s4;
transpose(t1, t2, t3, t4, t5, t6, t7, t8, s1, s2, s3, s4);
return coeff1*s1 + coeff2*s2;
}
#endif
......@@ -51,6 +51,7 @@ void testNeighborList(bool periodic) {
const int numParticles = 500;
const float cutoff = 2.0f;
const float boxSize[3] = {20.0f, 15.0f, 22.0f};
const int blockSize = 8;
OpenMM_SFMT::SFMT sfmt;
init_gen_rand(0, sfmt);
AlignedArray<float> positions(4*numParticles);
......@@ -66,15 +67,15 @@ void testNeighborList(bool periodic) {
}
}
ThreadPool threads;
CpuNeighborList neighborList;
CpuNeighborList neighborList(blockSize);
neighborList.computeNeighborList(numParticles, positions, exclusions, boxSize, periodic, cutoff, threads);
// Convert the neighbor list to a set for faster lookup.
set<pair<int, int> > neighbors;
for (int i = 0; i < (int) neighborList.getSortedAtoms().size(); i++) {
int blockIndex = i/CpuNeighborList::BlockSize;
int indexInBlock = i-blockIndex*CpuNeighborList::BlockSize;
int blockIndex = i/blockSize;
int indexInBlock = i-blockIndex*blockSize;
char mask = 1<<indexInBlock;
for (int j = 0; j < (int) neighborList.getBlockExclusions(blockIndex).size(); j++) {
if ((neighborList.getBlockExclusions(blockIndex)[j] & mask) == 0) {
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment