"libraries/pthreads/vscode:/vscode.git/clone" did not exist on "cf8a03e833ae2dc49853b8935ecb09a4872bce71"
Commit 8167c79b authored by leeping's avatar leeping
Browse files

Merge branch 'master' of github.com:leeping/openmm

parents 855ece90 99ef4344
...@@ -252,6 +252,7 @@ FOREACH(subdir ${OPENMM_SOURCE_SUBDIRS}) ...@@ -252,6 +252,7 @@ FOREACH(subdir ${OPENMM_SOURCE_SUBDIRS})
## OpenMM was previously installed there. ## OpenMM was previously installed there.
INCLUDE_DIRECTORIES(BEFORE ${CMAKE_CURRENT_SOURCE_DIR}/${subdir}/include) INCLUDE_DIRECTORIES(BEFORE ${CMAKE_CURRENT_SOURCE_DIR}/${subdir}/include)
ENDFOREACH(subdir) ENDFOREACH(subdir)
SET_SOURCE_FILES_PROPERTIES(${CMAKE_SOURCE_DIR}/libraries/sfmt/src/SFMT.cpp PROPERTIES COMPILE_FLAGS "-msse2 -DHAVE_SSE2=1")
# If API wrappers are being generated, and add them to the build. # If API wrappers are being generated, and add them to the build.
FIND_PROGRAM(GCCXML_PATH gccxml PATH FIND_PROGRAM(GCCXML_PATH gccxml PATH
......
...@@ -49,22 +49,23 @@ PRE_ALWAYS static __m128i mm_recursion(__m128i *a, __m128i *b, ...@@ -49,22 +49,23 @@ PRE_ALWAYS static __m128i mm_recursion(__m128i *a, __m128i *b,
* This function fills the internal state array with pseudorandom * This function fills the internal state array with pseudorandom
* integers. * integers.
*/ */
inline static void gen_rand_all(void) { inline static void gen_rand_all(SFMT& sfmt) {
int i; int i;
__m128i r, r1, r2, mask; __m128i r, r1, r2, mask;
mask = _mm_set_epi32(MSK4, MSK3, MSK2, MSK1); mask = _mm_set_epi32(MSK4, MSK3, MSK2, MSK1);
r1 = _mm_load_si128(&sfmt[N - 2].si); SFMTData& data = *sfmt.data;
r2 = _mm_load_si128(&sfmt[N - 1].si); r1 = _mm_load_si128(&data.sfmt[N - 2].si);
r2 = _mm_load_si128(&data.sfmt[N - 1].si);
for (i = 0; i < N - POS1; i++) { for (i = 0; i < N - POS1; i++) {
r = mm_recursion(&sfmt[i].si, &sfmt[i + POS1].si, r1, r2, mask); r = mm_recursion(&data.sfmt[i].si, &data.sfmt[i + POS1].si, r1, r2, mask);
_mm_store_si128(&sfmt[i].si, r); _mm_store_si128(&data.sfmt[i].si, r);
r1 = r2; r1 = r2;
r2 = r; r2 = r;
} }
for (; i < N; i++) { for (; i < N; i++) {
r = mm_recursion(&sfmt[i].si, &sfmt[i + POS1 - N].si, r1, r2, mask); r = mm_recursion(&data.sfmt[i].si, &data.sfmt[i + POS1 - N].si, r1, r2, mask);
_mm_store_si128(&sfmt[i].si, r); _mm_store_si128(&data.sfmt[i].si, r);
r1 = r2; r1 = r2;
r2 = r; r2 = r;
} }
...@@ -77,21 +78,22 @@ inline static void gen_rand_all(void) { ...@@ -77,21 +78,22 @@ inline static void gen_rand_all(void) {
* @param array an 128-bit array to be filled by pseudorandom numbers. * @param array an 128-bit array to be filled by pseudorandom numbers.
* @param size number of 128-bit pesudorandom numbers to be generated. * @param size number of 128-bit pesudorandom numbers to be generated.
*/ */
inline static void gen_rand_array(w128_t *array, int size) { inline static void gen_rand_array(w128_t *array, int size, SFMT& sfmt) {
int i, j; int i, j;
__m128i r, r1, r2, mask; __m128i r, r1, r2, mask;
mask = _mm_set_epi32(MSK4, MSK3, MSK2, MSK1); mask = _mm_set_epi32(MSK4, MSK3, MSK2, MSK1);
r1 = _mm_load_si128(&sfmt[N - 2].si); SFMTData& data = *sfmt.data;
r2 = _mm_load_si128(&sfmt[N - 1].si); r1 = _mm_load_si128(&data.sfmt[N - 2].si);
r2 = _mm_load_si128(&data.sfmt[N - 1].si);
for (i = 0; i < N - POS1; i++) { for (i = 0; i < N - POS1; i++) {
r = mm_recursion(&sfmt[i].si, &sfmt[i + POS1].si, r1, r2, mask); r = mm_recursion(&data.sfmt[i].si, &data.sfmt[i + POS1].si, r1, r2, mask);
_mm_store_si128(&array[i].si, r); _mm_store_si128(&array[i].si, r);
r1 = r2; r1 = r2;
r2 = r; r2 = r;
} }
for (; i < N; i++) { for (; i < N; i++) {
r = mm_recursion(&sfmt[i].si, &array[i + POS1 - N].si, r1, r2, mask); r = mm_recursion(&data.sfmt[i].si, &array[i + POS1 - N].si, r1, r2, mask);
_mm_store_si128(&array[i].si, r); _mm_store_si128(&array[i].si, r);
r1 = r2; r1 = r2;
r2 = r; r2 = r;
...@@ -106,13 +108,13 @@ inline static void gen_rand_array(w128_t *array, int size) { ...@@ -106,13 +108,13 @@ inline static void gen_rand_array(w128_t *array, int size) {
} }
for (j = 0; j < 2 * N - size; j++) { for (j = 0; j < 2 * N - size; j++) {
r = _mm_load_si128(&array[j + size - N].si); r = _mm_load_si128(&array[j + size - N].si);
_mm_store_si128(&sfmt[j].si, r); _mm_store_si128(&data.sfmt[j].si, r);
} }
for (; i < size; i++) { for (; i < size; i++) {
r = mm_recursion(&array[i - N].si, &array[i + POS1 - N].si, r1, r2, r = mm_recursion(&array[i - N].si, &array[i + POS1 - N].si, r1, r2,
mask); mask);
_mm_store_si128(&array[i].si, r); _mm_store_si128(&array[i].si, r);
_mm_store_si128(&sfmt[j++].si, r); _mm_store_si128(&data.sfmt[j++].si, r);
r1 = r2; r1 = r2;
r2 = r; r2 = r;
} }
......
...@@ -144,9 +144,9 @@ inline static void swap(w128_t *array, int size); ...@@ -144,9 +144,9 @@ inline static void swap(w128_t *array, int size);
#endif #endif
#if defined(HAVE_ALTIVEC) #if defined(HAVE_ALTIVEC)
#include "SFMT-alti.h" #include "sfmt/SFMT-alti.h"
#elif defined(HAVE_SSE2) #elif defined(HAVE_SSE2)
#include "SFMT-sse2.h" #include "sfmt/SFMT-sse2.h"
#endif #endif
/** /**
......
#ifndef OPENMM_VECTORIZE_H_
#define OPENMM_VECTORIZE_H_
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2013 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
* Permission is hereby granted, free of charge, to any person obtaining a *
* copy of this software and associated documentation files (the "Software"), *
* to deal in the Software without restriction, including without limitation *
* the rights to use, copy, modify, merge, publish, distribute, sublicense, *
* and/or sell copies of the Software, and to permit persons to whom the *
* Software is furnished to do so, subject to the following conditions: *
* *
* The above copyright notice and this permission notice shall be included in *
* all copies or substantial portions of the Software. *
* *
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, *
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL *
* THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, *
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR *
* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE *
* USE OR OTHER DEALINGS IN THE SOFTWARE. *
* -------------------------------------------------------------------------- */
#include <smmintrin.h>
// This file defines classes and functions to simplify vectorizing code with SSE.
class ivec4;
/**
* A four element vector of floats.
*/
class fvec4 {
public:
__m128 val;
fvec4() {}
fvec4(float v) : val(_mm_set1_ps(v)) {}
fvec4(float v1, float v2, float v3, float v4) : val(_mm_set_ps(v4, v3, v2, v1)) {}
fvec4(__m128 v) : val(v) {}
fvec4(const float* v) : val(_mm_loadu_ps(v)) {}
operator __m128() const {
return val;
}
float operator[](int i) const {
int resultBits = _mm_extract_ps(val, i);
return *((float*) &resultBits);
}
void store(float* v) const {
_mm_storeu_ps(v, val);
}
fvec4 operator+(fvec4 other) const {
return _mm_add_ps(val, other);
}
fvec4 operator-(fvec4 other) const {
return _mm_sub_ps(val, other);
}
fvec4 operator*(fvec4 other) const {
return _mm_mul_ps(val, other);
}
fvec4 operator/(fvec4 other) const {
return _mm_div_ps(val, other);
}
void operator+=(fvec4 other) {
val = _mm_add_ps(val, other);
}
void operator-=(fvec4 other) {
val = _mm_sub_ps(val, other);
}
void operator*=(fvec4 other) {
val = _mm_mul_ps(val, other);
}
void operator/=(fvec4 other) {
val = _mm_div_ps(val, other);
}
fvec4 operator-() const {
return _mm_sub_ps(_mm_set1_ps(0.0f), val);
}
fvec4 operator&(fvec4 other) const {
return _mm_and_ps(val, other);
}
fvec4 operator==(fvec4 other) const {
return _mm_cmpeq_ps(val, other);
}
fvec4 operator!=(fvec4 other) const {
return _mm_cmpneq_ps(val, other);
}
fvec4 operator>(fvec4 other) const {
return _mm_cmpgt_ps(val, other);
}
fvec4 operator<(fvec4 other) const {
return _mm_cmplt_ps(val, other);
}
fvec4 operator>=(fvec4 other) const {
return _mm_cmpge_ps(val, other);
}
fvec4 operator<=(fvec4 other) const {
return _mm_cmple_ps(val, other);
}
operator ivec4() const;
};
/**
* A four element vector of ints.
*/
class ivec4 {
public:
__m128i val;
ivec4() {}
ivec4(int v) : val(_mm_set1_epi32(v)) {}
ivec4(int v1, int v2, int v3, int v4) : val(_mm_set_epi32(v4, v3, v2, v1)) {}
ivec4(__m128i v) : val(v) {}
ivec4(const int* v) : val(_mm_loadu_si128((const __m128i*) v)) {}
operator __m128i() const {
return val;
}
int operator[](int i) const {
return _mm_extract_epi32(val, i);
}
void store(int* v) const {
_mm_storeu_si128((__m128i*) v, val);
}
ivec4 operator+(ivec4 other) const {
return _mm_add_epi32(val, other);
}
ivec4 operator-(ivec4 other) const {
return _mm_sub_epi32(val, other);
}
ivec4 operator*(ivec4 other) const {
return _mm_mul_epi32(val, other);
}
void operator+=(ivec4 other) {
val = _mm_add_epi32(val, other);
}
void operator-=(ivec4 other) {
val = _mm_sub_epi32(val, other);
}
void operator*=(ivec4 other) {
val = _mm_mul_epi32(val, other);
}
ivec4 operator-() const {
return _mm_sub_epi32(_mm_set1_epi32(0), val);
}
ivec4 operator&(ivec4 other) const {
return _mm_and_si128(val, other);
}
ivec4 operator==(ivec4 other) const {
return _mm_cmpeq_epi32(val, other);
}
operator fvec4() const;
};
// Conversion operators.
inline fvec4::operator ivec4() const {
return _mm_cvttps_epi32(val);
}
inline ivec4::operator fvec4() const {
return _mm_cvtepi32_ps(val);
}
// Functions that operate on fvec4s.
static inline fvec4 floor(fvec4 v) {
return fvec4(_mm_floor_ps(v.val));
}
static inline fvec4 ceil(fvec4 v) {
return fvec4(_mm_ceil_ps(v.val));
}
static inline fvec4 round(fvec4 v) {
return fvec4(_mm_round_ps(v.val, _MM_FROUND_TO_NEAREST_INT));
}
static inline fvec4 min(fvec4 v1, fvec4 v2) {
return fvec4(_mm_min_ps(v1.val, v2.val));
}
static inline fvec4 max(fvec4 v1, fvec4 v2) {
return fvec4(_mm_max_ps(v1.val, v2.val));
}
static inline fvec4 abs(fvec4 v) {
static const __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x7FFFFFFF));
return fvec4(_mm_and_ps(v.val, mask));
}
static inline fvec4 sqrt(fvec4 v) {
return fvec4(_mm_sqrt_ps(v.val));
}
static inline float dot3(fvec4 v1, fvec4 v2) {
return _mm_cvtss_f32(_mm_dp_ps(v1, v2, 0x71));
}
static inline float dot4(fvec4 v1, fvec4 v2) {
return _mm_cvtss_f32(_mm_dp_ps(v1, v2, 0xF1));
}
// Functions that operate on ivec4s.
static inline ivec4 min(ivec4 v1, ivec4 v2) {
return ivec4(_mm_min_epi32(v1.val, v2.val));
}
static inline ivec4 max(ivec4 v1, ivec4 v2) {
return ivec4(_mm_max_epi32(v1.val, v2.val));
}
static inline ivec4 abs(ivec4 v) {
return ivec4(_mm_abs_epi32(v.val));
}
// Mathematical operators involving a scalar and a vector.
static inline fvec4 operator+(float v1, fvec4 v2) {
return fvec4(v1)+v2;
}
static inline fvec4 operator-(float v1, fvec4 v2) {
return fvec4(v1)-v2;
}
static inline fvec4 operator*(float v1, fvec4 v2) {
return fvec4(v1)*v2;
}
static inline fvec4 operator/(float v1, fvec4 v2) {
return fvec4(v1)/v2;
}
#endif /*OPENMM_VECTORIZE_H_*/
...@@ -12,26 +12,42 @@ namespace OpenMM { ...@@ -12,26 +12,42 @@ namespace OpenMM {
class OPENMM_EXPORT_CPU CpuNeighborList { class OPENMM_EXPORT_CPU CpuNeighborList {
public: public:
class ThreadData; class ThreadData;
class VoxelHash; class Voxels;
static const int BlockSize;
CpuNeighborList(); CpuNeighborList();
~CpuNeighborList(); ~CpuNeighborList();
void computeNeighborList(int numAtoms, const std::vector<float>& atomLocations, const std::vector<std::set<int> >& exclusions, void computeNeighborList(int numAtoms, const std::vector<float>& atomLocations, const std::vector<std::set<int> >& exclusions,
const float* periodicBoxSize, bool usePeriodic, float maxDistance); const float* periodicBoxSize, bool usePeriodic, float maxDistance);
const std::vector<std::pair<int, int> >& getNeighbors(); int getNumBlocks() const;
const std::vector<int>& getSortedAtoms() const;
const std::vector<int>& getBlockNeighbors(int blockIndex) const;
const std::vector<char>& getBlockExclusions(int blockIndex) const;
/** /**
* This routine contains the code executed by each thread. * This routine contains the code executed by each thread.
*/ */
void runThread(int index, std::vector<std::pair<int, int> >& threadNeighbors); void runThread(int index);
private: private:
/**
* This is called by the worker threads to wait until the master thread instructs them to advance.
*/
void threadWait();
/**
* This is called by the master thread to instruct all the worker threads to advance.
*/
void advanceThreads();
bool isDeleted; bool isDeleted;
int numThreads, waitCount; int numThreads, waitCount;
std::vector<std::pair<int, int> > neighbors; std::vector<int> sortedAtoms;
std::vector<std::vector<int> > blockNeighbors;
std::vector<std::vector<char> > blockExclusions;
std::vector<pthread_t> thread; std::vector<pthread_t> thread;
std::vector<ThreadData*> threadData; std::vector<ThreadData*> threadData;
pthread_cond_t startCondition, endCondition; pthread_cond_t startCondition, endCondition;
pthread_mutex_t lock; pthread_mutex_t lock;
// The following variables are used to make information accessible to the individual threads. // The following variables are used to make information accessible to the individual threads.
VoxelHash* voxelHash; float minx, maxx, miny, maxy, minz, maxz;
std::vector<std::pair<int, int> > atomBins;
Voxels* voxels;
const std::vector<std::set<int> >* exclusions; const std::vector<std::set<int> >* exclusions;
const float* atomLocations; const float* atomLocations;
const float* periodicBoxSize; const float* periodicBoxSize;
......
...@@ -25,14 +25,17 @@ ...@@ -25,14 +25,17 @@
#ifndef OPENMM_CPU_NONBONDED_FORCE_H__ #ifndef OPENMM_CPU_NONBONDED_FORCE_H__
#define OPENMM_CPU_NONBONDED_FORCE_H__ #define OPENMM_CPU_NONBONDED_FORCE_H__
#include "CpuNeighborList.h"
#include "ReferencePairIxn.h" #include "ReferencePairIxn.h"
#include "openmm/internal/vectorize.h"
#include <pthread.h> #include <pthread.h>
#include <set> #include <set>
#include <utility> #include <utility>
#include <vector> #include <vector>
#include <smmintrin.h>
// --------------------------------------------------------------------------------------- // ---------------------------------------------------------------------------------------
namespace OpenMM {
class CpuNonbondedForce { class CpuNonbondedForce {
public: public:
class ThreadData; class ThreadData;
...@@ -63,7 +66,7 @@ class CpuNonbondedForce { ...@@ -63,7 +66,7 @@ class CpuNonbondedForce {
--------------------------------------------------------------------------------------- */ --------------------------------------------------------------------------------------- */
void setUseCutoff(float distance, const std::vector<std::pair<int, int> >& neighbors, float solventDielectric); void setUseCutoff(float distance, const CpuNeighborList& neighbors, float solventDielectric);
/**--------------------------------------------------------------------------------------- /**---------------------------------------------------------------------------------------
...@@ -127,9 +130,9 @@ class CpuNonbondedForce { ...@@ -127,9 +130,9 @@ class CpuNonbondedForce {
--------------------------------------------------------------------------------------- */ --------------------------------------------------------------------------------------- */
void calculateReciprocalIxn(int numberOfAtoms, float* posq, std::vector<OpenMM::RealVec>& atomCoordinates, void calculateReciprocalIxn(int numberOfAtoms, float* posq, std::vector<RealVec>& atomCoordinates,
const std::vector<std::pair<float, float> >& atomParameters, const std::vector<std::set<int> >& exclusions, const std::vector<std::pair<float, float> >& atomParameters, const std::vector<std::set<int> >& exclusions,
std::vector<OpenMM::RealVec>& forces, float* totalEnergy) const; std::vector<RealVec>& forces, float* totalEnergy) const;
/**--------------------------------------------------------------------------------------- /**---------------------------------------------------------------------------------------
...@@ -159,14 +162,14 @@ private: ...@@ -159,14 +162,14 @@ private:
bool periodic; bool periodic;
bool ewald; bool ewald;
bool pme; bool pme;
const std::vector<std::pair<int, int> >* neighborList; const CpuNeighborList* neighborList;
float periodicBoxSize[3]; float periodicBoxSize[3];
float cutoffDistance, switchingDistance; float cutoffDistance, switchingDistance;
float krf, crf; float krf, crf;
float alphaEwald; float alphaEwald;
int numRx, numRy, numRz; int numRx, numRy, numRz;
int meshDim[3]; int meshDim[3];
std::vector<float> ewaldScaleX, ewaldScaleY, ewaldScaleDeriv; std::vector<float> ewaldScaleTable;
float ewaldDX, ewaldDXInv; float ewaldDX, ewaldDXInv;
bool isDeleted; bool isDeleted;
int numThreads, waitCount; int numThreads, waitCount;
...@@ -195,31 +198,42 @@ private: ...@@ -195,31 +198,42 @@ private:
--------------------------------------------------------------------------------------- */ --------------------------------------------------------------------------------------- */
void calculateOneIxn(int atom1, int atom2, float* forces, double* totalEnergy, const __m128& boxSize, const __m128& invBoxSize); void calculateOneIxn(int atom1, int atom2, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize);
/**--------------------------------------------------------------------------------------- /**---------------------------------------------------------------------------------------
Calculate LJ Coulomb pair ixn between two atoms Calculate all the interactions for one atom block.
@param atom1 the index of the first atom @param blockIndex the index of the atom block
@param atom2 the index of the second atom @param forces force array (forces added)
@param totalEnergy total energy
--------------------------------------------------------------------------------------- */
void calculateBlockIxn(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize);
/**---------------------------------------------------------------------------------------
Calculate all the interactions for one atom block.
@param blockIndex the index of the atom block
@param forces force array (forces added) @param forces force array (forces added)
@param totalEnergy total energy @param totalEnergy total energy
--------------------------------------------------------------------------------------- */ --------------------------------------------------------------------------------------- */
void calculateOneEwaldIxn(int atom1, int atom2, float* forces, double* totalEnergy, const __m128& boxSize, const __m128& invBoxSize); void calculateBlockEwaldIxn(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize);
/** /**
* Compute the displacement and squared distance between two points, optionally using * Compute the displacement and squared distance between two points, optionally using
* periodic boundary conditions. * periodic boundary conditions.
*/ */
void getDeltaR(const __m128& posI, const __m128& posJ, __m128& deltaR, float& r2, bool periodic, const __m128& boxSize, const __m128& invBoxSize) const; void getDeltaR(const fvec4& posI, const fvec4& posJ, fvec4& deltaR, float& r2, bool periodic, const fvec4& boxSize, const fvec4& invBoxSize) const;
/** /**
* Compute a fast approximation to erfc(x). * Compute a fast approximation to erfc(x).
*/ */
static float erfcApprox(float x); static fvec4 erfcApprox(fvec4 x);
/** /**
* Create a lookup table for the scale factor used with Ewald and PME. * Create a lookup table for the scale factor used with Ewald and PME.
...@@ -229,9 +243,11 @@ private: ...@@ -229,9 +243,11 @@ private:
/** /**
* Evaluate the scale factor used with Ewald and PME: erfc(alpha*r) + 2*alpha*r*exp(-alpha*alpha*r*r)/sqrt(PI) * Evaluate the scale factor used with Ewald and PME: erfc(alpha*r) + 2*alpha*r*exp(-alpha*alpha*r*r)/sqrt(PI)
*/ */
float ewaldScaleFunction(float x); fvec4 ewaldScaleFunction(fvec4 x);
}; };
} // namespace OpenMM
// --------------------------------------------------------------------------------------- // ---------------------------------------------------------------------------------------
#endif // OPENMM_CPU_NONBONDED_FORCE_H__ #endif // OPENMM_CPU_NONBONDED_FORCE_H__
...@@ -221,20 +221,48 @@ double CpuCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeFo ...@@ -221,20 +221,48 @@ double CpuCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeFo
if (nonbondedMethod != NoCutoff) { if (nonbondedMethod != NoCutoff) {
// Determine whether we need to recompute the neighbor list. // Determine whether we need to recompute the neighbor list.
double padding = 0.1*nonbondedCutoff; double padding = 0.15*nonbondedCutoff;
bool needRecompute = false; bool needRecompute = false;
double closeCutoff2 = 0.25*padding*padding;
double farCutoff2 = 0.5*padding*padding;
int maxNumMoved = numParticles/10;
vector<int> moved;
for (int i = 0; i < numParticles; i++) { for (int i = 0; i < numParticles; i++) {
RealVec delta = posData[i]-lastPositions[i]; RealVec delta = posData[i]-lastPositions[i];
if (delta.dot(delta) > 0.25*padding*padding) { double dist2 = delta.dot(delta);
needRecompute = true; if (dist2 > closeCutoff2) {
break; moved.push_back(i);
if (dist2 > farCutoff2 || moved.size() > maxNumMoved) {
needRecompute = true;
break;
}
} }
} }
if (!needRecompute && moved.size() > 0) {
// Some particles have moved further than half the padding distance. Look for pairs
// that are missing from the neighbor list.
int numMoved = moved.size();
double cutoff2 = nonbondedCutoff*nonbondedCutoff;
for (int i = 1; i < numMoved && !needRecompute; i++)
for (int j = 0; j < i; j++) {
RealVec delta = posData[moved[i]]-posData[moved[j]];
if (delta.dot(delta) < cutoff2) {
// These particles should interact. See if they are in the neighbor list.
RealVec oldDelta = lastPositions[moved[i]]-lastPositions[moved[j]];
if (oldDelta.dot(oldDelta) > cutoff2) {
needRecompute = true;
break;
}
}
}
}
if (needRecompute) { if (needRecompute) {
neighborList.computeNeighborList(numParticles, posq, exclusions, floatBoxSize, periodic || ewald || pme, nonbondedCutoff+padding); neighborList.computeNeighborList(numParticles, posq, exclusions, floatBoxSize, periodic || ewald || pme, nonbondedCutoff+padding);
lastPositions = posData; lastPositions = posData;
} }
nonbonded.setUseCutoff(nonbondedCutoff, neighborList.getNeighbors(), rfDielectric); nonbonded.setUseCutoff(nonbondedCutoff, neighborList, rfDielectric);
} }
if (periodic || ewald || pme) { if (periodic || ewald || pme) {
double minAllowedSize = 1.999999*nonbondedCutoff; double minAllowedSize = 1.999999*nonbondedCutoff;
......
#include "CpuNeighborList.h" #include "CpuNeighborList.h"
#include "openmm/internal/hardware.h" #include "openmm/internal/hardware.h"
#include "openmm/internal/vectorize.h"
#include "hilbert.h"
#include <algorithm>
#include <set> #include <set>
#include <map> #include <map>
#include <cmath> #include <cmath>
...@@ -9,137 +12,236 @@ using namespace std; ...@@ -9,137 +12,236 @@ using namespace std;
namespace OpenMM { namespace OpenMM {
const int CpuNeighborList::BlockSize = 4;
class VoxelIndex class VoxelIndex
{ {
public: public:
VoxelIndex(int xx, int yy, int zz) : x(xx), y(yy), z(zz) { VoxelIndex(int x, int y) : x(x), y(y) {
}
// operator<() needed for map
bool operator<(const VoxelIndex& other) const {
if (x < other.x) return true;
else if (x > other.x) return false;
else if (y < other.y) return true;
else if (y > other.y) return false;
else if (z < other.z) return true;
else return false;
} }
int x; int x;
int y; int y;
int z;
}; };
typedef pair<const float*, int> VoxelItem; typedef pair<const float*, int> VoxelItem;
typedef vector< VoxelItem > Voxel;
class CpuNeighborList::VoxelHash { /**
* This data structure organizes the particles spatially. It divides them into bins along the x and y axes,
* then sorts each bin along the z axis so ranges can be identified quickly with a binary search.
*/
class CpuNeighborList::Voxels {
public: public:
VoxelHash(float vsx, float vsy, float vsz, const float* periodicBoxSize, bool usePeriodic) : Voxels(float vsx, float vsy, float minx, float maxx, float miny, float maxy, const float* periodicBoxSize, bool usePeriodic) :
voxelSizeX(vsx), voxelSizeY(vsy), voxelSizeZ(vsz), periodicBoxSize(periodicBoxSize), usePeriodic(usePeriodic) { voxelSizeX(vsx), voxelSizeY(vsy), minx(minx), maxx(maxx), miny(miny), maxy(maxy), periodicBoxSize(periodicBoxSize), usePeriodic(usePeriodic) {
if (usePeriodic) { if (usePeriodic) {
nx = (int) floorf(periodicBoxSize[0]/voxelSizeX+0.5f); nx = (int) floorf(periodicBoxSize[0]/voxelSizeX+0.5f);
ny = (int) floorf(periodicBoxSize[1]/voxelSizeY+0.5f); ny = (int) floorf(periodicBoxSize[1]/voxelSizeY+0.5f);
nz = (int) floorf(periodicBoxSize[2]/voxelSizeZ+0.5f);
voxelSizeX = periodicBoxSize[0]/nx; voxelSizeX = periodicBoxSize[0]/nx;
voxelSizeY = periodicBoxSize[1]/ny; voxelSizeY = periodicBoxSize[1]/ny;
voxelSizeZ = periodicBoxSize[2]/nz; }
else {
nx = max(1, (int) floorf((maxx-minx)/voxelSizeX+0.5f));
ny = max(1, (int) floorf((maxy-miny)/voxelSizeY+0.5f));
if (maxx > minx)
voxelSizeX = (maxx-minx)/nx;
if (maxy > miny)
voxelSizeY = (maxy-miny)/ny;
}
bins.resize(nx);
for (int i = 0; i < nx; i++) {
bins[i].resize(ny);
for (int j = 0; j < ny; j++)
bins[i][j].resize(0);
} }
} }
void insert(const int& item, const float* location) { /**
* Insert a particle into the voxel data structure.
*/
void insert(const int& atom, const float* location) {
VoxelIndex voxelIndex = getVoxelIndex(location); VoxelIndex voxelIndex = getVoxelIndex(location);
if (voxelMap.find(voxelIndex) == voxelMap.end()) bins[voxelIndex.x][voxelIndex.y].push_back(make_pair(location[2], VoxelItem(location, atom)));
voxelMap[voxelIndex] = Voxel(); }
Voxel& voxel = voxelMap.find(voxelIndex)->second;
voxel.push_back(VoxelItem(location, item)); /**
* Sort the particles in each voxel by z coordinate.
*/
void sortItems() {
for (int i = 0; i < nx; i++)
for (int j = 0; j < ny; j++)
sort(bins[i][j].begin(), bins[i][j].end());
} }
/**
* Find the index of the first particle in voxel (x,y) whose z coordinate in >= the specified value.
*/
int findLowerBound(int x, int y, double z) const {
const vector<pair<float, VoxelItem> >& bin = bins[x][y];
int lower = 0;
int upper = bin.size();
while (lower < upper) {
int middle = (lower+upper)/2;
if (bin[middle].first < z)
lower = middle+1;
else
upper = middle;
}
return lower;
}
/**
* Find the index of the first particle in voxel (x,y) whose z coordinate in greater than the specified value.
*/
int findUpperBound(int x, int y, double z) const {
const vector<pair<float, VoxelItem> >& bin = bins[x][y];
int lower = 0;
int upper = bin.size();
while (lower < upper) {
int middle = (lower+upper)/2;
if (bin[middle].first > z)
upper = middle;
else
lower = middle+1;
}
return upper;
}
/**
* Get the voxel index containing a particular location.
*/
VoxelIndex getVoxelIndex(const float* location) const { VoxelIndex getVoxelIndex(const float* location) const {
float xperiodic, yperiodic, zperiodic; float xperiodic, yperiodic;
if (!usePeriodic) { if (!usePeriodic) {
xperiodic = location[0]; xperiodic = location[0]-minx;
yperiodic = location[1]; yperiodic = location[1]-miny;
zperiodic = location[2];
} }
else { else {
xperiodic = location[0]-periodicBoxSize[0]*floorf(location[0]/periodicBoxSize[0]); xperiodic = location[0]-periodicBoxSize[0]*floorf(location[0]/periodicBoxSize[0]);
yperiodic = location[1]-periodicBoxSize[1]*floorf(location[1]/periodicBoxSize[1]); yperiodic = location[1]-periodicBoxSize[1]*floorf(location[1]/periodicBoxSize[1]);
zperiodic = location[2]-periodicBoxSize[2]*floorf(location[2]/periodicBoxSize[2]);
} }
int x = int(floorf(xperiodic / voxelSizeX)); int x = min(nx-1, int(floorf(xperiodic / voxelSizeX)));
int y = int(floorf(yperiodic / voxelSizeY)); int y = min(ny-1, int(floorf(yperiodic / voxelSizeY)));
int z = int(floorf(zperiodic / voxelSizeZ));
return VoxelIndex(x, y, z); return VoxelIndex(x, y);
} }
void getNeighbors(vector<pair<int, int> >& neighbors, const VoxelItem& referencePoint, const vector<set<int> >& exclusions, float maxDistance) const { void getNeighbors(vector<int>& neighbors, int blockIndex, fvec4 blockCenter, fvec4 blockWidth, const vector<int>& sortedAtoms, vector<char>& exclusions, float maxDistance, const vector<int> blockAtoms, const float* atomLocations) const {
neighbors.resize(0);
// Loop over neighboring voxels exclusions.resize(0);
// TODO use more clever selection of neighboring voxels fvec4 boxSize(periodicBoxSize[0], periodicBoxSize[1], periodicBoxSize[2], 0);
fvec4 invBoxSize(1/periodicBoxSize[0], 1/periodicBoxSize[1], 1/periodicBoxSize[2], 0);
const int atomI = referencePoint.second;
const float* locationI = referencePoint.first;
__m128 posI = _mm_loadu_ps(locationI);
__m128 boxSize = _mm_set_ps(0, periodicBoxSize[2], periodicBoxSize[1], periodicBoxSize[0]);
__m128 invBoxSize = _mm_set_ps(0, (1/periodicBoxSize[2]), (1/periodicBoxSize[1]), (1/periodicBoxSize[0]));
__m128 half = _mm_set1_ps(0.5);
float maxDistanceSquared = maxDistance * maxDistance; float maxDistanceSquared = maxDistance * maxDistance;
float refineCutoff = maxDistance-max(max(blockWidth[0], blockWidth[1]), blockWidth[2]);
float refineCutoffSquared = refineCutoff*refineCutoff;
int dIndexX = int(maxDistance / voxelSizeX) + 1; // How may voxels away do we have to look? int dIndexX = min(nx/2, int((maxDistance+blockWidth[0])/voxelSizeX)+1); // How may voxels away do we have to look?
int dIndexY = int(maxDistance / voxelSizeY) + 1; int dIndexY = min(ny/2, int((maxDistance+blockWidth[1])/voxelSizeY)+1);
int dIndexZ = int(maxDistance / voxelSizeZ) + 1; float centerPos[4];
VoxelIndex centerVoxelIndex = getVoxelIndex(locationI); blockCenter.store(centerPos);
int lastx = centerVoxelIndex.x+dIndexX; VoxelIndex centerVoxelIndex = getVoxelIndex(centerPos);
int lasty = centerVoxelIndex.y+dIndexY; int startx = centerVoxelIndex.x-dIndexX;
int lastz = centerVoxelIndex.z+dIndexZ; int starty = centerVoxelIndex.y-dIndexY;
int endx = centerVoxelIndex.x+dIndexX;
int endy = centerVoxelIndex.y+dIndexY;
int numRanges;
if (usePeriodic) { if (usePeriodic) {
lastx = min(lastx, centerVoxelIndex.x-dIndexX+nx-1); endx = min(endx, centerVoxelIndex.x-dIndexX+nx-1);
lasty = min(lasty, centerVoxelIndex.y-dIndexY+ny-1); endy = min(endy, centerVoxelIndex.y-dIndexY+ny-1);
lastz = min(lastz, centerVoxelIndex.z-dIndexZ+nz-1); numRanges = 2;
} }
VoxelIndex voxelIndex(0, 0, 0); else {
for (int x = centerVoxelIndex.x - dIndexX; x <= lastx; ++x) { startx = max(startx, 0);
starty = max(starty, 0);
endx = min(endx, nx-1);
endy = min(endy, ny-1);
numRanges = 1;
}
int lastSortedIndex = BlockSize*(blockIndex+1);
VoxelIndex voxelIndex(0, 0);
for (int x = startx; x <= endx; ++x) {
voxelIndex.x = x; voxelIndex.x = x;
if (usePeriodic) if (usePeriodic)
voxelIndex.x = (x < 0 ? x+nx : (x >= nx ? x-nx : x)); voxelIndex.x = (x < 0 ? x+nx : (x >= nx ? x-nx : x));
for (int y = centerVoxelIndex.y - dIndexY; y <= lasty; ++y) { float dx = max(0.0f, voxelSizeX*max(0, abs(centerVoxelIndex.x-x)-1)-blockWidth[0]);
for (int y = starty; y <= endy; ++y) {
voxelIndex.y = y; voxelIndex.y = y;
if (usePeriodic) if (usePeriodic)
voxelIndex.y = (y < 0 ? y+ny : (y >= ny ? y-ny : y)); voxelIndex.y = (y < 0 ? y+ny : (y >= ny ? y-ny : y));
for (int z = centerVoxelIndex.z - dIndexZ; z <= lastz; ++z) { float dy = max(0.0f, voxelSizeY*max(0, abs(centerVoxelIndex.y-y)-1)-blockWidth[1]);
voxelIndex.z = z;
if (usePeriodic) // Identify the range of atoms within this bin we need to search. When using periodic boundary
voxelIndex.z = (z < 0 ? z+nz : (z >= nz ? z-nz : z)); // conditions, there may be two separate ranges.
const map<VoxelIndex, Voxel>::const_iterator voxelEntry = voxelMap.find(voxelIndex);
if (voxelEntry == voxelMap.end()) float dz = maxDistance+blockWidth[2];
continue; // no such voxel; skip dz = sqrtf(max(0.0f, dz*dz-dx*dx-dy*dy));
const Voxel& voxel = voxelEntry->second; int rangeStart[2];
for (Voxel::const_iterator itemIter = voxel.begin(); itemIter != voxel.end(); ++itemIter) { int rangeEnd[2];
const int atomJ = itemIter->second; rangeStart[0] = findLowerBound(voxelIndex.x, voxelIndex.y, centerPos[2]-dz);
if (usePeriodic) {
rangeEnd[0] = findUpperBound(voxelIndex.x, voxelIndex.y, centerPos[2]+dz);
if (rangeStart[0] > 0) {
rangeStart[1] = 0;
rangeEnd[1] = min(findUpperBound(voxelIndex.x, voxelIndex.y, centerPos[2]+dz-periodicBoxSize[2]), rangeStart[0]);
}
else {
rangeStart[1] = max(findLowerBound(voxelIndex.x, voxelIndex.y, centerPos[2]-dz+periodicBoxSize[2]), rangeEnd[0]);
rangeEnd[1] = bins[voxelIndex.x][voxelIndex.y].size();
}
}
else
rangeEnd[0] = findUpperBound(voxelIndex.x, voxelIndex.y, centerPos[2]+dz);
// Loop over atoms and check to see if they are neighbors of this block.
for (int range = 0; range < numRanges; range++) {
for (int item = rangeStart[range]; item < rangeEnd[range]; item++) {
const int sortedIndex = bins[voxelIndex.x][voxelIndex.y][item].second.second;
// Avoid duplicate entries. // Avoid duplicate entries.
if (atomJ >= atomI) if (sortedIndex >= lastSortedIndex)
break; continue;
__m128 posJ = _mm_loadu_ps(itemIter->first); fvec4 atomPos(bins[voxelIndex.x][voxelIndex.y][item].second.first);
__m128 delta = _mm_sub_ps(posJ, posI); fvec4 delta = atomPos-centerPos;
if (usePeriodic) { if (usePeriodic) {
__m128 base = _mm_mul_ps(_mm_floor_ps(_mm_add_ps(_mm_mul_ps(delta, invBoxSize), half)), boxSize); fvec4 base = round(delta*invBoxSize)*boxSize;
delta = _mm_sub_ps(delta, base); delta = delta-base;
} }
float dSquared = _mm_cvtss_f32(_mm_dp_ps(delta, delta, 0x71)); delta = max(0.0f, abs(delta)-blockWidth);
float dSquared = dot3(delta, delta);
if (dSquared > maxDistanceSquared) if (dSquared > maxDistanceSquared)
continue; continue;
// Ignore exclusions. if (dSquared > refineCutoffSquared) {
if (exclusions[atomI].find(atomJ) != exclusions[atomI].end()) // The distance is large enough that there might not be any actual interactions.
continue; // Check individual atom pairs to be sure.
neighbors.push_back(make_pair(atomI, atomJ)); bool any = false;
for (int k = 0; k < (int) blockAtoms.size(); k++) {
fvec4 pos1(&atomLocations[4*blockAtoms[k]]);
delta = atomPos-pos1;
if (usePeriodic) {
fvec4 base = round(delta*invBoxSize)*boxSize;
delta = delta-base;
}
float r2 = dot3(delta, delta);
if (r2 < maxDistanceSquared) {
any = true;
break;
}
}
if (!any)
continue;
}
// Add this atom to the list of neighbors.
neighbors.push_back(sortedAtoms[sortedIndex]);
if (sortedIndex < BlockSize*blockIndex)
exclusions.push_back(0);
else
exclusions.push_back(0xF & (0xF<<(sortedIndex-BlockSize*blockIndex)));
} }
} }
} }
...@@ -147,11 +249,12 @@ public: ...@@ -147,11 +249,12 @@ public:
} }
private: private:
float voxelSizeX, voxelSizeY, voxelSizeZ; float voxelSizeX, voxelSizeY;
int nx, ny, nz; float minx, maxx, miny, maxy;
int nx, ny;
const float* periodicBoxSize; const float* periodicBoxSize;
const bool usePeriodic; const bool usePeriodic;
map<VoxelIndex, Voxel> voxelMap; vector<vector<vector<pair<float, VoxelItem> > > > bins;
}; };
class CpuNeighborList::ThreadData { class CpuNeighborList::ThreadData {
...@@ -160,12 +263,11 @@ public: ...@@ -160,12 +263,11 @@ public:
} }
int index; int index;
CpuNeighborList& owner; CpuNeighborList& owner;
vector<pair<int, int> > threadNeighbors;
}; };
static void* threadBody(void* args) { static void* threadBody(void* args) {
CpuNeighborList::ThreadData& data = *reinterpret_cast<CpuNeighborList::ThreadData*>(args); CpuNeighborList::ThreadData& data = *reinterpret_cast<CpuNeighborList::ThreadData*>(args);
data.owner.runThread(data.index, data.threadNeighbors); data.owner.runThread(data.index);
delete &data; delete &data;
return 0; return 0;
} }
...@@ -203,23 +305,13 @@ CpuNeighborList::~CpuNeighborList() { ...@@ -203,23 +305,13 @@ CpuNeighborList::~CpuNeighborList() {
void CpuNeighborList::computeNeighborList(int numAtoms, const vector<float>& atomLocations, const vector<set<int> >& exclusions, void CpuNeighborList::computeNeighborList(int numAtoms, const vector<float>& atomLocations, const vector<set<int> >& exclusions,
const float* periodicBoxSize, bool usePeriodic, float maxDistance) { const float* periodicBoxSize, bool usePeriodic, float maxDistance) {
// Build the voxel hash. int numBlocks = (numAtoms+BlockSize-1)/BlockSize;
blockNeighbors.resize(numBlocks);
float edgeSizeX, edgeSizeY, edgeSizeZ; blockExclusions.resize(numBlocks);
if (!usePeriodic) sortedAtoms.resize(numAtoms);
edgeSizeX = edgeSizeY = edgeSizeZ = maxDistance; // TODO - adjust this as needed
else {
edgeSizeX = 0.5f*periodicBoxSize[0]/floorf(periodicBoxSize[0]/maxDistance);
edgeSizeY = 0.5f*periodicBoxSize[1]/floorf(periodicBoxSize[1]/maxDistance);
edgeSizeZ = 0.5f*periodicBoxSize[2]/floorf(periodicBoxSize[2]/maxDistance);
}
VoxelHash voxelHash(edgeSizeX, edgeSizeY, edgeSizeZ, periodicBoxSize, usePeriodic);
for (int i = 0; i < numAtoms; i++)
voxelHash.insert(i, &atomLocations[4*i]);
// Record the parameters for the threads. // Record the parameters for the threads.
this->voxelHash = &voxelHash;
this->exclusions = &exclusions; this->exclusions = &exclusions;
this->atomLocations = &atomLocations[0]; this->atomLocations = &atomLocations[0];
this->periodicBoxSize = periodicBoxSize; this->periodicBoxSize = periodicBoxSize;
...@@ -227,43 +319,158 @@ void CpuNeighborList::computeNeighborList(int numAtoms, const vector<float>& ato ...@@ -227,43 +319,158 @@ void CpuNeighborList::computeNeighborList(int numAtoms, const vector<float>& ato
this->usePeriodic = usePeriodic; this->usePeriodic = usePeriodic;
this->maxDistance = maxDistance; this->maxDistance = maxDistance;
// Signal the threads to start running and wait for them to finish. // Identify the range of atom positions along each axis.
fvec4 minPos(&atomLocations[0]);
fvec4 maxPos = minPos;
for (int i = 0; i < numAtoms; i++) {
fvec4 pos(&atomLocations[4*i]);
minPos = min(minPos, pos);
maxPos = max(maxPos, pos);
}
minx = minPos[0];
maxx = maxPos[0];
miny = minPos[1];
maxy = maxPos[1];
minz = minPos[2];
maxz = maxPos[2];
// Sort the atoms based on a Hilbert curve.
atomBins.resize(numAtoms);
pthread_mutex_lock(&lock); pthread_mutex_lock(&lock);
waitCount = 0; advanceThreads();
pthread_cond_broadcast(&startCondition); sort(atomBins.begin(), atomBins.end());
while (waitCount < numThreads)
pthread_cond_wait(&endCondition, &lock); // Build the voxel hash.
float edgeSizeX, edgeSizeY;
if (!usePeriodic)
edgeSizeX = edgeSizeY = maxDistance; // TODO - adjust this as needed
else {
edgeSizeX = 0.5f*periodicBoxSize[0]/floorf(periodicBoxSize[0]/maxDistance);
edgeSizeY = 0.5f*periodicBoxSize[1]/floorf(periodicBoxSize[1]/maxDistance);
}
Voxels voxels(edgeSizeX, edgeSizeY, minx, maxx, miny, maxy, periodicBoxSize, usePeriodic);
for (int i = 0; i < numAtoms; i++) {
int atomIndex = atomBins[i].second;
sortedAtoms[i] = atomIndex;
voxels.insert(i, &atomLocations[4*atomIndex]);
}
voxels.sortItems();
this->voxels = &voxels;
// Signal the threads to start running and wait for them to finish.
advanceThreads();
pthread_mutex_unlock(&lock); pthread_mutex_unlock(&lock);
// Combine the results from all the threads. // Add padding atoms to fill up the last block.
neighbors.clear(); int numPadding = numBlocks*BlockSize-numAtoms;
for (int i = 0; i < numThreads; i++) if (numPadding > 0) {
neighbors.insert(neighbors.end(), threadData[i]->threadNeighbors.begin(), threadData[i]->threadNeighbors.end()); char mask = (0xF0 >> numPadding) & 0xF;
for (int i = 0; i < numPadding; i++)
sortedAtoms.push_back(0);
vector<char>& exc = blockExclusions[blockExclusions.size()-1];
for (int i = 0; i < (int) exc.size(); i++)
exc[i] |= mask;
}
}
int CpuNeighborList::getNumBlocks() const {
return sortedAtoms.size()/BlockSize;
} }
const vector<pair<int, int> >& CpuNeighborList::getNeighbors() { const std::vector<int>& CpuNeighborList::getSortedAtoms() const {
return neighbors; return sortedAtoms;
}
const std::vector<int>& CpuNeighborList::getBlockNeighbors(int blockIndex) const {
return blockNeighbors[blockIndex];
}
const std::vector<char>& CpuNeighborList::getBlockExclusions(int blockIndex) const {
return blockExclusions[blockIndex];
} }
void CpuNeighborList::runThread(int index, vector<pair<int, int> >& threadNeighbors) { void CpuNeighborList::runThread(int index) {
while (true) { while (true) {
// Wait for the signal to start running. // Wait for the signal to start running.
pthread_mutex_lock(&lock); threadWait();
waitCount++;
pthread_cond_signal(&endCondition);
pthread_cond_wait(&startCondition, &lock);
pthread_mutex_unlock(&lock);
if (isDeleted) if (isDeleted)
break; break;
// Compute the positions of atoms along the Hilbert curve.
float binWidth = max(max(maxx-minx, maxy-miny), maxz-minz)/255.0f;
float invBinWidth = 1.0f/binWidth;
bitmask_t coords[3];
for (int i = index; i < numAtoms; i += numThreads) {
const float* pos = &atomLocations[4*i];
coords[0] = (bitmask_t) ((pos[0]-minx)*invBinWidth);
coords[1] = (bitmask_t) ((pos[1]-miny)*invBinWidth);
coords[2] = (bitmask_t) ((pos[2]-minz)*invBinWidth);
int bin = (int) hilbert_c2i(3, 8, coords);
atomBins[i] = pair<int, int>(bin, i);
}
threadWait();
// Compute this thread's subset of neighbors. // Compute this thread's subset of neighbors.
threadNeighbors.clear(); int numBlocks = blockNeighbors.size();
for (int i = index; i < numAtoms; i += numThreads) vector<int> blockAtoms;
voxelHash->getNeighbors(threadNeighbors, VoxelItem(&atomLocations[4*i], i), *exclusions, maxDistance); for (int i = index; i < numBlocks; i += numThreads) {
{
int firstIndex = BlockSize*i;
int atomsInBlock = min(BlockSize, numAtoms-firstIndex);
blockAtoms.resize(atomsInBlock);
for (int j = 0; j < atomsInBlock; j++)
blockAtoms[j] = sortedAtoms[firstIndex+j];
}
int firstIndex = BlockSize*i;
fvec4 minPos(&atomLocations[4*sortedAtoms[firstIndex]]);
fvec4 maxPos = minPos;
int atomsInBlock = min(BlockSize, numAtoms-firstIndex);
for (int j = 1; j < atomsInBlock; j++) {
fvec4 pos(&atomLocations[4*sortedAtoms[firstIndex+j]]);
minPos = min(minPos, pos);
maxPos = max(maxPos, pos);
}
voxels->getNeighbors(blockNeighbors[i], i, (maxPos+minPos)*0.5f, (maxPos-minPos)*0.5f, sortedAtoms, blockExclusions[i], maxDistance, blockAtoms, atomLocations);
// Record the exclusions for this block.
for (int j = 0; j < atomsInBlock; j++) {
const set<int>& atomExclusions = (*exclusions)[sortedAtoms[firstIndex+j]];
char mask = 1<<j;
for (int k = 0; k < (int) blockNeighbors[i].size(); k++) {
int atomIndex = blockNeighbors[i][k];
if (atomExclusions.find(atomIndex) != atomExclusions.end())
blockExclusions[i][k] |= mask;
}
}
}
}
}
void CpuNeighborList::threadWait() {
pthread_mutex_lock(&lock);
waitCount++;
pthread_cond_signal(&endCondition);
pthread_cond_wait(&startCondition, &lock);
pthread_mutex_unlock(&lock);
}
void CpuNeighborList::advanceThreads() {
waitCount = 0;
pthread_cond_broadcast(&startCondition);
while (waitCount < numThreads) {
pthread_cond_wait(&endCondition, &lock);
} }
} }
......
...@@ -32,6 +32,7 @@ ...@@ -32,6 +32,7 @@
#include "ReferencePME.h" #include "ReferencePME.h"
#include "openmm/internal/hardware.h" #include "openmm/internal/hardware.h"
#include "openmm/internal/SplineFitter.h" #include "openmm/internal/SplineFitter.h"
#include "openmm/internal/vectorize.h"
// In case we're using some primitive version of Visual Studio this will // In case we're using some primitive version of Visual Studio this will
// make sure that erf() and erfc() are defined. // make sure that erf() and erfc() are defined.
...@@ -113,7 +114,7 @@ CpuNonbondedForce::~CpuNonbondedForce(){ ...@@ -113,7 +114,7 @@ CpuNonbondedForce::~CpuNonbondedForce(){
--------------------------------------------------------------------------------------- */ --------------------------------------------------------------------------------------- */
void CpuNonbondedForce::setUseCutoff(float distance, const vector<pair<int, int> >& neighbors, float solventDielectric) { void CpuNonbondedForce::setUseCutoff(float distance, const CpuNeighborList& neighbors, float solventDielectric) {
cutoff = true; cutoff = true;
cutoffDistance = distance; cutoffDistance = distance;
...@@ -199,23 +200,22 @@ void CpuNonbondedForce::setUseSwitchingFunction(float distance) { ...@@ -199,23 +200,22 @@ void CpuNonbondedForce::setUseSwitchingFunction(float distance) {
void CpuNonbondedForce::tabulateEwaldScaleFactor() { void CpuNonbondedForce::tabulateEwaldScaleFactor() {
ewaldDX = cutoffDistance/(NUM_TABLE_POINTS-2); ewaldDX = cutoffDistance/(NUM_TABLE_POINTS-2);
ewaldDXInv = 1.0f/ewaldDX; ewaldDXInv = 1.0f/ewaldDX;
vector<double> x(NUM_TABLE_POINTS); vector<double> x(NUM_TABLE_POINTS+1);
vector<double> y(NUM_TABLE_POINTS); vector<double> y(NUM_TABLE_POINTS+1);
vector<double> deriv; vector<double> deriv;
for (int i = 0; i < NUM_TABLE_POINTS; i++) { for (int i = 0; i < NUM_TABLE_POINTS+1; i++) {
double r = i*cutoffDistance/(NUM_TABLE_POINTS-2); double r = i*cutoffDistance/(NUM_TABLE_POINTS-2);
double alphaR = alphaEwald*r; double alphaR = alphaEwald*r;
x[i] = r; x[i] = r;
y[i] = erfc(alphaR) + TWO_OVER_SQRT_PI*alphaR*exp(-alphaR*alphaR); y[i] = erfc(alphaR) + TWO_OVER_SQRT_PI*alphaR*exp(-alphaR*alphaR);
} }
SplineFitter::createNaturalSpline(x, y, deriv); SplineFitter::createNaturalSpline(x, y, deriv);
ewaldScaleX.resize(NUM_TABLE_POINTS); ewaldScaleTable.resize(4*NUM_TABLE_POINTS);
ewaldScaleY.resize(NUM_TABLE_POINTS);
ewaldScaleDeriv.resize(NUM_TABLE_POINTS);
for (int i = 0; i < NUM_TABLE_POINTS; i++) { for (int i = 0; i < NUM_TABLE_POINTS; i++) {
ewaldScaleX[i] = (float) x[i]; ewaldScaleTable[4*i] = (float) y[i];
ewaldScaleY[i] = (float) y[i]; ewaldScaleTable[4*i+1] = (float) y[i+1];
ewaldScaleDeriv[i] = (float) (deriv[i]*ewaldDX*ewaldDX/6); ewaldScaleTable[4*i+2] = (float) (deriv[i]*ewaldDX*ewaldDX/6);
ewaldScaleTable[4*i+3] = (float) (deriv[i+1]*ewaldDX*ewaldDX/6);
} }
} }
...@@ -358,42 +358,12 @@ void CpuNonbondedForce::calculateDirectIxn(int numberOfAtoms, float* posq, const ...@@ -358,42 +358,12 @@ void CpuNonbondedForce::calculateDirectIxn(int numberOfAtoms, float* posq, const
for (int i = 0; i < numThreads; i++) for (int i = 0; i < numThreads; i++)
directEnergy += threadData[i]->threadEnergy; directEnergy += threadData[i]->threadEnergy;
for (int i = 0; i < numberOfAtoms; i++) { for (int i = 0; i < numberOfAtoms; i++) {
__m128 f = _mm_loadu_ps(forces+4*i); fvec4 f(forces+4*i);
for (int j = 0; j < numThreads; j++) for (int j = 0; j < numThreads; j++)
f = _mm_add_ps(f, _mm_loadu_ps(&threadData[j]->threadForce[4*i])); f += fvec4(&threadData[j]->threadForce[4*i]);
_mm_storeu_ps(forces+4*i, f); f.store(forces+4*i);
} }
if (ewald || pme) {
// Now subtract off the exclusions, since they were implicitly included in the reciprocal space sum.
__m128 boxSize = _mm_set_ps(0, periodicBoxSize[2], periodicBoxSize[1], periodicBoxSize[0]);
__m128 invBoxSize = _mm_set_ps(0, (1/periodicBoxSize[2]), (1/periodicBoxSize[1]), (1/periodicBoxSize[0]));
for (int i = 0; i < numberOfAtoms; i++)
for (set<int>::const_iterator iter = exclusions[i].begin(); iter != exclusions[i].end(); ++iter) {
if (*iter > i) {
int ii = i;
int jj = *iter;
__m128 deltaR;
__m128 posI = _mm_loadu_ps(posq+4*ii);
__m128 posJ = _mm_loadu_ps(posq+4*jj);
float r2;
getDeltaR(posJ, posI, deltaR, r2, false, boxSize, invBoxSize);
float r = sqrtf(r2);
float inverseR = 1/r;
float chargeProd = ONE_4PI_EPS0*posq[4*ii+3]*posq[4*jj+3];
float alphaR = alphaEwald*r;
float erfcAlphaR = erfcApprox(alphaR);
float dEdR = (float) (chargeProd * inverseR * inverseR * inverseR);
dEdR = (float) (dEdR * (1.0f-erfcAlphaR-TWO_OVER_SQRT_PI*alphaR*exp(-alphaR*alphaR)));
__m128 result = _mm_mul_ps(deltaR, _mm_set1_ps(dEdR));
_mm_storeu_ps(forces+4*ii, _mm_sub_ps(_mm_loadu_ps(forces+4*ii), result));
_mm_storeu_ps(forces+4*jj, _mm_add_ps(_mm_loadu_ps(forces+4*jj), result));
if (includeEnergy)
directEnergy -= chargeProd*inverseR*(1.0f-erfcAlphaR);
}
}
}
if (totalEnergy != NULL) if (totalEnergy != NULL)
*totalEnergy += (float) directEnergy; *totalEnergy += (float) directEnergy;
} }
...@@ -418,23 +388,47 @@ void CpuNonbondedForce::runThread(int index, vector<float>& threadForce, double& ...@@ -418,23 +388,47 @@ void CpuNonbondedForce::runThread(int index, vector<float>& threadForce, double&
threadForce.resize(4*numberOfAtoms, 0.0f); threadForce.resize(4*numberOfAtoms, 0.0f);
for (int i = 0; i < 4*numberOfAtoms; i++) for (int i = 0; i < 4*numberOfAtoms; i++)
threadForce[i] = 0.0f; threadForce[i] = 0.0f;
__m128 boxSize = _mm_set_ps(0, periodicBoxSize[2], periodicBoxSize[1], periodicBoxSize[0]); fvec4 boxSize(periodicBoxSize[0], periodicBoxSize[1], periodicBoxSize[2], 0);
__m128 invBoxSize = _mm_set_ps(0, (1/periodicBoxSize[2]), (1/periodicBoxSize[1]), (1/periodicBoxSize[0])); fvec4 invBoxSize((1/periodicBoxSize[0]), (1/periodicBoxSize[1]), (1/periodicBoxSize[2]), 0);
if (ewald || pme) { if (ewald || pme) {
// Compute the interactions from the neighbor list. // Compute the interactions from the neighbor list.
for (int i = index; i < (int) neighborList->size(); i += numThreads) { for (int i = index; i < neighborList->getNumBlocks(); i += numThreads)
pair<int, int> pair = (*neighborList)[i]; calculateBlockEwaldIxn(i, &threadForce[0], energyPtr, boxSize, invBoxSize);
calculateOneEwaldIxn(pair.first, pair.second, &threadForce[0], energyPtr, boxSize, invBoxSize);
} // Now subtract off the exclusions, since they were implicitly included in the reciprocal space sum.
fvec4 boxSize(periodicBoxSize[0], periodicBoxSize[1], periodicBoxSize[2], 0);
fvec4 invBoxSize((1/periodicBoxSize[0]), (1/periodicBoxSize[1]), (1/periodicBoxSize[2]), 0);
for (int i = index; i < numberOfAtoms; i += numThreads)
for (set<int>::const_iterator iter = exclusions[i].begin(); iter != exclusions[i].end(); ++iter) {
if (*iter > i) {
int j = *iter;
fvec4 deltaR;
fvec4 posI(posq+4*i);
fvec4 posJ(posq+4*j);
float r2;
getDeltaR(posJ, posI, deltaR, r2, false, boxSize, invBoxSize);
float r = sqrtf(r2);
float inverseR = 1/r;
float chargeProd = ONE_4PI_EPS0*posq[4*i+3]*posq[4*j+3];
float alphaR = alphaEwald*r;
float erfcAlphaR = erfcApprox(alphaR)[0];
float dEdR = (float) (chargeProd * inverseR * inverseR * inverseR);
dEdR = (float) (dEdR * (1.0f-erfcAlphaR-TWO_OVER_SQRT_PI*alphaR*exp(-alphaR*alphaR)));
fvec4 result = deltaR*dEdR;
(fvec4(&threadForce[4*i])-result).store(&threadForce[4*i]);
(fvec4(&threadForce[4*j])+result).store(&threadForce[4*j]);
if (includeEnergy)
threadEnergy -= chargeProd*inverseR*(1.0f-erfcAlphaR);
}
}
} }
else if (cutoff) { else if (cutoff) {
// Compute the interactions from the neighbor list. // Compute the interactions from the neighbor list.
for (int i = index; i < (int) neighborList->size(); i += numThreads) { for (int i = index; i < neighborList->getNumBlocks(); i += numThreads)
pair<int, int> pair = (*neighborList)[i]; calculateBlockIxn(i, &threadForce[0], energyPtr, boxSize, invBoxSize);
calculateOneIxn(pair.first, pair.second, &threadForce[0], energyPtr, boxSize, invBoxSize);
}
} }
else { else {
// Loop over all atom pairs // Loop over all atom pairs
...@@ -448,12 +442,12 @@ void CpuNonbondedForce::runThread(int index, vector<float>& threadForce, double& ...@@ -448,12 +442,12 @@ void CpuNonbondedForce::runThread(int index, vector<float>& threadForce, double&
} }
} }
void CpuNonbondedForce::calculateOneIxn(int ii, int jj, float* forces, double* totalEnergy, const __m128& boxSize, const __m128& invBoxSize) { void CpuNonbondedForce::calculateOneIxn(int ii, int jj, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize) {
// get deltaR, R2, and R between 2 atoms // get deltaR, R2, and R between 2 atoms
__m128 deltaR; fvec4 deltaR;
__m128 posI = _mm_loadu_ps(posq+4*ii); fvec4 posI(posq+4*ii);
__m128 posJ = _mm_loadu_ps(posq+4*jj); fvec4 posJ(posq+4*jj);
float r2; float r2;
getDeltaR(posJ, posI, deltaR, r2, periodic, boxSize, invBoxSize); getDeltaR(posJ, posI, deltaR, r2, periodic, boxSize, invBoxSize);
if (cutoff && r2 >= cutoffDistance*cutoffDistance) if (cutoff && r2 >= cutoffDistance*cutoffDistance)
...@@ -497,83 +491,239 @@ void CpuNonbondedForce::calculateOneIxn(int ii, int jj, float* forces, double* t ...@@ -497,83 +491,239 @@ void CpuNonbondedForce::calculateOneIxn(int ii, int jj, float* forces, double* t
// accumulate forces // accumulate forces
__m128 result = _mm_mul_ps(deltaR, _mm_set1_ps(dEdR)); fvec4 result = deltaR*dEdR;
_mm_storeu_ps(forces+4*ii, _mm_add_ps(_mm_loadu_ps(forces+4*ii), result)); (fvec4(forces+4*ii)+result).store(forces+4*ii);
_mm_storeu_ps(forces+4*jj, _mm_sub_ps(_mm_loadu_ps(forces+4*jj), result)); (fvec4(forces+4*jj)-result).store(forces+4*jj);
} }
void CpuNonbondedForce::calculateOneEwaldIxn(int ii, int jj, float* forces, double* totalEnergy, const __m128& boxSize, const __m128& invBoxSize) { void CpuNonbondedForce::calculateBlockIxn(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize) {
__m128 deltaR; // Load the positions and parameters of the atoms in the block.
__m128 posI = _mm_loadu_ps(posq+4*ii);
__m128 posJ = _mm_loadu_ps(posq+4*jj); int blockAtom[4];
float r2; fvec4 blockAtomPosq[4];
getDeltaR(posJ, posI, deltaR, r2, true, boxSize, invBoxSize); fvec4 blockAtomForce[4];
if (r2 < cutoffDistance*cutoffDistance) { for (int i = 0; i < 4; i++) {
float r = sqrtf(r2); blockAtom[i] = neighborList->getSortedAtoms()[4*blockIndex+i];
float inverseR = 1/r; blockAtomPosq[i] = fvec4(posq+4*blockAtom[i]);
float switchValue = 1, switchDeriv = 0; blockAtomForce[i] = fvec4(0.0f);
if (useSwitch && r > switchingDistance) { }
float t = (r-switchingDistance)/(cutoffDistance-switchingDistance); fvec4 blockAtomCharge = fvec4(ONE_4PI_EPS0)*fvec4(blockAtomPosq[0][3], blockAtomPosq[1][3], blockAtomPosq[2][3], blockAtomPosq[3][3]);
switchValue = 1+t*t*t*(-10+t*(15-t*6)); fvec4 blockAtomSigma(atomParameters[blockAtom[0]].first, atomParameters[blockAtom[1]].first, atomParameters[blockAtom[2]].first, atomParameters[blockAtom[3]].first);
switchDeriv = t*t*(-30+t*(60-t*30))/(cutoffDistance-switchingDistance); fvec4 blockAtomEpsilon(atomParameters[blockAtom[0]].second, atomParameters[blockAtom[1]].second, atomParameters[blockAtom[2]].second, atomParameters[blockAtom[3]].second);
// Loop over neighbors for this block.
const vector<int>& neighbors = neighborList->getBlockNeighbors(blockIndex);
const vector<char>& exclusions = neighborList->getBlockExclusions(blockIndex);
float blockAtomR2[4];
bool include[4];
fvec4 blockAtomDelta[4];
for (int i = 0; i < (int) neighbors.size(); i++) {
// Load the next neighbor.
int atom = neighbors[i];
fvec4 atomPosq(posq+4*atom);
// Compute the distances to the block atoms.
bool any = false;
for (int j = 0; j < 4; j++) {
getDeltaR(atomPosq, blockAtomPosq[j], blockAtomDelta[j], blockAtomR2[j], periodic, boxSize, invBoxSize);
include[j] = (((exclusions[i]>>j)&1) == 0 && (!cutoff || blockAtomR2[j] < cutoffDistance*cutoffDistance));
any |= include[j];
}
if (!any)
continue; // No interactions to compute.
// Compute the interactions.
fvec4 r2(blockAtomR2);
fvec4 r = sqrt(r2);
fvec4 inverseR = fvec4(1.0f)/r;
fvec4 switchValue(1.0f), switchDeriv(0.0f);
if (useSwitch) {
fvec4 t = (r>switchingDistance) & ((r-switchingDistance)/(cutoffDistance-switchingDistance));
switchValue = 1+t*t*t*(-10.0f+t*(15.0f-t*6.0f));
switchDeriv = t*t*(-30.0f+t*(60.0f-t*30.0f))/(cutoffDistance-switchingDistance);
} }
float chargeProd = ONE_4PI_EPS0*posq[4*ii+3]*posq[4*jj+3]; fvec4 sig = blockAtomSigma+atomParameters[atom].first;
float dEdR = chargeProd*inverseR*ewaldScaleFunction(r); fvec4 sig2 = inverseR*sig;
float sig = atomParameters[ii].first + atomParameters[jj].first; sig2 *= sig2;
float sig2 = inverseR*sig; fvec4 sig6 = sig2*sig2*sig2;
sig2 *= sig2; fvec4 eps = blockAtomEpsilon*atomParameters[atom].second;
float sig6 = sig2*sig2*sig2; fvec4 dEdR = switchValue*eps*(12.0f*sig6 - 6.0f)*sig6;
float eps = atomParameters[ii].second*atomParameters[jj].second; fvec4 chargeProd = blockAtomCharge*posq[4*atom+3];
dEdR += switchValue*eps*(12.0f*sig6 - 6.0f)*sig6; if (cutoff)
dEdR += chargeProd*(inverseR-2.0f*krf*r2);
else
dEdR += chargeProd*inverseR;
dEdR *= inverseR*inverseR; dEdR *= inverseR*inverseR;
float energy = eps*(sig6-1.0f)*sig6; fvec4 energy = eps*(sig6-1.0f)*sig6;
if (useSwitch) { if (useSwitch) {
dEdR -= energy*switchDeriv*inverseR; dEdR -= energy*switchDeriv*inverseR;
energy *= switchValue; energy *= switchValue;
} }
// accumulate forces // Accumulate energies.
__m128 result = _mm_mul_ps(deltaR, _mm_set1_ps(dEdR)); if (totalEnergy) {
_mm_storeu_ps(forces+4*ii, _mm_add_ps(_mm_loadu_ps(forces+4*ii), result)); if (cutoff)
_mm_storeu_ps(forces+4*jj, _mm_sub_ps(_mm_loadu_ps(forces+4*jj), result)); energy += chargeProd*(inverseR+krf*r2-crf);
else
energy += chargeProd*inverseR;
for (int j = 0; j < 4; j++)
if (include[j])
*totalEnergy += energy[j];
}
// accumulate energies // Accumulate forces.
fvec4 atomForce(forces+4*atom);
for (int j = 0; j < 4; j++) {
if (include[j]) {
fvec4 result = blockAtomDelta[j]*dEdR[j];
blockAtomForce[j] += result;
atomForce -= result;
}
}
atomForce.store(forces+4*atom);
}
// Record the forces on the block atoms.
for (int j = 0; j < 4; j++)
(fvec4(forces+4*blockAtom[j])+blockAtomForce[j]).store(forces+4*blockAtom[j]);
}
void CpuNonbondedForce::calculateBlockEwaldIxn(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize) {
// Load the positions and parameters of the atoms in the block.
int blockAtom[4];
fvec4 blockAtomPosq[4];
fvec4 blockAtomForce[4];
for (int i = 0; i < 4; i++) {
blockAtom[i] = neighborList->getSortedAtoms()[4*blockIndex+i];
blockAtomPosq[i] = fvec4(posq+4*blockAtom[i]);
blockAtomForce[i] = fvec4(0.0f);
}
fvec4 blockAtomCharge = fvec4(ONE_4PI_EPS0)*fvec4(blockAtomPosq[0][3], blockAtomPosq[1][3], blockAtomPosq[2][3], blockAtomPosq[3][3]);
fvec4 blockAtomSigma(atomParameters[blockAtom[0]].first, atomParameters[blockAtom[1]].first, atomParameters[blockAtom[2]].first, atomParameters[blockAtom[3]].first);
fvec4 blockAtomEpsilon(atomParameters[blockAtom[0]].second, atomParameters[blockAtom[1]].second, atomParameters[blockAtom[2]].second, atomParameters[blockAtom[3]].second);
// Loop over neighbors for this block.
const vector<int>& neighbors = neighborList->getBlockNeighbors(blockIndex);
const vector<char>& exclusions = neighborList->getBlockExclusions(blockIndex);
float blockAtomR2[4];
bool include[4];
fvec4 blockAtomDelta[4];
for (int i = 0; i < (int) neighbors.size(); i++) {
// Load the next neighbor.
int atom = neighbors[i];
fvec4 atomPosq(posq+4*atom);
// Compute the distances to the block atoms.
bool any = false;
for (int j = 0; j < 4; j++) {
getDeltaR(atomPosq, blockAtomPosq[j], blockAtomDelta[j], blockAtomR2[j], periodic, boxSize, invBoxSize);
include[j] = (((exclusions[i]>>j)&1) == 0 && blockAtomR2[j] < cutoffDistance*cutoffDistance);
any |= include[j];
}
if (!any)
continue; // No interactions to compute.
// Compute the interactions.
fvec4 r2(blockAtomR2);
fvec4 r = sqrt(r2);
fvec4 inverseR = fvec4(1.0f)/r;
fvec4 switchValue(1.0f), switchDeriv(0.0f);
if (useSwitch) {
fvec4 t = (r>switchingDistance) & ((r-switchingDistance)/(cutoffDistance-switchingDistance));
switchValue = 1+t*t*t*(-10.0f+t*(15.0f-t*6.0f));
switchDeriv = t*t*(-30.0f+t*(60.0f-t*30.0f))/(cutoffDistance-switchingDistance);
}
fvec4 chargeProd = blockAtomCharge*posq[4*atom+3];
fvec4 dEdR = chargeProd*inverseR*ewaldScaleFunction(r);
fvec4 sig = blockAtomSigma+atomParameters[atom].first;
fvec4 sig2 = inverseR*sig;
sig2 *= sig2;
fvec4 sig6 = sig2*sig2*sig2;
fvec4 eps = blockAtomEpsilon*atomParameters[atom].second;
dEdR += switchValue*eps*(12.0f*sig6 - 6.0f)*sig6;
dEdR *= inverseR*inverseR;
fvec4 energy = eps*(sig6-1.0f)*sig6;
if (useSwitch) {
dEdR -= energy*switchDeriv*inverseR;
energy *= switchValue;
}
// Accumulate energies.
if (totalEnergy) { if (totalEnergy) {
energy += (float) (chargeProd*inverseR*erfcApprox(alphaEwald*r)); energy += chargeProd*inverseR*erfcApprox(alphaEwald*r);
*totalEnergy += energy; for (int j = 0; j < 4; j++)
if (include[j])
*totalEnergy += energy[j];
} }
// Accumulate forces.
fvec4 atomForce(forces+4*atom);
for (int j = 0; j < 4; j++) {
if (include[j]) {
fvec4 result = blockAtomDelta[j]*dEdR[j];
blockAtomForce[j] += result;
atomForce -= result;
}
}
atomForce.store(forces+4*atom);
} }
// Record the forces on the block atoms.
for (int j = 0; j < 4; j++)
(fvec4(forces+4*blockAtom[j])+blockAtomForce[j]).store(forces+4*blockAtom[j]);
} }
void CpuNonbondedForce::getDeltaR(const __m128& posI, const __m128& posJ, __m128& deltaR, float& r2, bool periodic, const __m128& boxSize, const __m128& invBoxSize) const { void CpuNonbondedForce::getDeltaR(const fvec4& posI, const fvec4& posJ, fvec4& deltaR, float& r2, bool periodic, const fvec4& boxSize, const fvec4& invBoxSize) const {
deltaR = _mm_sub_ps(posJ, posI); deltaR = posJ-posI;
if (periodic) { if (periodic) {
__m128 base = _mm_mul_ps(_mm_floor_ps(_mm_add_ps(_mm_mul_ps(deltaR, invBoxSize), _mm_set1_ps(0.5))), boxSize); fvec4 base = round(deltaR*invBoxSize)*boxSize;
deltaR = _mm_sub_ps(deltaR, base); deltaR = deltaR-base;
} }
r2 = _mm_cvtss_f32(_mm_dp_ps(deltaR, deltaR, 0x71)); r2 = dot3(deltaR, deltaR);
} }
float CpuNonbondedForce::erfcApprox(float x) { fvec4 CpuNonbondedForce::erfcApprox(fvec4 x) {
// This approximation for erfc is from Abramowitz and Stegun (1964) p. 299. They cite the following as // This approximation for erfc is from Abramowitz and Stegun (1964) p. 299. They cite the following as
// the original source: C. Hastings, Jr., Approximations for Digital Computers (1955). It has a maximum // the original source: C. Hastings, Jr., Approximations for Digital Computers (1955). It has a maximum
// error of 3e-7. // error of 3e-7.
float t = 1.0f+(0.0705230784f+(0.0422820123f+(0.0092705272f+(0.0001520143f+(0.0002765672f+0.0000430638f*x)*x)*x)*x)*x)*x; fvec4 t = 1.0f+(0.0705230784f+(0.0422820123f+(0.0092705272f+(0.0001520143f+(0.0002765672f+0.0000430638f*x)*x)*x)*x)*x)*x;
t *= t; t *= t;
t *= t; t *= t;
t *= t; t *= t;
return 1.0f/(t*t); return 1.0f/(t*t);
} }
float CpuNonbondedForce::ewaldScaleFunction(float x) { fvec4 CpuNonbondedForce::ewaldScaleFunction(fvec4 x) {
// Compute the tabulated Ewald scale factor: erfc(alpha*r) + 2*alpha*r*exp(-alpha*alpha*r*r)/sqrt(PI) // Compute the tabulated Ewald scale factor: erfc(alpha*r) + 2*alpha*r*exp(-alpha*alpha*r*r)/sqrt(PI)
int lower = (int) (x*ewaldDXInv); float y[4];
int upper = lower+1; fvec4 x1 = x*ewaldDXInv;
float a = (ewaldScaleX[upper]-x)*ewaldDXInv; ivec4 index = floor(x1);
float b = 1.0f-a; fvec4 coeff[4];
return a*ewaldScaleY[lower]+b*ewaldScaleY[upper]+((a*a*a-a)*ewaldScaleDeriv[lower] + (b*b*b-b)*ewaldScaleDeriv[upper]); coeff[1] = x1-index;
coeff[0] = 1.0f-coeff[1];
coeff[2] = coeff[0]*coeff[0]*coeff[0]-coeff[0];
coeff[3] = coeff[1]*coeff[1]*coeff[1]-coeff[1];
_MM_TRANSPOSE4_PS(coeff[0], coeff[1], coeff[2], coeff[3]);
for (int i = 0; i < 4; i++) {
if (index[i] < NUM_TABLE_POINTS)
y[i] = dot4(coeff[i], fvec4(&ewaldScaleTable[4*index[i]]));
}
return fvec4(y);
} }
...@@ -39,6 +39,7 @@ ...@@ -39,6 +39,7 @@
#include "sfmt/SFMT.h" #include "sfmt/SFMT.h"
#include <iostream> #include <iostream>
#include <set> #include <set>
#include <utility>
#include <vector> #include <vector>
using namespace OpenMM; using namespace OpenMM;
...@@ -68,10 +69,19 @@ void testNeighborList(bool periodic) { ...@@ -68,10 +69,19 @@ void testNeighborList(bool periodic) {
// Convert the neighbor list to a set for faster lookup. // Convert the neighbor list to a set for faster lookup.
set<pair<int, int> > neighbors; set<pair<int, int> > neighbors;
for (int i = 0; i < (int) neighborList.getNeighbors().size(); i++) { for (int i = 0; i < (int) neighborList.getSortedAtoms().size(); i++) {
pair<int, int> entry = neighborList.getNeighbors()[i]; int blockIndex = i/CpuNeighborList::BlockSize;
ASSERT(neighbors.find(entry) == neighbors.end() && neighbors.find(make_pair(entry.second, entry.first)) == neighbors.end()); // No duplicates int indexInBlock = i-blockIndex*CpuNeighborList::BlockSize;
neighbors.insert(entry); char mask = 1<<indexInBlock;
for (int j = 0; j < (int) neighborList.getBlockExclusions(blockIndex).size(); j++) {
if ((neighborList.getBlockExclusions(blockIndex)[j] & mask) == 0) {
int atom1 = neighborList.getSortedAtoms()[i];
int atom2 = neighborList.getBlockNeighbors(blockIndex)[j];
pair<int, int> entry = make_pair(min(atom1, atom2), max(atom1, atom2));
ASSERT(neighbors.find(entry) == neighbors.end() && neighbors.find(make_pair(entry.second, entry.first)) == neighbors.end()); // No duplicates
neighbors.insert(entry);
}
}
} }
// Check each particle pair and figure out whether they should be in the neighbor list. // Check each particle pair and figure out whether they should be in the neighbor list.
...@@ -90,7 +100,8 @@ void testNeighborList(bool periodic) { ...@@ -90,7 +100,8 @@ void testNeighborList(bool periodic) {
if (dx*dx + dy*dy + dz*dz > cutoff*cutoff) if (dx*dx + dy*dy + dz*dz > cutoff*cutoff)
shouldInclude = false; shouldInclude = false;
bool isIncluded = (neighbors.find(make_pair(i, j)) != neighbors.end() || neighbors.find(make_pair(j, i)) != neighbors.end()); bool isIncluded = (neighbors.find(make_pair(i, j)) != neighbors.end() || neighbors.find(make_pair(j, i)) != neighbors.end());
ASSERT_EQUAL(shouldInclude, isIncluded); if (shouldInclude)
ASSERT(isIncluded);
} }
} }
......
...@@ -301,6 +301,14 @@ public: ...@@ -301,6 +301,14 @@ public:
*/ */
void setEwaldErrorTolerance(double tol); void setEwaldErrorTolerance(double tol);
/**
* Get the induced dipole moments of all particles.
*
* @param context the Context for which to get the induced dipoles
* @param dipoles the induced dipole moment of particle i is stored into the i'th element
*/
void getInducedDipoles(Context& context, std::vector<Vec3>& dipoles);
/** /**
* Get the electrostatic potential. * Get the electrostatic potential.
* *
......
...@@ -348,6 +348,8 @@ public: ...@@ -348,6 +348,8 @@ public:
*/ */
virtual double execute(ContextImpl& context, bool includeForces, bool includeEnergy) = 0; virtual double execute(ContextImpl& context, bool includeForces, bool includeEnergy) = 0;
virtual void getInducedDipoles(ContextImpl& context, std::vector<Vec3>& dipoles) = 0;
virtual void getElectrostaticPotential( ContextImpl& context, const std::vector< Vec3 >& inputGrid, virtual void getElectrostaticPotential( ContextImpl& context, const std::vector< Vec3 >& inputGrid,
std::vector< double >& outputElectrostaticPotential ) = 0; std::vector< double >& outputElectrostaticPotential ) = 0;
......
...@@ -82,6 +82,8 @@ public: ...@@ -82,6 +82,8 @@ public:
*/ */
static void getCovalentDegree( const AmoebaMultipoleForce& force, std::vector<int>& covalentDegree ); static void getCovalentDegree( const AmoebaMultipoleForce& force, std::vector<int>& covalentDegree );
void getInducedDipoles(ContextImpl& context, std::vector<Vec3>& dipoles);
void getElectrostaticPotential( ContextImpl& context, const std::vector< Vec3 >& inputGrid, void getElectrostaticPotential( ContextImpl& context, const std::vector< Vec3 >& inputGrid,
std::vector< double >& outputElectrostaticPotential ); std::vector< double >& outputElectrostaticPotential );
......
...@@ -226,6 +226,10 @@ void AmoebaMultipoleForce::getCovalentMaps(int index, std::vector< std::vector<i ...@@ -226,6 +226,10 @@ void AmoebaMultipoleForce::getCovalentMaps(int index, std::vector< std::vector<i
} }
} }
void AmoebaMultipoleForce::getInducedDipoles(Context& context, vector<Vec3>& dipoles) {
dynamic_cast<AmoebaMultipoleForceImpl&>(getImplInContext(context)).getInducedDipoles(getContextImpl(context), dipoles);
}
void AmoebaMultipoleForce::getElectrostaticPotential( const std::vector< Vec3 >& inputGrid, Context& context, std::vector< double >& outputElectrostaticPotential ){ void AmoebaMultipoleForce::getElectrostaticPotential( const std::vector< Vec3 >& inputGrid, Context& context, std::vector< double >& outputElectrostaticPotential ){
dynamic_cast<AmoebaMultipoleForceImpl&>(getImplInContext(context)).getElectrostaticPotential(getContextImpl(context), inputGrid, outputElectrostaticPotential); dynamic_cast<AmoebaMultipoleForceImpl&>(getImplInContext(context)).getElectrostaticPotential(getContextImpl(context), inputGrid, outputElectrostaticPotential);
} }
......
...@@ -183,6 +183,10 @@ void AmoebaMultipoleForceImpl::getCovalentDegree( const AmoebaMultipoleForce& fo ...@@ -183,6 +183,10 @@ void AmoebaMultipoleForceImpl::getCovalentDegree( const AmoebaMultipoleForce& fo
return; return;
} }
void AmoebaMultipoleForceImpl::getInducedDipoles(ContextImpl& context, vector<Vec3>& dipoles) {
kernel.getAs<CalcAmoebaMultipoleForceKernel>().getInducedDipoles(context, dipoles);
}
void AmoebaMultipoleForceImpl::getElectrostaticPotential( ContextImpl& context, const std::vector< Vec3 >& inputGrid, void AmoebaMultipoleForceImpl::getElectrostaticPotential( ContextImpl& context, const std::vector< Vec3 >& inputGrid,
std::vector< double >& outputElectrostaticPotential ){ std::vector< double >& outputElectrostaticPotential ){
kernel.getAs<CalcAmoebaMultipoleForceKernel>().getElectrostaticPotential(context, inputGrid, outputElectrostaticPotential); kernel.getAs<CalcAmoebaMultipoleForceKernel>().getElectrostaticPotential(context, inputGrid, outputElectrostaticPotential);
......
...@@ -1639,6 +1639,24 @@ void CudaCalcAmoebaMultipoleForceKernel::ensureMultipolesValid(ContextImpl& cont ...@@ -1639,6 +1639,24 @@ void CudaCalcAmoebaMultipoleForceKernel::ensureMultipolesValid(ContextImpl& cont
context.calcForcesAndEnergy(false, false, -1); context.calcForcesAndEnergy(false, false, -1);
} }
void CudaCalcAmoebaMultipoleForceKernel::getInducedDipoles(ContextImpl& context, vector<Vec3>& dipoles) {
ensureMultipolesValid(context);
int numParticles = cu.getNumAtoms();
dipoles.resize(numParticles);
if (cu.getUseDoublePrecision()) {
vector<double> d;
inducedDipole->download(d);
for (int i = 0; i < numParticles; i++)
dipoles[i] = Vec3(d[3*i], d[3*i+1], d[3*i+2]);
}
else {
vector<float> d;
inducedDipole->download(d);
for (int i = 0; i < numParticles; i++)
dipoles[i] = Vec3(d[3*i], d[3*i+1], d[3*i+2]);
}
}
void CudaCalcAmoebaMultipoleForceKernel::getElectrostaticPotential(ContextImpl& context, const vector<Vec3>& inputGrid, vector<double>& outputElectrostaticPotential) { void CudaCalcAmoebaMultipoleForceKernel::getElectrostaticPotential(ContextImpl& context, const vector<Vec3>& inputGrid, vector<double>& outputElectrostaticPotential) {
ensureMultipolesValid(context); ensureMultipolesValid(context);
int numPoints = inputGrid.size(); int numPoints = inputGrid.size();
......
...@@ -327,6 +327,13 @@ public: ...@@ -327,6 +327,13 @@ public:
* @return the potential energy due to the force * @return the potential energy due to the force
*/ */
double execute(ContextImpl& context, bool includeForces, bool includeEnergy); double execute(ContextImpl& context, bool includeForces, bool includeEnergy);
/**
* Get the induced dipole moments of all particles.
*
* @param context the Context for which to get the induced dipoles
* @param dipoles the induced dipole moment of particle i is stored into the i'th element
*/
void getInducedDipoles(ContextImpl& context, std::vector<Vec3>& dipoles);
/** /**
* Execute the kernel to calculate the electrostatic potential * Execute the kernel to calculate the electrostatic potential
* *
......
...@@ -2700,6 +2700,40 @@ static void testPMEMutualPolarizationLargeWater( FILE* log ) { ...@@ -2700,6 +2700,40 @@ static void testPMEMutualPolarizationLargeWater( FILE* log ) {
} }
// test querying particle induced dipoles
static void testParticleInducedDipoles() {
int numberOfParticles = 8;
int inputPmeGridDimension = 0;
double cutoff = 9000000.0;
std::vector<Vec3> forces;
double energy;
System system;
AmoebaMultipoleForce* amoebaMultipoleForce = new AmoebaMultipoleForce();;
setupMultipoleAmmonia(system, amoebaMultipoleForce, AmoebaMultipoleForce::NoCutoff, AmoebaMultipoleForce::Mutual,
cutoff, inputPmeGridDimension);
LangevinIntegrator integrator(0.0, 0.1, 0.01);
Context context(system, integrator, Platform::getPlatformByName("CUDA"));
getForcesEnergyMultipoleAmmonia(context, forces, energy);
std::vector<Vec3> dipole;
amoebaMultipoleForce->getInducedDipoles(context, dipole);
// Compare to values calculated by TINKER.
std::vector<Vec3> expectedDipole(numberOfParticles);
expectedDipole[0] = Vec3(0.0031710288, 9.3687453e-7, -0.0006919963);
expectedDipole[1] = Vec3(8.0279737504e-5, -0.000279376, 4.778060103e-5);
expectedDipole[2] = Vec3(0.000079322, 0.0002789804, 4.8696656126e-5);
expectedDipole[3] = Vec3(-0.0001407394, 1.540638116e-6, -0.0007077775);
expectedDipole[4] = Vec3(0.0019564439, -1.0409717e-7, 0.0007332188);
expectedDipole[5] = Vec3(0.0008213891, -0.0007749618, -0.0003883865);
expectedDipole[6] = Vec3(0.0046133992, -7.2868019e-7, 0.0002500622);
expectedDipole[7] = Vec3(0.0008204731, 0.0007772727, -0.0003856176);
for (int i = 0; i < numberOfParticles; i++)
ASSERT_EQUAL_VEC(expectedDipole[i], dipole[i], 1e-4);
}
// test computation of system multipole moments // test computation of system multipole moments
static void testSystemMultipoleMoments( FILE* log ) { static void testSystemMultipoleMoments( FILE* log ) {
...@@ -2963,6 +2997,10 @@ int main(int argc, char* argv[]) { ...@@ -2963,6 +2997,10 @@ int main(int argc, char* argv[]) {
testMultipoleIonsAndWaterPMEMutualPolarization( log ); testMultipoleIonsAndWaterPMEMutualPolarization( log );
testMultipoleIonsAndWaterPMEDirectPolarization( log ); testMultipoleIonsAndWaterPMEDirectPolarization( log );
// test querying induced dipoles
testParticleInducedDipoles();
// test computation of system multipole moments // test computation of system multipole moments
testSystemMultipoleMoments( log ); testSystemMultipoleMoments( log );
......
...@@ -683,6 +683,25 @@ double ReferenceCalcAmoebaMultipoleForceKernel::execute(ContextImpl& context, bo ...@@ -683,6 +683,25 @@ double ReferenceCalcAmoebaMultipoleForceKernel::execute(ContextImpl& context, bo
return static_cast<double>(energy); return static_cast<double>(energy);
} }
void ReferenceCalcAmoebaMultipoleForceKernel::getInducedDipoles(ContextImpl& context, vector<Vec3>& outputDipoles) {
int numParticles = context.getSystem().getNumParticles();
outputDipoles.resize(numParticles);
// Create an AmoebaReferenceMultipoleForce to do the calculation.
AmoebaReferenceMultipoleForce* amoebaReferenceMultipoleForce = setupAmoebaReferenceMultipoleForce( context );
vector<RealVec>& posData = extractPositions(context);
// Retrieve the induced dipoles.
vector<RealVec> inducedDipoles;
amoebaReferenceMultipoleForce->calculateInducedDipoles(posData, charges, dipoles, quadrupoles, tholes,
dampingFactors, polarity, axisTypes, multipoleAtomZs, multipoleAtomXs, multipoleAtomYs, multipoleAtomCovalentInfo, inducedDipoles);
for (int i = 0; i < numParticles; i++)
outputDipoles[i] = inducedDipoles[i];
delete amoebaReferenceMultipoleForce;
}
void ReferenceCalcAmoebaMultipoleForceKernel::getElectrostaticPotential(ContextImpl& context, const std::vector< Vec3 >& inputGrid, void ReferenceCalcAmoebaMultipoleForceKernel::getElectrostaticPotential(ContextImpl& context, const std::vector< Vec3 >& inputGrid,
std::vector< double >& outputElectrostaticPotential ){ std::vector< double >& outputElectrostaticPotential ){
...@@ -704,8 +723,6 @@ void ReferenceCalcAmoebaMultipoleForceKernel::getElectrostaticPotential(ContextI ...@@ -704,8 +723,6 @@ void ReferenceCalcAmoebaMultipoleForceKernel::getElectrostaticPotential(ContextI
} }
delete amoebaReferenceMultipoleForce; delete amoebaReferenceMultipoleForce;
return;
} }
void ReferenceCalcAmoebaMultipoleForceKernel::getSystemMultipoleMoments(ContextImpl& context, std::vector< double >& outputMultipoleMoments){ void ReferenceCalcAmoebaMultipoleForceKernel::getSystemMultipoleMoments(ContextImpl& context, std::vector< double >& outputMultipoleMoments){
...@@ -726,8 +743,6 @@ void ReferenceCalcAmoebaMultipoleForceKernel::getSystemMultipoleMoments(ContextI ...@@ -726,8 +743,6 @@ void ReferenceCalcAmoebaMultipoleForceKernel::getSystemMultipoleMoments(ContextI
multipoleAtomCovalentInfo, outputMultipoleMoments ); multipoleAtomCovalentInfo, outputMultipoleMoments );
delete amoebaReferenceMultipoleForce; delete amoebaReferenceMultipoleForce;
return;
} }
void ReferenceCalcAmoebaMultipoleForceKernel::copyParametersToContext(ContextImpl& context, const AmoebaMultipoleForce& force) { void ReferenceCalcAmoebaMultipoleForceKernel::copyParametersToContext(ContextImpl& context, const AmoebaMultipoleForce& force) {
......
...@@ -366,6 +366,13 @@ public: ...@@ -366,6 +366,13 @@ public:
* @return the potential energy due to the force * @return the potential energy due to the force
*/ */
double execute(ContextImpl& context, bool includeForces, bool includeEnergy); double execute(ContextImpl& context, bool includeForces, bool includeEnergy);
/**
* Get the induced dipole moments of all particles.
*
* @param context the Context for which to get the induced dipoles
* @param dipoles the induced dipole moment of particle i is stored into the i'th element
*/
void getInducedDipoles(ContextImpl& context, std::vector<Vec3>& dipoles);
/** /**
* Calculate the electrostatic potential given vector of grid coordinates. * Calculate the electrostatic potential given vector of grid coordinates.
* *
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment