Commit b21e3182 authored by Jason Swails's avatar Jason Swails
Browse files

Merge branch 'master' into psfinscode

parents 7b30da6e 3946c025
...@@ -296,7 +296,9 @@ SET(OPENMM_BUILD_SHARED_LIB ON CACHE BOOL "Whether to build shared OpenMM librar ...@@ -296,7 +296,9 @@ SET(OPENMM_BUILD_SHARED_LIB ON CACHE BOOL "Whether to build shared OpenMM librar
SET(EXTRA_LINK_FLAGS ${EXTRA_COMPILE_FLAGS}) SET(EXTRA_LINK_FLAGS ${EXTRA_COMPILE_FLAGS})
IF (CMAKE_SYSTEM_NAME MATCHES "Linux") IF (CMAKE_SYSTEM_NAME MATCHES "Linux")
SET(EXTRA_LINK_FLAGS "${EXTRA_LINK_FLAGS} -Wl,--no-as-needed -lrt") IF (NOT ANDROID)
SET(EXTRA_LINK_FLAGS "${EXTRA_LINK_FLAGS} -Wl,--no-as-needed -lrt")
ENDIF (NOT ANDROID)
ENDIF (CMAKE_SYSTEM_NAME MATCHES "Linux") ENDIF (CMAKE_SYSTEM_NAME MATCHES "Linux")
IF(OPENMM_BUILD_SHARED_LIB) IF(OPENMM_BUILD_SHARED_LIB)
...@@ -403,11 +405,11 @@ MARK_AS_ADVANCED(CUDA_BUILD_CUBIN) ...@@ -403,11 +405,11 @@ MARK_AS_ADVANCED(CUDA_BUILD_CUBIN)
MARK_AS_ADVANCED(CUDA_BUILD_EMULATION) MARK_AS_ADVANCED(CUDA_BUILD_EMULATION)
FIND_PACKAGE(OpenCL QUIET) FIND_PACKAGE(OpenCL QUIET)
IF(OPENCL_FOUND AND NOT APPLE) IF(OPENCL_FOUND)
SET(OPENMM_BUILD_OPENCL_LIB ON CACHE BOOL "Build OpenMMOpenCL library") SET(OPENMM_BUILD_OPENCL_LIB ON CACHE BOOL "Build OpenMMOpenCL library")
ELSE(OPENCL_FOUND AND NOT APPLE) ELSE(OPENCL_FOUND)
SET(OPENMM_BUILD_OPENCL_LIB OFF CACHE BOOL "Build OpenMMOpenCL library") SET(OPENMM_BUILD_OPENCL_LIB OFF CACHE BOOL "Build OpenMMOpenCL library")
ENDIF(OPENCL_FOUND AND NOT APPLE) ENDIF(OPENCL_FOUND)
IF(OPENMM_BUILD_OPENCL_LIB) IF(OPENMM_BUILD_OPENCL_LIB)
ADD_SUBDIRECTORY(platforms/opencl) ADD_SUBDIRECTORY(platforms/opencl)
ENDIF(OPENMM_BUILD_OPENCL_LIB) ENDIF(OPENMM_BUILD_OPENCL_LIB)
......
...@@ -191,6 +191,18 @@ static inline fvec8 sqrt(const fvec8& v) { ...@@ -191,6 +191,18 @@ static inline fvec8 sqrt(const fvec8& v) {
return fvec8(_mm256_sqrt_ps(v.val)); return fvec8(_mm256_sqrt_ps(v.val));
} }
static inline fvec8 rsqrt(const fvec8& v) {
// Initial estimate of rsqrt().
fvec8 y(_mm256_rsqrt_ps(v.val));
// Perform an iteration of Newton refinement.
fvec8 x2 = v*0.5f;
y *= fvec8(1.5f)-x2*y*y;
return y;
}
static inline float dot8(const fvec8& v1, const fvec8& v2) { static inline float dot8(const fvec8& v1, const fvec8& v2) {
fvec8 result = _mm256_dp_ps(v1, v2, 0xF1); fvec8 result = _mm256_dp_ps(v1, v2, 0xF1);
return _mm_cvtss_f32(result.lowerVec())+_mm_cvtss_f32(result.upperVec()); return _mm_cvtss_f32(result.lowerVec())+_mm_cvtss_f32(result.upperVec());
......
...@@ -251,11 +251,15 @@ static inline fvec4 abs(const fvec4& v) { ...@@ -251,11 +251,15 @@ static inline fvec4 abs(const fvec4& v) {
return vabsq_f32(v); return vabsq_f32(v);
} }
static inline fvec4 sqrt(const fvec4& v) { static inline fvec4 rsqrt(const fvec4& v) {
float32x4_t recipSqrt = vrsqrteq_f32(v); float32x4_t recipSqrt = vrsqrteq_f32(v);
recipSqrt = vmulq_f32(recipSqrt, vrsqrtsq_f32(vmulq_f32(recipSqrt, v), recipSqrt)); recipSqrt = vmulq_f32(recipSqrt, vrsqrtsq_f32(vmulq_f32(recipSqrt, v), recipSqrt));
recipSqrt = vmulq_f32(recipSqrt, vrsqrtsq_f32(vmulq_f32(recipSqrt, v), recipSqrt)); recipSqrt = vmulq_f32(recipSqrt, vrsqrtsq_f32(vmulq_f32(recipSqrt, v), recipSqrt));
return vmulq_f32(v, recipSqrt); return recipSqrt;
}
static inline fvec4 sqrt(const fvec4& v) {
return rsqrt(v)*v;
} }
static inline float dot3(const fvec4& v1, const fvec4& v2) { static inline float dot3(const fvec4& v1, const fvec4& v2) {
......
...@@ -330,7 +330,7 @@ static inline fvec4 ceil(const fvec4& v) { ...@@ -330,7 +330,7 @@ static inline fvec4 ceil(const fvec4& v) {
return truncated + blend(0.0f, 1.0f, truncated<v); return truncated + blend(0.0f, 1.0f, truncated<v);
} }
static inline fvec4 sqrt(const fvec4& v) { static inline fvec4 rsqrt(const fvec4& v) {
// Initial estimate of rsqrt(). // Initial estimate of rsqrt().
ivec4 i = (__m128i) v; ivec4 i = (__m128i) v;
...@@ -343,7 +343,11 @@ static inline fvec4 sqrt(const fvec4& v) { ...@@ -343,7 +343,11 @@ static inline fvec4 sqrt(const fvec4& v) {
y *= 1.5f-x2*y*y; y *= 1.5f-x2*y*y;
y *= 1.5f-x2*y*y; y *= 1.5f-x2*y*y;
y *= 1.5f-x2*y*y; y *= 1.5f-x2*y*y;
return y*v; return y;
}
static inline fvec4 sqrt(const fvec4& v) {
return rsqrt(v)*v;
} }
#endif /*OPENMM_VECTORIZE_PNACL_H_*/ #endif /*OPENMM_VECTORIZE_PNACL_H_*/
......
...@@ -241,6 +241,18 @@ static inline fvec4 sqrt(const fvec4& v) { ...@@ -241,6 +241,18 @@ static inline fvec4 sqrt(const fvec4& v) {
return fvec4(_mm_sqrt_ps(v.val)); return fvec4(_mm_sqrt_ps(v.val));
} }
static inline fvec4 rsqrt(const fvec4& v) {
// Initial estimate of rsqrt().
fvec4 y(_mm_rsqrt_ps(v.val));
// Perform an iteration of Newton refinement.
fvec4 x2 = v*0.5f;
y *= fvec4(1.5f)-x2*y*y;
return y;
}
static inline float dot3(const fvec4& v1, const fvec4& v2) { static inline float dot3(const fvec4& v1, const fvec4& v2) {
return _mm_cvtss_f32(_mm_dp_ps(v1, v2, 0x71)); return _mm_cvtss_f32(_mm_dp_ps(v1, v2, 0x71));
} }
......
...@@ -9,7 +9,7 @@ ...@@ -9,7 +9,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for * * Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. * * Medical Research, grant U54 GM072970. See https://simtk.org. *
* * * *
* Portions copyright (c) 2013 Stanford University and the Authors. * * Portions copyright (c) 2013-2015 Stanford University and the Authors. *
* Authors: Peter Eastman * * Authors: Peter Eastman *
* Contributors: * * Contributors: *
* * * *
...@@ -61,6 +61,7 @@ public: ...@@ -61,6 +61,7 @@ public:
private: private:
int blockSize; int blockSize;
std::vector<int> sortedAtoms; std::vector<int> sortedAtoms;
std::vector<float> sortedPositions;
std::vector<std::vector<int> > blockNeighbors; std::vector<std::vector<int> > blockNeighbors;
std::vector<std::vector<char> > blockExclusions; std::vector<std::vector<char> > blockExclusions;
// The following variables are used to make information accessible to the individual threads. // The following variables are used to make information accessible to the individual threads.
......
...@@ -56,8 +56,8 @@ protected: ...@@ -56,8 +56,8 @@ protected:
/** /**
* Templatized implementation of calculateBlockIxn. * Templatized implementation of calculateBlockIxn.
*/ */
template <bool TRICLINIC> template <int PERIODIC_TYPE>
void calculateBlockIxnImpl(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize); void calculateBlockIxnImpl(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize, const fvec4& blockCenter);
/**--------------------------------------------------------------------------------------- /**---------------------------------------------------------------------------------------
...@@ -74,15 +74,15 @@ protected: ...@@ -74,15 +74,15 @@ protected:
/** /**
* Templatized implementation of calculateBlockEwaldIxn. * Templatized implementation of calculateBlockEwaldIxn.
*/ */
template <bool TRICLINIC> template <int PERIODIC_TYPE>
void calculateBlockEwaldIxnImpl(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize); void calculateBlockEwaldIxnImpl(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize, const fvec4& blockCenter);
/** /**
* Compute the displacement and squared distance between a collection of points, optionally using * Compute the displacement and squared distance between a collection of points, optionally using
* periodic boundary conditions. * periodic boundary conditions.
*/ */
template <bool TRICLINIC> template <int PERIODIC_TYPE>
void getDeltaR(const float* posI, const fvec4& x, const fvec4& y, const fvec4& z, fvec4& dx, fvec4& dy, fvec4& dz, fvec4& r2, bool periodic, const fvec4& boxSize, const fvec4& invBoxSize) const; void getDeltaR(const fvec4& posI, const fvec4& x, const fvec4& y, const fvec4& z, fvec4& dx, fvec4& dy, fvec4& dz, fvec4& r2, bool periodic, const fvec4& boxSize, const fvec4& invBoxSize) const;
/** /**
* Compute a fast approximation to erfc(x). * Compute a fast approximation to erfc(x).
......
...@@ -55,8 +55,8 @@ protected: ...@@ -55,8 +55,8 @@ protected:
/** /**
* Templatized implementation of calculateBlockIxn. * Templatized implementation of calculateBlockIxn.
*/ */
template <bool TRICLINIC> template <int PERIODIC_TYPE>
void calculateBlockIxnImpl(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize); void calculateBlockIxnImpl(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize, const fvec4& blockCenter);
/**--------------------------------------------------------------------------------------- /**---------------------------------------------------------------------------------------
...@@ -73,15 +73,15 @@ protected: ...@@ -73,15 +73,15 @@ protected:
/** /**
* Templatized implementation of calculateBlockEwaldIxn. * Templatized implementation of calculateBlockEwaldIxn.
*/ */
template <bool TRICLINIC> template <int PERIODIC_TYPE>
void calculateBlockEwaldIxnImpl(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize); void calculateBlockEwaldIxnImpl(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize, const fvec4& blockCenter);
/** /**
* Compute the displacement and squared distance between a collection of points, optionally using * Compute the displacement and squared distance between a collection of points, optionally using
* periodic boundary conditions. * periodic boundary conditions.
*/ */
template <bool TRICLINIC> template <int PERIODIC_TYPE>
void getDeltaR(const float* posI, const fvec8& x, const fvec8& y, const fvec8& z, fvec8& dx, fvec8& dy, fvec8& dz, fvec8& r2, bool periodic, const fvec4& boxSize, const fvec4& invBoxSize) const; void getDeltaR(const fvec4& posI, const fvec8& x, const fvec8& y, const fvec8& z, fvec8& dx, fvec8& dy, fvec8& dz, fvec8& r2, bool periodic, const fvec4& boxSize, const fvec4& invBoxSize) const;
/** /**
* Compute a fast approximation to erfc(x). * Compute a fast approximation to erfc(x).
......
...@@ -199,7 +199,7 @@ void CpuCustomManyParticleForce::setUseCutoff(RealOpenMM distance) { ...@@ -199,7 +199,7 @@ void CpuCustomManyParticleForce::setUseCutoff(RealOpenMM distance) {
} }
void CpuCustomManyParticleForce::setPeriodic(RealVec* periodicBoxVectors) { void CpuCustomManyParticleForce::setPeriodic(RealVec* periodicBoxVectors) {
assert(cutoff); assert(useCutoff);
assert(periodicBoxVectors[0][0] >= 2.0*cutoffDistance); assert(periodicBoxVectors[0][0] >= 2.0*cutoffDistance);
assert(periodicBoxVectors[1][1] >= 2.0*cutoffDistance); assert(periodicBoxVectors[1][1] >= 2.0*cutoffDistance);
assert(periodicBoxVectors[2][2] >= 2.0*cutoffDistance); assert(periodicBoxVectors[2][2] >= 2.0*cutoffDistance);
......
...@@ -6,7 +6,7 @@ ...@@ -6,7 +6,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for * * Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. * * Medical Research, grant U54 GM072970. See https://simtk.org. *
* * * *
* Portions copyright (c) 2013 Stanford University and the Authors. * * Portions copyright (c) 2013-2015 Stanford University and the Authors. *
* Authors: Peter Eastman * * Authors: Peter Eastman *
* Contributors: * * Contributors: *
* * * *
...@@ -164,8 +164,8 @@ public: ...@@ -164,8 +164,8 @@ public:
return VoxelIndex(y, z); return VoxelIndex(y, z);
} }
void getNeighbors(vector<int>& neighbors, int blockIndex, const fvec4& blockCenter, const fvec4& blockWidth, const vector<int>& sortedAtoms, vector<char>& exclusions, float maxDistance, const vector<int>& blockAtoms, const float* atomLocations, const vector<VoxelIndex>& atomVoxelIndex) const { void getNeighbors(vector<int>& neighbors, int blockIndex, const fvec4& blockCenter, const fvec4& blockWidth, const vector<int>& sortedAtoms, vector<char>& exclusions, float maxDistance, const vector<int>& blockAtoms, const vector<float>& blockAtomX, const vector<float>& blockAtomY, const vector<float>& blockAtomZ, const vector<float>& sortedPositions, const vector<VoxelIndex>& atomVoxelIndex) const {
neighbors.resize(0); neighbors.resize(0);
exclusions.resize(0); exclusions.resize(0);
fvec4 boxSize(periodicBoxSize[0], periodicBoxSize[1], periodicBoxSize[2], 0); fvec4 boxSize(periodicBoxSize[0], periodicBoxSize[1], periodicBoxSize[2], 0);
...@@ -233,24 +233,34 @@ public: ...@@ -233,24 +233,34 @@ public:
float minx = centerPos[0]; float minx = centerPos[0];
float maxx = centerPos[0]; float maxx = centerPos[0];
fvec4 offset(-xoffset, -yoffset+voxelSizeY*y+(usePeriodic ? 0.0f : miny), voxelSizeZ*z+(usePeriodic ? 0.0f : minz), 0); float offset[3] = {-xoffset, -yoffset+voxelSizeY*y+(usePeriodic ? 0.0f : miny), voxelSizeZ*z+(usePeriodic ? 0.0f : minz)};
for (int k = 0; k < (int) blockAtoms.size(); k++) { for (int k = 0; k < (int) blockAtoms.size(); k += 4) {
const float* atomPos = &atomLocations[4*blockAtoms[k]]; fvec4 dist2 = maxDistanceSquared;
fvec4 posVec(atomPos); if (y != atomVoxelIndex[k].y) {
fvec4 delta1 = offset-posVec; fvec4 dy1 = offset[1]-fvec4(&blockAtomY[k]);
fvec4 delta2 = delta1+fvec4(0, voxelSizeY, voxelSizeZ, 0); fvec4 dy2 = dy1+voxelSizeY;
if (usePeriodic) { if (usePeriodic) {
delta1 -= round(delta1*invBoxSize)*boxSize; dy1 -= round(dy1*invBoxSize[1])*boxSize[1];
delta2 -= round(delta2*invBoxSize)*boxSize; dy2 -= round(dy2*invBoxSize[1])*boxSize[1];
}
fvec4 dy = min(abs(dy1), abs(dy2));
dist2 -= dy*dy;
}
if (z != atomVoxelIndex[k].z) {
fvec4 dz1 = offset[2]-fvec4(&blockAtomZ[k]);
fvec4 dz2 = dz1+voxelSizeZ;
if (usePeriodic) {
dz1 -= round(dz1*invBoxSize[2])*boxSize[2];
dz2 -= round(dz2*invBoxSize[2])*boxSize[2];
}
fvec4 dz = min(abs(dz1), abs(dz2));
dist2 -= dz*dz;
} }
fvec4 delta = min(abs(delta1), abs(delta2)); fvec4 dist = sqrt(dist2);
float dy = (y == atomVoxelIndex[k].y ? 0.0f : delta[1]); int numToCheck = min(4, (int) (blockAtoms.size()-k));
float dz = (z == atomVoxelIndex[k].z ? 0.0f : delta[2]); for (int m = 0; m < numToCheck; m++) {
float dist2 = maxDistanceSquared-dy*dy-dz*dz; minx = min(minx, blockAtomX[k+m]-dist[m]-xoffset);
if (dist2 > 0) { maxx = max(maxx, blockAtomX[k+m]+dist[m]-xoffset);
float dist = sqrtf(dist2);
minx = min(minx, atomPos[0]-dist-xoffset);
maxx = max(maxx, atomPos[0]+dist-xoffset);
} }
} }
if (minx == maxx) if (minx == maxx)
...@@ -290,12 +300,10 @@ public: ...@@ -290,12 +300,10 @@ public:
if (sortedIndex >= lastSortedIndex) if (sortedIndex >= lastSortedIndex)
continue; continue;
fvec4 atomPos(atomLocations+4*sortedAtoms[sortedIndex]); fvec4 atomPos(&sortedPositions[4*sortedIndex]);
fvec4 delta = atomPos-centerPos; fvec4 delta = atomPos-centerPos;
if (periodicRectangular) { if (periodicRectangular)
fvec4 base = round(delta*invBoxSize)*boxSize; delta -= round(delta*invBoxSize)*boxSize;
delta = delta-base;
}
else if (needPeriodic) { else if (needPeriodic) {
delta -= periodicBoxVec4[2]*floorf(delta[2]*recipBoxSize[2]+0.5f); delta -= periodicBoxVec4[2]*floorf(delta[2]*recipBoxSize[2]+0.5f);
delta -= periodicBoxVec4[1]*floorf(delta[1]*recipBoxSize[1]+0.5f); delta -= periodicBoxVec4[1]*floorf(delta[1]*recipBoxSize[1]+0.5f);
...@@ -310,26 +318,34 @@ public: ...@@ -310,26 +318,34 @@ public:
// The distance is large enough that there might not be any actual interactions. // The distance is large enough that there might not be any actual interactions.
// Check individual atom pairs to be sure. // Check individual atom pairs to be sure.
bool any = false; bool anyInteraction = false;
for (int k = 0; k < (int) blockAtoms.size(); k++) { for (int k = 0; k < (int) blockAtoms.size(); k += 4) {
fvec4 pos1(&atomLocations[4*blockAtoms[k]]); fvec4 dx = fvec4(&blockAtomX[k])-atomPos[0];
delta = atomPos-pos1; fvec4 dy = fvec4(&blockAtomY[k])-atomPos[1];
fvec4 dz = fvec4(&blockAtomZ[k])-atomPos[2];
if (periodicRectangular) { if (periodicRectangular) {
fvec4 base = round(delta*invBoxSize)*boxSize; dx -= round(dx*invBoxSize[0])*boxSize[0];
delta = delta-base; dy -= round(dy*invBoxSize[1])*boxSize[1];
dz -= round(dz*invBoxSize[2])*boxSize[2];
} }
else if (needPeriodic) { else if (needPeriodic) {
delta -= periodicBoxVec4[2]*floorf(delta[2]*recipBoxSize[2]+0.5f); fvec4 scale3 = floor(dz*recipBoxSize[2]+0.5f);
delta -= periodicBoxVec4[1]*floorf(delta[1]*recipBoxSize[1]+0.5f); dx -= scale3*periodicBoxVectors[2][0];
delta -= periodicBoxVec4[0]*floorf(delta[0]*recipBoxSize[0]+0.5f); dy -= scale3*periodicBoxVectors[2][1];
dz -= scale3*periodicBoxVectors[2][2];
fvec4 scale2 = floor(dy*recipBoxSize[1]+0.5f);
dx -= scale2*periodicBoxVectors[1][0];
dy -= scale2*periodicBoxVectors[1][1];
fvec4 scale1 = floor(dx*recipBoxSize[0]+0.5f);
dx -= scale1*periodicBoxVectors[0][0];
} }
float r2 = dot3(delta, delta); fvec4 r2 = dx*dx + dy*dy + dz*dz;
if (r2 < maxDistanceSquared) { if (any(r2 < maxDistanceSquared)) {
any = true; anyInteraction = true;
break; break;
} }
} }
if (!any) if (!anyInteraction)
continue; continue;
} }
...@@ -379,6 +395,7 @@ void CpuNeighborList::computeNeighborList(int numAtoms, const AlignedArray<float ...@@ -379,6 +395,7 @@ void CpuNeighborList::computeNeighborList(int numAtoms, const AlignedArray<float
blockNeighbors.resize(numBlocks); blockNeighbors.resize(numBlocks);
blockExclusions.resize(numBlocks); blockExclusions.resize(numBlocks);
sortedAtoms.resize(numAtoms); sortedAtoms.resize(numAtoms);
sortedPositions.resize(4*numAtoms);
// Record the parameters for the threads. // Record the parameters for the threads.
...@@ -428,6 +445,8 @@ void CpuNeighborList::computeNeighborList(int numAtoms, const AlignedArray<float ...@@ -428,6 +445,8 @@ void CpuNeighborList::computeNeighborList(int numAtoms, const AlignedArray<float
for (int i = 0; i < numAtoms; i++) { for (int i = 0; i < numAtoms; i++) {
int atomIndex = atomBins[i].second; int atomIndex = atomBins[i].second;
sortedAtoms[i] = atomIndex; sortedAtoms[i] = atomIndex;
fvec4 atomPos(&atomLocations[4*atomIndex]);
atomPos.store(&sortedPositions[4*i]);
voxels.insert(i, &atomLocations[4*atomIndex]); voxels.insert(i, &atomLocations[4*atomIndex]);
} }
voxels.sortItems(); voxels.sortItems();
...@@ -489,6 +508,7 @@ void CpuNeighborList::threadComputeNeighborList(ThreadPool& threads, int threadI ...@@ -489,6 +508,7 @@ void CpuNeighborList::threadComputeNeighborList(ThreadPool& threads, int threadI
int numBlocks = blockNeighbors.size(); int numBlocks = blockNeighbors.size();
vector<int> blockAtoms; vector<int> blockAtoms;
vector<float> blockAtomX(blockSize), blockAtomY(blockSize), blockAtomZ(blockSize);
vector<VoxelIndex> atomVoxelIndex; vector<VoxelIndex> atomVoxelIndex;
for (int i = threadIndex; i < numBlocks; i += numThreads) { for (int i = threadIndex; i < numBlocks; i += numThreads) {
// Find the atoms in this block and compute their bounding box. // Find the atoms in this block and compute their bounding box.
...@@ -501,14 +521,24 @@ void CpuNeighborList::threadComputeNeighborList(ThreadPool& threads, int threadI ...@@ -501,14 +521,24 @@ void CpuNeighborList::threadComputeNeighborList(ThreadPool& threads, int threadI
blockAtoms[j] = sortedAtoms[firstIndex+j]; blockAtoms[j] = sortedAtoms[firstIndex+j];
atomVoxelIndex[j] = voxels->getVoxelIndex(&atomLocations[4*blockAtoms[j]]); atomVoxelIndex[j] = voxels->getVoxelIndex(&atomLocations[4*blockAtoms[j]]);
} }
fvec4 minPos(&atomLocations[4*sortedAtoms[firstIndex]]); fvec4 minPos(&sortedPositions[4*firstIndex]);
fvec4 maxPos = minPos; fvec4 maxPos = minPos;
for (int j = 1; j < atomsInBlock; j++) { for (int j = 1; j < atomsInBlock; j++) {
fvec4 pos(&atomLocations[4*sortedAtoms[firstIndex+j]]); fvec4 pos(&sortedPositions[4*(firstIndex+j)]);
minPos = min(minPos, pos); minPos = min(minPos, pos);
maxPos = max(maxPos, pos); maxPos = max(maxPos, pos);
} }
voxels->getNeighbors(blockNeighbors[i], i, (maxPos+minPos)*0.5f, (maxPos-minPos)*0.5f, sortedAtoms, blockExclusions[i], maxDistance, blockAtoms, atomLocations, atomVoxelIndex); for (int j = 0; j < atomsInBlock; j++) {
blockAtomX[j] = sortedPositions[4*(firstIndex+j)];
blockAtomY[j] = sortedPositions[4*(firstIndex+j)+1];
blockAtomZ[j] = sortedPositions[4*(firstIndex+j)+2];
}
for (int j = atomsInBlock; j < blockSize; j++) {
blockAtomX[j] = 1e10;
blockAtomY[j] = 1e10;
blockAtomZ[j] = 1e10;
}
voxels->getNeighbors(blockNeighbors[i], i, (maxPos+minPos)*0.5f, (maxPos-minPos)*0.5f, sortedAtoms, blockExclusions[i], maxDistance, blockAtoms, blockAtomX, blockAtomY, blockAtomZ, sortedPositions, atomVoxelIndex);
// Record the exclusions for this block. // Record the exclusions for this block.
......
...@@ -44,30 +44,76 @@ CpuNonbondedForce* createCpuNonbondedForceVec4() { ...@@ -44,30 +44,76 @@ CpuNonbondedForce* createCpuNonbondedForceVec4() {
CpuNonbondedForceVec4::CpuNonbondedForceVec4() { CpuNonbondedForceVec4::CpuNonbondedForceVec4() {
} }
enum PeriodicType {NoPeriodic, PeriodicPerAtom, PeriodicPerInteraction, PeriodicTriclinic};
void CpuNonbondedForceVec4::calculateBlockIxn(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize) { void CpuNonbondedForceVec4::calculateBlockIxn(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize) {
if (triclinic) // Determine whether we need to apply periodic boundary conditions.
calculateBlockIxnImpl<true>(blockIndex, forces, totalEnergy, boxSize, invBoxSize);
else PeriodicType periodicType;
calculateBlockIxnImpl<false>(blockIndex, forces, totalEnergy, boxSize, invBoxSize); fvec4 blockCenter;
if (!periodic) {
periodicType = NoPeriodic;
blockCenter = 0.0f;
}
else {
const int* blockAtom = &neighborList->getSortedAtoms()[4*blockIndex];
float minx, maxx, miny, maxy, minz, maxz;
minx = maxx = posq[4*blockAtom[0]];
miny = maxy = posq[4*blockAtom[0]+1];
minz = maxz = posq[4*blockAtom[0]+2];
for (int i = 1; i < 4; i++) {
minx = min(minx, posq[4*blockAtom[i]]);
maxx = max(maxx, posq[4*blockAtom[i]]);
miny = min(miny, posq[4*blockAtom[i]+1]);
maxy = max(maxy, posq[4*blockAtom[i]+1]);
minz = min(minz, posq[4*blockAtom[i]+2]);
maxz = max(maxz, posq[4*blockAtom[i]+2]);
}
blockCenter = fvec4(0.5f*(minx+maxx), 0.5f*(miny+maxy), 0.5f*(minz+maxz), 0.0f);
if (!(minx < cutoffDistance || miny < cutoffDistance || minz < cutoffDistance ||
maxx > boxSize[0]-cutoffDistance || maxy > boxSize[1]-cutoffDistance || maxz > boxSize[2]-cutoffDistance))
periodicType = NoPeriodic;
else if (triclinic)
periodicType = PeriodicTriclinic;
else if (0.5f*(boxSize[0]-(maxx-minx)) >= cutoffDistance &&
0.5f*(boxSize[1]-(maxy-miny)) >= cutoffDistance &&
0.5f*(boxSize[2]-(maxz-minz)) >= cutoffDistance)
periodicType = PeriodicPerAtom;
else
periodicType = PeriodicPerInteraction;
}
// Call the appropriate version depending on what calculation is required for periodic boundary conditions.
if (periodicType == NoPeriodic)
calculateBlockIxnImpl<NoPeriodic>(blockIndex, forces, totalEnergy, boxSize, invBoxSize, blockCenter);
else if (periodicType == PeriodicPerAtom)
calculateBlockIxnImpl<PeriodicPerAtom>(blockIndex, forces, totalEnergy, boxSize, invBoxSize, blockCenter);
else if (periodicType == PeriodicPerInteraction)
calculateBlockIxnImpl<PeriodicPerInteraction>(blockIndex, forces, totalEnergy, boxSize, invBoxSize, blockCenter);
else if (periodicType == PeriodicTriclinic)
calculateBlockIxnImpl<PeriodicTriclinic>(blockIndex, forces, totalEnergy, boxSize, invBoxSize, blockCenter);
} }
template <bool TRICLINIC> template <int PERIODIC_TYPE>
void CpuNonbondedForceVec4::calculateBlockIxnImpl(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize) { void CpuNonbondedForceVec4::calculateBlockIxnImpl(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize, const fvec4& blockCenter) {
// Load the positions and parameters of the atoms in the block. // Load the positions and parameters of the atoms in the block.
const int* blockAtom = &neighborList->getSortedAtoms()[4*blockIndex]; const int* blockAtom = &neighborList->getSortedAtoms()[4*blockIndex];
fvec4 blockAtomPosq[4]; fvec4 blockAtomPosq[4];
fvec4 blockAtomForceX(0.0f), blockAtomForceY(0.0f), blockAtomForceZ(0.0f); fvec4 blockAtomForceX(0.0f), blockAtomForceY(0.0f), blockAtomForceZ(0.0f);
for (int i = 0; i < 4; i++) for (int i = 0; i < 4; i++) {
blockAtomPosq[i] = fvec4(posq+4*blockAtom[i]); blockAtomPosq[i] = fvec4(posq+4*blockAtom[i]);
if (PERIODIC_TYPE == PeriodicPerAtom)
blockAtomPosq[i] -= floor((blockAtomPosq[i]-blockCenter)*invBoxSize+0.5f)*boxSize;
}
fvec4 blockAtomX = fvec4(blockAtomPosq[0][0], blockAtomPosq[1][0], blockAtomPosq[2][0], blockAtomPosq[3][0]); fvec4 blockAtomX = fvec4(blockAtomPosq[0][0], blockAtomPosq[1][0], blockAtomPosq[2][0], blockAtomPosq[3][0]);
fvec4 blockAtomY = fvec4(blockAtomPosq[0][1], blockAtomPosq[1][1], blockAtomPosq[2][1], blockAtomPosq[3][1]); fvec4 blockAtomY = fvec4(blockAtomPosq[0][1], blockAtomPosq[1][1], blockAtomPosq[2][1], blockAtomPosq[3][1]);
fvec4 blockAtomZ = fvec4(blockAtomPosq[0][2], blockAtomPosq[1][2], blockAtomPosq[2][2], blockAtomPosq[3][2]); fvec4 blockAtomZ = fvec4(blockAtomPosq[0][2], blockAtomPosq[1][2], blockAtomPosq[2][2], blockAtomPosq[3][2]);
fvec4 blockAtomCharge = fvec4(ONE_4PI_EPS0)*fvec4(blockAtomPosq[0][3], blockAtomPosq[1][3], blockAtomPosq[2][3], blockAtomPosq[3][3]); fvec4 blockAtomCharge = fvec4(ONE_4PI_EPS0)*fvec4(blockAtomPosq[0][3], blockAtomPosq[1][3], blockAtomPosq[2][3], blockAtomPosq[3][3]);
fvec4 blockAtomSigma(atomParameters[blockAtom[0]].first, atomParameters[blockAtom[1]].first, atomParameters[blockAtom[2]].first, atomParameters[blockAtom[3]].first); fvec4 blockAtomSigma(atomParameters[blockAtom[0]].first, atomParameters[blockAtom[1]].first, atomParameters[blockAtom[2]].first, atomParameters[blockAtom[3]].first);
fvec4 blockAtomEpsilon(atomParameters[blockAtom[0]].second, atomParameters[blockAtom[1]].second, atomParameters[blockAtom[2]].second, atomParameters[blockAtom[3]].second); fvec4 blockAtomEpsilon(atomParameters[blockAtom[0]].second, atomParameters[blockAtom[1]].second, atomParameters[blockAtom[2]].second, atomParameters[blockAtom[3]].second);
bool needPeriodic = (periodic && (any(blockAtomX < cutoffDistance) || any(blockAtomY < cutoffDistance) || any(blockAtomZ < cutoffDistance) || const bool needPeriodic = (PERIODIC_TYPE == PeriodicPerInteraction || PERIODIC_TYPE == PeriodicTriclinic);
any(blockAtomX > boxSize[0]-cutoffDistance) || any(blockAtomY > boxSize[1]-cutoffDistance) || any(blockAtomZ > boxSize[2]-cutoffDistance)));
const float invSwitchingInterval = 1/(cutoffDistance-switchingDistance); const float invSwitchingInterval = 1/(cutoffDistance-switchingDistance);
// Loop over neighbors for this block. // Loop over neighbors for this block.
...@@ -82,7 +128,10 @@ void CpuNonbondedForceVec4::calculateBlockIxnImpl(int blockIndex, float* forces, ...@@ -82,7 +128,10 @@ void CpuNonbondedForceVec4::calculateBlockIxnImpl(int blockIndex, float* forces,
// Compute the distances to the block atoms. // Compute the distances to the block atoms.
fvec4 dx, dy, dz, r2; fvec4 dx, dy, dz, r2;
getDeltaR<TRICLINIC>(posq+4*atom, blockAtomX, blockAtomY, blockAtomZ, dx, dy, dz, r2, needPeriodic, boxSize, invBoxSize); fvec4 atomPos(posq+4*atom);
if (PERIODIC_TYPE == PeriodicPerAtom)
atomPos -= floor((atomPos-blockCenter)*invBoxSize+0.5f)*boxSize;
getDeltaR<PERIODIC_TYPE>(atomPos, blockAtomX, blockAtomY, blockAtomZ, dx, dy, dz, r2, needPeriodic, boxSize, invBoxSize);
ivec4 include; ivec4 include;
char excl = exclusions[i]; char excl = exclusions[i];
if (excl == 0) if (excl == 0)
...@@ -95,8 +144,7 @@ void CpuNonbondedForceVec4::calculateBlockIxnImpl(int blockIndex, float* forces, ...@@ -95,8 +144,7 @@ void CpuNonbondedForceVec4::calculateBlockIxnImpl(int blockIndex, float* forces,
// Compute the interactions. // Compute the interactions.
fvec4 r = sqrt(r2); fvec4 inverseR = rsqrt(r2);
fvec4 inverseR = fvec4(1.0f)/r;
fvec4 energy, dEdR; fvec4 energy, dEdR;
float atomEpsilon = atomParameters[atom].second; float atomEpsilon = atomParameters[atom].second;
if (atomEpsilon != 0.0f) { if (atomEpsilon != 0.0f) {
...@@ -108,6 +156,7 @@ void CpuNonbondedForceVec4::calculateBlockIxnImpl(int blockIndex, float* forces, ...@@ -108,6 +156,7 @@ void CpuNonbondedForceVec4::calculateBlockIxnImpl(int blockIndex, float* forces,
dEdR = epsSig6*(12.0f*sig6 - 6.0f); dEdR = epsSig6*(12.0f*sig6 - 6.0f);
energy = epsSig6*(sig6-1.0f); energy = epsSig6*(sig6-1.0f);
if (useSwitch) { if (useSwitch) {
fvec4 r = r2*inverseR;
fvec4 t = blend(0.0f, (r-switchingDistance)*invSwitchingInterval, r>switchingDistance); fvec4 t = blend(0.0f, (r-switchingDistance)*invSwitchingInterval, r>switchingDistance);
fvec4 switchValue = 1+t*t*t*(-10.0f+t*(15.0f-t*6.0f)); fvec4 switchValue = 1+t*t*t*(-10.0f+t*(15.0f-t*6.0f));
fvec4 switchDeriv = t*t*(-30.0f+t*(60.0f-t*30.0f))*invSwitchingInterval; fvec4 switchDeriv = t*t*(-30.0f+t*(60.0f-t*30.0f))*invSwitchingInterval;
...@@ -162,29 +211,73 @@ void CpuNonbondedForceVec4::calculateBlockIxnImpl(int blockIndex, float* forces, ...@@ -162,29 +211,73 @@ void CpuNonbondedForceVec4::calculateBlockIxnImpl(int blockIndex, float* forces,
} }
void CpuNonbondedForceVec4::calculateBlockEwaldIxn(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize) { void CpuNonbondedForceVec4::calculateBlockEwaldIxn(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize) {
if (triclinic) // Determine whether we need to apply periodic boundary conditions.
calculateBlockEwaldIxnImpl<true>(blockIndex, forces, totalEnergy, boxSize, invBoxSize);
else PeriodicType periodicType;
calculateBlockEwaldIxnImpl<false>(blockIndex, forces, totalEnergy, boxSize, invBoxSize); fvec4 blockCenter;
if (!periodic) {
periodicType = NoPeriodic;
blockCenter = 0.0f;
}
else {
const int* blockAtom = &neighborList->getSortedAtoms()[4*blockIndex];
float minx, maxx, miny, maxy, minz, maxz;
minx = maxx = posq[4*blockAtom[0]];
miny = maxy = posq[4*blockAtom[0]+1];
minz = maxz = posq[4*blockAtom[0]+2];
for (int i = 1; i < 4; i++) {
minx = min(minx, posq[4*blockAtom[i]]);
maxx = max(maxx, posq[4*blockAtom[i]]);
miny = min(miny, posq[4*blockAtom[i]+1]);
maxy = max(maxy, posq[4*blockAtom[i]+1]);
minz = min(minz, posq[4*blockAtom[i]+2]);
maxz = max(maxz, posq[4*blockAtom[i]+2]);
}
blockCenter = fvec4(0.5f*(minx+maxx), 0.5f*(miny+maxy), 0.5f*(minz+maxz), 0.0f);
if (!(minx < cutoffDistance || miny < cutoffDistance || minz < cutoffDistance ||
maxx > boxSize[0]-cutoffDistance || maxy > boxSize[1]-cutoffDistance || maxz > boxSize[2]-cutoffDistance))
periodicType = NoPeriodic;
else if (triclinic)
periodicType = PeriodicTriclinic;
else if (0.5f*(boxSize[0]-(maxx-minx)) >= cutoffDistance &&
0.5f*(boxSize[1]-(maxy-miny)) >= cutoffDistance &&
0.5f*(boxSize[2]-(maxz-minz)) >= cutoffDistance)
periodicType = PeriodicPerAtom;
else
periodicType = PeriodicPerInteraction;
}
// Call the appropriate version depending on what calculation is required for periodic boundary conditions.
if (periodicType == NoPeriodic)
calculateBlockEwaldIxnImpl<NoPeriodic>(blockIndex, forces, totalEnergy, boxSize, invBoxSize, blockCenter);
else if (periodicType == PeriodicPerAtom)
calculateBlockEwaldIxnImpl<PeriodicPerAtom>(blockIndex, forces, totalEnergy, boxSize, invBoxSize, blockCenter);
else if (periodicType == PeriodicPerInteraction)
calculateBlockEwaldIxnImpl<PeriodicPerInteraction>(blockIndex, forces, totalEnergy, boxSize, invBoxSize, blockCenter);
else if (periodicType == PeriodicTriclinic)
calculateBlockEwaldIxnImpl<PeriodicTriclinic>(blockIndex, forces, totalEnergy, boxSize, invBoxSize, blockCenter);
} }
template <bool TRICLINIC> template <int PERIODIC_TYPE>
void CpuNonbondedForceVec4::calculateBlockEwaldIxnImpl(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize) { void CpuNonbondedForceVec4::calculateBlockEwaldIxnImpl(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize, const fvec4& blockCenter) {
// Load the positions and parameters of the atoms in the block. // Load the positions and parameters of the atoms in the block.
const int* blockAtom = &neighborList->getSortedAtoms()[4*blockIndex]; const int* blockAtom = &neighborList->getSortedAtoms()[4*blockIndex];
fvec4 blockAtomPosq[4]; fvec4 blockAtomPosq[4];
fvec4 blockAtomForceX(0.0f), blockAtomForceY(0.0f), blockAtomForceZ(0.0f); fvec4 blockAtomForceX(0.0f), blockAtomForceY(0.0f), blockAtomForceZ(0.0f);
for (int i = 0; i < 4; i++) for (int i = 0; i < 4; i++) {
blockAtomPosq[i] = fvec4(posq+4*blockAtom[i]); blockAtomPosq[i] = fvec4(posq+4*blockAtom[i]);
if (PERIODIC_TYPE == PeriodicPerAtom)
blockAtomPosq[i] -= floor((blockAtomPosq[i]-blockCenter)*invBoxSize+0.5f)*boxSize;
}
fvec4 blockAtomX = fvec4(blockAtomPosq[0][0], blockAtomPosq[1][0], blockAtomPosq[2][0], blockAtomPosq[3][0]); fvec4 blockAtomX = fvec4(blockAtomPosq[0][0], blockAtomPosq[1][0], blockAtomPosq[2][0], blockAtomPosq[3][0]);
fvec4 blockAtomY = fvec4(blockAtomPosq[0][1], blockAtomPosq[1][1], blockAtomPosq[2][1], blockAtomPosq[3][1]); fvec4 blockAtomY = fvec4(blockAtomPosq[0][1], blockAtomPosq[1][1], blockAtomPosq[2][1], blockAtomPosq[3][1]);
fvec4 blockAtomZ = fvec4(blockAtomPosq[0][2], blockAtomPosq[1][2], blockAtomPosq[2][2], blockAtomPosq[3][2]); fvec4 blockAtomZ = fvec4(blockAtomPosq[0][2], blockAtomPosq[1][2], blockAtomPosq[2][2], blockAtomPosq[3][2]);
fvec4 blockAtomCharge = fvec4(ONE_4PI_EPS0)*fvec4(blockAtomPosq[0][3], blockAtomPosq[1][3], blockAtomPosq[2][3], blockAtomPosq[3][3]); fvec4 blockAtomCharge = fvec4(ONE_4PI_EPS0)*fvec4(blockAtomPosq[0][3], blockAtomPosq[1][3], blockAtomPosq[2][3], blockAtomPosq[3][3]);
fvec4 blockAtomSigma(atomParameters[blockAtom[0]].first, atomParameters[blockAtom[1]].first, atomParameters[blockAtom[2]].first, atomParameters[blockAtom[3]].first); fvec4 blockAtomSigma(atomParameters[blockAtom[0]].first, atomParameters[blockAtom[1]].first, atomParameters[blockAtom[2]].first, atomParameters[blockAtom[3]].first);
fvec4 blockAtomEpsilon(atomParameters[blockAtom[0]].second, atomParameters[blockAtom[1]].second, atomParameters[blockAtom[2]].second, atomParameters[blockAtom[3]].second); fvec4 blockAtomEpsilon(atomParameters[blockAtom[0]].second, atomParameters[blockAtom[1]].second, atomParameters[blockAtom[2]].second, atomParameters[blockAtom[3]].second);
bool needPeriodic = (periodic && (any(blockAtomX < cutoffDistance) || any(blockAtomY < cutoffDistance) || any(blockAtomZ < cutoffDistance) || const bool needPeriodic = (PERIODIC_TYPE == PeriodicPerInteraction || PERIODIC_TYPE == PeriodicTriclinic);
any(blockAtomX > boxSize[0]-cutoffDistance) || any(blockAtomY > boxSize[1]-cutoffDistance) || any(blockAtomZ > boxSize[2]-cutoffDistance)));
const float invSwitchingInterval = 1/(cutoffDistance-switchingDistance); const float invSwitchingInterval = 1/(cutoffDistance-switchingDistance);
// Loop over neighbors for this block. // Loop over neighbors for this block.
...@@ -199,7 +292,10 @@ void CpuNonbondedForceVec4::calculateBlockEwaldIxnImpl(int blockIndex, float* fo ...@@ -199,7 +292,10 @@ void CpuNonbondedForceVec4::calculateBlockEwaldIxnImpl(int blockIndex, float* fo
// Compute the distances to the block atoms. // Compute the distances to the block atoms.
fvec4 dx, dy, dz, r2; fvec4 dx, dy, dz, r2;
getDeltaR<TRICLINIC>(posq+4*atom, blockAtomX, blockAtomY, blockAtomZ, dx, dy, dz, r2, needPeriodic, boxSize, invBoxSize); fvec4 atomPos(posq+4*atom);
if (PERIODIC_TYPE == PeriodicPerAtom)
atomPos -= floor((atomPos-blockCenter)*invBoxSize+0.5f)*boxSize;
getDeltaR<PERIODIC_TYPE>(atomPos, blockAtomX, blockAtomY, blockAtomZ, dx, dy, dz, r2, needPeriodic, boxSize, invBoxSize);
ivec4 include; ivec4 include;
char excl = exclusions[i]; char excl = exclusions[i];
if (excl == 0) if (excl == 0)
...@@ -212,8 +308,8 @@ void CpuNonbondedForceVec4::calculateBlockEwaldIxnImpl(int blockIndex, float* fo ...@@ -212,8 +308,8 @@ void CpuNonbondedForceVec4::calculateBlockEwaldIxnImpl(int blockIndex, float* fo
// Compute the interactions. // Compute the interactions.
fvec4 r = sqrt(r2); fvec4 inverseR = rsqrt(r2);
fvec4 inverseR = fvec4(1.0f)/r; fvec4 r = r2*inverseR;
fvec4 energy, dEdR; fvec4 energy, dEdR;
float atomEpsilon = atomParameters[atom].second; float atomEpsilon = atomParameters[atom].second;
if (atomEpsilon != 0.0f) { if (atomEpsilon != 0.0f) {
...@@ -272,28 +368,26 @@ void CpuNonbondedForceVec4::calculateBlockEwaldIxnImpl(int blockIndex, float* fo ...@@ -272,28 +368,26 @@ void CpuNonbondedForceVec4::calculateBlockEwaldIxnImpl(int blockIndex, float* fo
(fvec4(forces+4*blockAtom[j])+f[j]).store(forces+4*blockAtom[j]); (fvec4(forces+4*blockAtom[j])+f[j]).store(forces+4*blockAtom[j]);
} }
template <bool TRICLINIC> template <int PERIODIC_TYPE>
void CpuNonbondedForceVec4::getDeltaR(const float* posI, const fvec4& x, const fvec4& y, const fvec4& z, fvec4& dx, fvec4& dy, fvec4& dz, fvec4& r2, bool periodic, const fvec4& boxSize, const fvec4& invBoxSize) const { void CpuNonbondedForceVec4::getDeltaR(const fvec4& posI, const fvec4& x, const fvec4& y, const fvec4& z, fvec4& dx, fvec4& dy, fvec4& dz, fvec4& r2, bool periodic, const fvec4& boxSize, const fvec4& invBoxSize) const {
dx = x-posI[0]; dx = x-posI[0];
dy = y-posI[1]; dy = y-posI[1];
dz = z-posI[2]; dz = z-posI[2];
if (periodic) { if (PERIODIC_TYPE == PeriodicTriclinic) {
if (TRICLINIC) { fvec4 scale3 = floor(dz*recipBoxSize[2]+0.5f);
fvec4 scale3 = floor(dz*recipBoxSize[2]+0.5f); dx -= scale3*periodicBoxVectors[2][0];
dx -= scale3*periodicBoxVectors[2][0]; dy -= scale3*periodicBoxVectors[2][1];
dy -= scale3*periodicBoxVectors[2][1]; dz -= scale3*periodicBoxVectors[2][2];
dz -= scale3*periodicBoxVectors[2][2]; fvec4 scale2 = floor(dy*recipBoxSize[1]+0.5f);
fvec4 scale2 = floor(dy*recipBoxSize[1]+0.5f); dx -= scale2*periodicBoxVectors[1][0];
dx -= scale2*periodicBoxVectors[1][0]; dy -= scale2*periodicBoxVectors[1][1];
dy -= scale2*periodicBoxVectors[1][1]; fvec4 scale1 = floor(dx*recipBoxSize[0]+0.5f);
fvec4 scale1 = floor(dx*recipBoxSize[0]+0.5f); dx -= scale1*periodicBoxVectors[0][0];
dx -= scale1*periodicBoxVectors[0][0]; }
} else if (PERIODIC_TYPE == PeriodicPerInteraction) {
else { dx -= round(dx*invBoxSize[0])*boxSize[0];
dx -= round(dx*invBoxSize[0])*boxSize[0]; dy -= round(dy*invBoxSize[1])*boxSize[1];
dy -= round(dy*invBoxSize[1])*boxSize[1]; dz -= round(dz*invBoxSize[2])*boxSize[2];
dz -= round(dz*invBoxSize[2])*boxSize[2];
}
} }
r2 = dx*dx + dy*dy + dz*dz; r2 = dx*dx + dy*dy + dz*dz;
} }
......
...@@ -50,7 +50,7 @@ CpuNonbondedForce* createCpuNonbondedForceVec8() { ...@@ -50,7 +50,7 @@ CpuNonbondedForce* createCpuNonbondedForceVec8() {
*/ */
bool isVec8Supported() { bool isVec8Supported() {
// Make sure the CPU supports AVX. // Make sure the CPU supports AVX.
int cpuInfo[4]; int cpuInfo[4];
cpuid(cpuInfo, 0); cpuid(cpuInfo, 0);
if (cpuInfo[0] >= 1) { if (cpuInfo[0] >= 1) {
...@@ -76,29 +76,75 @@ CpuNonbondedForce* createCpuNonbondedForceVec8() { ...@@ -76,29 +76,75 @@ CpuNonbondedForce* createCpuNonbondedForceVec8() {
CpuNonbondedForceVec8::CpuNonbondedForceVec8() { CpuNonbondedForceVec8::CpuNonbondedForceVec8() {
} }
enum PeriodicType {NoPeriodic, PeriodicPerAtom, PeriodicPerInteraction, PeriodicTriclinic};
void CpuNonbondedForceVec8::calculateBlockIxn(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize) { void CpuNonbondedForceVec8::calculateBlockIxn(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize) {
if (triclinic) // Determine whether we need to apply periodic boundary conditions.
calculateBlockIxnImpl<true>(blockIndex, forces, totalEnergy, boxSize, invBoxSize);
else PeriodicType periodicType;
calculateBlockIxnImpl<false>(blockIndex, forces, totalEnergy, boxSize, invBoxSize); fvec4 blockCenter;
if (!periodic) {
periodicType = NoPeriodic;
blockCenter = 0.0f;
}
else {
const int* blockAtom = &neighborList->getSortedAtoms()[8*blockIndex];
float minx, maxx, miny, maxy, minz, maxz;
minx = maxx = posq[4*blockAtom[0]];
miny = maxy = posq[4*blockAtom[0]+1];
minz = maxz = posq[4*blockAtom[0]+2];
for (int i = 1; i < 8; i++) {
minx = min(minx, posq[4*blockAtom[i]]);
maxx = max(maxx, posq[4*blockAtom[i]]);
miny = min(miny, posq[4*blockAtom[i]+1]);
maxy = max(maxy, posq[4*blockAtom[i]+1]);
minz = min(minz, posq[4*blockAtom[i]+2]);
maxz = max(maxz, posq[4*blockAtom[i]+2]);
}
blockCenter = fvec4(0.5f*(minx+maxx), 0.5f*(miny+maxy), 0.5f*(minz+maxz), 0.0f);
if (!(minx < cutoffDistance || miny < cutoffDistance || minz < cutoffDistance ||
maxx > boxSize[0]-cutoffDistance || maxy > boxSize[1]-cutoffDistance || maxz > boxSize[2]-cutoffDistance))
periodicType = NoPeriodic;
else if (triclinic)
periodicType = PeriodicTriclinic;
else if (0.5f*(boxSize[0]-(maxx-minx)) >= cutoffDistance &&
0.5f*(boxSize[1]-(maxy-miny)) >= cutoffDistance &&
0.5f*(boxSize[2]-(maxz-minz)) >= cutoffDistance)
periodicType = PeriodicPerAtom;
else
periodicType = PeriodicPerInteraction;
}
// Call the appropriate version depending on what calculation is required for periodic boundary conditions.
if (periodicType == NoPeriodic)
calculateBlockIxnImpl<NoPeriodic>(blockIndex, forces, totalEnergy, boxSize, invBoxSize, blockCenter);
else if (periodicType == PeriodicPerAtom)
calculateBlockIxnImpl<PeriodicPerAtom>(blockIndex, forces, totalEnergy, boxSize, invBoxSize, blockCenter);
else if (periodicType == PeriodicPerInteraction)
calculateBlockIxnImpl<PeriodicPerInteraction>(blockIndex, forces, totalEnergy, boxSize, invBoxSize, blockCenter);
else if (periodicType == PeriodicTriclinic)
calculateBlockIxnImpl<PeriodicTriclinic>(blockIndex, forces, totalEnergy, boxSize, invBoxSize, blockCenter);
} }
template <bool TRICLINIC> template <int PERIODIC_TYPE>
void CpuNonbondedForceVec8::calculateBlockIxnImpl(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize) { void CpuNonbondedForceVec8::calculateBlockIxnImpl(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize, const fvec4& blockCenter) {
// Load the positions and parameters of the atoms in the block. // Load the positions and parameters of the atoms in the block.
const int* blockAtom = &neighborList->getSortedAtoms()[8*blockIndex]; const int* blockAtom = &neighborList->getSortedAtoms()[8*blockIndex];
fvec4 blockAtomPosq[8]; fvec4 blockAtomPosq[8];
fvec8 blockAtomForceX(0.0f), blockAtomForceY(0.0f), blockAtomForceZ(0.0f); fvec8 blockAtomForceX(0.0f), blockAtomForceY(0.0f), blockAtomForceZ(0.0f);
fvec8 blockAtomX, blockAtomY, blockAtomZ, blockAtomCharge; fvec8 blockAtomX, blockAtomY, blockAtomZ, blockAtomCharge;
for (int i = 0; i < 8; i++) for (int i = 0; i < 8; i++) {
blockAtomPosq[i] = fvec4(posq+4*blockAtom[i]); blockAtomPosq[i] = fvec4(posq+4*blockAtom[i]);
if (PERIODIC_TYPE == PeriodicPerAtom)
blockAtomPosq[i] -= floor((blockAtomPosq[i]-blockCenter)*invBoxSize+0.5f)*boxSize;
}
transpose(blockAtomPosq[0], blockAtomPosq[1], blockAtomPosq[2], blockAtomPosq[3], blockAtomPosq[4], blockAtomPosq[5], blockAtomPosq[6], blockAtomPosq[7], blockAtomX, blockAtomY, blockAtomZ, blockAtomCharge); transpose(blockAtomPosq[0], blockAtomPosq[1], blockAtomPosq[2], blockAtomPosq[3], blockAtomPosq[4], blockAtomPosq[5], blockAtomPosq[6], blockAtomPosq[7], blockAtomX, blockAtomY, blockAtomZ, blockAtomCharge);
blockAtomCharge *= ONE_4PI_EPS0; blockAtomCharge *= ONE_4PI_EPS0;
fvec8 blockAtomSigma(atomParameters[blockAtom[0]].first, atomParameters[blockAtom[1]].first, atomParameters[blockAtom[2]].first, atomParameters[blockAtom[3]].first, atomParameters[blockAtom[4]].first, atomParameters[blockAtom[5]].first, atomParameters[blockAtom[6]].first, atomParameters[blockAtom[7]].first); fvec8 blockAtomSigma(atomParameters[blockAtom[0]].first, atomParameters[blockAtom[1]].first, atomParameters[blockAtom[2]].first, atomParameters[blockAtom[3]].first, atomParameters[blockAtom[4]].first, atomParameters[blockAtom[5]].first, atomParameters[blockAtom[6]].first, atomParameters[blockAtom[7]].first);
fvec8 blockAtomEpsilon(atomParameters[blockAtom[0]].second, atomParameters[blockAtom[1]].second, atomParameters[blockAtom[2]].second, atomParameters[blockAtom[3]].second, atomParameters[blockAtom[4]].second, atomParameters[blockAtom[5]].second, atomParameters[blockAtom[6]].second, atomParameters[blockAtom[7]].second); fvec8 blockAtomEpsilon(atomParameters[blockAtom[0]].second, atomParameters[blockAtom[1]].second, atomParameters[blockAtom[2]].second, atomParameters[blockAtom[3]].second, atomParameters[blockAtom[4]].second, atomParameters[blockAtom[5]].second, atomParameters[blockAtom[6]].second, atomParameters[blockAtom[7]].second);
bool needPeriodic = (periodic && (any(blockAtomX < cutoffDistance) || any(blockAtomY < cutoffDistance) || any(blockAtomZ < cutoffDistance) || const bool needPeriodic = (PERIODIC_TYPE == PeriodicPerInteraction || PERIODIC_TYPE == PeriodicTriclinic);
any(blockAtomX > boxSize[0]-cutoffDistance) || any(blockAtomY > boxSize[1]-cutoffDistance) || any(blockAtomZ > boxSize[2]-cutoffDistance)));
const float invSwitchingInterval = 1/(cutoffDistance-switchingDistance); const float invSwitchingInterval = 1/(cutoffDistance-switchingDistance);
// Loop over neighbors for this block. // Loop over neighbors for this block.
...@@ -113,7 +159,10 @@ void CpuNonbondedForceVec8::calculateBlockIxnImpl(int blockIndex, float* forces, ...@@ -113,7 +159,10 @@ void CpuNonbondedForceVec8::calculateBlockIxnImpl(int blockIndex, float* forces,
// Compute the distances to the block atoms. // Compute the distances to the block atoms.
fvec8 dx, dy, dz, r2; fvec8 dx, dy, dz, r2;
getDeltaR<TRICLINIC>(&posq[4*atom], blockAtomX, blockAtomY, blockAtomZ, dx, dy, dz, r2, needPeriodic, boxSize, invBoxSize); fvec4 atomPos(posq+4*atom);
if (PERIODIC_TYPE == PeriodicPerAtom)
atomPos -= floor((atomPos-blockCenter)*invBoxSize+0.5f)*boxSize;
getDeltaR<PERIODIC_TYPE>(atomPos, blockAtomX, blockAtomY, blockAtomZ, dx, dy, dz, r2, needPeriodic, boxSize, invBoxSize);
ivec8 include; ivec8 include;
char excl = exclusions[i]; char excl = exclusions[i];
if (excl == 0) if (excl == 0)
...@@ -126,8 +175,7 @@ void CpuNonbondedForceVec8::calculateBlockIxnImpl(int blockIndex, float* forces, ...@@ -126,8 +175,7 @@ void CpuNonbondedForceVec8::calculateBlockIxnImpl(int blockIndex, float* forces,
// Compute the interactions. // Compute the interactions.
fvec8 r = sqrt(r2); fvec8 inverseR = rsqrt(r2);
fvec8 inverseR = fvec8(1.0f)/r;
fvec8 energy, dEdR; fvec8 energy, dEdR;
float atomEpsilon = atomParameters[atom].second; float atomEpsilon = atomParameters[atom].second;
if (atomEpsilon != 0.0f) { if (atomEpsilon != 0.0f) {
...@@ -139,6 +187,7 @@ void CpuNonbondedForceVec8::calculateBlockIxnImpl(int blockIndex, float* forces, ...@@ -139,6 +187,7 @@ void CpuNonbondedForceVec8::calculateBlockIxnImpl(int blockIndex, float* forces,
dEdR = epsSig6*(12.0f*sig6 - 6.0f); dEdR = epsSig6*(12.0f*sig6 - 6.0f);
energy = epsSig6*(sig6-1.0f); energy = epsSig6*(sig6-1.0f);
if (useSwitch) { if (useSwitch) {
fvec8 r = r2*inverseR;
fvec8 t = (r>switchingDistance) & ((r-switchingDistance)*invSwitchingInterval); fvec8 t = (r>switchingDistance) & ((r-switchingDistance)*invSwitchingInterval);
fvec8 switchValue = 1+t*t*t*(-10.0f+t*(15.0f-t*6.0f)); fvec8 switchValue = 1+t*t*t*(-10.0f+t*(15.0f-t*6.0f));
fvec8 switchDeriv = t*t*(-30.0f+t*(60.0f-t*30.0f))*invSwitchingInterval; fvec8 switchDeriv = t*t*(-30.0f+t*(60.0f-t*30.0f))*invSwitchingInterval;
...@@ -193,28 +242,72 @@ void CpuNonbondedForceVec8::calculateBlockIxnImpl(int blockIndex, float* forces, ...@@ -193,28 +242,72 @@ void CpuNonbondedForceVec8::calculateBlockIxnImpl(int blockIndex, float* forces,
} }
void CpuNonbondedForceVec8::calculateBlockEwaldIxn(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize) { void CpuNonbondedForceVec8::calculateBlockEwaldIxn(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize) {
if (triclinic) // Determine whether we need to apply periodic boundary conditions.
calculateBlockEwaldIxnImpl<true>(blockIndex, forces, totalEnergy, boxSize, invBoxSize);
else PeriodicType periodicType;
calculateBlockEwaldIxnImpl<false>(blockIndex, forces, totalEnergy, boxSize, invBoxSize); fvec4 blockCenter;
if (!periodic) {
periodicType = NoPeriodic;
blockCenter = 0.0f;
}
else {
const int* blockAtom = &neighborList->getSortedAtoms()[8*blockIndex];
float minx, maxx, miny, maxy, minz, maxz;
minx = maxx = posq[4*blockAtom[0]];
miny = maxy = posq[4*blockAtom[0]+1];
minz = maxz = posq[4*blockAtom[0]+2];
for (int i = 1; i < 8; i++) {
minx = min(minx, posq[4*blockAtom[i]]);
maxx = max(maxx, posq[4*blockAtom[i]]);
miny = min(miny, posq[4*blockAtom[i]+1]);
maxy = max(maxy, posq[4*blockAtom[i]+1]);
minz = min(minz, posq[4*blockAtom[i]+2]);
maxz = max(maxz, posq[4*blockAtom[i]+2]);
}
blockCenter = fvec4(0.5f*(minx+maxx), 0.5f*(miny+maxy), 0.5f*(minz+maxz), 0.0f);
if (!(minx < cutoffDistance || miny < cutoffDistance || minz < cutoffDistance ||
maxx > boxSize[0]-cutoffDistance || maxy > boxSize[1]-cutoffDistance || maxz > boxSize[2]-cutoffDistance))
periodicType = NoPeriodic;
else if (triclinic)
periodicType = PeriodicTriclinic;
else if (0.5f*(boxSize[0]-(maxx-minx)) >= cutoffDistance &&
0.5f*(boxSize[1]-(maxy-miny)) >= cutoffDistance &&
0.5f*(boxSize[2]-(maxz-minz)) >= cutoffDistance)
periodicType = PeriodicPerAtom;
else
periodicType = PeriodicPerInteraction;
}
// Call the appropriate version depending on what calculation is required for periodic boundary conditions.
if (periodicType == NoPeriodic)
calculateBlockEwaldIxnImpl<NoPeriodic>(blockIndex, forces, totalEnergy, boxSize, invBoxSize, blockCenter);
else if (periodicType == PeriodicPerAtom)
calculateBlockEwaldIxnImpl<PeriodicPerAtom>(blockIndex, forces, totalEnergy, boxSize, invBoxSize, blockCenter);
else if (periodicType == PeriodicPerInteraction)
calculateBlockEwaldIxnImpl<PeriodicPerInteraction>(blockIndex, forces, totalEnergy, boxSize, invBoxSize, blockCenter);
else if (periodicType == PeriodicTriclinic)
calculateBlockEwaldIxnImpl<PeriodicTriclinic>(blockIndex, forces, totalEnergy, boxSize, invBoxSize, blockCenter);
} }
template <bool TRICLINIC> template <int PERIODIC_TYPE>
void CpuNonbondedForceVec8::calculateBlockEwaldIxnImpl(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize) { void CpuNonbondedForceVec8::calculateBlockEwaldIxnImpl(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize, const fvec4& blockCenter) {
// Load the positions and parameters of the atoms in the block. // Load the positions and parameters of the atoms in the block.
const int* blockAtom = &neighborList->getSortedAtoms()[8*blockIndex]; const int* blockAtom = &neighborList->getSortedAtoms()[8*blockIndex];
fvec4 blockAtomPosq[8]; fvec4 blockAtomPosq[8];
fvec8 blockAtomForceX(0.0f), blockAtomForceY(0.0f), blockAtomForceZ(0.0f); fvec8 blockAtomForceX(0.0f), blockAtomForceY(0.0f), blockAtomForceZ(0.0f);
fvec8 blockAtomX, blockAtomY, blockAtomZ, blockAtomCharge; fvec8 blockAtomX, blockAtomY, blockAtomZ, blockAtomCharge;
for (int i = 0; i < 8; i++) for (int i = 0; i < 8; i++) {
blockAtomPosq[i] = fvec4(posq+4*blockAtom[i]); blockAtomPosq[i] = fvec4(posq+4*blockAtom[i]);
if (PERIODIC_TYPE == PeriodicPerAtom)
blockAtomPosq[i] -= floor((blockAtomPosq[i]-blockCenter)*invBoxSize+0.5f)*boxSize;
}
transpose(blockAtomPosq[0], blockAtomPosq[1], blockAtomPosq[2], blockAtomPosq[3], blockAtomPosq[4], blockAtomPosq[5], blockAtomPosq[6], blockAtomPosq[7], blockAtomX, blockAtomY, blockAtomZ, blockAtomCharge); transpose(blockAtomPosq[0], blockAtomPosq[1], blockAtomPosq[2], blockAtomPosq[3], blockAtomPosq[4], blockAtomPosq[5], blockAtomPosq[6], blockAtomPosq[7], blockAtomX, blockAtomY, blockAtomZ, blockAtomCharge);
blockAtomCharge *= ONE_4PI_EPS0; blockAtomCharge *= ONE_4PI_EPS0;
fvec8 blockAtomSigma(atomParameters[blockAtom[0]].first, atomParameters[blockAtom[1]].first, atomParameters[blockAtom[2]].first, atomParameters[blockAtom[3]].first, atomParameters[blockAtom[4]].first, atomParameters[blockAtom[5]].first, atomParameters[blockAtom[6]].first, atomParameters[blockAtom[7]].first); fvec8 blockAtomSigma(atomParameters[blockAtom[0]].first, atomParameters[blockAtom[1]].first, atomParameters[blockAtom[2]].first, atomParameters[blockAtom[3]].first, atomParameters[blockAtom[4]].first, atomParameters[blockAtom[5]].first, atomParameters[blockAtom[6]].first, atomParameters[blockAtom[7]].first);
fvec8 blockAtomEpsilon(atomParameters[blockAtom[0]].second, atomParameters[blockAtom[1]].second, atomParameters[blockAtom[2]].second, atomParameters[blockAtom[3]].second, atomParameters[blockAtom[4]].second, atomParameters[blockAtom[5]].second, atomParameters[blockAtom[6]].second, atomParameters[blockAtom[7]].second); fvec8 blockAtomEpsilon(atomParameters[blockAtom[0]].second, atomParameters[blockAtom[1]].second, atomParameters[blockAtom[2]].second, atomParameters[blockAtom[3]].second, atomParameters[blockAtom[4]].second, atomParameters[blockAtom[5]].second, atomParameters[blockAtom[6]].second, atomParameters[blockAtom[7]].second);
bool needPeriodic = (periodic && (any(blockAtomX < cutoffDistance) || any(blockAtomY < cutoffDistance) || any(blockAtomZ < cutoffDistance) || const bool needPeriodic = (PERIODIC_TYPE == PeriodicPerInteraction || PERIODIC_TYPE == PeriodicTriclinic);
any(blockAtomX > boxSize[0]-cutoffDistance) || any(blockAtomY > boxSize[1]-cutoffDistance) || any(blockAtomZ > boxSize[2]-cutoffDistance)));
const float invSwitchingInterval = 1/(cutoffDistance-switchingDistance); const float invSwitchingInterval = 1/(cutoffDistance-switchingDistance);
// Loop over neighbors for this block. // Loop over neighbors for this block.
...@@ -229,7 +322,10 @@ void CpuNonbondedForceVec8::calculateBlockEwaldIxnImpl(int blockIndex, float* fo ...@@ -229,7 +322,10 @@ void CpuNonbondedForceVec8::calculateBlockEwaldIxnImpl(int blockIndex, float* fo
// Compute the distances to the block atoms. // Compute the distances to the block atoms.
fvec8 dx, dy, dz, r2; fvec8 dx, dy, dz, r2;
getDeltaR<TRICLINIC>(&posq[4*atom], blockAtomX, blockAtomY, blockAtomZ, dx, dy, dz, r2, needPeriodic, boxSize, invBoxSize); fvec4 atomPos(posq+4*atom);
if (PERIODIC_TYPE == PeriodicPerAtom)
atomPos -= floor((atomPos-blockCenter)*invBoxSize+0.5f)*boxSize;
getDeltaR<PERIODIC_TYPE>(atomPos, blockAtomX, blockAtomY, blockAtomZ, dx, dy, dz, r2, needPeriodic, boxSize, invBoxSize);
ivec8 include; ivec8 include;
char excl = exclusions[i]; char excl = exclusions[i];
if (excl == 0) if (excl == 0)
...@@ -242,8 +338,8 @@ void CpuNonbondedForceVec8::calculateBlockEwaldIxnImpl(int blockIndex, float* fo ...@@ -242,8 +338,8 @@ void CpuNonbondedForceVec8::calculateBlockEwaldIxnImpl(int blockIndex, float* fo
// Compute the interactions. // Compute the interactions.
fvec8 r = sqrt(r2); fvec8 inverseR = rsqrt(r2);
fvec8 inverseR = fvec8(1.0f)/r; fvec8 r = r2*inverseR;
fvec8 energy, dEdR; fvec8 energy, dEdR;
float atomEpsilon = atomParameters[atom].second; float atomEpsilon = atomParameters[atom].second;
if (atomEpsilon != 0.0f) { if (atomEpsilon != 0.0f) {
...@@ -268,7 +364,7 @@ void CpuNonbondedForceVec8::calculateBlockEwaldIxnImpl(int blockIndex, float* fo ...@@ -268,7 +364,7 @@ void CpuNonbondedForceVec8::calculateBlockEwaldIxnImpl(int blockIndex, float* fo
} }
fvec8 chargeProd = blockAtomCharge*posq[4*atom+3]; fvec8 chargeProd = blockAtomCharge*posq[4*atom+3];
dEdR += chargeProd*inverseR*ewaldScaleFunction(r); dEdR += chargeProd*inverseR*ewaldScaleFunction(r);
dEdR *= inverseR*inverseR; dEdR *= inverseR*inverseR;
// Accumulate energies. // Accumulate energies.
...@@ -302,28 +398,26 @@ void CpuNonbondedForceVec8::calculateBlockEwaldIxnImpl(int blockIndex, float* fo ...@@ -302,28 +398,26 @@ void CpuNonbondedForceVec8::calculateBlockEwaldIxnImpl(int blockIndex, float* fo
(fvec4(forces+4*blockAtom[j])+f[j]).store(forces+4*blockAtom[j]); (fvec4(forces+4*blockAtom[j])+f[j]).store(forces+4*blockAtom[j]);
} }
template <bool TRICLINIC> template <int PERIODIC_TYPE>
void CpuNonbondedForceVec8::getDeltaR(const float* posI, const fvec8& x, const fvec8& y, const fvec8& z, fvec8& dx, fvec8& dy, fvec8& dz, fvec8& r2, bool periodic, const fvec4& boxSize, const fvec4& invBoxSize) const { void CpuNonbondedForceVec8::getDeltaR(const fvec4& posI, const fvec8& x, const fvec8& y, const fvec8& z, fvec8& dx, fvec8& dy, fvec8& dz, fvec8& r2, bool periodic, const fvec4& boxSize, const fvec4& invBoxSize) const {
dx = x-posI[0]; dx = x-posI[0];
dy = y-posI[1]; dy = y-posI[1];
dz = z-posI[2]; dz = z-posI[2];
if (periodic) { if (PERIODIC_TYPE == PeriodicTriclinic) {
if (TRICLINIC) { fvec8 scale3 = floor(dz*recipBoxSize[2]+0.5f);
fvec8 scale3 = floor(dz*recipBoxSize[2]+0.5f); dx -= scale3*periodicBoxVectors[2][0];
dx -= scale3*periodicBoxVectors[2][0]; dy -= scale3*periodicBoxVectors[2][1];
dy -= scale3*periodicBoxVectors[2][1]; dz -= scale3*periodicBoxVectors[2][2];
dz -= scale3*periodicBoxVectors[2][2]; fvec8 scale2 = floor(dy*recipBoxSize[1]+0.5f);
fvec8 scale2 = floor(dy*recipBoxSize[1]+0.5f); dx -= scale2*periodicBoxVectors[1][0];
dx -= scale2*periodicBoxVectors[1][0]; dy -= scale2*periodicBoxVectors[1][1];
dy -= scale2*periodicBoxVectors[1][1]; fvec8 scale1 = floor(dx*recipBoxSize[0]+0.5f);
fvec8 scale1 = floor(dx*recipBoxSize[0]+0.5f); dx -= scale1*periodicBoxVectors[0][0];
dx -= scale1*periodicBoxVectors[0][0]; }
} else if (PERIODIC_TYPE == PeriodicPerInteraction) {
else { dx -= round(dx*invBoxSize[0])*boxSize[0];
dx -= round(dx*invBoxSize[0])*boxSize[0]; dy -= round(dy*invBoxSize[1])*boxSize[1];
dy -= round(dy*invBoxSize[1])*boxSize[1]; dz -= round(dz*invBoxSize[2])*boxSize[2];
dz -= round(dz*invBoxSize[2])*boxSize[2];
}
} }
r2 = dx*dx + dy*dy + dz*dz; r2 = dx*dx + dy*dy + dz*dz;
} }
......
...@@ -9,7 +9,7 @@ ...@@ -9,7 +9,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for * * Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. * * Medical Research, grant U54 GM072970. See https://simtk.org. *
* * * *
* Portions copyright (c) 2009 Stanford University and the Authors. * * Portions copyright (c) 2009-2015 Stanford University and the Authors. *
* Authors: Peter Eastman * * Authors: Peter Eastman *
* Contributors: * * Contributors: *
* * * *
...@@ -40,7 +40,7 @@ namespace OpenMM { ...@@ -40,7 +40,7 @@ namespace OpenMM {
* 15, 207–228 (2000). * 15, 207–228 (2000).
* <p> * <p>
* This class places certain restrictions on the allowed dimensions of the grid. First, * This class places certain restrictions on the allowed dimensions of the grid. First,
* the size of each dimension may have no prime factors other than 2, 3, and 5. You * the size of each dimension may have no prime factors other than 2, 3, 5, and 7. You
* can call findLegalDimension() to determine the smallest size that satisfies this * can call findLegalDimension() to determine the smallest size that satisfies this
* requirement and is greater than or equal to a specified minimum size. Second, the size * requirement and is greater than or equal to a specified minimum size. Second, the size
* of each dimension must be small enough to compute each 1D transform entirely in local * of each dimension must be small enough to compute each 1D transform entirely in local
...@@ -61,12 +61,17 @@ public: ...@@ -61,12 +61,17 @@ public:
* @param xsize the first dimension of the data sets on which FFTs will be performed * @param xsize the first dimension of the data sets on which FFTs will be performed
* @param ysize the second dimension of the data sets on which FFTs will be performed * @param ysize the second dimension of the data sets on which FFTs will be performed
* @param zsize the third dimension of the data sets on which FFTs will be performed * @param zsize the third dimension of the data sets on which FFTs will be performed
* @param realToComplex if true, a real-to-complex transform will be done. Otherwise, it is complex-to-complex.
*/ */
OpenCLFFT3D(OpenCLContext& context, int xsize, int ysize, int zsize); OpenCLFFT3D(OpenCLContext& context, int xsize, int ysize, int zsize, bool realToComplex=false);
/** /**
* Perform a Fourier transform. The transform cannot be done in-place: the input and output * Perform a Fourier transform. The transform cannot be done in-place: the input and output
* arrays must be different. Also, the input array is used as workspace, so its contents * arrays must be different. Also, the input array is used as workspace, so its contents
* are destroyed. * are destroyed. This also means that both arrays must be large enough to hold complex values,
* even when performing a real-to-complex transform.
* <p>
* When performing a real-to-complex transform, the output data is of size xsize*ysize*(zsize/2+1)
* and contains only the non-redundant elements.
* *
* @param in the data to transform, ordered such that in[x*ysize*zsize + y*zsize + z] contains element (x, y, z) * @param in the data to transform, ordered such that in[x*ysize*zsize + y*zsize + z] contains element (x, y, z)
* @param out on exit, this contains the transformed data * @param out on exit, this contains the transformed data
...@@ -75,17 +80,20 @@ public: ...@@ -75,17 +80,20 @@ public:
void execFFT(OpenCLArray& in, OpenCLArray& out, bool forward = true); void execFFT(OpenCLArray& in, OpenCLArray& out, bool forward = true);
/** /**
* Get the smallest legal size for a dimension of the grid (that is, a size with no prime * Get the smallest legal size for a dimension of the grid (that is, a size with no prime
* factors other than 2, 3, and 5). * factors other than 2, 3, 5, and 7).
* *
* @param minimum the minimum size the return value must be greater than or equal to * @param minimum the minimum size the return value must be greater than or equal to
*/ */
static int findLegalDimension(int minimum); static int findLegalDimension(int minimum);
private: private:
cl::Kernel createKernel(int xsize, int ysize, int zsize, int& threads); cl::Kernel createKernel(int xsize, int ysize, int zsize, int& threads, int axis, bool forward, bool inputIsReal);
int xsize, ysize, zsize; int xsize, ysize, zsize;
int xthreads, ythreads, zthreads; int xthreads, ythreads, zthreads;
bool packRealAsComplex;
OpenCLContext& context; OpenCLContext& context;
cl::Kernel xkernel, ykernel, zkernel; cl::Kernel xkernel, ykernel, zkernel;
cl::Kernel invxkernel, invykernel, invzkernel;
cl::Kernel packForwardKernel, unpackForwardKernel, packBackwardKernel, unpackBackwardKernel;
}; };
} // namespace OpenMM } // namespace OpenMM
......
...@@ -639,6 +639,7 @@ private: ...@@ -639,6 +639,7 @@ private:
cl::Kernel pmeSpreadChargeKernel; cl::Kernel pmeSpreadChargeKernel;
cl::Kernel pmeFinishSpreadChargeKernel; cl::Kernel pmeFinishSpreadChargeKernel;
cl::Kernel pmeConvolutionKernel; cl::Kernel pmeConvolutionKernel;
cl::Kernel pmeEvalEnergyKernel;
cl::Kernel pmeInterpolateForceKernel; cl::Kernel pmeInterpolateForceKernel;
std::map<std::string, std::string> pmeDefines; std::map<std::string, std::string> pmeDefines;
std::vector<std::pair<int, int> > exceptionAtoms; std::vector<std::pair<int, int> > exceptionAtoms;
......
...@@ -106,9 +106,8 @@ OpenCLContext::OpenCLContext(const System& system, int platformIndex, int device ...@@ -106,9 +106,8 @@ OpenCLContext::OpenCLContext(const System& system, int platformIndex, int device
// if they supplied a valid deviceIndex, we only look through that one // if they supplied a valid deviceIndex, we only look through that one
if (i != deviceIndex && deviceIndex >= 0 && deviceIndex < (int) devices.size()) if (i != deviceIndex && deviceIndex >= 0 && deviceIndex < (int) devices.size())
continue; continue;
if (platformVendor == "Apple" && (devices[i].getInfo<CL_DEVICE_TYPE>() == CL_DEVICE_TYPE_CPU))
if (platformVendor == "Apple" && (devices[i].getInfo<CL_DEVICE_TYPE>() == CL_DEVICE_TYPE_CPU || devices[i].getInfo<CL_DEVICE_VENDOR>() == "AMD")) continue; // The CPU device on OS X won't work correctly.
continue; // The CPU device on OS X won't work correctly, and there are serious bugs using AMD GPUs.
int maxSize = devices[i].getInfo<CL_DEVICE_MAX_WORK_ITEM_SIZES>()[0]; int maxSize = devices[i].getInfo<CL_DEVICE_MAX_WORK_ITEM_SIZES>()[0];
int processingElementsPerComputeUnit = 8; int processingElementsPerComputeUnit = 8;
if (devices[i].getInfo<CL_DEVICE_TYPE>() != CL_DEVICE_TYPE_GPU) { if (devices[i].getInfo<CL_DEVICE_TYPE>() != CL_DEVICE_TYPE_GPU) {
...@@ -170,6 +169,8 @@ OpenCLContext::OpenCLContext(const System& system, int platformIndex, int device ...@@ -170,6 +169,8 @@ OpenCLContext::OpenCLContext(const System& system, int platformIndex, int device
compilationDefines["WORK_GROUP_SIZE"] = intToString(ThreadBlockSize); compilationDefines["WORK_GROUP_SIZE"] = intToString(ThreadBlockSize);
if (platformVendor.size() >= 5 && platformVendor.substr(0, 5) == "Intel") if (platformVendor.size() >= 5 && platformVendor.substr(0, 5) == "Intel")
defaultOptimizationOptions = ""; defaultOptimizationOptions = "";
else if (platformVendor == "Apple")
defaultOptimizationOptions = "-cl-mad-enable -cl-no-signed-zeros";
else else
defaultOptimizationOptions = "-cl-fast-relaxed-math"; defaultOptimizationOptions = "-cl-fast-relaxed-math";
supports64BitGlobalAtomics = (device.getInfo<CL_DEVICE_EXTENSIONS>().find("cl_khr_int64_base_atomics") != string::npos); supports64BitGlobalAtomics = (device.getInfo<CL_DEVICE_EXTENSIONS>().find("cl_khr_int64_base_atomics") != string::npos);
...@@ -241,8 +242,6 @@ OpenCLContext::OpenCLContext(const System& system, int platformIndex, int device ...@@ -241,8 +242,6 @@ OpenCLContext::OpenCLContext(const System& system, int platformIndex, int device
} }
else else
simdWidth = 1; simdWidth = 1;
if (platformVendor == "Apple" && vendor == "AMD")
compilationDefines["MAC_AMD_WORKAROUND"] = "";
if (supports64BitGlobalAtomics) if (supports64BitGlobalAtomics)
compilationDefines["SUPPORTS_64_BIT_ATOMICS"] = ""; compilationDefines["SUPPORTS_64_BIT_ATOMICS"] = "";
if (supportsDoublePrecision) if (supportsDoublePrecision)
......
...@@ -6,7 +6,7 @@ ...@@ -6,7 +6,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for * * Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. * * Medical Research, grant U54 GM072970. See https://simtk.org. *
* * * *
* Portions copyright (c) 2009-2012 Stanford University and the Authors. * * Portions copyright (c) 2009-2015 Stanford University and the Authors. *
* Authors: Peter Eastman * * Authors: Peter Eastman *
* Contributors: * * Contributors: *
* * * *
...@@ -35,25 +35,109 @@ ...@@ -35,25 +35,109 @@
using namespace OpenMM; using namespace OpenMM;
using namespace std; using namespace std;
OpenCLFFT3D::OpenCLFFT3D(OpenCLContext& context, int xsize, int ysize, int zsize) : context(context), xsize(xsize), ysize(ysize), zsize(zsize) { OpenCLFFT3D::OpenCLFFT3D(OpenCLContext& context, int xsize, int ysize, int zsize, bool realToComplex) :
zkernel = createKernel(xsize, ysize, zsize, zthreads); context(context), xsize(xsize), ysize(ysize), zsize(zsize) {
xkernel = createKernel(ysize, zsize, xsize, xthreads); packRealAsComplex = false;
ykernel = createKernel(zsize, xsize, ysize, ythreads); int packedXSize = xsize;
int packedYSize = ysize;
int packedZSize = zsize;
if (realToComplex) {
// If any axis size is even, we can pack the real values into a complex grid that is only half as large.
// Look for an appropriate axis.
packRealAsComplex = true;
int packedAxis, bufferSize;
if (xsize%2 == 0) {
packedAxis = 0;
packedXSize /= 2;
bufferSize = packedXSize;
}
else if (ysize%2 == 0) {
packedAxis = 1;
packedYSize /= 2;
bufferSize = packedYSize;
}
else if (zsize%2 == 0) {
packedAxis = 2;
packedZSize /= 2;
bufferSize = packedZSize;
}
else
packRealAsComplex = false;
if (packRealAsComplex) {
// Build the kernels for packing and unpacking the data.
map<string, string> defines;
defines["XSIZE"] = context.intToString(xsize);
defines["YSIZE"] = context.intToString(ysize);
defines["ZSIZE"] = context.intToString(zsize);
defines["PACKED_AXIS"] = context.intToString(packedAxis);
defines["PACKED_XSIZE"] = context.intToString(packedXSize);
defines["PACKED_YSIZE"] = context.intToString(packedYSize);
defines["PACKED_ZSIZE"] = context.intToString(packedZSize);
defines["M_PI"] = context.doubleToString(M_PI);
cl::Program program = context.createProgram(OpenCLKernelSources::fftR2C, defines);
packForwardKernel = cl::Kernel(program, "packForwardData");
unpackForwardKernel = cl::Kernel(program, "unpackForwardData");
unpackForwardKernel.setArg(2, bufferSize*(context.getUseDoublePrecision() ? sizeof(mm_double2) : sizeof(mm_float2)), NULL);
packBackwardKernel = cl::Kernel(program, "packBackwardData");
packBackwardKernel.setArg(2, bufferSize*(context.getUseDoublePrecision() ? sizeof(mm_double2) : sizeof(mm_float2)), NULL);
unpackBackwardKernel = cl::Kernel(program, "unpackBackwardData");
}
}
bool inputIsReal = (realToComplex && !packRealAsComplex);
zkernel = createKernel(packedXSize, packedYSize, packedZSize, zthreads, 0, true, inputIsReal);
xkernel = createKernel(packedYSize, packedZSize, packedXSize, xthreads, 1, true, inputIsReal);
ykernel = createKernel(packedZSize, packedXSize, packedYSize, ythreads, 2, true, inputIsReal);
invzkernel = createKernel(packedXSize, packedYSize, packedZSize, zthreads, 0, false, inputIsReal);
invxkernel = createKernel(packedYSize, packedZSize, packedXSize, xthreads, 1, false, inputIsReal);
invykernel = createKernel(packedZSize, packedXSize, packedYSize, ythreads, 2, false, inputIsReal);
} }
void OpenCLFFT3D::execFFT(OpenCLArray& in, OpenCLArray& out, bool forward) { void OpenCLFFT3D::execFFT(OpenCLArray& in, OpenCLArray& out, bool forward) {
zkernel.setArg<cl::Buffer>(0, in.getDeviceBuffer()); cl::Kernel kernel1 = (forward ? zkernel : invzkernel);
zkernel.setArg<cl::Buffer>(1, out.getDeviceBuffer()); cl::Kernel kernel2 = (forward ? xkernel : invxkernel);
zkernel.setArg<cl_int>(2, forward ? 1 : -1); cl::Kernel kernel3 = (forward ? ykernel : invykernel);
context.executeKernel(zkernel, xsize*ysize*zsize, zthreads); if (packRealAsComplex) {
xkernel.setArg<cl::Buffer>(0, out.getDeviceBuffer()); cl::Kernel packKernel = (forward ? packForwardKernel : packBackwardKernel);
xkernel.setArg<cl::Buffer>(1, in.getDeviceBuffer()); cl::Kernel unpackKernel = (forward ? unpackForwardKernel : unpackBackwardKernel);
xkernel.setArg<cl_int>(2, forward ? 1 : -1); int gridSize = xsize*ysize*zsize/2;
context.executeKernel(xkernel, xsize*ysize*zsize, xthreads);
ykernel.setArg<cl::Buffer>(0, in.getDeviceBuffer()); // Pack the data into a half sized grid.
ykernel.setArg<cl::Buffer>(1, out.getDeviceBuffer());
ykernel.setArg<cl_int>(2, forward ? 1 : -1); packKernel.setArg<cl::Buffer>(0, in.getDeviceBuffer());
context.executeKernel(ykernel, xsize*ysize*zsize, ythreads); packKernel.setArg<cl::Buffer>(1, out.getDeviceBuffer());
context.executeKernel(packKernel, gridSize);
// Perform the FFT.
kernel1.setArg<cl::Buffer>(0, out.getDeviceBuffer());
kernel1.setArg<cl::Buffer>(1, in.getDeviceBuffer());
context.executeKernel(kernel1, gridSize, zthreads);
kernel2.setArg<cl::Buffer>(0, in.getDeviceBuffer());
kernel2.setArg<cl::Buffer>(1, out.getDeviceBuffer());
context.executeKernel(kernel2, gridSize, xthreads);
kernel3.setArg<cl::Buffer>(0, out.getDeviceBuffer());
kernel3.setArg<cl::Buffer>(1, in.getDeviceBuffer());
context.executeKernel(kernel3, gridSize, ythreads);
// Unpack the data.
unpackKernel.setArg<cl::Buffer>(0, in.getDeviceBuffer());
unpackKernel.setArg<cl::Buffer>(1, out.getDeviceBuffer());
context.executeKernel(unpackKernel, gridSize);
}
else {
kernel1.setArg<cl::Buffer>(0, in.getDeviceBuffer());
kernel1.setArg<cl::Buffer>(1, out.getDeviceBuffer());
context.executeKernel(kernel1, xsize*ysize*zsize, zthreads);
kernel2.setArg<cl::Buffer>(0, out.getDeviceBuffer());
kernel2.setArg<cl::Buffer>(1, in.getDeviceBuffer());
context.executeKernel(kernel2, xsize*ysize*zsize, xthreads);
kernel3.setArg<cl::Buffer>(0, in.getDeviceBuffer());
kernel3.setArg<cl::Buffer>(1, out.getDeviceBuffer());
context.executeKernel(kernel3, xsize*ysize*zsize, ythreads);
}
} }
int OpenCLFFT3D::findLegalDimension(int minimum) { int OpenCLFFT3D::findLegalDimension(int minimum) {
...@@ -73,8 +157,10 @@ int OpenCLFFT3D::findLegalDimension(int minimum) { ...@@ -73,8 +157,10 @@ int OpenCLFFT3D::findLegalDimension(int minimum) {
} }
} }
cl::Kernel OpenCLFFT3D::createKernel(int xsize, int ysize, int zsize, int& threads) { cl::Kernel OpenCLFFT3D::createKernel(int xsize, int ysize, int zsize, int& threads, int axis, bool forward, bool inputIsReal) {
int maxThreads = std::min(256, (int) context.getDevice().getInfo<CL_DEVICE_MAX_WORK_GROUP_SIZE>()); int maxThreads = std::min(256, (int) context.getDevice().getInfo<CL_DEVICE_MAX_WORK_GROUP_SIZE>());
while (maxThreads > 128 && maxThreads-64 >= zsize)
maxThreads -= 64;
bool isCPU = context.getDevice().getInfo<CL_DEVICE_TYPE>() == CL_DEVICE_TYPE_CPU; bool isCPU = context.getDevice().getInfo<CL_DEVICE_TYPE>() == CL_DEVICE_TYPE_CPU;
while (true) { while (true) {
bool loopRequired = (zsize > maxThreads || isCPU); bool loopRequired = (zsize > maxThreads || isCPU);
...@@ -137,10 +223,10 @@ cl::Kernel OpenCLFFT3D::createKernel(int xsize, int ysize, int zsize, int& threa ...@@ -137,10 +223,10 @@ cl::Kernel OpenCLFFT3D::createKernel(int xsize, int ysize, int zsize, int& threa
source<<"real2 b2 = "<<context.doubleToString((2*cos(2*M_PI/7)-cos(4*M_PI/7)-cos(6*M_PI/7))/3)<<"*(d0-d4);\n"; source<<"real2 b2 = "<<context.doubleToString((2*cos(2*M_PI/7)-cos(4*M_PI/7)-cos(6*M_PI/7))/3)<<"*(d0-d4);\n";
source<<"real2 b3 = "<<context.doubleToString((cos(2*M_PI/7)-2*cos(4*M_PI/7)+cos(6*M_PI/7))/3)<<"*(d4-d2);\n"; source<<"real2 b3 = "<<context.doubleToString((cos(2*M_PI/7)-2*cos(4*M_PI/7)+cos(6*M_PI/7))/3)<<"*(d4-d2);\n";
source<<"real2 b4 = "<<context.doubleToString((cos(2*M_PI/7)+cos(4*M_PI/7)-2*cos(6*M_PI/7))/3)<<"*(d2-d0);\n"; source<<"real2 b4 = "<<context.doubleToString((cos(2*M_PI/7)+cos(4*M_PI/7)-2*cos(6*M_PI/7))/3)<<"*(d2-d0);\n";
source<<"real2 b5 = -sign*"<<context.doubleToString((sin(2*M_PI/7)+sin(4*M_PI/7)-sin(6*M_PI/7))/3)<<"*(d7+d1);\n"; source<<"real2 b5 = -(SIGN)*"<<context.doubleToString((sin(2*M_PI/7)+sin(4*M_PI/7)-sin(6*M_PI/7))/3)<<"*(d7+d1);\n";
source<<"real2 b6 = -sign*"<<context.doubleToString((2*sin(2*M_PI/7)-sin(4*M_PI/7)+sin(6*M_PI/7))/3)<<"*(d1-d5);\n"; source<<"real2 b6 = -(SIGN)*"<<context.doubleToString((2*sin(2*M_PI/7)-sin(4*M_PI/7)+sin(6*M_PI/7))/3)<<"*(d1-d5);\n";
source<<"real2 b7 = -sign*"<<context.doubleToString((sin(2*M_PI/7)-2*sin(4*M_PI/7)-sin(6*M_PI/7))/3)<<"*(d5-d3);\n"; source<<"real2 b7 = -(SIGN)*"<<context.doubleToString((sin(2*M_PI/7)-2*sin(4*M_PI/7)-sin(6*M_PI/7))/3)<<"*(d5-d3);\n";
source<<"real2 b8 = -sign*"<<context.doubleToString((sin(2*M_PI/7)+sin(4*M_PI/7)+2*sin(6*M_PI/7))/3)<<"*(d3-d1);\n"; source<<"real2 b8 = -(SIGN)*"<<context.doubleToString((sin(2*M_PI/7)+sin(4*M_PI/7)+2*sin(6*M_PI/7))/3)<<"*(d3-d1);\n";
source<<"real2 t0 = b0+b1;\n"; source<<"real2 t0 = b0+b1;\n";
source<<"real2 t1 = b2+b3;\n"; source<<"real2 t1 = b2+b3;\n";
source<<"real2 t2 = b4-b3;\n"; source<<"real2 t2 = b4-b3;\n";
...@@ -178,8 +264,8 @@ cl::Kernel OpenCLFFT3D::createKernel(int xsize, int ysize, int zsize, int& threa ...@@ -178,8 +264,8 @@ cl::Kernel OpenCLFFT3D::createKernel(int xsize, int ysize, int zsize, int& threa
source<<"real2 d7 = d6+d5;\n"; source<<"real2 d7 = d6+d5;\n";
source<<"real2 d8 = d6-d5;\n"; source<<"real2 d8 = d6-d5;\n";
string coeff = context.doubleToString(sin(0.2*M_PI)/sin(0.4*M_PI)); string coeff = context.doubleToString(sin(0.2*M_PI)/sin(0.4*M_PI));
source<<"real2 d9 = sign*(real2) (d2.y+"<<coeff<<"*d3.y, -d2.x-"<<coeff<<"*d3.x);\n"; source<<"real2 d9 = (SIGN)*(real2) (d2.y+"<<coeff<<"*d3.y, -d2.x-"<<coeff<<"*d3.x);\n";
source<<"real2 d10 = sign*(real2) ("<<coeff<<"*d2.y-d3.y, d3.x-"<<coeff<<"*d2.x);\n"; source<<"real2 d10 = (SIGN)*(real2) ("<<coeff<<"*d2.y-d3.y, d3.x-"<<coeff<<"*d2.x);\n";
source<<"data"<<output<<"[base+4*j*"<<m<<"] = c0+d4;\n"; source<<"data"<<output<<"[base+4*j*"<<m<<"] = c0+d4;\n";
source<<"data"<<output<<"[base+(4*j+1)*"<<m<<"] = multiplyComplex(w[j*"<<zsize<<"/"<<(5*L)<<"], d7+d9);\n"; source<<"data"<<output<<"[base+(4*j+1)*"<<m<<"] = multiplyComplex(w[j*"<<zsize<<"/"<<(5*L)<<"], d7+d9);\n";
source<<"data"<<output<<"[base+(4*j+2)*"<<m<<"] = multiplyComplex(w[j*"<<(2*zsize)<<"/"<<(5*L)<<"], d8+d10);\n"; source<<"data"<<output<<"[base+(4*j+2)*"<<m<<"] = multiplyComplex(w[j*"<<(2*zsize)<<"/"<<(5*L)<<"], d8+d10);\n";
...@@ -194,7 +280,7 @@ cl::Kernel OpenCLFFT3D::createKernel(int xsize, int ysize, int zsize, int& threa ...@@ -194,7 +280,7 @@ cl::Kernel OpenCLFFT3D::createKernel(int xsize, int ysize, int zsize, int& threa
source<<"real2 d0 = c0+c2;\n"; source<<"real2 d0 = c0+c2;\n";
source<<"real2 d1 = c0-c2;\n"; source<<"real2 d1 = c0-c2;\n";
source<<"real2 d2 = c1+c3;\n"; source<<"real2 d2 = c1+c3;\n";
source<<"real2 d3 = sign*(real2) (c1.y-c3.y, c3.x-c1.x);\n"; source<<"real2 d3 = (SIGN)*(real2) (c1.y-c3.y, c3.x-c1.x);\n";
source<<"data"<<output<<"[base+3*j*"<<m<<"] = d0+d2;\n"; source<<"data"<<output<<"[base+3*j*"<<m<<"] = d0+d2;\n";
source<<"data"<<output<<"[base+(3*j+1)*"<<m<<"] = multiplyComplex(w[j*"<<zsize<<"/"<<(4*L)<<"], d1+d3);\n"; source<<"data"<<output<<"[base+(3*j+1)*"<<m<<"] = multiplyComplex(w[j*"<<zsize<<"/"<<(4*L)<<"], d1+d3);\n";
source<<"data"<<output<<"[base+(3*j+2)*"<<m<<"] = multiplyComplex(w[j*"<<(2*zsize)<<"/"<<(4*L)<<"], d0-d2);\n"; source<<"data"<<output<<"[base+(3*j+2)*"<<m<<"] = multiplyComplex(w[j*"<<(2*zsize)<<"/"<<(4*L)<<"], d0-d2);\n";
...@@ -206,7 +292,7 @@ cl::Kernel OpenCLFFT3D::createKernel(int xsize, int ysize, int zsize, int& threa ...@@ -206,7 +292,7 @@ cl::Kernel OpenCLFFT3D::createKernel(int xsize, int ysize, int zsize, int& threa
source<<"real2 c2 = data"<<input<<"[base+"<<(2*L*m)<<"];\n"; source<<"real2 c2 = data"<<input<<"[base+"<<(2*L*m)<<"];\n";
source<<"real2 d0 = c1+c2;\n"; source<<"real2 d0 = c1+c2;\n";
source<<"real2 d1 = c0-0.5f*d0;\n"; source<<"real2 d1 = c0-0.5f*d0;\n";
source<<"real2 d2 = sign*"<<context.doubleToString(sin(M_PI/3.0))<<"*(real2) (c1.y-c2.y, c2.x-c1.x);\n"; source<<"real2 d2 = (SIGN)*"<<context.doubleToString(sin(M_PI/3.0))<<"*(real2) (c1.y-c2.y, c2.x-c1.x);\n";
source<<"data"<<output<<"[base+2*j*"<<m<<"] = c0+d0;\n"; source<<"data"<<output<<"[base+2*j*"<<m<<"] = c0+d0;\n";
source<<"data"<<output<<"[base+(2*j+1)*"<<m<<"] = multiplyComplex(w[j*"<<zsize<<"/"<<(3*L)<<"], d1+d2);\n"; source<<"data"<<output<<"[base+(2*j+1)*"<<m<<"] = multiplyComplex(w[j*"<<zsize<<"/"<<(3*L)<<"], d1+d2);\n";
source<<"data"<<output<<"[base+(2*j+2)*"<<m<<"] = multiplyComplex(w[j*"<<(2*zsize)<<"/"<<(3*L)<<"], d1-d2);\n"; source<<"data"<<output<<"[base+(2*j+2)*"<<m<<"] = multiplyComplex(w[j*"<<(2*zsize)<<"/"<<(3*L)<<"], d1-d2);\n";
...@@ -226,13 +312,27 @@ cl::Kernel OpenCLFFT3D::createKernel(int xsize, int ysize, int zsize, int& threa ...@@ -226,13 +312,27 @@ cl::Kernel OpenCLFFT3D::createKernel(int xsize, int ysize, int zsize, int& threa
// Create the kernel. // Create the kernel.
bool outputIsReal = (inputIsReal && axis == 2 && !forward);
bool outputIsPacked = (inputIsReal && axis == 2 && forward);
string outputSuffix = (outputIsReal ? ".x" : "");
if (loopRequired) { if (loopRequired) {
if (outputIsPacked)
source<<"if (x < XSIZE/2+1)\n";
source<<"for (int z = get_local_id(0); z < ZSIZE; z += get_local_size(0))\n"; source<<"for (int z = get_local_id(0); z < ZSIZE; z += get_local_size(0))\n";
source<<"out[y*(ZSIZE*XSIZE)+z*XSIZE+x] = data"<<(stage%2)<<"[z];\n"; if (outputIsPacked)
source<<"out[y*(ZSIZE*(XSIZE/2+1))+z*(XSIZE/2+1)+x] = data"<<(stage%2)<<"[z]"<<outputSuffix<<";\n";
else
source<<"out[y*(ZSIZE*XSIZE)+z*XSIZE+x] = data"<<(stage%2)<<"[z]"<<outputSuffix<<";\n";
} }
else { else {
source<<"if (index < XSIZE*YSIZE)\n"; if (outputIsPacked) {
source<<"out[y*(ZSIZE*XSIZE)+(get_local_id(0)%ZSIZE)*XSIZE+x] = data"<<(stage%2)<<"[get_local_id(0)];\n"; source<<"if (index < XSIZE*YSIZE && x < XSIZE/2+1)\n";
source<<"out[y*(ZSIZE*(XSIZE/2+1))+(get_local_id(0)%ZSIZE)*(XSIZE/2+1)+x] = data"<<(stage%2)<<"[get_local_id(0)]"<<outputSuffix<<";\n";
}
else {
source<<"if (index < XSIZE*YSIZE)\n";
source<<"out[y*(ZSIZE*XSIZE)+(get_local_id(0)%ZSIZE)*XSIZE+x] = data"<<(stage%2)<<"[get_local_id(0)]"<<outputSuffix<<";\n";
}
} }
map<string, string> replacements; map<string, string> replacements;
replacements["XSIZE"] = context.intToString(xsize); replacements["XSIZE"] = context.intToString(xsize);
...@@ -242,6 +342,12 @@ cl::Kernel OpenCLFFT3D::createKernel(int xsize, int ysize, int zsize, int& threa ...@@ -242,6 +342,12 @@ cl::Kernel OpenCLFFT3D::createKernel(int xsize, int ysize, int zsize, int& threa
replacements["M_PI"] = context.doubleToString(M_PI); replacements["M_PI"] = context.doubleToString(M_PI);
replacements["COMPUTE_FFT"] = source.str(); replacements["COMPUTE_FFT"] = source.str();
replacements["LOOP_REQUIRED"] = (loopRequired ? "1" : "0"); replacements["LOOP_REQUIRED"] = (loopRequired ? "1" : "0");
replacements["SIGN"] = (forward ? "1" : "-1");
replacements["INPUT_TYPE"] = (inputIsReal && axis == 0 && forward ? "real" : "real2");
replacements["OUTPUT_TYPE"] = (outputIsReal ? "real" : "real2");
replacements["INPUT_IS_REAL"] = (inputIsReal && axis == 0 && forward ? "1" : "0");
replacements["INPUT_IS_PACKED"] = (inputIsReal && axis == 0 && !forward ? "1" : "0");
replacements["OUTPUT_IS_PACKED"] = (outputIsPacked ? "1" : "0");
cl::Program program = context.createProgram(context.replaceStrings(OpenCLKernelSources::fft, replacements)); cl::Program program = context.createProgram(context.replaceStrings(OpenCLKernelSources::fft, replacements));
cl::Kernel kernel(program, "execFFT"); cl::Kernel kernel(program, "execFFT");
threads = (isCPU ? 1 : blocksPerGroup*zsize); threads = (isCPU ? 1 : blocksPerGroup*zsize);
...@@ -253,9 +359,9 @@ cl::Kernel OpenCLFFT3D::createKernel(int xsize, int ysize, int zsize, int& threa ...@@ -253,9 +359,9 @@ cl::Kernel OpenCLFFT3D::createKernel(int xsize, int ysize, int zsize, int& threa
continue; continue;
} }
int bufferSize = blocksPerGroup*zsize*(context.getUseDoublePrecision() ? sizeof(mm_double2) : sizeof(mm_float2)); int bufferSize = blocksPerGroup*zsize*(context.getUseDoublePrecision() ? sizeof(mm_double2) : sizeof(mm_float2));
kernel.setArg(2, bufferSize, NULL);
kernel.setArg(3, bufferSize, NULL); kernel.setArg(3, bufferSize, NULL);
kernel.setArg(4, bufferSize, NULL); kernel.setArg(4, bufferSize, NULL);
kernel.setArg(5, bufferSize, NULL);
return kernel; return kernel;
} }
} }
...@@ -1628,6 +1628,7 @@ void OpenCLCalcNonbondedForceKernel::initialize(const System& system, const Nonb ...@@ -1628,6 +1628,7 @@ void OpenCLCalcNonbondedForceKernel::initialize(const System& system, const Nonb
pmeDefines["GRID_SIZE_Y"] = cl.intToString(gridSizeY); pmeDefines["GRID_SIZE_Y"] = cl.intToString(gridSizeY);
pmeDefines["GRID_SIZE_Z"] = cl.intToString(gridSizeZ); pmeDefines["GRID_SIZE_Z"] = cl.intToString(gridSizeZ);
pmeDefines["EPSILON_FACTOR"] = cl.doubleToString(sqrt(ONE_4PI_EPS0)); pmeDefines["EPSILON_FACTOR"] = cl.doubleToString(sqrt(ONE_4PI_EPS0));
pmeDefines["M_PI"] = cl.doubleToString(M_PI);
bool deviceIsCpu = (cl.getDevice().getInfo<CL_DEVICE_TYPE>() == CL_DEVICE_TYPE_CPU); bool deviceIsCpu = (cl.getDevice().getInfo<CL_DEVICE_TYPE>() == CL_DEVICE_TYPE_CPU);
if (deviceIsCpu) if (deviceIsCpu)
pmeDefines["DEVICE_IS_CPU"] = "1"; pmeDefines["DEVICE_IS_CPU"] = "1";
...@@ -1652,8 +1653,11 @@ void OpenCLCalcNonbondedForceKernel::initialize(const System& system, const Nonb ...@@ -1652,8 +1653,11 @@ void OpenCLCalcNonbondedForceKernel::initialize(const System& system, const Nonb
int elementSize = (cl.getUseDoublePrecision() ? sizeof(double) : sizeof(float)); int elementSize = (cl.getUseDoublePrecision() ? sizeof(double) : sizeof(float));
pmeGrid = new OpenCLArray(cl, gridSizeX*gridSizeY*gridSizeZ, 2*elementSize, "pmeGrid"); pmeGrid = new OpenCLArray(cl, gridSizeX*gridSizeY*gridSizeZ, 2*elementSize, "pmeGrid");
cl.addAutoclearBuffer(*pmeGrid);
pmeGrid2 = new OpenCLArray(cl, gridSizeX*gridSizeY*gridSizeZ, 2*elementSize, "pmeGrid2"); pmeGrid2 = new OpenCLArray(cl, gridSizeX*gridSizeY*gridSizeZ, 2*elementSize, "pmeGrid2");
if (cl.getSupports64BitGlobalAtomics())
cl.addAutoclearBuffer(*pmeGrid2);
else
cl.addAutoclearBuffer(*pmeGrid);
pmeBsplineModuliX = new OpenCLArray(cl, gridSizeX, elementSize, "pmeBsplineModuliX"); pmeBsplineModuliX = new OpenCLArray(cl, gridSizeX, elementSize, "pmeBsplineModuliX");
pmeBsplineModuliY = new OpenCLArray(cl, gridSizeY, elementSize, "pmeBsplineModuliY"); pmeBsplineModuliY = new OpenCLArray(cl, gridSizeY, elementSize, "pmeBsplineModuliY");
pmeBsplineModuliZ = new OpenCLArray(cl, gridSizeZ, elementSize, "pmeBsplineModuliZ"); pmeBsplineModuliZ = new OpenCLArray(cl, gridSizeZ, elementSize, "pmeBsplineModuliZ");
...@@ -1661,9 +1665,12 @@ void OpenCLCalcNonbondedForceKernel::initialize(const System& system, const Nonb ...@@ -1661,9 +1665,12 @@ void OpenCLCalcNonbondedForceKernel::initialize(const System& system, const Nonb
pmeAtomRange = OpenCLArray::create<cl_int>(cl, gridSizeX*gridSizeY*gridSizeZ+1, "pmeAtomRange"); pmeAtomRange = OpenCLArray::create<cl_int>(cl, gridSizeX*gridSizeY*gridSizeZ+1, "pmeAtomRange");
pmeAtomGridIndex = OpenCLArray::create<mm_int2>(cl, numParticles, "pmeAtomGridIndex"); pmeAtomGridIndex = OpenCLArray::create<mm_int2>(cl, numParticles, "pmeAtomGridIndex");
sort = new OpenCLSort(cl, new SortTrait(), cl.getNumAtoms()); sort = new OpenCLSort(cl, new SortTrait(), cl.getNumAtoms());
fft = new OpenCLFFT3D(cl, gridSizeX, gridSizeY, gridSizeZ); fft = new OpenCLFFT3D(cl, gridSizeX, gridSizeY, gridSizeZ, true);
string vendor = cl.getDevice().getInfo<CL_DEVICE_VENDOR>(); string vendor = cl.getDevice().getInfo<CL_DEVICE_VENDOR>();
usePmeQueue = (vendor.size() >= 6 && vendor.substr(0, 6) == "NVIDIA"); bool isNvidia = (vendor.size() >= 6 && vendor.substr(0, 6) == "NVIDIA");
if (isNvidia)
pmeDefines["USE_ALTERNATE_MEMORY_ACCESS_PATTERN"] = "1";
usePmeQueue = isNvidia;
if (usePmeQueue) { if (usePmeQueue) {
pmeQueue = cl::CommandQueue(cl.getContext(), cl.getDevice()); pmeQueue = cl::CommandQueue(cl.getContext(), cl.getDevice());
int recipForceGroup = force.getReciprocalSpaceForceGroup(); int recipForceGroup = force.getReciprocalSpaceForceGroup();
...@@ -1800,6 +1807,7 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ ...@@ -1800,6 +1807,7 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ
pmeZIndexKernel = cl::Kernel(program, "recordZIndex"); pmeZIndexKernel = cl::Kernel(program, "recordZIndex");
pmeSpreadChargeKernel = cl::Kernel(program, "gridSpreadCharge"); pmeSpreadChargeKernel = cl::Kernel(program, "gridSpreadCharge");
pmeConvolutionKernel = cl::Kernel(program, "reciprocalConvolution"); pmeConvolutionKernel = cl::Kernel(program, "reciprocalConvolution");
pmeEvalEnergyKernel = cl::Kernel(program, "gridEvaluateEnergy");
pmeInterpolateForceKernel = cl::Kernel(program, "gridInterpolateForce"); pmeInterpolateForceKernel = cl::Kernel(program, "gridInterpolateForce");
int elementSize = (cl.getUseDoublePrecision() ? sizeof(mm_double4) : sizeof(mm_float4)); int elementSize = (cl.getUseDoublePrecision() ? sizeof(mm_double4) : sizeof(mm_float4));
pmeUpdateBsplinesKernel.setArg<cl::Buffer>(0, cl.getPosq().getDeviceBuffer()); pmeUpdateBsplinesKernel.setArg<cl::Buffer>(0, cl.getPosq().getDeviceBuffer());
...@@ -1814,20 +1822,28 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ ...@@ -1814,20 +1822,28 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ
pmeSpreadChargeKernel.setArg<cl::Buffer>(0, cl.getPosq().getDeviceBuffer()); pmeSpreadChargeKernel.setArg<cl::Buffer>(0, cl.getPosq().getDeviceBuffer());
pmeSpreadChargeKernel.setArg<cl::Buffer>(1, pmeAtomGridIndex->getDeviceBuffer()); pmeSpreadChargeKernel.setArg<cl::Buffer>(1, pmeAtomGridIndex->getDeviceBuffer());
pmeSpreadChargeKernel.setArg<cl::Buffer>(2, pmeAtomRange->getDeviceBuffer()); pmeSpreadChargeKernel.setArg<cl::Buffer>(2, pmeAtomRange->getDeviceBuffer());
pmeSpreadChargeKernel.setArg<cl::Buffer>(3, pmeGrid->getDeviceBuffer()); if (cl.getSupports64BitGlobalAtomics())
pmeSpreadChargeKernel.setArg<cl::Buffer>(3, pmeGrid2->getDeviceBuffer());
else
pmeSpreadChargeKernel.setArg<cl::Buffer>(3, pmeGrid->getDeviceBuffer());
pmeSpreadChargeKernel.setArg<cl::Buffer>(4, pmeBsplineTheta->getDeviceBuffer()); pmeSpreadChargeKernel.setArg<cl::Buffer>(4, pmeBsplineTheta->getDeviceBuffer());
pmeConvolutionKernel.setArg<cl::Buffer>(0, pmeGrid2->getDeviceBuffer()); pmeConvolutionKernel.setArg<cl::Buffer>(0, pmeGrid2->getDeviceBuffer());
pmeConvolutionKernel.setArg<cl::Buffer>(1, cl.getEnergyBuffer().getDeviceBuffer()); pmeConvolutionKernel.setArg<cl::Buffer>(1, pmeBsplineModuliX->getDeviceBuffer());
pmeConvolutionKernel.setArg<cl::Buffer>(2, pmeBsplineModuliX->getDeviceBuffer()); pmeConvolutionKernel.setArg<cl::Buffer>(2, pmeBsplineModuliY->getDeviceBuffer());
pmeConvolutionKernel.setArg<cl::Buffer>(3, pmeBsplineModuliY->getDeviceBuffer()); pmeConvolutionKernel.setArg<cl::Buffer>(3, pmeBsplineModuliZ->getDeviceBuffer());
pmeConvolutionKernel.setArg<cl::Buffer>(4, pmeBsplineModuliZ->getDeviceBuffer()); pmeEvalEnergyKernel.setArg<cl::Buffer>(0, pmeGrid2->getDeviceBuffer());
pmeEvalEnergyKernel.setArg<cl::Buffer>(1, cl.getEnergyBuffer().getDeviceBuffer());
pmeEvalEnergyKernel.setArg<cl::Buffer>(2, pmeBsplineModuliX->getDeviceBuffer());
pmeEvalEnergyKernel.setArg<cl::Buffer>(3, pmeBsplineModuliY->getDeviceBuffer());
pmeEvalEnergyKernel.setArg<cl::Buffer>(4, pmeBsplineModuliZ->getDeviceBuffer());
pmeInterpolateForceKernel.setArg<cl::Buffer>(0, cl.getPosq().getDeviceBuffer()); pmeInterpolateForceKernel.setArg<cl::Buffer>(0, cl.getPosq().getDeviceBuffer());
pmeInterpolateForceKernel.setArg<cl::Buffer>(1, cl.getForceBuffers().getDeviceBuffer()); pmeInterpolateForceKernel.setArg<cl::Buffer>(1, cl.getForceBuffers().getDeviceBuffer());
pmeInterpolateForceKernel.setArg<cl::Buffer>(2, pmeGrid->getDeviceBuffer()); pmeInterpolateForceKernel.setArg<cl::Buffer>(2, pmeGrid->getDeviceBuffer());
pmeInterpolateForceKernel.setArg<cl::Buffer>(7, pmeAtomGridIndex->getDeviceBuffer()); pmeInterpolateForceKernel.setArg<cl::Buffer>(7, pmeAtomGridIndex->getDeviceBuffer());
if (cl.getSupports64BitGlobalAtomics()) { if (cl.getSupports64BitGlobalAtomics()) {
pmeFinishSpreadChargeKernel = cl::Kernel(program, "finishSpreadCharge"); pmeFinishSpreadChargeKernel = cl::Kernel(program, "finishSpreadCharge");
pmeFinishSpreadChargeKernel.setArg<cl::Buffer>(0, pmeGrid->getDeviceBuffer()); pmeFinishSpreadChargeKernel.setArg<cl::Buffer>(0, pmeGrid2->getDeviceBuffer());
pmeFinishSpreadChargeKernel.setArg<cl::Buffer>(1, pmeGrid->getDeviceBuffer());
} }
} }
} }
...@@ -1851,7 +1867,7 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ ...@@ -1851,7 +1867,7 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ
cl.executeKernel(ewaldForcesKernel, cl.getNumAtoms()); cl.executeKernel(ewaldForcesKernel, cl.getNumAtoms());
} }
if (pmeGrid != NULL && includeReciprocal) { if (pmeGrid != NULL && includeReciprocal) {
if (usePmeQueue) if (usePmeQueue && !includeEnergy)
cl.setQueue(pmeQueue); cl.setQueue(pmeQueue);
// Invert the periodic box vectors. // Invert the periodic box vectors.
...@@ -1926,19 +1942,24 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ ...@@ -1926,19 +1942,24 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ
} }
fft->execFFT(*pmeGrid, *pmeGrid2, true); fft->execFFT(*pmeGrid, *pmeGrid2, true);
mm_double4 boxSize = cl.getPeriodicBoxSizeDouble(); mm_double4 boxSize = cl.getPeriodicBoxSizeDouble();
double scaleFactor = 1.0/(M_PI*boxSize.x*boxSize.y*boxSize.z);
if (cl.getUseDoublePrecision()) { if (cl.getUseDoublePrecision()) {
pmeConvolutionKernel.setArg<mm_double4>(5, recipBoxVectors[0]); pmeConvolutionKernel.setArg<mm_double4>(4, recipBoxVectors[0]);
pmeConvolutionKernel.setArg<mm_double4>(6, recipBoxVectors[1]); pmeConvolutionKernel.setArg<mm_double4>(5, recipBoxVectors[1]);
pmeConvolutionKernel.setArg<mm_double4>(7, recipBoxVectors[2]); pmeConvolutionKernel.setArg<mm_double4>(6, recipBoxVectors[2]);
pmeConvolutionKernel.setArg<cl_double>(8, scaleFactor); pmeEvalEnergyKernel.setArg<mm_double4>(5, recipBoxVectors[0]);
pmeEvalEnergyKernel.setArg<mm_double4>(6, recipBoxVectors[1]);
pmeEvalEnergyKernel.setArg<mm_double4>(7, recipBoxVectors[2]);
} }
else { else {
pmeConvolutionKernel.setArg<mm_float4>(5, recipBoxVectorsFloat[0]); pmeConvolutionKernel.setArg<mm_float4>(4, recipBoxVectorsFloat[0]);
pmeConvolutionKernel.setArg<mm_float4>(6, recipBoxVectorsFloat[1]); pmeConvolutionKernel.setArg<mm_float4>(5, recipBoxVectorsFloat[1]);
pmeConvolutionKernel.setArg<mm_float4>(7, recipBoxVectorsFloat[2]); pmeConvolutionKernel.setArg<mm_float4>(6, recipBoxVectorsFloat[2]);
pmeConvolutionKernel.setArg<cl_float>(8, (float) scaleFactor); pmeEvalEnergyKernel.setArg<mm_float4>(5, recipBoxVectorsFloat[0]);
} pmeEvalEnergyKernel.setArg<mm_float4>(6, recipBoxVectorsFloat[1]);
pmeEvalEnergyKernel.setArg<mm_float4>(7, recipBoxVectorsFloat[2]);
}
if (includeEnergy)
cl.executeKernel(pmeEvalEnergyKernel, cl.getNumAtoms());
cl.executeKernel(pmeConvolutionKernel, cl.getNumAtoms()); cl.executeKernel(pmeConvolutionKernel, cl.getNumAtoms());
fft->execFFT(*pmeGrid2, *pmeGrid, false); fft->execFFT(*pmeGrid2, *pmeGrid, false);
setPeriodicBoxSizeArg(cl, pmeInterpolateForceKernel, 3); setPeriodicBoxSizeArg(cl, pmeInterpolateForceKernel, 3);
......
...@@ -109,7 +109,7 @@ bool OpenCLPlatform::supportsDoublePrecision() const { ...@@ -109,7 +109,7 @@ bool OpenCLPlatform::supportsDoublePrecision() const {
bool OpenCLPlatform::isPlatformSupported() { bool OpenCLPlatform::isPlatformSupported() {
// Return false for OpenCL implementations that are known // Return false for OpenCL implementations that are known
// to be buggy (Apple OSX since 10.7.5) // to be buggy (Apple OS X prior to 10.10).
#ifdef __APPLE__ #ifdef __APPLE__
char str[256]; char str[256];
...@@ -122,12 +122,10 @@ bool OpenCLPlatform::isPlatformSupported() { ...@@ -122,12 +122,10 @@ bool OpenCLPlatform::isPlatformSupported() {
if (sscanf(str, "%d.%d.%d", &major, &minor, &micro) != 3) if (sscanf(str, "%d.%d.%d", &major, &minor, &micro) != 3)
return false; return false;
if ((major > 11) || (major == 11 && minor > 4) || (major == 11 && minor == 4 && micro >= 2)) if (major < 14 || (major == 14 && minor < 3))
// 11.4.2 is the darwin release corresponding to OSX 10.7.5, which is the // 14.3.0 is the darwin release corresponding to OS X 10.10.3. Versions prior to that
// point at which a number of serious bugs were introduced into the // contained a number of serious bugs in the Apple OpenCL libraries.
// Apple OpenCL libraries, resulting in catistrophically incorrect MD simulations // (See https://github.com/SimTk/openmm/issues/395 for example.)
// (see https://github.com/SimTk/openmm/issues/395 for example). Once a fix is released,
// this version check should be updated.
return false; return false;
#endif #endif
......
...@@ -6,7 +6,7 @@ ...@@ -6,7 +6,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for * * Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. * * Medical Research, grant U54 GM072970. See https://simtk.org. *
* * * *
* Portions copyright (c) 2010-2013 Stanford University and the Authors. * * Portions copyright (c) 2010-2015 Stanford University and the Authors. *
* Authors: Peter Eastman * * Authors: Peter Eastman *
* Contributors: * * Contributors: *
* * * *
...@@ -42,7 +42,6 @@ OpenCLSort::OpenCLSort(OpenCLContext& context, SortTrait* trait, unsigned int le ...@@ -42,7 +42,6 @@ OpenCLSort::OpenCLSort(OpenCLContext& context, SortTrait* trait, unsigned int le
replacements["MIN_KEY"] = trait->getMinKey(); replacements["MIN_KEY"] = trait->getMinKey();
replacements["MAX_KEY"] = trait->getMaxKey(); replacements["MAX_KEY"] = trait->getMaxKey();
replacements["MAX_VALUE"] = trait->getMaxValue(); replacements["MAX_VALUE"] = trait->getMaxValue();
replacements["VALUE_IS_INT2"] = (trait->getDataType() == std::string("int2") ? "1" : "0");
cl::Program program = context.createProgram(context.replaceStrings(OpenCLKernelSources::sort, replacements)); cl::Program program = context.createProgram(context.replaceStrings(OpenCLKernelSources::sort, replacements));
shortListKernel = cl::Kernel(program, "sortShortList"); shortListKernel = cl::Kernel(program, "sortShortList");
computeRangeKernel = cl::Kernel(program, "computeRange"); computeRangeKernel = cl::Kernel(program, "computeRange");
...@@ -59,7 +58,11 @@ OpenCLSort::OpenCLSort(OpenCLContext& context, SortTrait* trait, unsigned int le ...@@ -59,7 +58,11 @@ OpenCLSort::OpenCLSort(OpenCLContext& context, SortTrait* trait, unsigned int le
unsigned int maxRangeSize = std::min(maxGroupSize, (unsigned int) computeRangeKernel.getWorkGroupInfo<CL_KERNEL_WORK_GROUP_SIZE>(context.getDevice())); unsigned int maxRangeSize = std::min(maxGroupSize, (unsigned int) computeRangeKernel.getWorkGroupInfo<CL_KERNEL_WORK_GROUP_SIZE>(context.getDevice()));
unsigned int maxPositionsSize = std::min(maxGroupSize, (unsigned int) computeBucketPositionsKernel.getWorkGroupInfo<CL_KERNEL_WORK_GROUP_SIZE>(context.getDevice())); unsigned int maxPositionsSize = std::min(maxGroupSize, (unsigned int) computeBucketPositionsKernel.getWorkGroupInfo<CL_KERNEL_WORK_GROUP_SIZE>(context.getDevice()));
unsigned int maxShortListSize = shortListKernel.getWorkGroupInfo<CL_KERNEL_WORK_GROUP_SIZE>(context.getDevice()); unsigned int maxShortListSize = shortListKernel.getWorkGroupInfo<CL_KERNEL_WORK_GROUP_SIZE>(context.getDevice());
isShortList = (length <= maxLocalBuffer && length < maxShortListSize); // On Qualcomm's OpenCL, it's essential to check against maxShortListSize. Otherwise you get a crash.
// But AMD's OpenCL returns an inappropriately small value for it that is much shorter than the actual
// maximum, so including the check hurts performance. For the moment I'm going to just comment it out.
// If we officially support Qualcomm in the future, we'll need to do something better.
isShortList = (length <= maxLocalBuffer/* && length < maxShortListSize*/);
for (rangeKernelSize = 1; rangeKernelSize*2 <= maxRangeSize; rangeKernelSize *= 2) for (rangeKernelSize = 1; rangeKernelSize*2 <= maxRangeSize; rangeKernelSize *= 2)
; ;
positionsKernelSize = std::min(rangeKernelSize, maxPositionsSize); positionsKernelSize = std::min(rangeKernelSize, maxPositionsSize);
......
...@@ -227,7 +227,9 @@ __kernel void findNeighbors(real4 periodicBoxSize, real4 invPeriodicBoxSize, rea ...@@ -227,7 +227,9 @@ __kernel void findNeighbors(real4 periodicBoxSize, real4 invPeriodicBoxSize, rea
int start = block2*TILE_SIZE; int start = block2*TILE_SIZE;
int included[TILE_SIZE]; int included[TILE_SIZE];
int numIncluded = 0; int numIncluded = 0;
SYNC_WARPS;
positionCache[get_local_id(0)] = posq[start+indexInWarp]; positionCache[get_local_id(0)] = posq[start+indexInWarp];
SYNC_WARPS;
if (atom1 < NUM_ATOMS) { if (atom1 < NUM_ATOMS) {
for (int j = 0; j < 32; j++) { for (int j = 0; j < 32; j++) {
int atom2 = start+j; int atom2 = start+j;
...@@ -287,7 +289,7 @@ __kernel void computeNeighborStartIndices(__global int* restrict numNeighborsFor ...@@ -287,7 +289,7 @@ __kernel void computeNeighborStartIndices(__global int* restrict numNeighborsFor
unsigned int globalIndex = startAtom+get_local_id(0); unsigned int globalIndex = startAtom+get_local_id(0);
posBuffer[get_local_id(0)] = (globalIndex < NUM_ATOMS ? numNeighborsForAtom[globalIndex] : 0); posBuffer[get_local_id(0)] = (globalIndex < NUM_ATOMS ? numNeighborsForAtom[globalIndex] : 0);
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
// Perform a parallel prefix sum. // Perform a parallel prefix sum.
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment