Commit b1a22214 authored by Peter Eastman's avatar Peter Eastman
Browse files

Implemented PME for OpenCL platform

parent 3a38084c
......@@ -143,6 +143,9 @@ void NonbondedForceImpl::calcPMEParameters(const System& system, const Nonbonded
xsize = (int) ceil(alpha*boxVectors[0][0]/pow(0.5*tol, 0.2));
ysize = (int) ceil(alpha*boxVectors[1][1]/pow(0.5*tol, 0.2));
zsize = (int) ceil(alpha*boxVectors[2][2]/pow(0.5*tol, 0.2));
xsize = max(xsize, 5);
ysize = max(ysize, 5);
zsize = max(zsize, 5);
}
int NonbondedForceImpl::findZero(const NonbondedForceImpl::ErrorFunction& f, int initialGuess) {
......
......@@ -86,6 +86,7 @@ cl::Kernel OpenCLFFT3D::createKernel(int xsize, int ysize, int zsize, int xmult,
if (unfactored%5 == 0) {
L = L/5;
source<<"// Pass "<<(stage+1)<<" (radix 5)\n";
source<<"if (i < "<<(L*m)<<") {\n";
source<<"int j = i/"<<m<<";\n";
source<<"float2 c0 = data"<<input<<"[i];\n";
source<<"float2 c1 = data"<<input<<"[i+"<<(L*m)<<"];\n";
......@@ -109,12 +110,14 @@ cl::Kernel OpenCLFFT3D::createKernel(int xsize, int ysize, int zsize, int xmult,
source<<"data"<<output<<"[i+(4*j+2)*"<<m<<"] = multiplyComplex(w[j*"<<(2*xsize)<<"/"<<(5*L)<<"], d8+d10);\n";
source<<"data"<<output<<"[i+(4*j+3)*"<<m<<"] = multiplyComplex(w[j*"<<(3*xsize)<<"/"<<(5*L)<<"], d8-d10);\n";
source<<"data"<<output<<"[i+(4*j+4)*"<<m<<"] = multiplyComplex(w[j*"<<(4*xsize)<<"/"<<(5*L)<<"], d7-d9);\n";
source<<"}\n";
m = m*5;
unfactored /= 5;
}
else if (unfactored%4 == 0) {
L = L/4;
source<<"// Pass "<<(stage+1)<<" (radix 4)\n";
source<<"if (i < "<<(L*m)<<") {\n";
source<<"int j = i/"<<m<<";\n";
source<<"float2 c0 = data"<<input<<"[i];\n";
source<<"float2 c1 = data"<<input<<"[i+"<<(L*m)<<"];\n";
......@@ -128,12 +131,14 @@ cl::Kernel OpenCLFFT3D::createKernel(int xsize, int ysize, int zsize, int xmult,
source<<"data"<<output<<"[i+(3*j+1)*"<<m<<"] = multiplyComplex(w[j*"<<xsize<<"/"<<(4*L)<<"], d1+d3);\n";
source<<"data"<<output<<"[i+(3*j+2)*"<<m<<"] = multiplyComplex(w[j*"<<(2*xsize)<<"/"<<(4*L)<<"], d0-d2);\n";
source<<"data"<<output<<"[i+(3*j+3)*"<<m<<"] = multiplyComplex(w[j*"<<(3*xsize)<<"/"<<(4*L)<<"], d1-d3);\n";
source<<"}\n";
m = m*4;
unfactored /= 4;
}
else if (unfactored%3 == 0) {
L = L/3;
source<<"// Pass "<<(stage+1)<<" (radix 3)\n";
source<<"if (i < "<<(L*m)<<") {\n";
source<<"int j = i/"<<m<<";\n";
source<<"float2 c0 = data"<<input<<"[i];\n";
source<<"float2 c1 = data"<<input<<"[i+"<<(L*m)<<"];\n";
......@@ -144,17 +149,20 @@ cl::Kernel OpenCLFFT3D::createKernel(int xsize, int ysize, int zsize, int xmult,
source<<"data"<<output<<"[i+2*j*"<<m<<"] = c0+d0;\n";
source<<"data"<<output<<"[i+(2*j+1)*"<<m<<"] = multiplyComplex(w[j*"<<xsize<<"/"<<(3*L)<<"], d1+d2);\n";
source<<"data"<<output<<"[i+(2*j+2)*"<<m<<"] = multiplyComplex(w[j*"<<(2*xsize)<<"/"<<(3*L)<<"], d1-d2);\n";
source<<"}\n";
m = m*3;
unfactored /= 3;
}
else if (unfactored%2 == 0) {
L = L/2;
source<<"// Pass "<<(stage+1)<<" (radix 2)\n";
source<<"if (i < "<<(L*m)<<") {\n";
source<<"int j = i/"<<m<<";\n";
source<<"float2 c0 = data"<<input<<"[i];\n";
source<<"float2 c1 = data"<<input<<"[i+"<<(L*m)<<"];\n";
source<<"data"<<output<<"[i+j*"<<m<<"] = c0+c1;\n";
source<<"data"<<output<<"[i+(j+1)*"<<m<<"] = multiplyComplex(w[j*"<<xsize<<"/"<<(2*L)<<"], c0-c1);\n";
source<<"}\n";
m = m*2;
unfactored /= 2;
}
......@@ -167,7 +175,8 @@ cl::Kernel OpenCLFFT3D::createKernel(int xsize, int ysize, int zsize, int xmult,
// Create the kernel.
source<<"matrix[element] = data"<<(stage%2)<<"[i];";
source<<"matrix[element] = data"<<(stage%2)<<"[i];\n";
source<<"barrier(CLK_GLOBAL_MEM_FENCE);";
map<string, string> replacements;
replacements["XSIZE"] = OpenCLExpressionUtilities::intToString(xsize);
replacements["YSIZE"] = OpenCLExpressionUtilities::intToString(ysize);
......
......@@ -691,6 +691,28 @@ OpenCLCalcNonbondedForceKernel::~OpenCLCalcNonbondedForceKernel() {
delete exceptionParams;
if (exceptionIndices != NULL)
delete exceptionIndices;
if (cosSinSums != NULL)
delete cosSinSums;
if (pmeGrid != NULL)
delete pmeGrid;
if (pmeBsplineModuliX != NULL)
delete pmeBsplineModuliX;
if (pmeBsplineModuliY != NULL)
delete pmeBsplineModuliY;
if (pmeBsplineModuliZ != NULL)
delete pmeBsplineModuliZ;
if (pmeBsplineTheta != NULL)
delete pmeBsplineTheta;
if (pmeBsplineDtheta != NULL)
delete pmeBsplineDtheta;
if (pmeAtomRange != NULL)
delete pmeAtomRange;
if (pmeAtomGridIndex != NULL)
delete pmeAtomGridIndex;
if (sort != NULL)
delete sort;
if (fft != NULL)
delete fft;
}
void OpenCLCalcNonbondedForceKernel::initialize(const System& system, const NonbondedForce& force) {
......@@ -780,8 +802,101 @@ void OpenCLCalcNonbondedForceKernel::initialize(const System& system, const Nonb
ewaldForcesKernel = cl::Kernel(program, "calculateEwaldForces");
cosSinSums = new OpenCLArray<mm_float2>(cl, (2*kmaxx-1)*(2*kmaxy-1)*(2*kmaxz-1), "cosSinSums");
}
else if (force.getNonbondedMethod() == NonbondedForce::PME)
throw OpenMMException("OpenMMPlatform does not yet support PME");
else if (force.getNonbondedMethod() == NonbondedForce::PME) {
// Compute the PME parameters.
double alpha;
int gridSizeX, gridSizeY, gridSizeZ;
NonbondedForceImpl::calcPMEParameters(system, force, alpha, gridSizeX, gridSizeY, gridSizeZ);
gridSizeX = OpenCLFFT3D::findLegalDimension(gridSizeX);
gridSizeY = OpenCLFFT3D::findLegalDimension(gridSizeY);
gridSizeZ = OpenCLFFT3D::findLegalDimension(gridSizeZ);
defines["EWALD_ALPHA"] = doubleToString(alpha);
defines["TWO_OVER_SQRT_PI"] = doubleToString(2.0/sqrt(M_PI));
defines["USE_EWALD"] = "1";
double selfEnergyScale = ONE_4PI_EPS0*alpha/std::sqrt(M_PI);
ewaldSelfEnergy = -ONE_4PI_EPS0*alpha*sumSquaredCharges/std::sqrt(M_PI);
pmeDefines["PME_ORDER"] = intToString(PmeOrder);
pmeDefines["NUM_ATOMS"] = intToString(numParticles);
pmeDefines["RECIP_EXP_FACTOR"] = doubleToString(M_PI*M_PI/(alpha*alpha));
pmeDefines["GRID_SIZE_X"] = intToString(gridSizeX);
pmeDefines["GRID_SIZE_Y"] = intToString(gridSizeY);
pmeDefines["GRID_SIZE_Z"] = intToString(gridSizeZ);
pmeDefines["EPSILON_FACTOR"] = doubleToString(std::sqrt(ONE_4PI_EPS0));
// Create required data structures.
pmeGrid = new OpenCLArray<mm_float2>(cl, gridSizeX*gridSizeY*gridSizeZ, "pmeGrid");
pmeBsplineModuliX = new OpenCLArray<cl_float>(cl, gridSizeX, "pmeBsplineModuliX");
pmeBsplineModuliY = new OpenCLArray<cl_float>(cl, gridSizeY, "pmeBsplineModuliY");
pmeBsplineModuliZ = new OpenCLArray<cl_float>(cl, gridSizeZ, "pmeBsplineModuliZ");
pmeBsplineTheta = new OpenCLArray<mm_float4>(cl, PmeOrder*numParticles, "pmeBsplineTheta");
pmeBsplineDtheta = new OpenCLArray<mm_float4>(cl, PmeOrder*numParticles, "pmeBsplineDtheta");
pmeAtomRange = new OpenCLArray<cl_int>(cl, gridSizeX*gridSizeY*gridSizeZ+1, "pmeAtomRange");
pmeAtomGridIndex = new OpenCLArray<mm_float2>(cl, numParticles, "pmeAtomGridIndex");
sort = new OpenCLSort<mm_float2>(cl, cl.getNumAtoms(), "float2", "value.y");
fft = new OpenCLFFT3D(cl, gridSizeX, gridSizeY, gridSizeZ);
// Initialize the b-spline moduli.
int maxSize = max(max(gridSizeX, gridSizeY), gridSizeZ);
vector<double> data(PmeOrder);
vector<double> ddata(PmeOrder);
vector<double> bsplines_data(maxSize);
data[PmeOrder-1] = 0.0;
data[1] = 0.0;
data[0] = 1.0;
for (int i = 3; i < PmeOrder; i++) {
double div = 1.0/(i-1.0);
data[i-1] = 0.0;
for (int j = 1; j < (i-1); j++)
data[i-j-1] = div*(j*data[i-j-2]+(i-j)*data[i-j-1]);
data[0] = div*data[0];
}
// Differentiate.
ddata[0] = -data[0];
for (int i = 1; i < PmeOrder; i++)
ddata[i] = data[i-1]-data[i];
double div = 1.0/(PmeOrder-1);
data[PmeOrder-1] = 0.0;
for (int i = 1; i < (PmeOrder-1); i++)
data[PmeOrder-i-1] = div*(i*data[PmeOrder-i-2]+(PmeOrder-i)*data[PmeOrder-i-1]);
data[0] = div*data[0];
for (int i = 0; i < maxSize; i++)
bsplines_data[i] = 0.0;
for (int i = 1; i <= PmeOrder; i++)
bsplines_data[i] = data[i-1];
// Evaluate the actual bspline moduli for X/Y/Z.
for(int dim = 0; dim < 3; dim++) {
int ndata = (dim == 0 ? gridSizeX : dim == 1 ? gridSizeY : gridSizeZ);
vector<cl_float> moduli(ndata);
for (int i = 0; i < ndata; i++) {
double sc = 0.0;
double ss = 0.0;
for (int j = 0; j < ndata; j++) {
double arg = (2.0*M_PI*i*j)/ndata;
sc += bsplines_data[j]*cos(arg);
ss += bsplines_data[j]*sin(arg);
}
moduli[i] = (float) (sc*sc+ss*ss);
}
for (int i = 0; i < ndata; i++)
{
if (moduli[i] < 1.0e-7)
moduli[i] = (moduli[i-1]+moduli[i+1])*0.5f;
}
if (dim == 0)
pmeBsplineModuliX->upload(moduli);
else if (dim == 1)
pmeBsplineModuliY->upload(moduli);
else
pmeBsplineModuliZ->upload(moduli);
}
}
else
ewaldSelfEnergy = 0.0;
......@@ -848,6 +963,43 @@ void OpenCLCalcNonbondedForceKernel::executeForces(ContextImpl& context) {
ewaldForcesKernel.setArg<cl::Buffer>(1, cl.getPosq().getDeviceBuffer());
ewaldForcesKernel.setArg<cl::Buffer>(2, cosSinSums->getDeviceBuffer());
}
if (pmeGrid != NULL) {
mm_float4 boxSize = cl.getNonbondedUtilities().getPeriodicBoxSize();
pmeDefines["PERIODIC_BOX_SIZE_X"] = doubleToString(boxSize.x);
pmeDefines["PERIODIC_BOX_SIZE_Y"] = doubleToString(boxSize.y);
pmeDefines["PERIODIC_BOX_SIZE_Z"] = doubleToString(boxSize.z);
pmeDefines["RECIP_SCALE_FACTOR"] = doubleToString(1.0/(M_PI*boxSize.x*boxSize.y*boxSize.z));
cl::Program program = cl.createProgram(OpenCLKernelSources::pme, pmeDefines);
pmeGridIndexKernel = cl::Kernel(program, "updateGridIndexAndFraction");
pmeAtomRangeKernel = cl::Kernel(program, "findAtomRangeForGrid");
pmeUpdateBsplinesKernel = cl::Kernel(program, "updateBsplines");
pmeSpreadChargeKernel = cl::Kernel(program, "gridSpreadCharge");
pmeConvolutionKernel = cl::Kernel(program, "reciprocalConvolution");
pmeInterpolateForceKernel = cl::Kernel(program, "gridInterpolateForce");
pmeGridIndexKernel.setArg<cl::Buffer>(0, cl.getPosq().getDeviceBuffer());
pmeGridIndexKernel.setArg<cl::Buffer>(1, pmeAtomGridIndex->getDeviceBuffer());
pmeAtomRangeKernel.setArg<cl::Buffer>(0, cl.getPosq().getDeviceBuffer());
pmeAtomRangeKernel.setArg<cl::Buffer>(1, pmeAtomGridIndex->getDeviceBuffer());
pmeAtomRangeKernel.setArg<cl::Buffer>(2, pmeAtomRange->getDeviceBuffer());
pmeUpdateBsplinesKernel.setArg<cl::Buffer>(0, cl.getPosq().getDeviceBuffer());
pmeUpdateBsplinesKernel.setArg<cl::Buffer>(1, pmeBsplineTheta->getDeviceBuffer());
pmeUpdateBsplinesKernel.setArg<cl::Buffer>(2, pmeBsplineDtheta->getDeviceBuffer());
pmeUpdateBsplinesKernel.setArg(3, 2*OpenCLContext::ThreadBlockSize*PmeOrder*sizeof(mm_float4), NULL);
pmeSpreadChargeKernel.setArg<cl::Buffer>(0, pmeAtomGridIndex->getDeviceBuffer());
pmeSpreadChargeKernel.setArg<cl::Buffer>(1, pmeAtomRange->getDeviceBuffer());
pmeSpreadChargeKernel.setArg<cl::Buffer>(2, pmeGrid->getDeviceBuffer());
pmeSpreadChargeKernel.setArg<cl::Buffer>(3, pmeBsplineTheta->getDeviceBuffer());
pmeConvolutionKernel.setArg<cl::Buffer>(0, pmeGrid->getDeviceBuffer());
pmeConvolutionKernel.setArg<cl::Buffer>(1, cl.getEnergyBuffer().getDeviceBuffer());
pmeConvolutionKernel.setArg<cl::Buffer>(2, pmeBsplineModuliX->getDeviceBuffer());
pmeConvolutionKernel.setArg<cl::Buffer>(3, pmeBsplineModuliY->getDeviceBuffer());
pmeConvolutionKernel.setArg<cl::Buffer>(4, pmeBsplineModuliZ->getDeviceBuffer());
pmeInterpolateForceKernel.setArg<cl::Buffer>(0, cl.getPosq().getDeviceBuffer());
pmeInterpolateForceKernel.setArg<cl::Buffer>(1, cl.getForceBuffers().getDeviceBuffer());
pmeInterpolateForceKernel.setArg<cl::Buffer>(2, pmeBsplineTheta->getDeviceBuffer());
pmeInterpolateForceKernel.setArg<cl::Buffer>(3, pmeBsplineDtheta->getDeviceBuffer());
pmeInterpolateForceKernel.setArg<cl::Buffer>(4, pmeGrid->getDeviceBuffer());
}
}
if (exceptionIndices != NULL)
cl.executeKernel(exceptionsKernel, exceptionIndices->getSize());
......@@ -855,6 +1007,17 @@ void OpenCLCalcNonbondedForceKernel::executeForces(ContextImpl& context) {
cl.executeKernel(ewaldSumsKernel, cosSinSums->getSize());
cl.executeKernel(ewaldForcesKernel, cl.getNumAtoms());
}
if (pmeGrid != NULL) {
cl.executeKernel(pmeGridIndexKernel, cl.getNumAtoms());
sort->sort(*pmeAtomGridIndex);
cl.executeKernel(pmeAtomRangeKernel, cl.getNumAtoms());
cl.executeKernel(pmeUpdateBsplinesKernel, cl.getNumAtoms());
cl.executeKernel(pmeSpreadChargeKernel, cl.getNumAtoms());
fft->execFFT(*pmeGrid, true);
cl.executeKernel(pmeConvolutionKernel, cl.getNumAtoms());
fft->execFFT(*pmeGrid, false);
cl.executeKernel(pmeInterpolateForceKernel, cl.getNumAtoms());
}
}
double OpenCLCalcNonbondedForceKernel::executeEnergy(ContextImpl& context) {
......
......@@ -30,7 +30,9 @@
#include "OpenCLPlatform.h"
#include "OpenCLArray.h"
#include "OpenCLContext.h"
#include "OpenCLFFT3D.h"
#include "OpenCLParameterSet.h"
#include "OpenCLSort.h"
#include "openmm/kernels.h"
#include "openmm/System.h"
......@@ -350,7 +352,9 @@ private:
class OpenCLCalcNonbondedForceKernel : public CalcNonbondedForceKernel {
public:
OpenCLCalcNonbondedForceKernel(std::string name, const Platform& platform, OpenCLContext& cl, System& system) : CalcNonbondedForceKernel(name, platform),
hasInitializedKernel(false), cl(cl), sigmaEpsilon(NULL), exceptionParams(NULL), exceptionIndices(NULL), cosSinSums(NULL) {
hasInitializedKernel(false), cl(cl), sigmaEpsilon(NULL), exceptionParams(NULL), exceptionIndices(NULL), cosSinSums(NULL), pmeGrid(NULL),
pmeBsplineModuliX(NULL), pmeBsplineModuliY(NULL), pmeBsplineModuliZ(NULL), pmeBsplineTheta(NULL), pmeBsplineDtheta(NULL), pmeAtomRange(NULL),
pmeAtomGridIndex(NULL), sort(NULL), fft(NULL) {
}
~OpenCLCalcNonbondedForceKernel();
/**
......@@ -380,10 +384,28 @@ private:
OpenCLArray<mm_float4>* exceptionParams;
OpenCLArray<mm_int4>* exceptionIndices;
OpenCLArray<mm_float2>* cosSinSums;
OpenCLArray<mm_float2>* pmeGrid;
OpenCLArray<cl_float>* pmeBsplineModuliX;
OpenCLArray<cl_float>* pmeBsplineModuliY;
OpenCLArray<cl_float>* pmeBsplineModuliZ;
OpenCLArray<mm_float4>* pmeBsplineTheta;
OpenCLArray<mm_float4>* pmeBsplineDtheta;
OpenCLArray<cl_int>* pmeAtomRange;
OpenCLArray<mm_float2>* pmeAtomGridIndex;
OpenCLSort<mm_float2>* sort;
OpenCLFFT3D* fft;
cl::Kernel exceptionsKernel;
cl::Kernel ewaldSumsKernel;
cl::Kernel ewaldForcesKernel;
cl::Kernel pmeGridIndexKernel;
cl::Kernel pmeAtomRangeKernel;
cl::Kernel pmeUpdateBsplinesKernel;
cl::Kernel pmeSpreadChargeKernel;
cl::Kernel pmeConvolutionKernel;
cl::Kernel pmeInterpolateForceKernel;
std::map<std::string, std::string> pmeDefines;
double cutoffSquared, ewaldSelfEnergy;
static const int PmeOrder = 5;
};
/**
......
......@@ -92,6 +92,8 @@ public:
sortKernelSize = maxLocalBuffer;
int targetBucketSize = sortKernelSize/2;
int numBuckets = length/targetBucketSize;
if (numBuckets < 1)
numBuckets = 1;
if (positionsKernelSize > numBuckets)
positionsKernelSize = numBuckets;
......
......@@ -9,14 +9,12 @@ float2 multiplyComplex(float2 c1, float2 c2) {
__kernel void execFFT(__global float2* matrix, float sign, __local float2* w, __local float2* data0, __local float2* data1) {
const int i = get_local_id(0);
w[i] = (float2) (cos(-sign*i*2*M_PI/XSIZE), sin(-sign*i*2*M_PI/XSIZE));
int index = get_group_id(0);
while (index < YSIZE*ZSIZE) {
for (int index = get_group_id(0); index < YSIZE*ZSIZE; index += get_num_groups(0)) {
int z = index/YSIZE;
int y = index-z*YSIZE;
int element = i*XMULT+y*YMULT+z*ZMULT;
data0[i] = matrix[element];
barrier(CLK_LOCAL_MEM_FENCE);
COMPUTE_FFT
index += get_num_groups(0);
}
}
__kernel void updateGridIndexAndFraction(__global float4* posq, __global float2* pmeAtomGridIndex) {
for (int i = get_global_id(0); i < NUM_ATOMS; i += get_global_size(0)) {
float4 pos = posq[i];
float4 t = (float4) ((pos.x/PERIODIC_BOX_SIZE_X+1.0f)*GRID_SIZE_X,
(pos.y/PERIODIC_BOX_SIZE_Y+1.0f)*GRID_SIZE_Y,
(pos.z/PERIODIC_BOX_SIZE_Z+1.0f)*GRID_SIZE_Z, 0.0f);
int4 gridIndex = (int4) (((int) t.x) % GRID_SIZE_X,
((int) t.y) % GRID_SIZE_Y,
((int) t.z) % GRID_SIZE_Z, 0);
pmeAtomGridIndex[i] = (float2) (i, gridIndex.x*GRID_SIZE_Y*GRID_SIZE_Z+gridIndex.y*GRID_SIZE_Z+gridIndex.z);
}
}
/**
* For each grid point, find the range of sorted atoms associated with that point.
*/
__kernel void findAtomRangeForGrid(__global float4* posq, __global float2* pmeAtomGridIndex, __global int* pmeAtomRange) {
int start = (NUM_ATOMS*get_global_id(0))/get_global_size(0);
int end = (NUM_ATOMS*(get_global_id(0)+1))/get_global_size(0);
int last = (start == 0 ? -1 : pmeAtomGridIndex[start-1].y);
for (int i = start; i < end; ++i) {
float2 atomData = pmeAtomGridIndex[i];
int gridIndex = atomData.y;
if (gridIndex != last) {
for (int j = last+1; j <= gridIndex; ++j)
pmeAtomRange[j] = i;
last = gridIndex;
}
// The grid index won't be needed again. Reuse that component to hold the atom charge, thus saving
// an extra load operation in the charge spreading kernel.
pmeAtomGridIndex[i].y = posq[(int) atomData.x].w;
}
// Fill in values beyond the last atom.
if (get_global_id(0) == get_global_size(0)-1) {
int gridSize = GRID_SIZE_X*GRID_SIZE_Y*GRID_SIZE_Z;
for (int j = last+1; j <= gridSize; ++j)
pmeAtomRange[j] = NUM_ATOMS;
}
}
__kernel void updateBsplines(__global float4* posq, __global float4* pmeBsplineTheta, __global float4* pmeBsplineDTheta, __local float4* bsplinesCache) {
const float4 scale = 1.0f/(PME_ORDER-1);
for (int i = get_global_id(0); i < NUM_ATOMS; i += get_global_size(0)) {
__local float4* data = &bsplinesCache[get_local_id(0)*PME_ORDER];
__local float4* ddata = &bsplinesCache[get_local_id(0)*PME_ORDER + get_local_size(0)*PME_ORDER];
for (int j = 0; j < PME_ORDER; j++) {
data[j] = 0.0f;
ddata[j] = 0.0f;
}
float4 pos = posq[i];
float4 t = (float4) ((pos.x/PERIODIC_BOX_SIZE_X+1.0f)*GRID_SIZE_X,
(pos.y/PERIODIC_BOX_SIZE_Y+1.0f)*GRID_SIZE_Y,
(pos.z/PERIODIC_BOX_SIZE_Z+1.0f)*GRID_SIZE_Z, 0.0f);
float4 dr = (float4) (t.x-(int) t.x, t.y-(int) t.y, t.z-(int) t.z, 0.0f);
data[PME_ORDER-1] = 0.0f;
data[1] = dr;
data[0] = 1.0f-dr;
for (int j = 3; j < PME_ORDER; j++) {
float div = 1.0f/(j-1.0f);
data[j-1] = div*dr*data[j-2];
for (int k = 1; k < (j-1); k++)
data[j-k-1] = div*((dr+(float4) k) *data[j-k-2] + (-dr+(float4) (j-k))*data[j-k-1]);
data[0] = div*(- dr+1.0f)*data[0];
}
ddata[0] = -data[0];
for (int j = 1; j < PME_ORDER; j++)
ddata[j] = data[j-1]-data[j];
data[PME_ORDER-1] = scale*dr*data[PME_ORDER-2];
for (int j = 1; j < (PME_ORDER-1); j++)
data[PME_ORDER-j-1] = scale*((dr+(float4) j)*data[PME_ORDER-j-2] + (-dr+(float4) (PME_ORDER-j))*data[PME_ORDER-j-1]);
data[0] = scale*(-dr+1.0f)*data[0];
for (int j = 0; j < PME_ORDER; j++) {
pmeBsplineTheta[i+j*NUM_ATOMS] = data[j];
pmeBsplineDTheta[i+j*NUM_ATOMS] = ddata[j];
}
}
}
__kernel void gridSpreadCharge(__global float2* pmeAtomGridIndex, __global int* pmeAtomRange, __global float2* pmeGrid, __global float4* pmeBsplineTheta) {
unsigned int numGridPoints = GRID_SIZE_X*GRID_SIZE_Y*GRID_SIZE_Z;
for (int gridIndex = get_global_id(0); gridIndex < numGridPoints; gridIndex += get_global_size(0)) {
int3 gridPoint;
gridPoint.x = gridIndex/(GRID_SIZE_Y*GRID_SIZE_Z);
int remainder = gridIndex-gridPoint.x*GRID_SIZE_Y*GRID_SIZE_Z;
gridPoint.y = remainder/GRID_SIZE_Z;
gridPoint.z = remainder-gridPoint.y*GRID_SIZE_Z;
gridPoint.x += GRID_SIZE_X;
gridPoint.y += GRID_SIZE_Y;
gridPoint.z += GRID_SIZE_Z;
float result = 0.0f;
for (int ix = 0; ix < PME_ORDER; ++ix)
for (int iy = 0; iy < PME_ORDER; ++iy)
for (int iz = 0; iz < PME_ORDER; ++iz) {
int x = (gridPoint.x-ix)%GRID_SIZE_X;
int y = (gridPoint.y-iy)%GRID_SIZE_Y;
int z = (gridPoint.z-iz)%GRID_SIZE_Z;
int gridIndex = x*GRID_SIZE_Y*GRID_SIZE_Z+y*GRID_SIZE_Z+z;
int firstAtom = pmeAtomRange[gridIndex];
int lastAtom = pmeAtomRange[gridIndex+1];
for (int i = firstAtom; i < lastAtom; ++i) {
float2 atomData = pmeAtomGridIndex[i];
int atomIndex = atomData.x;
float atomCharge = atomData.y;
result += atomCharge*pmeBsplineTheta[atomIndex+ix*NUM_ATOMS].x*pmeBsplineTheta[atomIndex+iy*NUM_ATOMS].y*pmeBsplineTheta[atomIndex+iz*NUM_ATOMS].z;
}
}
pmeGrid[gridIndex] = (float2) (result*EPSILON_FACTOR, 0.0f);
}
}
__kernel void reciprocalConvolution(__global float2* pmeGrid, __global float* energyBuffer, __global float* pmeBsplineModuliX,
__global float* pmeBsplineModuliY, __global float* pmeBsplineModuliZ) {
const unsigned int gridSize = GRID_SIZE_X*GRID_SIZE_Y*GRID_SIZE_Z;
float energy = 0.0f;
for (int index = get_global_id(0); index < gridSize; index += get_global_size(0)) {
int kx = index/(GRID_SIZE_Y*GRID_SIZE_Z);
int remainder = index-kx*GRID_SIZE_Y*GRID_SIZE_Z;
int ky = remainder/GRID_SIZE_Z;
int kz = remainder-ky*GRID_SIZE_Z;
if (kx == 0 && ky == 0 && kz == 0)
continue;
int mx = (kx < (GRID_SIZE_X+1)/2) ? kx : (kx-GRID_SIZE_X);
int my = (ky < (GRID_SIZE_Y+1)/2) ? ky : (ky-GRID_SIZE_Y);
int mz = (kz < (GRID_SIZE_Z+1)/2) ? kz : (kz-GRID_SIZE_Z);
float mhx = mx/PERIODIC_BOX_SIZE_X;
float mhy = my/PERIODIC_BOX_SIZE_Y;
float mhz = mz/PERIODIC_BOX_SIZE_Z;
float bx = pmeBsplineModuliX[kx];
float by = pmeBsplineModuliY[ky];
float bz = pmeBsplineModuliZ[kz];
float2 grid = pmeGrid[index];
float m2 = mhx*mhx+mhy*mhy+mhz*mhz;
float denom = m2*bx*by*bz;
float eterm = RECIP_SCALE_FACTOR*exp(-RECIP_EXP_FACTOR*m2)/denom;
pmeGrid[index] = (float2) (grid.x*eterm, grid.y*eterm);
energy += eterm*(grid.x*grid.x + grid.y*grid.y);
}
energyBuffer[get_global_id(0)] += 0.5f*energy;
}
__kernel void gridInterpolateForce(__global float4* posq, __global float4* forceBuffers, __global float4* pmeBsplineTheta, __global float4* pmeBsplineDTheta, __global float2* pmeGrid) {
for (int atom = get_global_id(0); atom < NUM_ATOMS; atom += get_global_size(0)) {
float3 force = 0.0f;
float4 pos = posq[atom];
float4 t = (float4) ((pos.x/PERIODIC_BOX_SIZE_X+1.0f)*GRID_SIZE_X,
(pos.y/PERIODIC_BOX_SIZE_Y+1.0f)*GRID_SIZE_Y,
(pos.z/PERIODIC_BOX_SIZE_Z+1.0f)*GRID_SIZE_Z, 0.0f);
int4 gridIndex = (int4) (((int) t.x) % GRID_SIZE_X,
((int) t.y) % GRID_SIZE_Y,
((int) t.z) % GRID_SIZE_Z, 0);
for (int ix = 0; ix < PME_ORDER; ix++) {
int xindex = (gridIndex.x + ix) % GRID_SIZE_X;
float tx = pmeBsplineTheta[atom+ix*NUM_ATOMS].x;
float dtx = pmeBsplineDTheta[atom+ix*NUM_ATOMS].x;
for (int iy = 0; iy < PME_ORDER; iy++) {
int yindex = (gridIndex.y + iy) % GRID_SIZE_Y;
float ty = pmeBsplineTheta[atom+iy*NUM_ATOMS].y;
float dty = pmeBsplineDTheta[atom+iy*NUM_ATOMS].y;
for (int iz = 0; iz < PME_ORDER; iz++) {
int zindex = (gridIndex.z + iz) % GRID_SIZE_Z;
float tz = pmeBsplineTheta[atom+iz*NUM_ATOMS].z;
float dtz = pmeBsplineDTheta[atom+iz*NUM_ATOMS].z;
int index = xindex*GRID_SIZE_Y*GRID_SIZE_Z + yindex*GRID_SIZE_Z + zindex;
float gridvalue = pmeGrid[index].x;
force.x += dtx*ty*tz*gridvalue;
force.y += tx*dty*tz*gridvalue;
force.z += tx*ty*dtz*gridvalue;
}
}
}
float4 totalForce = forceBuffers[atom];
float q = pos.w*EPSILON_FACTOR;
totalForce.x -= q*force.x*GRID_SIZE_X/PERIODIC_BOX_SIZE_X;
totalForce.y -= q*force.y*GRID_SIZE_Y/PERIODIC_BOX_SIZE_Y;
totalForce.z -= q*force.z*GRID_SIZE_Z/PERIODIC_BOX_SIZE_Z;
forceBuffers[atom] = totalForce;
}
}
......@@ -123,7 +123,7 @@ void testEwaldPME() {
ASSERT_EQUAL_TOL(norm, (clState2.getPotentialEnergy()-clState.getPotentialEnergy())/delta, tol)
// (3) Check whether the Reference and OpenCL platforms agree when using PME
/*
nonbonded->setNonbondedMethod(NonbondedForce::PME);
clContext.reinitialize();
referenceContext.reinitialize();
......@@ -158,7 +158,7 @@ void testEwaldPME() {
tol = 1e-3;
State clState3 = clContext3.getState(State::Energy);
ASSERT_EQUAL_TOL(norm, (clState3.getPotentialEnergy()-clState.getPotentialEnergy())/delta, tol)*/
ASSERT_EQUAL_TOL(norm, (clState3.getPotentialEnergy()-clState.getPotentialEnergy())/delta, tol)
}
void testEwald2Ions() {
......@@ -244,7 +244,7 @@ int main() {
testEwaldPME();
// testEwald2Ions();
testErrorTolerance(NonbondedForce::Ewald);
// testErrorTolerance(NonbondedForce::PME);
testErrorTolerance(NonbondedForce::PME);
}
catch(const exception& e) {
cout << "exception: " << e.what() << endl;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment