Commit 98c30a3f authored by Peter Eastman's avatar Peter Eastman
Browse files

Major optimizations to PME performance

parent b880b5d9
...@@ -17,7 +17,7 @@ ...@@ -17,7 +17,7 @@
#include "bbsort_kernel.cu" #include "bbsort_kernel.cu"
float getValue(float2 v){ int getValue(int2 v){
return v.y; return v.y;
} }
...@@ -115,7 +115,7 @@ void reduceMinMax(T* dData,int size,float& result,bool isMax) ...@@ -115,7 +115,7 @@ void reduceMinMax(T* dData,int size,float& result,bool isMax)
CUDA_SAFE_CALL(cudaMemcpy(&originalResult, dData, sizeof(T), cudaMemcpyDeviceToHost)); CUDA_SAFE_CALL(cudaMemcpy(&originalResult, dData, sizeof(T), cudaMemcpyDeviceToHost));
result=(float)getValue(originalResult); result=(int)getValue(originalResult);
} }
template <typename T> template <typename T>
...@@ -305,7 +305,7 @@ Also note that you need to use 1.3 capbility (use arch=sm_13 in your compile com ...@@ -305,7 +305,7 @@ Also note that you need to use 1.3 capbility (use arch=sm_13 in your compile com
*************************************************************************************/ *************************************************************************************/
template<> template<>
void bbSort(float2* dData,int size,int listOrder) void bbSort(int2* dData,int size,int listOrder)
{ {
bbSortBody(dData,size,listOrder); bbSortBody(dData,size,listOrder);
......
...@@ -19,7 +19,7 @@ texture<unsigned int, 1, cudaReadModeElementType> tBucketOffsets; ...@@ -19,7 +19,7 @@ texture<unsigned int, 1, cudaReadModeElementType> tBucketOffsets;
texture<unsigned int, 1, cudaReadModeElementType> tBucketOfSlices; texture<unsigned int, 1, cudaReadModeElementType> tBucketOfSlices;
texture<unsigned int, 1, cudaReadModeElementType> tSliceOffsetInBucket; texture<unsigned int, 1, cudaReadModeElementType> tSliceOffsetInBucket;
__device__ float dGetValue(float2 v){ __device__ int dGetValue(int2 v){
return v.y; return v.y;
} }
...@@ -29,7 +29,7 @@ __device__ T dGetValue(T v){ ...@@ -29,7 +29,7 @@ __device__ T dGetValue(T v){
} }
__device__ void dPad(float2& v){ __device__ void dPad(int2& v){
v.x=0x3fffffff; v.x=0x3fffffff;
v.y=0x4fffffff; v.y=0x4fffffff;
} }
......
...@@ -420,7 +420,7 @@ struct cudaGmxSimulation { ...@@ -420,7 +420,7 @@ struct cudaGmxSimulation {
float4* pPmeBsplineTheta; float4* pPmeBsplineTheta;
float4* pPmeBsplineDtheta; float4* pPmeBsplineDtheta;
int* pPmeAtomRange; // The range of sorted atoms at each grid point int* pPmeAtomRange; // The range of sorted atoms at each grid point
float2* pPmeAtomGridIndex; // The grid point each atom is at int2* pPmeAtomGridIndex; // The grid point each atom is at
unsigned int bonds; // Number of bonds unsigned int bonds; // Number of bonds
int4* pBondID; // Bond atom and output buffer IDs int4* pBondID; // Bond atom and output buffer IDs
float2* pBondParameter; // Bond parameters float2* pBondParameter; // Bond parameters
......
...@@ -994,7 +994,7 @@ void gpuSetPMEParameters(gpuContext gpu, float alpha, int gridSizeX, int gridSiz ...@@ -994,7 +994,7 @@ void gpuSetPMEParameters(gpuContext gpu, float alpha, int gridSizeX, int gridSiz
gpu->sim.pPmeBsplineDtheta = gpu->psPmeBsplineDtheta->_pDevData; gpu->sim.pPmeBsplineDtheta = gpu->psPmeBsplineDtheta->_pDevData;
gpu->psPmeAtomRange = new CUDAStream<int>(gridSize.x*gridSize.y*gridSize.z+1, 1, "PmeAtomRange"); gpu->psPmeAtomRange = new CUDAStream<int>(gridSize.x*gridSize.y*gridSize.z+1, 1, "PmeAtomRange");
gpu->sim.pPmeAtomRange = gpu->psPmeAtomRange->_pDevData; gpu->sim.pPmeAtomRange = gpu->psPmeAtomRange->_pDevData;
gpu->psPmeAtomGridIndex = new CUDAStream<float2>(gpu->natoms, 1, "PmeAtomGridIndex"); gpu->psPmeAtomGridIndex = new CUDAStream<int2>(gpu->natoms, 1, "PmeAtomGridIndex");
gpu->sim.pPmeAtomGridIndex = gpu->psPmeAtomGridIndex->_pDevData; gpu->sim.pPmeAtomGridIndex = gpu->psPmeAtomGridIndex->_pDevData;
tabulateErfc(gpu); tabulateErfc(gpu);
......
...@@ -122,7 +122,7 @@ struct _gpuContext { ...@@ -122,7 +122,7 @@ struct _gpuContext {
CUDAStream<float4>* psPmeBsplineTheta; CUDAStream<float4>* psPmeBsplineTheta;
CUDAStream<float4>* psPmeBsplineDtheta; CUDAStream<float4>* psPmeBsplineDtheta;
CUDAStream<int>* psPmeAtomRange; // The range of sorted atoms at each grid point CUDAStream<int>* psPmeAtomRange; // The range of sorted atoms at each grid point
CUDAStream<float2>* psPmeAtomGridIndex; // The grid point each atom is at CUDAStream<int2>* psPmeAtomGridIndex; // The grid point each atom is at
CUDAStream<float2>* psObcData; CUDAStream<float2>* psObcData;
CUDAStream<float4>* psGBVIData; CUDAStream<float4>* psGBVIData;
CUDAStream<float>* psObcChain; CUDAStream<float>* psObcChain;
......
...@@ -6,7 +6,7 @@ ...@@ -6,7 +6,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for * * Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. * * Medical Research, grant U54 GM072970. See https://simtk.org. *
* * * *
* Portions copyright (c) 2009 Stanford University and the Authors. * * Portions copyright (c) 2009-2010 Stanford University and the Authors. *
* Authors: Erik Lindahl, Rossen Apostolov, Szilard Pall, Peter Eastman * * Authors: Erik Lindahl, Rossen Apostolov, Szilard Pall, Peter Eastman *
* Contributors: * * Contributors: *
* * * *
...@@ -117,7 +117,7 @@ void kUpdateGridIndexAndFraction_kernel() ...@@ -117,7 +117,7 @@ void kUpdateGridIndexAndFraction_kernel()
int3 gridIndex = make_int3(((int) t.x) % cSim.pmeGridSize.x, int3 gridIndex = make_int3(((int) t.x) % cSim.pmeGridSize.x,
((int) t.y) % cSim.pmeGridSize.y, ((int) t.y) % cSim.pmeGridSize.y,
((int) t.z) % cSim.pmeGridSize.z); ((int) t.z) % cSim.pmeGridSize.z);
cSim.pPmeAtomGridIndex[i] = make_float2(i, gridIndex.x*cSim.pmeGridSize.y*cSim.pmeGridSize.z+gridIndex.y*cSim.pmeGridSize.z+gridIndex.z); cSim.pPmeAtomGridIndex[i] = make_int2(i, gridIndex.x*cSim.pmeGridSize.y*cSim.pmeGridSize.z+gridIndex.y*cSim.pmeGridSize.z+gridIndex.z);
} }
} }
...@@ -141,7 +141,7 @@ void kFindAtomRangeForGrid_kernel() ...@@ -141,7 +141,7 @@ void kFindAtomRangeForGrid_kernel()
int last = (start == 0 ? -1 : cSim.pPmeAtomGridIndex[start-1].y); int last = (start == 0 ? -1 : cSim.pPmeAtomGridIndex[start-1].y);
for (int i = start; i < end; ++i) for (int i = start; i < end; ++i)
{ {
float2 atomData = cSim.pPmeAtomGridIndex[i]; int2 atomData = cSim.pPmeAtomGridIndex[i];
int gridIndex = atomData.y; int gridIndex = atomData.y;
if (gridIndex != last) if (gridIndex != last)
{ {
...@@ -150,10 +150,13 @@ void kFindAtomRangeForGrid_kernel() ...@@ -150,10 +150,13 @@ void kFindAtomRangeForGrid_kernel()
last = gridIndex; last = gridIndex;
} }
// The grid index won't be needed again. Reuse that component to hold the atom charge, thus saving // The grid index won't be needed again. Reuse that component to hold the z index, thus saving
// an extra load operation in the charge spreading kernel. // some work in the charge spreading kernel.
cSim.pPmeAtomGridIndex[i].y = cSim.pPosq[(int) atomData.x].w; float posz = cSim.pPosq[atomData.x].z;
posz -= floor(posz*cSim.invPeriodicBoxSizeZ)*cSim.periodicBoxSizeZ;
int z = ((int) ((posz*cSim.invPeriodicBoxSizeZ)*cSim.pmeGridSize.z)) % cSim.pmeGridSize.z;
cSim.pPmeAtomGridIndex[i].y = z;
} }
// Fill in values beyond the last atom. // Fill in values beyond the last atom.
...@@ -266,28 +269,47 @@ void kGridSpreadCharge_kernel() ...@@ -266,28 +269,47 @@ void kGridSpreadCharge_kernel()
int remainder = gridIndex-gridPoint.x*cSim.pmeGridSize.y*cSim.pmeGridSize.z; int remainder = gridIndex-gridPoint.x*cSim.pmeGridSize.y*cSim.pmeGridSize.z;
gridPoint.y = remainder/cSim.pmeGridSize.z; gridPoint.y = remainder/cSim.pmeGridSize.z;
gridPoint.z = remainder-gridPoint.y*cSim.pmeGridSize.z; gridPoint.z = remainder-gridPoint.y*cSim.pmeGridSize.z;
gridPoint.x += cSim.pmeGridSize.x;
gridPoint.y += cSim.pmeGridSize.y;
gridPoint.z += cSim.pmeGridSize.z;
float result = 0.0f; float result = 0.0f;
for (int ix = 0; ix < PME_ORDER; ++ix) for (int ix = 0; ix < PME_ORDER; ++ix)
{
int x = gridPoint.x-ix+(gridPoint.x >= ix ? 0 : cSim.pmeGridSize.x);
for (int iy = 0; iy < PME_ORDER; ++iy) for (int iy = 0; iy < PME_ORDER; ++iy)
for (int iz = 0; iz < PME_ORDER; ++iz) {
int y = gridPoint.y-iy+(gridPoint.y >= iy ? 0 : cSim.pmeGridSize.y);
int z1 = gridPoint.z-PME_ORDER+1;
z1 += (z1 >= 0 ? 0 : cSim.pmeGridSize.z);
int z2 = (z1 < gridPoint.z ? gridPoint.z : cSim.pmeGridSize.z-1);
int gridIndex1 = x*cSim.pmeGridSize.y*cSim.pmeGridSize.z+y*cSim.pmeGridSize.z+z1;
int gridIndex2 = x*cSim.pmeGridSize.y*cSim.pmeGridSize.z+y*cSim.pmeGridSize.z+z2;
int firstAtom = cSim.pPmeAtomRange[gridIndex1];
int lastAtom = cSim.pPmeAtomRange[gridIndex2+1];
for (int i = firstAtom; i < lastAtom; ++i)
{
int2 atomData = cSim.pPmeAtomGridIndex[i];
int atomIndex = atomData.x;
int z = atomData.y;
int iz = gridPoint.z-z+(gridPoint.z >= z ? 0 : cSim.pmeGridSize.z);
float atomCharge = cSim.pPosq[atomIndex].w;
result += atomCharge*tex1Dfetch(bsplineThetaRef, atomIndex+ix*cSim.atoms).x*tex1Dfetch(bsplineThetaRef, atomIndex+iy*cSim.atoms).y*tex1Dfetch(bsplineThetaRef, atomIndex+iz*cSim.atoms).z;
}
if (z1 > gridPoint.z)
{ {
int x = (gridPoint.x-ix)%cSim.pmeGridSize.x; gridIndex1 = x*cSim.pmeGridSize.y*cSim.pmeGridSize.z+y*cSim.pmeGridSize.z;
int y = (gridPoint.y-iy)%cSim.pmeGridSize.y; gridIndex2 = x*cSim.pmeGridSize.y*cSim.pmeGridSize.z+y*cSim.pmeGridSize.z+gridPoint.z;
int z = (gridPoint.z-iz)%cSim.pmeGridSize.z; firstAtom = cSim.pPmeAtomRange[gridIndex1];
int gridIndex = x*cSim.pmeGridSize.y*cSim.pmeGridSize.z+y*cSim.pmeGridSize.z+z; lastAtom = cSim.pPmeAtomRange[gridIndex2+1];
int firstAtom = cSim.pPmeAtomRange[gridIndex];
int lastAtom = cSim.pPmeAtomRange[gridIndex+1];
for (int i = firstAtom; i < lastAtom; ++i) for (int i = firstAtom; i < lastAtom; ++i)
{ {
float2 atomData = cSim.pPmeAtomGridIndex[i]; int2 atomData = cSim.pPmeAtomGridIndex[i];
int atomIndex = atomData.x; int atomIndex = atomData.x;
float atomCharge = atomData.y; int z = atomData.y;
int iz = gridPoint.z-z+(gridPoint.z >= z ? 0 : cSim.pmeGridSize.z);
float atomCharge = cSim.pPosq[atomIndex].w;
result += atomCharge*tex1Dfetch(bsplineThetaRef, atomIndex+ix*cSim.atoms).x*tex1Dfetch(bsplineThetaRef, atomIndex+iy*cSim.atoms).y*tex1Dfetch(bsplineThetaRef, atomIndex+iz*cSim.atoms).z; result += atomCharge*tex1Dfetch(bsplineThetaRef, atomIndex+ix*cSim.atoms).x*tex1Dfetch(bsplineThetaRef, atomIndex+iy*cSim.atoms).y*tex1Dfetch(bsplineThetaRef, atomIndex+iz*cSim.atoms).z;
} }
} }
}
}
cSim.pPmeGrid[gridIndex] = make_cuComplex(result*sqrt(cSim.epsfac), 0.0f); cSim.pPmeGrid[gridIndex] = make_cuComplex(result*sqrt(cSim.epsfac), 0.0f);
} }
} }
...@@ -358,17 +380,20 @@ void kGridInterpolateForce_kernel() ...@@ -358,17 +380,20 @@ void kGridInterpolateForce_kernel()
((int) t.z) % cSim.pmeGridSize.z); ((int) t.z) % cSim.pmeGridSize.z);
for (int ix = 0; ix < PME_ORDER; ix++) for (int ix = 0; ix < PME_ORDER; ix++)
{ {
int xindex = (gridIndex.x + ix) % cSim.pmeGridSize.x; int xindex = gridIndex.x+ix;
xindex -= (xindex >= cSim.pmeGridSize.x ? cSim.pmeGridSize.x : 0);
float tx = cSim.pPmeBsplineTheta[atom+ix*cSim.atoms].x; float tx = cSim.pPmeBsplineTheta[atom+ix*cSim.atoms].x;
float dtx = cSim.pPmeBsplineDtheta[atom+ix*cSim.atoms].x; float dtx = cSim.pPmeBsplineDtheta[atom+ix*cSim.atoms].x;
for (int iy = 0; iy < PME_ORDER; iy++) for (int iy = 0; iy < PME_ORDER; iy++)
{ {
int yindex = (gridIndex.y + iy) % cSim.pmeGridSize.y; int yindex = gridIndex.y+iy;
yindex -= (yindex >= cSim.pmeGridSize.y ? cSim.pmeGridSize.y : 0);
float ty = cSim.pPmeBsplineTheta[atom+iy*cSim.atoms].y; float ty = cSim.pPmeBsplineTheta[atom+iy*cSim.atoms].y;
float dty = cSim.pPmeBsplineDtheta[atom+iy*cSim.atoms].y; float dty = cSim.pPmeBsplineDtheta[atom+iy*cSim.atoms].y;
for (int iz = 0; iz < PME_ORDER; iz++) for (int iz = 0; iz < PME_ORDER; iz++)
{ {
int zindex = (gridIndex.z + iz) % cSim.pmeGridSize.z; int zindex = gridIndex.z+iz;
zindex -= (zindex >= cSim.pmeGridSize.z ? cSim.pmeGridSize.z : 0);
float tz = cSim.pPmeBsplineTheta[atom+iz*cSim.atoms].z; float tz = cSim.pPmeBsplineTheta[atom+iz*cSim.atoms].z;
float dtz = cSim.pPmeBsplineDtheta[atom+iz*cSim.atoms].z; float dtz = cSim.pPmeBsplineDtheta[atom+iz*cSim.atoms].z;
int index = xindex*cSim.pmeGridSize.y*cSim.pmeGridSize.z + yindex*cSim.pmeGridSize.z + zindex; int index = xindex*cSim.pmeGridSize.y*cSim.pmeGridSize.z + yindex*cSim.pmeGridSize.z + zindex;
......
...@@ -1151,8 +1151,8 @@ void OpenCLCalcNonbondedForceKernel::initialize(const System& system, const Nonb ...@@ -1151,8 +1151,8 @@ void OpenCLCalcNonbondedForceKernel::initialize(const System& system, const Nonb
pmeBsplineTheta = new OpenCLArray<mm_float4>(cl, PmeOrder*numParticles, "pmeBsplineTheta"); pmeBsplineTheta = new OpenCLArray<mm_float4>(cl, PmeOrder*numParticles, "pmeBsplineTheta");
pmeBsplineDtheta = new OpenCLArray<mm_float4>(cl, PmeOrder*numParticles, "pmeBsplineDtheta"); pmeBsplineDtheta = new OpenCLArray<mm_float4>(cl, PmeOrder*numParticles, "pmeBsplineDtheta");
pmeAtomRange = new OpenCLArray<cl_int>(cl, gridSizeX*gridSizeY*gridSizeZ+1, "pmeAtomRange"); pmeAtomRange = new OpenCLArray<cl_int>(cl, gridSizeX*gridSizeY*gridSizeZ+1, "pmeAtomRange");
pmeAtomGridIndex = new OpenCLArray<mm_float2>(cl, numParticles, "pmeAtomGridIndex"); pmeAtomGridIndex = new OpenCLArray<mm_int2>(cl, numParticles, "pmeAtomGridIndex");
sort = new OpenCLSort<mm_float2>(cl, cl.getNumAtoms(), "float2", "value.y"); sort = new OpenCLSort<mm_int2>(cl, cl.getNumAtoms(), "int2", "value.y");
fft = new OpenCLFFT3D(cl, gridSizeX, gridSizeY, gridSizeZ); fft = new OpenCLFFT3D(cl, gridSizeX, gridSizeY, gridSizeZ);
// Initialize the b-spline moduli. // Initialize the b-spline moduli.
...@@ -1303,10 +1303,11 @@ void OpenCLCalcNonbondedForceKernel::executeForces(ContextImpl& context) { ...@@ -1303,10 +1303,11 @@ void OpenCLCalcNonbondedForceKernel::executeForces(ContextImpl& context) {
pmeUpdateBsplinesKernel.setArg<cl::Buffer>(2, pmeBsplineDtheta->getDeviceBuffer()); pmeUpdateBsplinesKernel.setArg<cl::Buffer>(2, pmeBsplineDtheta->getDeviceBuffer());
pmeUpdateBsplinesKernel.setArg(3, 2*OpenCLContext::ThreadBlockSize*PmeOrder*sizeof(mm_float4), NULL); pmeUpdateBsplinesKernel.setArg(3, 2*OpenCLContext::ThreadBlockSize*PmeOrder*sizeof(mm_float4), NULL);
pmeUpdateBsplinesKernel.setArg<cl::Buffer>(4, pmeAtomGridIndex->getDeviceBuffer()); pmeUpdateBsplinesKernel.setArg<cl::Buffer>(4, pmeAtomGridIndex->getDeviceBuffer());
pmeSpreadChargeKernel.setArg<cl::Buffer>(0, pmeAtomGridIndex->getDeviceBuffer()); pmeSpreadChargeKernel.setArg<cl::Buffer>(0, cl.getPosq().getDeviceBuffer());
pmeSpreadChargeKernel.setArg<cl::Buffer>(1, pmeAtomRange->getDeviceBuffer()); pmeSpreadChargeKernel.setArg<cl::Buffer>(1, pmeAtomGridIndex->getDeviceBuffer());
pmeSpreadChargeKernel.setArg<cl::Buffer>(2, pmeGrid->getDeviceBuffer()); pmeSpreadChargeKernel.setArg<cl::Buffer>(2, pmeAtomRange->getDeviceBuffer());
pmeSpreadChargeKernel.setArg<cl::Buffer>(3, pmeBsplineTheta->getDeviceBuffer()); pmeSpreadChargeKernel.setArg<cl::Buffer>(3, pmeGrid->getDeviceBuffer());
pmeSpreadChargeKernel.setArg<cl::Buffer>(4, pmeBsplineTheta->getDeviceBuffer());
pmeConvolutionKernel.setArg<cl::Buffer>(0, pmeGrid->getDeviceBuffer()); pmeConvolutionKernel.setArg<cl::Buffer>(0, pmeGrid->getDeviceBuffer());
pmeConvolutionKernel.setArg<cl::Buffer>(1, cl.getEnergyBuffer().getDeviceBuffer()); pmeConvolutionKernel.setArg<cl::Buffer>(1, cl.getEnergyBuffer().getDeviceBuffer());
pmeConvolutionKernel.setArg<cl::Buffer>(2, pmeBsplineModuliX->getDeviceBuffer()); pmeConvolutionKernel.setArg<cl::Buffer>(2, pmeBsplineModuliX->getDeviceBuffer());
......
...@@ -491,9 +491,9 @@ private: ...@@ -491,9 +491,9 @@ private:
OpenCLArray<mm_float4>* pmeBsplineTheta; OpenCLArray<mm_float4>* pmeBsplineTheta;
OpenCLArray<mm_float4>* pmeBsplineDtheta; OpenCLArray<mm_float4>* pmeBsplineDtheta;
OpenCLArray<cl_int>* pmeAtomRange; OpenCLArray<cl_int>* pmeAtomRange;
OpenCLArray<mm_float2>* pmeAtomGridIndex; OpenCLArray<mm_int2>* pmeAtomGridIndex;
OpenCLArray<cl_float>* erfcTable; OpenCLArray<cl_float>* erfcTable;
OpenCLSort<mm_float2>* sort; OpenCLSort<mm_int2>* sort;
OpenCLFFT3D* fft; OpenCLFFT3D* fft;
cl::Kernel exceptionsKernel; cl::Kernel exceptionsKernel;
cl::Kernel ewaldSumsKernel; cl::Kernel ewaldSumsKernel;
......
__kernel void updateGridIndexAndFraction(__global float4* posq, __global float2* pmeAtomGridIndex, float4 periodicBoxSize, float4 invPeriodicBoxSize) { __kernel void updateGridIndexAndFraction(__global float4* posq, __global int2* pmeAtomGridIndex, float4 periodicBoxSize, float4 invPeriodicBoxSize) {
for (int i = get_global_id(0); i < NUM_ATOMS; i += get_global_size(0)) { for (int i = get_global_id(0); i < NUM_ATOMS; i += get_global_size(0)) {
float4 pos = posq[i]; float4 pos = posq[i];
pos.x -= floor(pos.x*invPeriodicBoxSize.x)*periodicBoxSize.x; pos.x -= floor(pos.x*invPeriodicBoxSize.x)*periodicBoxSize.x;
...@@ -10,7 +10,7 @@ __kernel void updateGridIndexAndFraction(__global float4* posq, __global float2* ...@@ -10,7 +10,7 @@ __kernel void updateGridIndexAndFraction(__global float4* posq, __global float2*
int4 gridIndex = (int4) (((int) t.x) % GRID_SIZE_X, int4 gridIndex = (int4) (((int) t.x) % GRID_SIZE_X,
((int) t.y) % GRID_SIZE_Y, ((int) t.y) % GRID_SIZE_Y,
((int) t.z) % GRID_SIZE_Z, 0); ((int) t.z) % GRID_SIZE_Z, 0);
pmeAtomGridIndex[i] = (float2) (i, gridIndex.x*GRID_SIZE_Y*GRID_SIZE_Z+gridIndex.y*GRID_SIZE_Z+gridIndex.z); pmeAtomGridIndex[i] = (int2) (i, gridIndex.x*GRID_SIZE_Y*GRID_SIZE_Z+gridIndex.y*GRID_SIZE_Z+gridIndex.z);
} }
} }
...@@ -18,12 +18,12 @@ __kernel void updateGridIndexAndFraction(__global float4* posq, __global float2* ...@@ -18,12 +18,12 @@ __kernel void updateGridIndexAndFraction(__global float4* posq, __global float2*
* For each grid point, find the range of sorted atoms associated with that point. * For each grid point, find the range of sorted atoms associated with that point.
*/ */
__kernel void findAtomRangeForGrid(__global float2* pmeAtomGridIndex, __global int* pmeAtomRange) { __kernel void findAtomRangeForGrid(__global int2* pmeAtomGridIndex, __global int* pmeAtomRange) {
int start = (NUM_ATOMS*get_global_id(0))/get_global_size(0); int start = (NUM_ATOMS*get_global_id(0))/get_global_size(0);
int end = (NUM_ATOMS*(get_global_id(0)+1))/get_global_size(0); int end = (NUM_ATOMS*(get_global_id(0)+1))/get_global_size(0);
int last = (start == 0 ? -1 : pmeAtomGridIndex[start-1].y); int last = (start == 0 ? -1 : pmeAtomGridIndex[start-1].y);
for (int i = start; i < end; ++i) { for (int i = start; i < end; ++i) {
float2 atomData = pmeAtomGridIndex[i]; int2 atomData = pmeAtomGridIndex[i];
int gridIndex = atomData.y; int gridIndex = atomData.y;
if (gridIndex != last) { if (gridIndex != last) {
for (int j = last+1; j <= gridIndex; ++j) for (int j = last+1; j <= gridIndex; ++j)
...@@ -41,7 +41,7 @@ __kernel void findAtomRangeForGrid(__global float2* pmeAtomGridIndex, __global i ...@@ -41,7 +41,7 @@ __kernel void findAtomRangeForGrid(__global float2* pmeAtomGridIndex, __global i
} }
} }
__kernel void updateBsplines(__global float4* posq, __global float4* pmeBsplineTheta, __global float4* pmeBsplineDTheta, __local float4* bsplinesCache, __global float2* pmeAtomGridIndex, float4 periodicBoxSize, float4 invPeriodicBoxSize) { __kernel void updateBsplines(__global float4* posq, __global float4* pmeBsplineTheta, __global float4* pmeBsplineDTheta, __local float4* bsplinesCache, __global int2* pmeAtomGridIndex, float4 periodicBoxSize, float4 invPeriodicBoxSize) {
const float4 scale = 1.0f/(PME_ORDER-1); const float4 scale = 1.0f/(PME_ORDER-1);
for (int i = get_global_id(0); i < NUM_ATOMS; i += get_global_size(0)) { for (int i = get_global_id(0); i < NUM_ATOMS; i += get_global_size(0)) {
__local float4* data = &bsplinesCache[get_local_id(0)*PME_ORDER]; __local float4* data = &bsplinesCache[get_local_id(0)*PME_ORDER];
...@@ -81,45 +81,71 @@ __kernel void updateBsplines(__global float4* posq, __global float4* pmeBsplineT ...@@ -81,45 +81,71 @@ __kernel void updateBsplines(__global float4* posq, __global float4* pmeBsplineT
} }
} }
// The grid index won't be needed again. Reuse that component to hold the atom charge, thus saving // The grid index won't be needed again. Reuse that component to hold the z index, thus saving
// an extra load operation in the charge spreading kernel. // some work in the charge spreading kernel.
int start = (NUM_ATOMS*get_global_id(0))/get_global_size(0); int start = (NUM_ATOMS*get_global_id(0))/get_global_size(0);
int end = (NUM_ATOMS*(get_global_id(0)+1))/get_global_size(0); int end = (NUM_ATOMS*(get_global_id(0)+1))/get_global_size(0);
for (int i = start; i < end; ++i) { for (int i = start; i < end; ++i) {
float2 atomData = pmeAtomGridIndex[i]; float posz = posq[pmeAtomGridIndex[i].x].z;
pmeAtomGridIndex[i].y = posq[(int) atomData.x].w; posz -= floor(posz*invPeriodicBoxSize.z)*periodicBoxSize.z;
int z = ((int) ((posz*invPeriodicBoxSize.z)*GRID_SIZE_Z)) % GRID_SIZE_Z;
pmeAtomGridIndex[i].y = z;
} }
} }
__kernel void gridSpreadCharge(__global float2* pmeAtomGridIndex, __global int* pmeAtomRange, __global float2* pmeGrid, __global float4* pmeBsplineTheta) { __kernel void gridSpreadCharge(__global float4* posq, __global int2* pmeAtomGridIndex, __global int* pmeAtomRange, __global float2* pmeGrid, __global float4* pmeBsplineTheta) {
unsigned int numGridPoints = GRID_SIZE_X*GRID_SIZE_Y*GRID_SIZE_Z; unsigned int numGridPoints = GRID_SIZE_X*GRID_SIZE_Y*GRID_SIZE_Z;
for (int gridIndex = get_global_id(0); gridIndex < numGridPoints; gridIndex += get_global_size(0)) { for (int gridIndex = get_global_id(0); gridIndex < numGridPoints; gridIndex += get_global_size(0)) {
// Compute the charge on a grid point.
int4 gridPoint; int4 gridPoint;
gridPoint.x = gridIndex/(GRID_SIZE_Y*GRID_SIZE_Z); gridPoint.x = gridIndex/(GRID_SIZE_Y*GRID_SIZE_Z);
int remainder = gridIndex-gridPoint.x*GRID_SIZE_Y*GRID_SIZE_Z; int remainder = gridIndex-gridPoint.x*GRID_SIZE_Y*GRID_SIZE_Z;
gridPoint.y = remainder/GRID_SIZE_Z; gridPoint.y = remainder/GRID_SIZE_Z;
gridPoint.z = remainder-gridPoint.y*GRID_SIZE_Z; gridPoint.z = remainder-gridPoint.y*GRID_SIZE_Z;
gridPoint.x += GRID_SIZE_X;
gridPoint.y += GRID_SIZE_Y;
gridPoint.z += GRID_SIZE_Z;
float result = 0.0f; float result = 0.0f;
for (int ix = 0; ix < PME_ORDER; ++ix)
for (int iy = 0; iy < PME_ORDER; ++iy) // Loop over all atoms that affect this grid point.
for (int iz = 0; iz < PME_ORDER; ++iz) {
int x = (gridPoint.x-ix)%GRID_SIZE_X; for (int ix = 0; ix < PME_ORDER; ++ix) {
int y = (gridPoint.y-iy)%GRID_SIZE_Y; int x = gridPoint.x-ix+(gridPoint.x >= ix ? 0 : GRID_SIZE_X);
int z = (gridPoint.z-iz)%GRID_SIZE_Z; for (int iy = 0; iy < PME_ORDER; ++iy) {
int gridIndex = x*GRID_SIZE_Y*GRID_SIZE_Z+y*GRID_SIZE_Z+z; int y = gridPoint.y-iy+(gridPoint.y >= iy ? 0 : GRID_SIZE_Y);
int firstAtom = pmeAtomRange[gridIndex]; int z1 = gridPoint.z-PME_ORDER+1;
int lastAtom = pmeAtomRange[gridIndex+1]; z1 += (z1 >= 0 ? 0 : GRID_SIZE_Z);
for (int i = firstAtom; i < lastAtom; ++i) { int z2 = (z1 < gridPoint.z ? gridPoint.z : GRID_SIZE_Z-1);
float2 atomData = pmeAtomGridIndex[i]; int gridIndex1 = x*GRID_SIZE_Y*GRID_SIZE_Z+y*GRID_SIZE_Z+z1;
int gridIndex2 = x*GRID_SIZE_Y*GRID_SIZE_Z+y*GRID_SIZE_Z+z2;
int firstAtom = pmeAtomRange[gridIndex1];
int lastAtom = pmeAtomRange[gridIndex2+1];
for (int i = firstAtom; i < lastAtom; ++i)
{
int2 atomData = pmeAtomGridIndex[i];
int atomIndex = atomData.x;
int z = atomData.y;
int iz = gridPoint.z-z+(gridPoint.z >= z ? 0 : GRID_SIZE_Z);
float atomCharge = posq[atomIndex].w;
result += atomCharge*pmeBsplineTheta[atomIndex+ix*NUM_ATOMS].x*pmeBsplineTheta[atomIndex+iy*NUM_ATOMS].y*pmeBsplineTheta[atomIndex+iz*NUM_ATOMS].z;
}
if (z1 > gridPoint.z)
{
gridIndex1 = x*GRID_SIZE_Y*GRID_SIZE_Z+y*GRID_SIZE_Z;
gridIndex2 = x*GRID_SIZE_Y*GRID_SIZE_Z+y*GRID_SIZE_Z+gridPoint.z;
firstAtom = pmeAtomRange[gridIndex1];
lastAtom = pmeAtomRange[gridIndex2+1];
for (int i = firstAtom; i < lastAtom; ++i)
{
int2 atomData = pmeAtomGridIndex[i];
int atomIndex = atomData.x; int atomIndex = atomData.x;
float atomCharge = atomData.y; int z = atomData.y;
int iz = gridPoint.z-z+(gridPoint.z >= z ? 0 : GRID_SIZE_Z);
float atomCharge = posq[atomIndex].w;
result += atomCharge*pmeBsplineTheta[atomIndex+ix*NUM_ATOMS].x*pmeBsplineTheta[atomIndex+iy*NUM_ATOMS].y*pmeBsplineTheta[atomIndex+iz*NUM_ATOMS].z; result += atomCharge*pmeBsplineTheta[atomIndex+ix*NUM_ATOMS].x*pmeBsplineTheta[atomIndex+iy*NUM_ATOMS].y*pmeBsplineTheta[atomIndex+iz*NUM_ATOMS].z;
} }
} }
}
}
pmeGrid[gridIndex] = (float2) (result*EPSILON_FACTOR, 0.0f); pmeGrid[gridIndex] = (float2) (result*EPSILON_FACTOR, 0.0f);
} }
} }
...@@ -168,15 +194,18 @@ __kernel void gridInterpolateForce(__global float4* posq, __global float4* force ...@@ -168,15 +194,18 @@ __kernel void gridInterpolateForce(__global float4* posq, __global float4* force
((int) t.y) % GRID_SIZE_Y, ((int) t.y) % GRID_SIZE_Y,
((int) t.z) % GRID_SIZE_Z, 0); ((int) t.z) % GRID_SIZE_Z, 0);
for (int ix = 0; ix < PME_ORDER; ix++) { for (int ix = 0; ix < PME_ORDER; ix++) {
int xindex = (gridIndex.x + ix) % GRID_SIZE_X; int xindex = gridIndex.x+ix;
xindex -= (xindex >= GRID_SIZE_X ? GRID_SIZE_X : 0);
float tx = pmeBsplineTheta[atom+ix*NUM_ATOMS].x; float tx = pmeBsplineTheta[atom+ix*NUM_ATOMS].x;
float dtx = pmeBsplineDTheta[atom+ix*NUM_ATOMS].x; float dtx = pmeBsplineDTheta[atom+ix*NUM_ATOMS].x;
for (int iy = 0; iy < PME_ORDER; iy++) { for (int iy = 0; iy < PME_ORDER; iy++) {
int yindex = (gridIndex.y + iy) % GRID_SIZE_Y; int yindex = gridIndex.y+iy;
yindex -= (yindex >= GRID_SIZE_Y ? GRID_SIZE_Y : 0);
float ty = pmeBsplineTheta[atom+iy*NUM_ATOMS].y; float ty = pmeBsplineTheta[atom+iy*NUM_ATOMS].y;
float dty = pmeBsplineDTheta[atom+iy*NUM_ATOMS].y; float dty = pmeBsplineDTheta[atom+iy*NUM_ATOMS].y;
for (int iz = 0; iz < PME_ORDER; iz++) { for (int iz = 0; iz < PME_ORDER; iz++) {
int zindex = (gridIndex.z + iz) % GRID_SIZE_Z; int zindex = gridIndex.z+iz;
zindex -= (zindex >= GRID_SIZE_Z ? GRID_SIZE_Z : 0);
float tz = pmeBsplineTheta[atom+iz*NUM_ATOMS].z; float tz = pmeBsplineTheta[atom+iz*NUM_ATOMS].z;
float dtz = pmeBsplineDTheta[atom+iz*NUM_ATOMS].z; float dtz = pmeBsplineDTheta[atom+iz*NUM_ATOMS].z;
int index = xindex*GRID_SIZE_Y*GRID_SIZE_Z + yindex*GRID_SIZE_Z + zindex; int index = xindex*GRID_SIZE_Y*GRID_SIZE_Z + yindex*GRID_SIZE_Z + zindex;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment