Unverified Commit 4885a268 authored by peastman's avatar peastman Committed by GitHub
Browse files

Merge pull request #2121 from peastman/pme

Optimizations to PME charge spreading
parents 38ae2563 73760081
...@@ -686,8 +686,8 @@ private: ...@@ -686,8 +686,8 @@ private:
CudaArray exceptionOffsetIndices; CudaArray exceptionOffsetIndices;
CudaArray globalParams; CudaArray globalParams;
CudaArray cosSinSums; CudaArray cosSinSums;
CudaArray directPmeGrid; CudaArray pmeGrid1;
CudaArray reciprocalPmeGrid; CudaArray pmeGrid2;
CudaArray pmeBsplineModuliX; CudaArray pmeBsplineModuliX;
CudaArray pmeBsplineModuliY; CudaArray pmeBsplineModuliY;
CudaArray pmeBsplineModuliZ; CudaArray pmeBsplineModuliZ;
......
...@@ -1749,7 +1749,6 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon ...@@ -1749,7 +1749,6 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
pmeDefines["GRID_SIZE_X"] = cu.intToString(dispersionGridSizeX); pmeDefines["GRID_SIZE_X"] = cu.intToString(dispersionGridSizeX);
pmeDefines["GRID_SIZE_Y"] = cu.intToString(dispersionGridSizeY); pmeDefines["GRID_SIZE_Y"] = cu.intToString(dispersionGridSizeY);
pmeDefines["GRID_SIZE_Z"] = cu.intToString(dispersionGridSizeZ); pmeDefines["GRID_SIZE_Z"] = cu.intToString(dispersionGridSizeZ);
pmeDefines["EPSILON_FACTOR"] = "1";
pmeDefines["RECIP_EXP_FACTOR"] = cu.doubleToString(M_PI*M_PI/(dispersionAlpha*dispersionAlpha)); pmeDefines["RECIP_EXP_FACTOR"] = cu.doubleToString(M_PI*M_PI/(dispersionAlpha*dispersionAlpha));
pmeDefines["USE_LJPME"] = "1"; pmeDefines["USE_LJPME"] = "1";
double invRCut6 = pow(force.getCutoffDistance(), -6); double invRCut6 = pow(force.getCutoffDistance(), -6);
...@@ -1772,12 +1771,15 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon ...@@ -1772,12 +1771,15 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
// Create required data structures. // Create required data structures.
int elementSize = (cu.getUseDoublePrecision() ? sizeof(double) : sizeof(float)); int elementSize = (cu.getUseDoublePrecision() ? sizeof(double) : sizeof(float));
int gridElements = gridSizeX*gridSizeY*gridSizeZ; int roundedZSize = PmeOrder*(int) ceil(gridSizeZ/(double) PmeOrder);
if (doLJPME) int gridElements = gridSizeX*gridSizeY*roundedZSize;
gridElements = max(gridElements, dispersionGridSizeX*dispersionGridSizeY*dispersionGridSizeZ); if (doLJPME) {
directPmeGrid.initialize(cu, gridElements, cu.getComputeCapability() >= 2.0 ? 2*elementSize : 2*sizeof(long long), "originalPmeGrid"); roundedZSize = PmeOrder*(int) ceil(dispersionGridSizeZ/(double) PmeOrder);
reciprocalPmeGrid.initialize(cu, gridElements, 2*elementSize, "reciprocalPmeGrid"); gridElements = max(gridElements, dispersionGridSizeX*dispersionGridSizeY*roundedZSize);
cu.addAutoclearBuffer(directPmeGrid); }
pmeGrid1.initialize(cu, gridElements, 2*elementSize, "pmeGrid1");
pmeGrid2.initialize(cu, gridElements, 2*elementSize, "pmeGrid2");
cu.addAutoclearBuffer(pmeGrid2);
pmeBsplineModuliX.initialize(cu, gridSizeX, elementSize, "pmeBsplineModuliX"); pmeBsplineModuliX.initialize(cu, gridSizeX, elementSize, "pmeBsplineModuliX");
pmeBsplineModuliY.initialize(cu, gridSizeY, elementSize, "pmeBsplineModuliY"); pmeBsplineModuliY.initialize(cu, gridSizeY, elementSize, "pmeBsplineModuliY");
pmeBsplineModuliZ.initialize(cu, gridSizeZ, elementSize, "pmeBsplineModuliZ"); pmeBsplineModuliZ.initialize(cu, gridSizeZ, elementSize, "pmeBsplineModuliZ");
...@@ -2093,7 +2095,7 @@ double CudaCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeF ...@@ -2093,7 +2095,7 @@ double CudaCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeF
void* forcesArgs[] = {&cu.getForce().getDevicePointer(), &cu.getPosq().getDevicePointer(), &cosSinSums.getDevicePointer(), cu.getPeriodicBoxSizePointer()}; void* forcesArgs[] = {&cu.getForce().getDevicePointer(), &cu.getPosq().getDevicePointer(), &cosSinSums.getDevicePointer(), cu.getPeriodicBoxSizePointer()};
cu.executeKernel(ewaldForcesKernel, forcesArgs, cu.getNumAtoms()); cu.executeKernel(ewaldForcesKernel, forcesArgs, cu.getNumAtoms());
} }
if (directPmeGrid.isInitialized() && includeReciprocal) { if (pmeGrid1.isInitialized() && includeReciprocal) {
if (usePmeStream) if (usePmeStream)
cu.setCurrentStream(pmeStream); cu.setCurrentStream(pmeStream);
...@@ -2133,50 +2135,48 @@ double CudaCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeF ...@@ -2133,50 +2135,48 @@ double CudaCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeF
sort->sort(pmeAtomGridIndex); sort->sort(pmeAtomGridIndex);
void* spreadArgs[] = {&cu.getPosq().getDevicePointer(), &directPmeGrid.getDevicePointer(), cu.getPeriodicBoxSizePointer(), void* spreadArgs[] = {&cu.getPosq().getDevicePointer(), &pmeGrid2.getDevicePointer(), cu.getPeriodicBoxSizePointer(),
cu.getInvPeriodicBoxSizePointer(), cu.getPeriodicBoxVecXPointer(), cu.getPeriodicBoxVecYPointer(), cu.getPeriodicBoxVecZPointer(), cu.getInvPeriodicBoxSizePointer(), cu.getPeriodicBoxVecXPointer(), cu.getPeriodicBoxVecYPointer(), cu.getPeriodicBoxVecZPointer(),
recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2], &pmeAtomGridIndex.getDevicePointer(), recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2], &pmeAtomGridIndex.getDevicePointer(),
&charges.getDevicePointer()}; &charges.getDevicePointer()};
cu.executeKernel(pmeSpreadChargeKernel, spreadArgs, cu.getNumAtoms(), 128); cu.executeKernel(pmeSpreadChargeKernel, spreadArgs, cu.getNumAtoms(), 128);
if (cu.getUseDoublePrecision() || cu.getComputeCapability() < 2.0 || cu.getPlatformData().deterministicForces) { void* finishSpreadArgs[] = {&pmeGrid2.getDevicePointer(), &pmeGrid1.getDevicePointer()};
void* finishSpreadArgs[] = {&directPmeGrid.getDevicePointer()};
cu.executeKernel(pmeFinishSpreadChargeKernel, finishSpreadArgs, gridSizeX*gridSizeY*gridSizeZ, 256); cu.executeKernel(pmeFinishSpreadChargeKernel, finishSpreadArgs, gridSizeX*gridSizeY*gridSizeZ, 256);
}
if (useCudaFFT) { if (useCudaFFT) {
if (cu.getUseDoublePrecision()) if (cu.getUseDoublePrecision())
cufftExecD2Z(fftForward, (double*) directPmeGrid.getDevicePointer(), (double2*) reciprocalPmeGrid.getDevicePointer()); cufftExecD2Z(fftForward, (double*) pmeGrid1.getDevicePointer(), (double2*) pmeGrid2.getDevicePointer());
else else
cufftExecR2C(fftForward, (float*) directPmeGrid.getDevicePointer(), (float2*) reciprocalPmeGrid.getDevicePointer()); cufftExecR2C(fftForward, (float*) pmeGrid1.getDevicePointer(), (float2*) pmeGrid2.getDevicePointer());
} }
else { else {
fft->execFFT(directPmeGrid, reciprocalPmeGrid, true); fft->execFFT(pmeGrid1, pmeGrid2, true);
} }
if (includeEnergy) { if (includeEnergy) {
void* computeEnergyArgs[] = {&reciprocalPmeGrid.getDevicePointer(), usePmeStream ? &pmeEnergyBuffer.getDevicePointer() : &cu.getEnergyBuffer().getDevicePointer(), void* computeEnergyArgs[] = {&pmeGrid2.getDevicePointer(), usePmeStream ? &pmeEnergyBuffer.getDevicePointer() : &cu.getEnergyBuffer().getDevicePointer(),
&pmeBsplineModuliX.getDevicePointer(), &pmeBsplineModuliY.getDevicePointer(), &pmeBsplineModuliZ.getDevicePointer(), &pmeBsplineModuliX.getDevicePointer(), &pmeBsplineModuliY.getDevicePointer(), &pmeBsplineModuliZ.getDevicePointer(),
cu.getPeriodicBoxSizePointer(), recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]}; cu.getPeriodicBoxSizePointer(), recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]};
cu.executeKernel(pmeEvalEnergyKernel, computeEnergyArgs, gridSizeX*gridSizeY*gridSizeZ); cu.executeKernel(pmeEvalEnergyKernel, computeEnergyArgs, gridSizeX*gridSizeY*gridSizeZ);
} }
void* convolutionArgs[] = {&reciprocalPmeGrid.getDevicePointer(), &cu.getEnergyBuffer().getDevicePointer(), void* convolutionArgs[] = {&pmeGrid2.getDevicePointer(), &cu.getEnergyBuffer().getDevicePointer(),
&pmeBsplineModuliX.getDevicePointer(), &pmeBsplineModuliY.getDevicePointer(), &pmeBsplineModuliZ.getDevicePointer(), &pmeBsplineModuliX.getDevicePointer(), &pmeBsplineModuliY.getDevicePointer(), &pmeBsplineModuliZ.getDevicePointer(),
cu.getPeriodicBoxSizePointer(), recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]}; cu.getPeriodicBoxSizePointer(), recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]};
cu.executeKernel(pmeConvolutionKernel, convolutionArgs, gridSizeX*gridSizeY*gridSizeZ, 256); cu.executeKernel(pmeConvolutionKernel, convolutionArgs, gridSizeX*gridSizeY*gridSizeZ, 256);
if (useCudaFFT) { if (useCudaFFT) {
if (cu.getUseDoublePrecision()) if (cu.getUseDoublePrecision())
cufftExecZ2D(fftBackward, (double2*) reciprocalPmeGrid.getDevicePointer(), (double*) directPmeGrid.getDevicePointer()); cufftExecZ2D(fftBackward, (double2*) pmeGrid2.getDevicePointer(), (double*) pmeGrid1.getDevicePointer());
else else
cufftExecC2R(fftBackward, (float2*) reciprocalPmeGrid.getDevicePointer(), (float*) directPmeGrid.getDevicePointer()); cufftExecC2R(fftBackward, (float2*) pmeGrid2.getDevicePointer(), (float*) pmeGrid1.getDevicePointer());
} }
else { else {
fft->execFFT(reciprocalPmeGrid, directPmeGrid, false); fft->execFFT(pmeGrid2, pmeGrid1, false);
} }
void* interpolateArgs[] = {&cu.getPosq().getDevicePointer(), &cu.getForce().getDevicePointer(), &directPmeGrid.getDevicePointer(), cu.getPeriodicBoxSizePointer(), void* interpolateArgs[] = {&cu.getPosq().getDevicePointer(), &cu.getForce().getDevicePointer(), &pmeGrid1.getDevicePointer(), cu.getPeriodicBoxSizePointer(),
cu.getInvPeriodicBoxSizePointer(), cu.getPeriodicBoxVecXPointer(), cu.getPeriodicBoxVecYPointer(), cu.getPeriodicBoxVecZPointer(), cu.getInvPeriodicBoxSizePointer(), cu.getPeriodicBoxVecXPointer(), cu.getPeriodicBoxVecYPointer(), cu.getPeriodicBoxVecZPointer(),
recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2], &pmeAtomGridIndex.getDevicePointer(), recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2], &pmeAtomGridIndex.getDevicePointer(),
&charges.getDevicePointer()}; &charges.getDevicePointer()};
...@@ -2186,58 +2186,58 @@ double CudaCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeF ...@@ -2186,58 +2186,58 @@ double CudaCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeF
// As written, we check only the Electrostatic grid pointer to get here. We could separate them out, but for // As written, we check only the Electrostatic grid pointer to get here. We could separate them out, but for
// now we assume that LJPME can only be used if electrostatic PME is also active. // now we assume that LJPME can only be used if electrostatic PME is also active.
if (doLJPME && hasLJ) { if (doLJPME && hasLJ) {
if (!hasCoulomb) {
void* gridIndexArgs[] = {&cu.getPosq().getDevicePointer(), &pmeAtomGridIndex.getDevicePointer(), cu.getPeriodicBoxSizePointer(), void* gridIndexArgs[] = {&cu.getPosq().getDevicePointer(), &pmeAtomGridIndex.getDevicePointer(), cu.getPeriodicBoxSizePointer(),
cu.getInvPeriodicBoxSizePointer(), cu.getPeriodicBoxVecXPointer(), cu.getPeriodicBoxVecYPointer(), cu.getPeriodicBoxVecZPointer(), cu.getInvPeriodicBoxSizePointer(), cu.getPeriodicBoxVecXPointer(), cu.getPeriodicBoxVecYPointer(), cu.getPeriodicBoxVecZPointer(),
recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]}; recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]};
cu.executeKernel(pmeDispersionGridIndexKernel, gridIndexArgs, cu.getNumAtoms()); cu.executeKernel(pmeDispersionGridIndexKernel, gridIndexArgs, cu.getNumAtoms());
sort->sort(pmeAtomGridIndex); sort->sort(pmeAtomGridIndex);
}
cu.clearBuffer(directPmeGrid); cu.clearBuffer(pmeGrid2);
void* spreadArgs[] = {&cu.getPosq().getDevicePointer(), &directPmeGrid.getDevicePointer(), cu.getPeriodicBoxSizePointer(), void* spreadArgs[] = {&cu.getPosq().getDevicePointer(), &pmeGrid2.getDevicePointer(), cu.getPeriodicBoxSizePointer(),
cu.getInvPeriodicBoxSizePointer(), cu.getPeriodicBoxVecXPointer(), cu.getPeriodicBoxVecYPointer(), cu.getPeriodicBoxVecZPointer(), cu.getInvPeriodicBoxSizePointer(), cu.getPeriodicBoxVecXPointer(), cu.getPeriodicBoxVecYPointer(), cu.getPeriodicBoxVecZPointer(),
recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2], &pmeAtomGridIndex.getDevicePointer(), recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2], &pmeAtomGridIndex.getDevicePointer(),
&sigmaEpsilon.getDevicePointer()}; &sigmaEpsilon.getDevicePointer()};
cu.executeKernel(pmeDispersionSpreadChargeKernel, spreadArgs, cu.getNumAtoms(), 128); cu.executeKernel(pmeDispersionSpreadChargeKernel, spreadArgs, cu.getNumAtoms(), 128);
if (cu.getUseDoublePrecision() || cu.getComputeCapability() < 2.0 || cu.getPlatformData().deterministicForces) { void* finishSpreadArgs[] = {&pmeGrid2.getDevicePointer(), &pmeGrid1.getDevicePointer()};
void* finishSpreadArgs[] = {&directPmeGrid.getDevicePointer()};
cu.executeKernel(pmeDispersionFinishSpreadChargeKernel, finishSpreadArgs, dispersionGridSizeX*dispersionGridSizeY*dispersionGridSizeZ, 256); cu.executeKernel(pmeDispersionFinishSpreadChargeKernel, finishSpreadArgs, dispersionGridSizeX*dispersionGridSizeY*dispersionGridSizeZ, 256);
}
if (useCudaFFT) { if (useCudaFFT) {
if (cu.getUseDoublePrecision()) if (cu.getUseDoublePrecision())
cufftExecD2Z(dispersionFftForward, (double*) directPmeGrid.getDevicePointer(), (double2*) reciprocalPmeGrid.getDevicePointer()); cufftExecD2Z(dispersionFftForward, (double*) pmeGrid1.getDevicePointer(), (double2*) pmeGrid2.getDevicePointer());
else else
cufftExecR2C(dispersionFftForward, (float*) directPmeGrid.getDevicePointer(), (float2*) reciprocalPmeGrid.getDevicePointer()); cufftExecR2C(dispersionFftForward, (float*) pmeGrid1.getDevicePointer(), (float2*) pmeGrid2.getDevicePointer());
} }
else { else {
dispersionFft->execFFT(directPmeGrid, reciprocalPmeGrid, true); dispersionFft->execFFT(pmeGrid1, pmeGrid2, true);
} }
if (includeEnergy) { if (includeEnergy) {
void* computeEnergyArgs[] = {&reciprocalPmeGrid.getDevicePointer(), usePmeStream ? &pmeEnergyBuffer.getDevicePointer() : &cu.getEnergyBuffer().getDevicePointer(), void* computeEnergyArgs[] = {&pmeGrid2.getDevicePointer(), usePmeStream ? &pmeEnergyBuffer.getDevicePointer() : &cu.getEnergyBuffer().getDevicePointer(),
&pmeDispersionBsplineModuliX.getDevicePointer(), &pmeDispersionBsplineModuliY.getDevicePointer(), &pmeDispersionBsplineModuliZ.getDevicePointer(), &pmeDispersionBsplineModuliX.getDevicePointer(), &pmeDispersionBsplineModuliY.getDevicePointer(), &pmeDispersionBsplineModuliZ.getDevicePointer(),
cu.getPeriodicBoxSizePointer(), recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]}; cu.getPeriodicBoxSizePointer(), recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]};
cu.executeKernel(pmeEvalDispersionEnergyKernel, computeEnergyArgs, dispersionGridSizeX*dispersionGridSizeY*dispersionGridSizeZ); cu.executeKernel(pmeEvalDispersionEnergyKernel, computeEnergyArgs, dispersionGridSizeX*dispersionGridSizeY*dispersionGridSizeZ);
} }
void* convolutionArgs[] = {&reciprocalPmeGrid.getDevicePointer(), &cu.getEnergyBuffer().getDevicePointer(), void* convolutionArgs[] = {&pmeGrid2.getDevicePointer(), &cu.getEnergyBuffer().getDevicePointer(),
&pmeDispersionBsplineModuliX.getDevicePointer(), &pmeDispersionBsplineModuliY.getDevicePointer(), &pmeDispersionBsplineModuliZ.getDevicePointer(), &pmeDispersionBsplineModuliX.getDevicePointer(), &pmeDispersionBsplineModuliY.getDevicePointer(), &pmeDispersionBsplineModuliZ.getDevicePointer(),
cu.getPeriodicBoxSizePointer(), recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]}; cu.getPeriodicBoxSizePointer(), recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]};
cu.executeKernel(pmeDispersionConvolutionKernel, convolutionArgs, dispersionGridSizeX*dispersionGridSizeY*dispersionGridSizeZ, 256); cu.executeKernel(pmeDispersionConvolutionKernel, convolutionArgs, dispersionGridSizeX*dispersionGridSizeY*dispersionGridSizeZ, 256);
if (useCudaFFT) { if (useCudaFFT) {
if (cu.getUseDoublePrecision()) if (cu.getUseDoublePrecision())
cufftExecZ2D(dispersionFftBackward, (double2*) reciprocalPmeGrid.getDevicePointer(), (double*) directPmeGrid.getDevicePointer()); cufftExecZ2D(dispersionFftBackward, (double2*) pmeGrid2.getDevicePointer(), (double*) pmeGrid1.getDevicePointer());
else else
cufftExecC2R(dispersionFftBackward, (float2*) reciprocalPmeGrid.getDevicePointer(), (float*) directPmeGrid.getDevicePointer()); cufftExecC2R(dispersionFftBackward, (float2*) pmeGrid2.getDevicePointer(), (float*) pmeGrid1.getDevicePointer());
} }
else { else {
dispersionFft->execFFT(reciprocalPmeGrid, directPmeGrid, false); dispersionFft->execFFT(pmeGrid2, pmeGrid1, false);
} }
void* interpolateArgs[] = {&cu.getPosq().getDevicePointer(), &cu.getForce().getDevicePointer(), &directPmeGrid.getDevicePointer(), cu.getPeriodicBoxSizePointer(), void* interpolateArgs[] = {&cu.getPosq().getDevicePointer(), &cu.getForce().getDevicePointer(), &pmeGrid1.getDevicePointer(), cu.getPeriodicBoxSizePointer(),
cu.getInvPeriodicBoxSizePointer(), cu.getPeriodicBoxVecXPointer(), cu.getPeriodicBoxVecYPointer(), cu.getPeriodicBoxVecZPointer(), cu.getInvPeriodicBoxSizePointer(), cu.getPeriodicBoxVecXPointer(), cu.getPeriodicBoxVecYPointer(), cu.getPeriodicBoxVecZPointer(),
recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2], &pmeAtomGridIndex.getDevicePointer(), recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2], &pmeAtomGridIndex.getDevicePointer(),
&sigmaEpsilon.getDevicePointer()}; &sigmaEpsilon.getDevicePointer()};
......
...@@ -28,12 +28,25 @@ extern "C" __global__ void gridSpreadCharge(const real4* __restrict__ posq, real ...@@ -28,12 +28,25 @@ extern "C" __global__ void gridSpreadCharge(const real4* __restrict__ posq, real
, const real* __restrict__ charges , const real* __restrict__ charges
#endif #endif
) { ) {
real3 data[PME_ORDER]; // To improve memory efficiency, we divide indices along the z axis into
const real scale = RECIP(PME_ORDER-1); // PME_ORDER blocks, where the data for each block is stored together. We
// can ensure that all threads write to the same block at the same time,
// which leads to better coalescing of writes.
__shared__ int zindexTable[GRID_SIZE_Z+PME_ORDER];
int blockSize = (int) ceil(GRID_SIZE_Z/(real) PME_ORDER);
for (int i = threadIdx.x; i < GRID_SIZE_Z+PME_ORDER; i += blockDim.x) {
int zindex = i % GRID_SIZE_Z;
int block = zindex % PME_ORDER;
zindexTable[i] = zindex/PME_ORDER + block*GRID_SIZE_X*GRID_SIZE_Y*blockSize;
}
__syncthreads();
// Process the atoms in spatially sorted order. This improves efficiency when writing // Process the atoms in spatially sorted order. This improves efficiency when writing
// the grid values. // the grid values.
real3 data[PME_ORDER];
const real scale = RECIP(PME_ORDER-1);
for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < NUM_ATOMS; i += blockDim.x*gridDim.x) { for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < NUM_ATOMS; i += blockDim.x*gridDim.x) {
int atom = pmeAtomGridIndex[i].x; int atom = pmeAtomGridIndex[i].x;
real4 pos = posq[atom]; real4 pos = posq[atom];
...@@ -41,7 +54,7 @@ extern "C" __global__ void gridSpreadCharge(const real4* __restrict__ posq, real ...@@ -41,7 +54,7 @@ extern "C" __global__ void gridSpreadCharge(const real4* __restrict__ posq, real
const float2 sigEps = sigmaEpsilon[atom]; const float2 sigEps = sigmaEpsilon[atom];
const real charge = 8*sigEps.x*sigEps.x*sigEps.x*sigEps.y; const real charge = 8*sigEps.x*sigEps.x*sigEps.x*sigEps.y;
#else #else
const real charge = CHARGE; const real charge = (CHARGE)*EPSILON_FACTOR;
#endif #endif
if (charge == 0) if (charge == 0)
continue; continue;
...@@ -77,34 +90,27 @@ extern "C" __global__ void gridSpreadCharge(const real4* __restrict__ posq, real ...@@ -77,34 +90,27 @@ extern "C" __global__ void gridSpreadCharge(const real4* __restrict__ posq, real
// Spread the charge from this atom onto each grid point. // Spread the charge from this atom onto each grid point.
int izoffset = (PME_ORDER-(gridIndex.z%PME_ORDER)) % PME_ORDER;
for (int ix = 0; ix < PME_ORDER; ix++) { for (int ix = 0; ix < PME_ORDER; ix++) {
int xbase = gridIndex.x+ix; int xbase = gridIndex.x+ix;
xbase -= (xbase >= GRID_SIZE_X ? GRID_SIZE_X : 0); xbase -= (xbase >= GRID_SIZE_X ? GRID_SIZE_X : 0);
xbase = xbase*GRID_SIZE_Y*GRID_SIZE_Z; xbase = xbase*GRID_SIZE_Y;
real dx = data[ix].x; real dx = charge*data[ix].x;
for (int iy = 0; iy < PME_ORDER; iy++) { for (int iy = 0; iy < PME_ORDER; iy++) {
int ybase = gridIndex.y+iy; int ybase = gridIndex.y+iy;
ybase -= (ybase >= GRID_SIZE_Y ? GRID_SIZE_Y : 0); ybase -= (ybase >= GRID_SIZE_Y ? GRID_SIZE_Y : 0);
ybase = xbase + ybase*GRID_SIZE_Z; ybase = (xbase+ybase)*blockSize;
real dy = data[iy].y; real dxdy = dx*data[iy].y;
for (int i = 0; i < PME_ORDER; i++) {
for (int iz = 0; iz < PME_ORDER; iz++) { int iz = (i+izoffset) % PME_ORDER;
int zindex = gridIndex.z+iz; int zindex = gridIndex.z+iz;
zindex -= (zindex >= GRID_SIZE_Z ? GRID_SIZE_Z : 0); int index = ybase + zindexTable[zindex];
int index = ybase + zindex; real add = dxdy*data[iz].z;
#if defined(USE_DOUBLE_PRECISION) || defined(USE_DETERMINISTIC_FORCES)
real add = charge*dx*dy*data[iz].z;
#ifdef USE_DOUBLE_PRECISION
unsigned long long * ulonglong_p = (unsigned long long *) originalPmeGrid; unsigned long long * ulonglong_p = (unsigned long long *) originalPmeGrid;
atomicAdd(&ulonglong_p[index], static_cast<unsigned long long>((long long) (add*0x100000000))); atomicAdd(&ulonglong_p[index], static_cast<unsigned long long>((long long) (add*0x100000000)));
#elif __CUDA_ARCH__ < 200 || defined(USE_DETERMINISTIC_FORCES)
unsigned long long * ulonglong_p = (unsigned long long *) originalPmeGrid;
int gridIndex = index;
gridIndex = (gridIndex%2 == 0 ? gridIndex/2 : (gridIndex+GRID_SIZE_X*GRID_SIZE_Y*GRID_SIZE_Z)/2);
atomicAdd(&ulonglong_p[gridIndex], static_cast<unsigned long long>((long long) (add*0x100000000)));
#else #else
atomicAdd(&originalPmeGrid[index], add*EPSILON_FACTOR); atomicAdd(&originalPmeGrid[index], add);
#endif #endif
} }
} }
...@@ -112,20 +118,36 @@ extern "C" __global__ void gridSpreadCharge(const real4* __restrict__ posq, real ...@@ -112,20 +118,36 @@ extern "C" __global__ void gridSpreadCharge(const real4* __restrict__ posq, real
} }
} }
extern "C" __global__ void finishSpreadCharge(long long* __restrict__ originalPmeGrid) { extern "C" __global__ void finishSpreadCharge(
real* floatGrid = (real*) originalPmeGrid; #if defined(USE_DOUBLE_PRECISION) || defined(USE_DETERMINISTIC_FORCES)
const unsigned int gridSize = GRID_SIZE_X*GRID_SIZE_Y*GRID_SIZE_Z; const long long* __restrict__ grid1,
real scale = EPSILON_FACTOR/(real) 0x100000000;
#ifdef USE_DOUBLE_PRECISION
for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < gridSize; index += blockDim.x*gridDim.x)
floatGrid[index] = scale*originalPmeGrid[index];
#else #else
for (int index = 2*(blockIdx.x*blockDim.x+threadIdx.x); index < gridSize; index += 2*blockDim.x*gridDim.x) { const real* __restrict__ grid1,
floatGrid[index] = scale*originalPmeGrid[index/2]; #endif
if (index+1 < gridSize) real* __restrict__ grid2) {
floatGrid[index+1] = scale*originalPmeGrid[(index+gridSize+1)/2]; // During charge spreading, we shuffled the order of indices along the z
// axis to make memory access more efficient. We now need to unshuffle
// them. If the values were accumulated as fixed point, we also need to
// convert them to floating point.
__shared__ int zindexTable[GRID_SIZE_Z];
int blockSize = (int) ceil(GRID_SIZE_Z/(real) PME_ORDER);
for (int i = threadIdx.x; i < GRID_SIZE_Z; i += blockDim.x) {
int block = i % PME_ORDER;
zindexTable[i] = i/PME_ORDER + block*GRID_SIZE_X*GRID_SIZE_Y*blockSize;
} }
__syncthreads();
const unsigned int gridSize = GRID_SIZE_X*GRID_SIZE_Y*GRID_SIZE_Z;
real scale = 1/(real) 0x100000000;
for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < gridSize; index += blockDim.x*gridDim.x) {
int zindex = index%GRID_SIZE_Z;
int loadIndex = zindexTable[zindex] + blockSize*(int) (index/GRID_SIZE_Z);
#if defined(USE_DOUBLE_PRECISION) || defined(USE_DETERMINISTIC_FORCES)
grid2[index] = scale*grid1[loadIndex];
#else
grid2[index] = grid1[loadIndex];
#endif #endif
}
} }
// convolutes on the halfcomplex_pmeGrid, which is of size NX*NY*(NZ/2+1) as F(Q) is conjugate symmetric // convolutes on the halfcomplex_pmeGrid, which is of size NX*NY*(NZ/2+1) as F(Q) is conjugate symmetric
......
...@@ -663,7 +663,7 @@ private: ...@@ -663,7 +663,7 @@ private:
OpenCLArray exceptionOffsetIndices; OpenCLArray exceptionOffsetIndices;
OpenCLArray globalParams; OpenCLArray globalParams;
OpenCLArray cosSinSums; OpenCLArray cosSinSums;
OpenCLArray pmeGrid; OpenCLArray pmeGrid1;
OpenCLArray pmeGrid2; OpenCLArray pmeGrid2;
OpenCLArray pmeBsplineModuliX; OpenCLArray pmeBsplineModuliX;
OpenCLArray pmeBsplineModuliY; OpenCLArray pmeBsplineModuliY;
......
...@@ -1726,15 +1726,18 @@ void OpenCLCalcNonbondedForceKernel::initialize(const System& system, const Nonb ...@@ -1726,15 +1726,18 @@ void OpenCLCalcNonbondedForceKernel::initialize(const System& system, const Nonb
defines["MULTSHIFT6"] = cl.doubleToString(multShift6); defines["MULTSHIFT6"] = cl.doubleToString(multShift6);
} }
int elementSize = (cl.getUseDoublePrecision() ? sizeof(double) : sizeof(float)); int elementSize = (cl.getUseDoublePrecision() ? sizeof(double) : sizeof(float));
int gridElements = gridSizeX*gridSizeY*gridSizeZ; int roundedZSize = PmeOrder*(int) ceil(gridSizeZ/(double) PmeOrder);
if (doLJPME) int gridElements = gridSizeX*gridSizeY*roundedZSize;
gridElements = max(gridElements, dispersionGridSizeX*dispersionGridSizeY*dispersionGridSizeZ); if (doLJPME) {
pmeGrid.initialize(cl, gridElements, 2*elementSize, "pmeGrid"); roundedZSize = PmeOrder*(int) ceil(dispersionGridSizeZ/(double) PmeOrder);
gridElements = max(gridElements, dispersionGridSizeX*dispersionGridSizeY*roundedZSize);
}
pmeGrid1.initialize(cl, gridElements, 2*elementSize, "pmeGrid1");
pmeGrid2.initialize(cl, gridElements, 2*elementSize, "pmeGrid2"); pmeGrid2.initialize(cl, gridElements, 2*elementSize, "pmeGrid2");
if (cl.getSupports64BitGlobalAtomics()) if (cl.getSupports64BitGlobalAtomics())
cl.addAutoclearBuffer(pmeGrid2); cl.addAutoclearBuffer(pmeGrid2);
else else
cl.addAutoclearBuffer(pmeGrid); cl.addAutoclearBuffer(pmeGrid1);
pmeBsplineModuliX.initialize(cl, gridSizeX, elementSize, "pmeBsplineModuliX"); pmeBsplineModuliX.initialize(cl, gridSizeX, elementSize, "pmeBsplineModuliX");
pmeBsplineModuliY.initialize(cl, gridSizeY, elementSize, "pmeBsplineModuliY"); pmeBsplineModuliY.initialize(cl, gridSizeY, elementSize, "pmeBsplineModuliY");
pmeBsplineModuliZ.initialize(cl, gridSizeZ, elementSize, "pmeBsplineModuliZ"); pmeBsplineModuliZ.initialize(cl, gridSizeZ, elementSize, "pmeBsplineModuliZ");
...@@ -1755,8 +1758,6 @@ void OpenCLCalcNonbondedForceKernel::initialize(const System& system, const Nonb ...@@ -1755,8 +1758,6 @@ void OpenCLCalcNonbondedForceKernel::initialize(const System& system, const Nonb
dispersionFft = new OpenCLFFT3D(cl, dispersionGridSizeX, dispersionGridSizeY, dispersionGridSizeZ, true); dispersionFft = new OpenCLFFT3D(cl, dispersionGridSizeX, dispersionGridSizeY, dispersionGridSizeZ, true);
string vendor = cl.getDevice().getInfo<CL_DEVICE_VENDOR>(); string vendor = cl.getDevice().getInfo<CL_DEVICE_VENDOR>();
bool isNvidia = (vendor.size() >= 6 && vendor.substr(0, 6) == "NVIDIA"); bool isNvidia = (vendor.size() >= 6 && vendor.substr(0, 6) == "NVIDIA");
if (isNvidia)
pmeDefines["USE_ALTERNATE_MEMORY_ACCESS_PATTERN"] = "1";
usePmeQueue = (!cl.getPlatformData().disablePmeStream && isNvidia); usePmeQueue = (!cl.getPlatformData().disablePmeStream && isNvidia);
if (usePmeQueue) { if (usePmeQueue) {
pmeDefines["USE_PME_STREAM"] = "1"; pmeDefines["USE_PME_STREAM"] = "1";
...@@ -2006,7 +2007,7 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ ...@@ -2006,7 +2007,7 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ
ewaldForcesKernel.setArg<cl::Buffer>(1, cl.getPosq().getDeviceBuffer()); ewaldForcesKernel.setArg<cl::Buffer>(1, cl.getPosq().getDeviceBuffer());
ewaldForcesKernel.setArg<cl::Buffer>(2, cosSinSums.getDeviceBuffer()); ewaldForcesKernel.setArg<cl::Buffer>(2, cosSinSums.getDeviceBuffer());
} }
if (pmeGrid.isInitialized()) { if (pmeGrid1.isInitialized()) {
// Create kernels for Coulomb PME. // Create kernels for Coulomb PME.
map<string, string> replacements; map<string, string> replacements;
...@@ -2036,7 +2037,7 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ ...@@ -2036,7 +2037,7 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ
if (cl.getSupports64BitGlobalAtomics()) if (cl.getSupports64BitGlobalAtomics())
pmeSpreadChargeKernel.setArg<cl::Buffer>(3, pmeGrid2.getDeviceBuffer()); pmeSpreadChargeKernel.setArg<cl::Buffer>(3, pmeGrid2.getDeviceBuffer());
else else
pmeSpreadChargeKernel.setArg<cl::Buffer>(3, pmeGrid.getDeviceBuffer()); pmeSpreadChargeKernel.setArg<cl::Buffer>(3, pmeGrid1.getDeviceBuffer());
pmeSpreadChargeKernel.setArg<cl::Buffer>(4, pmeBsplineTheta.getDeviceBuffer()); pmeSpreadChargeKernel.setArg<cl::Buffer>(4, pmeBsplineTheta.getDeviceBuffer());
if (deviceIsCpu || cl.getSupports64BitGlobalAtomics()) if (deviceIsCpu || cl.getSupports64BitGlobalAtomics())
pmeSpreadChargeKernel.setArg<cl::Buffer>(13, charges.getDeviceBuffer()); pmeSpreadChargeKernel.setArg<cl::Buffer>(13, charges.getDeviceBuffer());
...@@ -2053,13 +2054,13 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ ...@@ -2053,13 +2054,13 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ
pmeEvalEnergyKernel.setArg<cl::Buffer>(4, pmeBsplineModuliZ.getDeviceBuffer()); pmeEvalEnergyKernel.setArg<cl::Buffer>(4, pmeBsplineModuliZ.getDeviceBuffer());
pmeInterpolateForceKernel.setArg<cl::Buffer>(0, cl.getPosq().getDeviceBuffer()); pmeInterpolateForceKernel.setArg<cl::Buffer>(0, cl.getPosq().getDeviceBuffer());
pmeInterpolateForceKernel.setArg<cl::Buffer>(1, cl.getForceBuffers().getDeviceBuffer()); pmeInterpolateForceKernel.setArg<cl::Buffer>(1, cl.getForceBuffers().getDeviceBuffer());
pmeInterpolateForceKernel.setArg<cl::Buffer>(2, pmeGrid.getDeviceBuffer()); pmeInterpolateForceKernel.setArg<cl::Buffer>(2, pmeGrid1.getDeviceBuffer());
pmeInterpolateForceKernel.setArg<cl::Buffer>(11, pmeAtomGridIndex.getDeviceBuffer()); pmeInterpolateForceKernel.setArg<cl::Buffer>(11, pmeAtomGridIndex.getDeviceBuffer());
pmeInterpolateForceKernel.setArg<cl::Buffer>(12, charges.getDeviceBuffer()); pmeInterpolateForceKernel.setArg<cl::Buffer>(12, charges.getDeviceBuffer());
if (cl.getSupports64BitGlobalAtomics()) { if (cl.getSupports64BitGlobalAtomics()) {
pmeFinishSpreadChargeKernel = cl::Kernel(program, "finishSpreadCharge"); pmeFinishSpreadChargeKernel = cl::Kernel(program, "finishSpreadCharge");
pmeFinishSpreadChargeKernel.setArg<cl::Buffer>(0, pmeGrid2.getDeviceBuffer()); pmeFinishSpreadChargeKernel.setArg<cl::Buffer>(0, pmeGrid2.getDeviceBuffer());
pmeFinishSpreadChargeKernel.setArg<cl::Buffer>(1, pmeGrid.getDeviceBuffer()); pmeFinishSpreadChargeKernel.setArg<cl::Buffer>(1, pmeGrid1.getDeviceBuffer());
} }
if (usePmeQueue) if (usePmeQueue)
syncQueue->setKernel(cl::Kernel(program, "addEnergy")); syncQueue->setKernel(cl::Kernel(program, "addEnergy"));
...@@ -2099,7 +2100,7 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ ...@@ -2099,7 +2100,7 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ
if (cl.getSupports64BitGlobalAtomics()) if (cl.getSupports64BitGlobalAtomics())
pmeDispersionSpreadChargeKernel.setArg<cl::Buffer>(3, pmeGrid2.getDeviceBuffer()); pmeDispersionSpreadChargeKernel.setArg<cl::Buffer>(3, pmeGrid2.getDeviceBuffer());
else else
pmeDispersionSpreadChargeKernel.setArg<cl::Buffer>(3, pmeGrid.getDeviceBuffer()); pmeDispersionSpreadChargeKernel.setArg<cl::Buffer>(3, pmeGrid1.getDeviceBuffer());
pmeDispersionSpreadChargeKernel.setArg<cl::Buffer>(4, pmeBsplineTheta.getDeviceBuffer()); pmeDispersionSpreadChargeKernel.setArg<cl::Buffer>(4, pmeBsplineTheta.getDeviceBuffer());
if (deviceIsCpu || cl.getSupports64BitGlobalAtomics()) if (deviceIsCpu || cl.getSupports64BitGlobalAtomics())
pmeDispersionSpreadChargeKernel.setArg<cl::Buffer>(13, sigmaEpsilon.getDeviceBuffer()); pmeDispersionSpreadChargeKernel.setArg<cl::Buffer>(13, sigmaEpsilon.getDeviceBuffer());
...@@ -2116,13 +2117,13 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ ...@@ -2116,13 +2117,13 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ
pmeDispersionEvalEnergyKernel.setArg<cl::Buffer>(4, pmeDispersionBsplineModuliZ.getDeviceBuffer()); pmeDispersionEvalEnergyKernel.setArg<cl::Buffer>(4, pmeDispersionBsplineModuliZ.getDeviceBuffer());
pmeDispersionInterpolateForceKernel.setArg<cl::Buffer>(0, cl.getPosq().getDeviceBuffer()); pmeDispersionInterpolateForceKernel.setArg<cl::Buffer>(0, cl.getPosq().getDeviceBuffer());
pmeDispersionInterpolateForceKernel.setArg<cl::Buffer>(1, cl.getForceBuffers().getDeviceBuffer()); pmeDispersionInterpolateForceKernel.setArg<cl::Buffer>(1, cl.getForceBuffers().getDeviceBuffer());
pmeDispersionInterpolateForceKernel.setArg<cl::Buffer>(2, pmeGrid.getDeviceBuffer()); pmeDispersionInterpolateForceKernel.setArg<cl::Buffer>(2, pmeGrid1.getDeviceBuffer());
pmeDispersionInterpolateForceKernel.setArg<cl::Buffer>(11, pmeAtomGridIndex.getDeviceBuffer()); pmeDispersionInterpolateForceKernel.setArg<cl::Buffer>(11, pmeAtomGridIndex.getDeviceBuffer());
pmeDispersionInterpolateForceKernel.setArg<cl::Buffer>(12, sigmaEpsilon.getDeviceBuffer()); pmeDispersionInterpolateForceKernel.setArg<cl::Buffer>(12, sigmaEpsilon.getDeviceBuffer());
if (cl.getSupports64BitGlobalAtomics()) { if (cl.getSupports64BitGlobalAtomics()) {
pmeDispersionFinishSpreadChargeKernel = cl::Kernel(program, "finishSpreadCharge"); pmeDispersionFinishSpreadChargeKernel = cl::Kernel(program, "finishSpreadCharge");
pmeDispersionFinishSpreadChargeKernel.setArg<cl::Buffer>(0, pmeGrid2.getDeviceBuffer()); pmeDispersionFinishSpreadChargeKernel.setArg<cl::Buffer>(0, pmeGrid2.getDeviceBuffer());
pmeDispersionFinishSpreadChargeKernel.setArg<cl::Buffer>(1, pmeGrid.getDeviceBuffer()); pmeDispersionFinishSpreadChargeKernel.setArg<cl::Buffer>(1, pmeGrid1.getDeviceBuffer());
} }
} }
} }
...@@ -2176,7 +2177,7 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ ...@@ -2176,7 +2177,7 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ
cl.executeKernel(ewaldSumsKernel, cosSinSums.getSize()); cl.executeKernel(ewaldSumsKernel, cosSinSums.getSize());
cl.executeKernel(ewaldForcesKernel, cl.getNumAtoms()); cl.executeKernel(ewaldForcesKernel, cl.getNumAtoms());
} }
if (pmeGrid.isInitialized() && includeReciprocal) { if (pmeGrid1.isInitialized() && includeReciprocal) {
if (usePmeQueue && !includeEnergy) if (usePmeQueue && !includeEnergy)
cl.setQueue(pmeQueue); cl.setQueue(pmeQueue);
...@@ -2251,7 +2252,7 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ ...@@ -2251,7 +2252,7 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ
cl.executeKernel(pmeSpreadChargeKernel, cl.getNumAtoms()); cl.executeKernel(pmeSpreadChargeKernel, cl.getNumAtoms());
} }
} }
fft->execFFT(pmeGrid, pmeGrid2, true); fft->execFFT(pmeGrid1, pmeGrid2, true);
mm_double4 boxSize = cl.getPeriodicBoxSizeDouble(); mm_double4 boxSize = cl.getPeriodicBoxSizeDouble();
if (cl.getUseDoublePrecision()) { if (cl.getUseDoublePrecision()) {
pmeConvolutionKernel.setArg<mm_double4>(4, recipBoxVectors[0]); pmeConvolutionKernel.setArg<mm_double4>(4, recipBoxVectors[0]);
...@@ -2272,7 +2273,7 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ ...@@ -2272,7 +2273,7 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ
if (includeEnergy) if (includeEnergy)
cl.executeKernel(pmeEvalEnergyKernel, gridSizeX*gridSizeY*gridSizeZ); cl.executeKernel(pmeEvalEnergyKernel, gridSizeX*gridSizeY*gridSizeZ);
cl.executeKernel(pmeConvolutionKernel, gridSizeX*gridSizeY*gridSizeZ); cl.executeKernel(pmeConvolutionKernel, gridSizeX*gridSizeY*gridSizeZ);
fft->execFFT(pmeGrid2, pmeGrid, false); fft->execFFT(pmeGrid2, pmeGrid1, false);
setPeriodicBoxArgs(cl, pmeInterpolateForceKernel, 3); setPeriodicBoxArgs(cl, pmeInterpolateForceKernel, 3);
if (cl.getUseDoublePrecision()) { if (cl.getUseDoublePrecision()) {
pmeInterpolateForceKernel.setArg<mm_double4>(8, recipBoxVectors[0]); pmeInterpolateForceKernel.setArg<mm_double4>(8, recipBoxVectors[0]);
...@@ -2304,7 +2305,7 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ ...@@ -2304,7 +2305,7 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ
} }
cl.executeKernel(pmeDispersionUpdateBsplinesKernel, cl.getNumAtoms()); cl.executeKernel(pmeDispersionUpdateBsplinesKernel, cl.getNumAtoms());
if (deviceIsCpu && !cl.getSupports64BitGlobalAtomics()) { if (deviceIsCpu && !cl.getSupports64BitGlobalAtomics()) {
cl.clearBuffer(pmeGrid); cl.clearBuffer(pmeGrid1);
setPeriodicBoxArgs(cl, pmeDispersionSpreadChargeKernel, 5); setPeriodicBoxArgs(cl, pmeDispersionSpreadChargeKernel, 5);
if (cl.getUseDoublePrecision()) { if (cl.getUseDoublePrecision()) {
pmeDispersionSpreadChargeKernel.setArg<mm_double4>(10, recipBoxVectors[0]); pmeDispersionSpreadChargeKernel.setArg<mm_double4>(10, recipBoxVectors[0]);
...@@ -2319,6 +2320,7 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ ...@@ -2319,6 +2320,7 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ
cl.executeKernel(pmeDispersionSpreadChargeKernel, 2*cl.getDevice().getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>(), 1); cl.executeKernel(pmeDispersionSpreadChargeKernel, 2*cl.getDevice().getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>(), 1);
} }
else { else {
if (!hasCoulomb)
sort->sort(pmeAtomGridIndex); sort->sort(pmeAtomGridIndex);
if (cl.getSupports64BitGlobalAtomics()) { if (cl.getSupports64BitGlobalAtomics()) {
cl.clearBuffer(pmeGrid2); cl.clearBuffer(pmeGrid2);
...@@ -2337,7 +2339,7 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ ...@@ -2337,7 +2339,7 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ
cl.executeKernel(pmeDispersionFinishSpreadChargeKernel, gridSizeX*gridSizeY*gridSizeZ); cl.executeKernel(pmeDispersionFinishSpreadChargeKernel, gridSizeX*gridSizeY*gridSizeZ);
} }
else { else {
cl.clearBuffer(pmeGrid); cl.clearBuffer(pmeGrid1);
cl.executeKernel(pmeDispersionAtomRangeKernel, cl.getNumAtoms()); cl.executeKernel(pmeDispersionAtomRangeKernel, cl.getNumAtoms());
setPeriodicBoxSizeArg(cl, pmeDispersionZIndexKernel, 2); setPeriodicBoxSizeArg(cl, pmeDispersionZIndexKernel, 2);
if (cl.getUseDoublePrecision()) if (cl.getUseDoublePrecision())
...@@ -2348,7 +2350,7 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ ...@@ -2348,7 +2350,7 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ
cl.executeKernel(pmeDispersionSpreadChargeKernel, cl.getNumAtoms()); cl.executeKernel(pmeDispersionSpreadChargeKernel, cl.getNumAtoms());
} }
} }
dispersionFft->execFFT(pmeGrid, pmeGrid2, true); dispersionFft->execFFT(pmeGrid1, pmeGrid2, true);
mm_double4 boxSize = cl.getPeriodicBoxSizeDouble(); mm_double4 boxSize = cl.getPeriodicBoxSizeDouble();
if (cl.getUseDoublePrecision()) { if (cl.getUseDoublePrecision()) {
pmeDispersionConvolutionKernel.setArg<mm_double4>(4, recipBoxVectors[0]); pmeDispersionConvolutionKernel.setArg<mm_double4>(4, recipBoxVectors[0]);
...@@ -2369,7 +2371,7 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ ...@@ -2369,7 +2371,7 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ
if (includeEnergy) if (includeEnergy)
cl.executeKernel(pmeDispersionEvalEnergyKernel, gridSizeX*gridSizeY*gridSizeZ); cl.executeKernel(pmeDispersionEvalEnergyKernel, gridSizeX*gridSizeY*gridSizeZ);
cl.executeKernel(pmeDispersionConvolutionKernel, gridSizeX*gridSizeY*gridSizeZ); cl.executeKernel(pmeDispersionConvolutionKernel, gridSizeX*gridSizeY*gridSizeZ);
dispersionFft->execFFT(pmeGrid2, pmeGrid, false); dispersionFft->execFFT(pmeGrid2, pmeGrid1, false);
setPeriodicBoxArgs(cl, pmeDispersionInterpolateForceKernel, 3); setPeriodicBoxArgs(cl, pmeDispersionInterpolateForceKernel, 3);
if (cl.getUseDoublePrecision()) { if (cl.getUseDoublePrecision()) {
pmeDispersionInterpolateForceKernel.setArg<mm_double4>(8, recipBoxVectors[0]); pmeDispersionInterpolateForceKernel.setArg<mm_double4>(8, recipBoxVectors[0]);
......
...@@ -105,12 +105,25 @@ __kernel void gridSpreadCharge(__global const real4* restrict posq, __global con ...@@ -105,12 +105,25 @@ __kernel void gridSpreadCharge(__global const real4* restrict posq, __global con
, __global const real* restrict charges , __global const real* restrict charges
#endif #endif
) { ) {
const real scale = 1/(real) (PME_ORDER-1); // To improve memory efficiency, we divide indices along the z axis into
real4 data[PME_ORDER]; // PME_ORDER blocks, where the data for each block is stored together. We
// can ensure that all threads write to the same block at the same time,
// which leads to better coalescing of writes.
__local int zindexTable[GRID_SIZE_Z+PME_ORDER];
int blockSize = (int) ceil(GRID_SIZE_Z/(real) PME_ORDER);
for (int i = get_local_id(0); i < GRID_SIZE_Z+PME_ORDER; i += get_local_size(0)) {
int zindex = i % GRID_SIZE_Z;
int block = zindex % PME_ORDER;
zindexTable[i] = zindex/PME_ORDER + block*GRID_SIZE_X*GRID_SIZE_Y*blockSize;
}
barrier(CLK_LOCAL_MEM_FENCE);
// Process the atoms in spatially sorted order. This improves efficiency when writing // Process the atoms in spatially sorted order. This improves efficiency when writing
// the grid values. // the grid values.
const real scale = 1/(real) (PME_ORDER-1);
real4 data[PME_ORDER];
for (int i = get_global_id(0); i < NUM_ATOMS; i += get_global_size(0)) { for (int i = get_global_id(0); i < NUM_ATOMS; i += get_global_size(0)) {
int atom = pmeAtomGridIndex[i].x; int atom = pmeAtomGridIndex[i].x;
real4 pos = posq[atom]; real4 pos = posq[atom];
...@@ -118,7 +131,7 @@ __kernel void gridSpreadCharge(__global const real4* restrict posq, __global con ...@@ -118,7 +131,7 @@ __kernel void gridSpreadCharge(__global const real4* restrict posq, __global con
const float2 sigEps = sigmaEpsilon[atom]; const float2 sigEps = sigmaEpsilon[atom];
const real charge = 8*sigEps.x*sigEps.x*sigEps.x*sigEps.y; const real charge = 8*sigEps.x*sigEps.x*sigEps.x*sigEps.y;
#else #else
const real charge = CHARGE; const real charge = (CHARGE)*EPSILON_FACTOR;
#endif #endif
if (charge == 0) if (charge == 0)
continue; continue;
...@@ -154,40 +167,47 @@ __kernel void gridSpreadCharge(__global const real4* restrict posq, __global con ...@@ -154,40 +167,47 @@ __kernel void gridSpreadCharge(__global const real4* restrict posq, __global con
// Spread the charge from this atom onto each grid point. // Spread the charge from this atom onto each grid point.
int izoffset = (PME_ORDER-(gridIndex.z%PME_ORDER)) % PME_ORDER;
for (int ix = 0; ix < PME_ORDER; ix++) { for (int ix = 0; ix < PME_ORDER; ix++) {
int xindex = gridIndex.x+ix; int xbase = gridIndex.x+ix;
xindex -= (xindex >= GRID_SIZE_X ? GRID_SIZE_X : 0); xbase -= (xbase >= GRID_SIZE_X ? GRID_SIZE_X : 0);
xbase = xbase*GRID_SIZE_Y;
real dx = charge*data[ix].x;
for (int iy = 0; iy < PME_ORDER; iy++) { for (int iy = 0; iy < PME_ORDER; iy++) {
int yindex = gridIndex.y+iy; int ybase = gridIndex.y+iy;
yindex -= (yindex >= GRID_SIZE_Y ? GRID_SIZE_Y : 0); ybase -= (ybase >= GRID_SIZE_Y ? GRID_SIZE_Y : 0);
for (int iz = 0; iz < PME_ORDER; iz++) { ybase = (xbase+ybase)*blockSize;
real dxdy = dx*data[iy].y;
for (int i = 0; i < PME_ORDER; i++) {
int iz = (i+izoffset) % PME_ORDER;
int zindex = gridIndex.z+iz; int zindex = gridIndex.z+iz;
zindex -= (zindex >= GRID_SIZE_Z ? GRID_SIZE_Z : 0); int index = ybase + zindexTable[zindex];
int index = xindex*GRID_SIZE_Y*GRID_SIZE_Z + yindex*GRID_SIZE_Z + zindex; real add = dxdy*data[iz].z;
real add = charge*data[ix].x*data[iy].y*data[iz].z;
#ifdef USE_ALTERNATE_MEMORY_ACCESS_PATTERN
// On Nvidia devices (at least Maxwell anyway), this split ordering produces much higher performance. Why?
// I have no idea! And of course on AMD it produces slower performance. GPUs are not meant to be understood.
atom_add(&pmeGrid[index%2 == 0 ? index/2 : (index+GRID_SIZE_X*GRID_SIZE_Y*GRID_SIZE_Z)/2], (long) (add*0x100000000));
#else
atom_add(&pmeGrid[index], (long) (add*0x100000000)); atom_add(&pmeGrid[index], (long) (add*0x100000000));
#endif
} }
} }
} }
} }
} }
__kernel void finishSpreadCharge(__global long* restrict fixedGrid, __global real* restrict realGrid) { __kernel void finishSpreadCharge(__global long* restrict grid1, __global real* restrict grid2) {
// During charge spreading, we shuffled the order of indices along the z
// axis to make memory access more efficient. We now need to unshuffle
// them and convert fixed point values to floating point.
__local int zindexTable[GRID_SIZE_Z];
int blockSize = (int) ceil(GRID_SIZE_Z/(real) PME_ORDER);
for (int i = get_local_id(0); i < GRID_SIZE_Z; i += get_local_size(0)) {
int block = i % PME_ORDER;
zindexTable[i] = i/PME_ORDER + block*GRID_SIZE_X*GRID_SIZE_Y*blockSize;
}
barrier(CLK_LOCAL_MEM_FENCE);
const unsigned int gridSize = GRID_SIZE_X*GRID_SIZE_Y*GRID_SIZE_Z; const unsigned int gridSize = GRID_SIZE_X*GRID_SIZE_Y*GRID_SIZE_Z;
real scale = EPSILON_FACTOR/(real) 0x100000000; real scale = 1/(real) 0x100000000;
for (int index = get_global_id(0); index < gridSize; index += get_global_size(0)) { for (int index = get_global_id(0); index < gridSize; index += get_global_size(0)) {
#ifdef USE_ALTERNATE_MEMORY_ACCESS_PATTERN int zindex = index%GRID_SIZE_Z;
long value = fixedGrid[index%2 == 0 ? index/2 : (index+gridSize)/2]; int loadIndex = zindexTable[zindex] + blockSize*(int) (index/GRID_SIZE_Z);
#else grid2[index] = scale*grid1[loadIndex];
long value = fixedGrid[index];
#endif
realGrid[index] = (real) (value*scale);
} }
} }
#elif defined(DEVICE_IS_CPU) #elif defined(DEVICE_IS_CPU)
...@@ -230,7 +250,7 @@ __kernel void gridSpreadCharge(__global const real4* restrict posq, __global con ...@@ -230,7 +250,7 @@ __kernel void gridSpreadCharge(__global const real4* restrict posq, __global con
const float2 sigEps = sigmaEpsilon[atom]; const float2 sigEps = sigmaEpsilon[atom];
const real charge = 8*sigEps.x*sigEps.x*sigEps.x*sigEps.y; const real charge = 8*sigEps.x*sigEps.x*sigEps.x*sigEps.y;
#else #else
const real charge = CHARGE; const real charge = (CHARGE)*EPSILON_FACTOR;
#endif #endif
if (charge == 0) if (charge == 0)
continue; continue;
...@@ -269,7 +289,7 @@ __kernel void gridSpreadCharge(__global const real4* restrict posq, __global con ...@@ -269,7 +289,7 @@ __kernel void gridSpreadCharge(__global const real4* restrict posq, __global con
int zindex = gridIndex.z+iz; int zindex = gridIndex.z+iz;
zindex -= (zindex >= GRID_SIZE_Z ? GRID_SIZE_Z : 0); zindex -= (zindex >= GRID_SIZE_Z ? GRID_SIZE_Z : 0);
int index = xindex*GRID_SIZE_Y*GRID_SIZE_Z + yindex*GRID_SIZE_Z + zindex; int index = xindex*GRID_SIZE_Y*GRID_SIZE_Z + yindex*GRID_SIZE_Z + zindex;
pmeGrid[index] += EPSILON_FACTOR*charge*data[ix].x*data[iy].y*data[iz].z; pmeGrid[index] += charge*data[ix].x*data[iy].y*data[iz].z;
} }
} }
} }
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment