Commit 1b12ede8 authored by Peter Eastman's avatar Peter Eastman
Browse files

Optimizations to PME in CUDA

parent dea16a26
......@@ -686,8 +686,8 @@ private:
CudaArray exceptionOffsetIndices;
CudaArray globalParams;
CudaArray cosSinSums;
CudaArray directPmeGrid;
CudaArray reciprocalPmeGrid;
CudaArray pmeGrid1;
CudaArray pmeGrid2;
CudaArray pmeBsplineModuliX;
CudaArray pmeBsplineModuliY;
CudaArray pmeBsplineModuliZ;
......
......@@ -1772,12 +1772,15 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
// Create required data structures.
int elementSize = (cu.getUseDoublePrecision() ? sizeof(double) : sizeof(float));
int gridElements = gridSizeX*gridSizeY*gridSizeZ;
if (doLJPME)
gridElements = max(gridElements, dispersionGridSizeX*dispersionGridSizeY*dispersionGridSizeZ);
directPmeGrid.initialize(cu, gridElements, cu.getComputeCapability() >= 2.0 ? 2*elementSize : 2*sizeof(long long), "originalPmeGrid");
reciprocalPmeGrid.initialize(cu, gridElements, 2*elementSize, "reciprocalPmeGrid");
cu.addAutoclearBuffer(directPmeGrid);
int roundedZSize = PmeOrder*(int) ceil(gridSizeZ/(double) PmeOrder);
int gridElements = gridSizeX*gridSizeY*roundedZSize;
if (doLJPME) {
roundedZSize = PmeOrder*(int) ceil(dispersionGridSizeZ/(double) PmeOrder);
gridElements = max(gridElements, dispersionGridSizeX*dispersionGridSizeY*roundedZSize);
}
pmeGrid1.initialize(cu, gridElements, 2*elementSize, "originalPmeGrid");
pmeGrid2.initialize(cu, gridElements, 2*elementSize, "pmeGrid2");
cu.addAutoclearBuffer(pmeGrid2);
pmeBsplineModuliX.initialize(cu, gridSizeX, elementSize, "pmeBsplineModuliX");
pmeBsplineModuliY.initialize(cu, gridSizeY, elementSize, "pmeBsplineModuliY");
pmeBsplineModuliZ.initialize(cu, gridSizeZ, elementSize, "pmeBsplineModuliZ");
......@@ -2093,7 +2096,7 @@ double CudaCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeF
void* forcesArgs[] = {&cu.getForce().getDevicePointer(), &cu.getPosq().getDevicePointer(), &cosSinSums.getDevicePointer(), cu.getPeriodicBoxSizePointer()};
cu.executeKernel(ewaldForcesKernel, forcesArgs, cu.getNumAtoms());
}
if (directPmeGrid.isInitialized() && includeReciprocal) {
if (pmeGrid1.isInitialized() && includeReciprocal) {
if (usePmeStream)
cu.setCurrentStream(pmeStream);
......@@ -2133,50 +2136,48 @@ double CudaCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeF
sort->sort(pmeAtomGridIndex);
void* spreadArgs[] = {&cu.getPosq().getDevicePointer(), &directPmeGrid.getDevicePointer(), cu.getPeriodicBoxSizePointer(),
void* spreadArgs[] = {&cu.getPosq().getDevicePointer(), &pmeGrid2.getDevicePointer(), cu.getPeriodicBoxSizePointer(),
cu.getInvPeriodicBoxSizePointer(), cu.getPeriodicBoxVecXPointer(), cu.getPeriodicBoxVecYPointer(), cu.getPeriodicBoxVecZPointer(),
recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2], &pmeAtomGridIndex.getDevicePointer(),
&charges.getDevicePointer()};
cu.executeKernel(pmeSpreadChargeKernel, spreadArgs, cu.getNumAtoms(), 128);
if (cu.getUseDoublePrecision() || cu.getComputeCapability() < 2.0 || cu.getPlatformData().deterministicForces) {
void* finishSpreadArgs[] = {&directPmeGrid.getDevicePointer()};
void* finishSpreadArgs[] = {&pmeGrid2.getDevicePointer(), &pmeGrid1.getDevicePointer()};
cu.executeKernel(pmeFinishSpreadChargeKernel, finishSpreadArgs, gridSizeX*gridSizeY*gridSizeZ, 256);
}
if (useCudaFFT) {
if (cu.getUseDoublePrecision())
cufftExecD2Z(fftForward, (double*) directPmeGrid.getDevicePointer(), (double2*) reciprocalPmeGrid.getDevicePointer());
cufftExecD2Z(fftForward, (double*) pmeGrid1.getDevicePointer(), (double2*) pmeGrid2.getDevicePointer());
else
cufftExecR2C(fftForward, (float*) directPmeGrid.getDevicePointer(), (float2*) reciprocalPmeGrid.getDevicePointer());
cufftExecR2C(fftForward, (float*) pmeGrid1.getDevicePointer(), (float2*) pmeGrid2.getDevicePointer());
}
else {
fft->execFFT(directPmeGrid, reciprocalPmeGrid, true);
fft->execFFT(pmeGrid1, pmeGrid2, true);
}
if (includeEnergy) {
void* computeEnergyArgs[] = {&reciprocalPmeGrid.getDevicePointer(), usePmeStream ? &pmeEnergyBuffer.getDevicePointer() : &cu.getEnergyBuffer().getDevicePointer(),
void* computeEnergyArgs[] = {&pmeGrid2.getDevicePointer(), usePmeStream ? &pmeEnergyBuffer.getDevicePointer() : &cu.getEnergyBuffer().getDevicePointer(),
&pmeBsplineModuliX.getDevicePointer(), &pmeBsplineModuliY.getDevicePointer(), &pmeBsplineModuliZ.getDevicePointer(),
cu.getPeriodicBoxSizePointer(), recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]};
cu.executeKernel(pmeEvalEnergyKernel, computeEnergyArgs, gridSizeX*gridSizeY*gridSizeZ);
}
void* convolutionArgs[] = {&reciprocalPmeGrid.getDevicePointer(), &cu.getEnergyBuffer().getDevicePointer(),
void* convolutionArgs[] = {&pmeGrid2.getDevicePointer(), &cu.getEnergyBuffer().getDevicePointer(),
&pmeBsplineModuliX.getDevicePointer(), &pmeBsplineModuliY.getDevicePointer(), &pmeBsplineModuliZ.getDevicePointer(),
cu.getPeriodicBoxSizePointer(), recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]};
cu.executeKernel(pmeConvolutionKernel, convolutionArgs, gridSizeX*gridSizeY*gridSizeZ, 256);
if (useCudaFFT) {
if (cu.getUseDoublePrecision())
cufftExecZ2D(fftBackward, (double2*) reciprocalPmeGrid.getDevicePointer(), (double*) directPmeGrid.getDevicePointer());
cufftExecZ2D(fftBackward, (double2*) pmeGrid2.getDevicePointer(), (double*) pmeGrid1.getDevicePointer());
else
cufftExecC2R(fftBackward, (float2*) reciprocalPmeGrid.getDevicePointer(), (float*) directPmeGrid.getDevicePointer());
cufftExecC2R(fftBackward, (float2*) pmeGrid2.getDevicePointer(), (float*) pmeGrid1.getDevicePointer());
}
else {
fft->execFFT(reciprocalPmeGrid, directPmeGrid, false);
fft->execFFT(pmeGrid2, pmeGrid1, false);
}
void* interpolateArgs[] = {&cu.getPosq().getDevicePointer(), &cu.getForce().getDevicePointer(), &directPmeGrid.getDevicePointer(), cu.getPeriodicBoxSizePointer(),
void* interpolateArgs[] = {&cu.getPosq().getDevicePointer(), &cu.getForce().getDevicePointer(), &pmeGrid1.getDevicePointer(), cu.getPeriodicBoxSizePointer(),
cu.getInvPeriodicBoxSizePointer(), cu.getPeriodicBoxVecXPointer(), cu.getPeriodicBoxVecYPointer(), cu.getPeriodicBoxVecZPointer(),
recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2], &pmeAtomGridIndex.getDevicePointer(),
&charges.getDevicePointer()};
......@@ -2186,58 +2187,58 @@ double CudaCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeF
// As written, we check only the Electrostatic grid pointer to get here. We could separate them out, but for
// now we assume that LJPME can only be used if electrostatic PME is also active.
if (doLJPME && hasLJ) {
if (!hasCoulomb) {
void* gridIndexArgs[] = {&cu.getPosq().getDevicePointer(), &pmeAtomGridIndex.getDevicePointer(), cu.getPeriodicBoxSizePointer(),
cu.getInvPeriodicBoxSizePointer(), cu.getPeriodicBoxVecXPointer(), cu.getPeriodicBoxVecYPointer(), cu.getPeriodicBoxVecZPointer(),
recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]};
cu.executeKernel(pmeDispersionGridIndexKernel, gridIndexArgs, cu.getNumAtoms());
sort->sort(pmeAtomGridIndex);
}
cu.clearBuffer(directPmeGrid);
void* spreadArgs[] = {&cu.getPosq().getDevicePointer(), &directPmeGrid.getDevicePointer(), cu.getPeriodicBoxSizePointer(),
cu.clearBuffer(pmeGrid1);
void* spreadArgs[] = {&cu.getPosq().getDevicePointer(), &pmeGrid1.getDevicePointer(), cu.getPeriodicBoxSizePointer(),
cu.getInvPeriodicBoxSizePointer(), cu.getPeriodicBoxVecXPointer(), cu.getPeriodicBoxVecYPointer(), cu.getPeriodicBoxVecZPointer(),
recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2], &pmeAtomGridIndex.getDevicePointer(),
&sigmaEpsilon.getDevicePointer()};
cu.executeKernel(pmeDispersionSpreadChargeKernel, spreadArgs, cu.getNumAtoms(), 128);
if (cu.getUseDoublePrecision() || cu.getComputeCapability() < 2.0 || cu.getPlatformData().deterministicForces) {
void* finishSpreadArgs[] = {&directPmeGrid.getDevicePointer()};
void* finishSpreadArgs[] = {&pmeGrid1.getDevicePointer()};
cu.executeKernel(pmeDispersionFinishSpreadChargeKernel, finishSpreadArgs, dispersionGridSizeX*dispersionGridSizeY*dispersionGridSizeZ, 256);
}
if (useCudaFFT) {
if (cu.getUseDoublePrecision())
cufftExecD2Z(dispersionFftForward, (double*) directPmeGrid.getDevicePointer(), (double2*) reciprocalPmeGrid.getDevicePointer());
cufftExecD2Z(dispersionFftForward, (double*) pmeGrid1.getDevicePointer(), (double2*) pmeGrid2.getDevicePointer());
else
cufftExecR2C(dispersionFftForward, (float*) directPmeGrid.getDevicePointer(), (float2*) reciprocalPmeGrid.getDevicePointer());
cufftExecR2C(dispersionFftForward, (float*) pmeGrid1.getDevicePointer(), (float2*) pmeGrid2.getDevicePointer());
}
else {
dispersionFft->execFFT(directPmeGrid, reciprocalPmeGrid, true);
dispersionFft->execFFT(pmeGrid1, pmeGrid2, true);
}
if (includeEnergy) {
void* computeEnergyArgs[] = {&reciprocalPmeGrid.getDevicePointer(), usePmeStream ? &pmeEnergyBuffer.getDevicePointer() : &cu.getEnergyBuffer().getDevicePointer(),
void* computeEnergyArgs[] = {&pmeGrid2.getDevicePointer(), usePmeStream ? &pmeEnergyBuffer.getDevicePointer() : &cu.getEnergyBuffer().getDevicePointer(),
&pmeDispersionBsplineModuliX.getDevicePointer(), &pmeDispersionBsplineModuliY.getDevicePointer(), &pmeDispersionBsplineModuliZ.getDevicePointer(),
cu.getPeriodicBoxSizePointer(), recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]};
cu.executeKernel(pmeEvalDispersionEnergyKernel, computeEnergyArgs, dispersionGridSizeX*dispersionGridSizeY*dispersionGridSizeZ);
}
void* convolutionArgs[] = {&reciprocalPmeGrid.getDevicePointer(), &cu.getEnergyBuffer().getDevicePointer(),
void* convolutionArgs[] = {&pmeGrid2.getDevicePointer(), &cu.getEnergyBuffer().getDevicePointer(),
&pmeDispersionBsplineModuliX.getDevicePointer(), &pmeDispersionBsplineModuliY.getDevicePointer(), &pmeDispersionBsplineModuliZ.getDevicePointer(),
cu.getPeriodicBoxSizePointer(), recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]};
cu.executeKernel(pmeDispersionConvolutionKernel, convolutionArgs, dispersionGridSizeX*dispersionGridSizeY*dispersionGridSizeZ, 256);
if (useCudaFFT) {
if (cu.getUseDoublePrecision())
cufftExecZ2D(dispersionFftBackward, (double2*) reciprocalPmeGrid.getDevicePointer(), (double*) directPmeGrid.getDevicePointer());
cufftExecZ2D(dispersionFftBackward, (double2*) pmeGrid2.getDevicePointer(), (double*) pmeGrid1.getDevicePointer());
else
cufftExecC2R(dispersionFftBackward, (float2*) reciprocalPmeGrid.getDevicePointer(), (float*) directPmeGrid.getDevicePointer());
cufftExecC2R(dispersionFftBackward, (float2*) pmeGrid2.getDevicePointer(), (float*) pmeGrid1.getDevicePointer());
}
else {
dispersionFft->execFFT(reciprocalPmeGrid, directPmeGrid, false);
dispersionFft->execFFT(pmeGrid2, pmeGrid1, false);
}
void* interpolateArgs[] = {&cu.getPosq().getDevicePointer(), &cu.getForce().getDevicePointer(), &directPmeGrid.getDevicePointer(), cu.getPeriodicBoxSizePointer(),
void* interpolateArgs[] = {&cu.getPosq().getDevicePointer(), &cu.getForce().getDevicePointer(), &pmeGrid1.getDevicePointer(), cu.getPeriodicBoxSizePointer(),
cu.getInvPeriodicBoxSizePointer(), cu.getPeriodicBoxVecXPointer(), cu.getPeriodicBoxVecYPointer(), cu.getPeriodicBoxVecZPointer(),
recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2], &pmeAtomGridIndex.getDevicePointer(),
&sigmaEpsilon.getDevicePointer()};
......
......@@ -28,20 +28,33 @@ extern "C" __global__ void gridSpreadCharge(const real4* __restrict__ posq, real
, const real* __restrict__ charges
#endif
) {
real3 data[PME_ORDER];
const real scale = RECIP(PME_ORDER-1);
// To improve memory efficiency, we divide indices along the z axis into
// PME_ORDER blocks, where the data for each block is stored together. We
// can ensure that all threads write to the same block at the same time,
// which leads to better coalescing of writes.
__shared__ int zindexTable[GRID_SIZE_Z+PME_ORDER];
int blockSize = (int) ceil(GRID_SIZE_Z/(real) PME_ORDER);
for (int i = threadIdx.x; i < GRID_SIZE_Z+PME_ORDER; i += blockDim.x) {
int zindex = i % GRID_SIZE_Z;
int block = zindex % PME_ORDER;
zindexTable[i] = zindex/PME_ORDER + block*GRID_SIZE_X*GRID_SIZE_Y*blockSize;
}
__syncthreads();
// Process the atoms in spatially sorted order. This improves efficiency when writing
// the grid values.
real3 data[PME_ORDER];
const real scale = RECIP(PME_ORDER-1);
for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < NUM_ATOMS; i += blockDim.x*gridDim.x) {
int atom = pmeAtomGridIndex[i].x;
real4 pos = posq[atom];
#ifdef USE_LJPME
const float2 sigEps = sigmaEpsilon[atom];
const real charge = 8*sigEps.x*sigEps.x*sigEps.x*sigEps.y;
const real charge = 8*sigEps.x*sigEps.x*sigEps.x*sigEps.y*EPSILON_FACTOR;
#else
const real charge = CHARGE;
const real charge = (CHARGE)*EPSILON_FACTOR;
#endif
if (charge == 0)
continue;
......@@ -77,34 +90,27 @@ extern "C" __global__ void gridSpreadCharge(const real4* __restrict__ posq, real
// Spread the charge from this atom onto each grid point.
int izoffset = (PME_ORDER-(gridIndex.z%PME_ORDER)) % PME_ORDER;
for (int ix = 0; ix < PME_ORDER; ix++) {
int xbase = gridIndex.x+ix;
xbase -= (xbase >= GRID_SIZE_X ? GRID_SIZE_X : 0);
xbase = xbase*GRID_SIZE_Y*GRID_SIZE_Z;
real dx = data[ix].x;
xbase = xbase*GRID_SIZE_Y;
real dx = charge*data[ix].x;
for (int iy = 0; iy < PME_ORDER; iy++) {
int ybase = gridIndex.y+iy;
ybase -= (ybase >= GRID_SIZE_Y ? GRID_SIZE_Y : 0);
ybase = xbase + ybase*GRID_SIZE_Z;
real dy = data[iy].y;
for (int iz = 0; iz < PME_ORDER; iz++) {
ybase = (xbase+ybase)*blockSize;
real dxdy = dx*data[iy].y;
for (int i = 0; i < PME_ORDER; i++) {
int iz = (i+izoffset) % PME_ORDER;
int zindex = gridIndex.z+iz;
zindex -= (zindex >= GRID_SIZE_Z ? GRID_SIZE_Z : 0);
int index = ybase + zindex;
real add = charge*dx*dy*data[iz].z;
#ifdef USE_DOUBLE_PRECISION
int index = ybase + zindexTable[zindex];
real add = dxdy*data[iz].z;
#if defined(USE_DOUBLE_PRECISION) || defined(USE_DETERMINISTIC_FORCES)
unsigned long long * ulonglong_p = (unsigned long long *) originalPmeGrid;
atomicAdd(&ulonglong_p[index], static_cast<unsigned long long>((long long) (add*0x100000000)));
#elif __CUDA_ARCH__ < 200 || defined(USE_DETERMINISTIC_FORCES)
unsigned long long * ulonglong_p = (unsigned long long *) originalPmeGrid;
int gridIndex = index;
gridIndex = (gridIndex%2 == 0 ? gridIndex/2 : (gridIndex+GRID_SIZE_X*GRID_SIZE_Y*GRID_SIZE_Z)/2);
atomicAdd(&ulonglong_p[gridIndex], static_cast<unsigned long long>((long long) (add*0x100000000)));
#else
atomicAdd(&originalPmeGrid[index], add*EPSILON_FACTOR);
atomicAdd(&originalPmeGrid[index], add);
#endif
}
}
......@@ -112,20 +118,39 @@ extern "C" __global__ void gridSpreadCharge(const real4* __restrict__ posq, real
}
}
extern "C" __global__ void finishSpreadCharge(long long* __restrict__ originalPmeGrid) {
real* floatGrid = (real*) originalPmeGrid;
const unsigned int gridSize = GRID_SIZE_X*GRID_SIZE_Y*GRID_SIZE_Z;
real scale = EPSILON_FACTOR/(real) 0x100000000;
#ifdef USE_DOUBLE_PRECISION
for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < gridSize; index += blockDim.x*gridDim.x)
floatGrid[index] = scale*originalPmeGrid[index];
extern "C" __global__ void finishSpreadCharge(
#if defined(USE_DOUBLE_PRECISION) || defined(USE_DETERMINISTIC_FORCES)
const long long* __restrict__ grid1,
#else
for (int index = 2*(blockIdx.x*blockDim.x+threadIdx.x); index < gridSize; index += 2*blockDim.x*gridDim.x) {
floatGrid[index] = scale*originalPmeGrid[index/2];
if (index+1 < gridSize)
floatGrid[index+1] = scale*originalPmeGrid[(index+gridSize+1)/2];
const real* __restrict__ grid1,
#endif
real* __restrict__ grid2) {
// During charge spreading, we shuffled the order of indices along the z
// axis to make memory access more efficient. We now need to unshuffle
// them. If the values were accumulated as fixed point, we also need to
// convert them to floating point.
__shared__ int zindexTable[GRID_SIZE_Z];
int blockSize = (int) ceil(GRID_SIZE_Z/(real) PME_ORDER);
for (int i = threadIdx.x; i < GRID_SIZE_Z; i += blockDim.x) {
int block = i % PME_ORDER;
zindexTable[i] = i/PME_ORDER + block*GRID_SIZE_X*GRID_SIZE_Y*blockSize;
}
__syncthreads();
const unsigned int gridSize = GRID_SIZE_X*GRID_SIZE_Y*GRID_SIZE_Z;
real scale = 1/(real) 0x100000000;
for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < gridSize; index += blockDim.x*gridDim.x) {
int xindex = index/(GRID_SIZE_Y*GRID_SIZE_Z);
int remainder = index-xindex*GRID_SIZE_Y*GRID_SIZE_Z;
int yindex = remainder/GRID_SIZE_Z;
int zindex = remainder-yindex*GRID_SIZE_Z;
int loadIndex = zindexTable[zindex]+(xindex*GRID_SIZE_Y+yindex)*blockSize;
#if defined(USE_DOUBLE_PRECISION) || defined(USE_DETERMINISTIC_FORCES)
grid2[index] = scale*grid1[loadIndex];
#else
grid2[index] = grid1[loadIndex];
#endif
}
}
// convolutes on the halfcomplex_pmeGrid, which is of size NX*NY*(NZ/2+1) as F(Q) is conjugate symmetric
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment