Commit 2bb875b0 authored by Peter Eastman's avatar Peter Eastman
Browse files

Optimizations to new CUDA platform

parent ca7fd533
......@@ -1448,6 +1448,7 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
pmeConvolutionKernel = cu.getKernel(module, "reciprocalConvolution");
pmeInterpolateForceKernel = cu.getKernel(module, "gridInterpolateForce");
pmeFinishSpreadChargeKernel = cu.getKernel(module, "finishSpreadCharge");
cuFuncSetCacheConfig(pmeInterpolateForceKernel, CU_FUNC_CACHE_PREFER_L1);
// Create required data structures.
......@@ -1606,9 +1607,7 @@ double CudaCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeF
cufftExecC2C(fft, (float2*) pmeGrid->getDevicePointer(), (float2*) pmeGrid->getDevicePointer(), CUFFT_INVERSE);
void* interpolateArgs[] = {&cu.getPosq().getDevicePointer(), &cu.getForce().getDevicePointer(), &pmeGrid->getDevicePointer(),
cu.getPeriodicBoxSizePointer(), cu.getInvPeriodicBoxSizePointer()};
interpolateForceThreads = 64;
int interpolateSharedSize = 2*interpolateForceThreads*PmeOrder*(cu.getUseDoublePrecision() ? sizeof(double3) : sizeof(float3));
cu.executeKernel(pmeInterpolateForceKernel, interpolateArgs, cu.getNumAtoms(), interpolateForceThreads, interpolateSharedSize);
cu.executeKernel(pmeInterpolateForceKernel, interpolateArgs, cu.getNumAtoms());
}
double energy = (includeReciprocal ? ewaldSelfEnergy : 0.0);
if (dispersionCoefficient != 0.0 && includeDirect) {
......@@ -1970,6 +1969,8 @@ double CudaCalcGBSAOBCForceKernel::execute(ContextImpl& context, bool includeFor
defines["USE_CUTOFF"] = "1";
if (nb.getUsePeriodic())
defines["USE_PERIODIC"] = "1";
if (cu.getComputeCapability() >= 3.0 && !cu.getUseDoublePrecision())
defines["ENABLE_SHUFFLE"] = "1";
defines["CUTOFF_SQUARED"] = cu.doubleToString(nb.getCutoffDistance()*nb.getCutoffDistance());
defines["PREFACTOR"] = cu.doubleToString(prefactor);
defines["NUM_ATOMS"] = cu.intToString(cu.getNumAtoms());
......
......@@ -447,6 +447,8 @@ CUfunction CudaNonbondedUtilities::createInteractionKernel(const string& source,
defines["NUM_BLOCKS"] = context.intToString(context.getNumAtomBlocks());
if ((localDataSize/4)%2 == 0 && !context.getUseDoublePrecision())
defines["PARAMETER_SIZE_IS_EVEN"] = "1";
if (context.getComputeCapability() >= 3.0 && !context.getUseDoublePrecision())
defines["ENABLE_SHUFFLE"] = "1";
CUmodule program = context.createModule(CudaKernelSources::vectorOps+context.replaceStrings(CudaKernelSources::nonbonded, replacements), defines);
CUfunction kernel = context.getKernel(program, "computeNonbonded");
......
......@@ -87,10 +87,11 @@ extern "C" __global__ void computeBornSum(unsigned long long* __restrict__ globa
#endif
unsigned int lasty = 0xFFFFFFFF;
__shared__ AtomData1 localData[FORCE_WORK_GROUP_SIZE];
__shared__ real tempBuffer[FORCE_WORK_GROUP_SIZE];
__shared__ unsigned int exclusionRange[2*WARPS_PER_GROUP];
__shared__ int exclusionIndex[WARPS_PER_GROUP];
#ifndef ENABLE_SHUFFLE
__shared__ real tempBuffer[FORCE_WORK_GROUP_SIZE];
#endif
do {
// Extract the coordinates of this tile
const unsigned int tgx = threadIdx.x & (TILE_SIZE-1);
......@@ -204,7 +205,7 @@ extern "C" __global__ void computeBornSum(unsigned long long* __restrict__ globa
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
#endif
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
tempBuffer[threadIdx.x] = 0.0f;
real sum = 0;
#ifdef USE_CUTOFF
if (atom1 < NUM_ATOMS && y*TILE_SIZE+j < NUM_ATOMS && r2 < CUTOFF_SQUARED) {
#else
......@@ -236,16 +237,24 @@ extern "C" __global__ void computeBornSum(unsigned long long* __restrict__ globa
(params1.y*params1.y*invR)*(l_ij2-u_ij2));
if (params2.x < params1.y-r)
term += 2.0f*(RECIP(params2.x)-l_ij);
tempBuffer[threadIdx.x] = term;
sum = term;
}
}
// Sum the forces on atom j.
#ifdef ENABLE_SHUFFLE
for (int i = 16; i >= 1; i /= 2)
sum += __shfl_xor(sum, i, 32);
if (tgx == 0)
localData[tbx+j].bornSum += sum;
#else
tempBuffer[threadIdx.x] = sum;
if (tgx % 4 == 0)
tempBuffer[threadIdx.x] += tempBuffer[threadIdx.x+1]+tempBuffer[threadIdx.x+2]+tempBuffer[threadIdx.x+3];
if (tgx == 0)
localData[tbx+j].bornSum += tempBuffer[threadIdx.x]+tempBuffer[threadIdx.x+4]+tempBuffer[threadIdx.x+8]+tempBuffer[threadIdx.x+12]+tempBuffer[threadIdx.x+16]+tempBuffer[threadIdx.x+20]+tempBuffer[threadIdx.x+24]+tempBuffer[threadIdx.x+28];
#endif
}
}
}
......@@ -351,9 +360,11 @@ extern "C" __global__ void computeGBSAForce1(unsigned long long* __restrict__ fo
real energy = 0;
unsigned int lasty = 0xFFFFFFFF;
__shared__ AtomData2 localData[FORCE_WORK_GROUP_SIZE];
__shared__ real4 tempBuffer[FORCE_WORK_GROUP_SIZE];
__shared__ unsigned int exclusionRange[2*WARPS_PER_GROUP];
__shared__ int exclusionIndex[WARPS_PER_GROUP];
#ifndef ENABLE_SHUFFLE
__shared__ real4 tempBuffer[FORCE_WORK_GROUP_SIZE];
#endif
do {
// Extract the coordinates of this tile
......@@ -466,7 +477,7 @@ extern "C" __global__ void computeGBSAForce1(unsigned long long* __restrict__ fo
for (unsigned int j = 0; j < TILE_SIZE; j++) {
if ((flags&(1<<j)) != 0) {
real4 posq2 = make_real4(localData[tbx+j].x, localData[tbx+j].y, localData[tbx+j].z, localData[tbx+j].q);
real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z);
real4 delta = make_real4(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z, 0);
#ifdef USE_PERIODIC
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
......@@ -503,15 +514,30 @@ extern "C" __global__ void computeGBSAForce1(unsigned long long* __restrict__ fo
force.x -= delta.x;
force.y -= delta.y;
force.z -= delta.z;
tempBuffer[threadIdx.x] = make_real4(delta.x, delta.y, delta.z, dGpol_dalpha2_ij*bornRadius1);
delta.w = dGpol_dalpha2_ij*bornRadius1;
#ifdef USE_CUTOFF
}
else
tempBuffer[threadIdx.x] = make_real4(0);
delta = make_real4(0);
#endif
// Sum the forces on atom j.
#ifdef ENABLE_SHUFFLE
for (int i = 16; i >= 1; i /= 2) {
delta.x += __shfl_xor(delta.x, i, 32);
delta.y += __shfl_xor(delta.y, i, 32);
delta.z += __shfl_xor(delta.z, i, 32);
delta.w += __shfl_xor(delta.w, i, 32);
}
if (tgx == 0) {
localData[tbx+j].fx += delta.x;
localData[tbx+j].fy += delta.y;
localData[tbx+j].fz += delta.z;
localData[tbx+j].fw += delta.w;
}
#else
tempBuffer[threadIdx.x] = delta;
if (tgx % 4 == 0)
tempBuffer[threadIdx.x] += tempBuffer[threadIdx.x+1]+tempBuffer[threadIdx.x+2]+tempBuffer[threadIdx.x+3];
if (tgx == 0) {
......@@ -521,6 +547,7 @@ extern "C" __global__ void computeGBSAForce1(unsigned long long* __restrict__ fo
localData[tbx+j].fz += sum.z;
localData[tbx+j].fw += sum.w;
}
#endif
}
}
}
......
......@@ -35,9 +35,11 @@ extern "C" __global__ void computeNonbonded(
#endif
real energy = 0.0f;
__shared__ AtomData localData[THREAD_BLOCK_SIZE];
__shared__ real tempBuffer[3*THREAD_BLOCK_SIZE];
__shared__ unsigned int exclusionRange[2*WARPS_PER_GROUP];
__shared__ int exclusionIndex[WARPS_PER_GROUP];
#ifndef ENABLE_SHUFFLE
__shared__ real tempBuffer[3*THREAD_BLOCK_SIZE];
#endif
do {
// Extract the coordinates of this tile
......@@ -186,18 +188,46 @@ extern "C" __global__ void computeNonbonded(
#ifdef USE_CUTOFF
}
#endif
#ifdef USE_SYMMETRIC
#ifdef ENABLE_SHUFFLE
#ifdef USE_SYMMETRIC
delta *= dEdR;
force -= delta;
for (int i = 16; i >= 1; i /= 2) {
delta.x += __shfl_xor(delta.x, i, 32);
delta.y += __shfl_xor(delta.y, i, 32);
delta.z += __shfl_xor(delta.z, i, 32);
}
if (tgx == 0) {
localData[tbx+j].fx += delta.x;
localData[tbx+j].fy += delta.y;
localData[tbx+j].fz += delta.z;
}
#else
force -= dEdR1;
for (int i = 16; i >= 1; i /= 2) {
dEdR2.x += __shfl_xor(dEdR2.x, i, 32);
dEdR2.y += __shfl_xor(dEdR2.y, i, 32);
dEdR2.z += __shfl_xor(dEdR2.z, i, 32);
}
if (tgx == 0) {
localData[tbx+j].fx += dEdR2.x;
localData[tbx+j].fy += dEdR2.y;
localData[tbx+j].fz += dEdR2.z;
}
#endif
#else
#ifdef USE_SYMMETRIC
delta *= dEdR;
force -= delta;
tempBuffer[bufferIndex] = delta.x;
tempBuffer[bufferIndex+1] = delta.y;
tempBuffer[bufferIndex+2] = delta.z;
#else
#else
force -= dEdR1;
tempBuffer[bufferIndex] = dEdR2.x;
tempBuffer[bufferIndex+1] = dEdR2.y;
tempBuffer[bufferIndex+2] = dEdR2.z;
#endif
#endif
// Sum the forces on atom2.
......@@ -211,6 +241,7 @@ extern "C" __global__ void computeNonbonded(
localData[tbx+j].fy += tempBuffer[bufferIndex+1]+tempBuffer[bufferIndex+13]+tempBuffer[bufferIndex+25]+tempBuffer[bufferIndex+37]+tempBuffer[bufferIndex+49]+tempBuffer[bufferIndex+61]+tempBuffer[bufferIndex+73]+tempBuffer[bufferIndex+85];
localData[tbx+j].fz += tempBuffer[bufferIndex+2]+tempBuffer[bufferIndex+14]+tempBuffer[bufferIndex+26]+tempBuffer[bufferIndex+38]+tempBuffer[bufferIndex+50]+tempBuffer[bufferIndex+62]+tempBuffer[bufferIndex+74]+tempBuffer[bufferIndex+86];
}
#endif
}
}
}
......@@ -285,14 +316,12 @@ extern "C" __global__ void computeNonbonded(
atomicAdd(&forceBuffers[offset], static_cast<unsigned long long>((long long) (force.x*0xFFFFFFFF)));
atomicAdd(&forceBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force.y*0xFFFFFFFF)));
atomicAdd(&forceBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force.z*0xFFFFFFFF)));
__threadfence_block();
}
if (pos < end && x != y) {
const unsigned int offset = y*TILE_SIZE + tgx;
atomicAdd(&forceBuffers[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fx*0xFFFFFFFF)));
atomicAdd(&forceBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fy*0xFFFFFFFF)));
atomicAdd(&forceBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fz*0xFFFFFFFF)));
__threadfence_block();
}
pos++;
} while (pos < end);
......
......@@ -77,7 +77,6 @@ extern "C" __global__ void gridSpreadCharge(const real4* __restrict__ posq, unsi
for (int baseIndex = blockIdx.x*BUFFER_SIZE; baseIndex < NUM_ATOMS; baseIndex += gridDim.x*BUFFER_SIZE) {
// Load the next block of atoms into the buffers.
if (threadIdx.x < BUFFER_SIZE) {
int atomIndex = baseIndex+threadIdx.x;
if (atomIndex < NUM_ATOMS) {
real4 pos = posq[atomIndex];
......@@ -89,7 +88,6 @@ extern "C" __global__ void gridSpreadCharge(const real4* __restrict__ posq, unsi
basey[threadIdx.x] = (int) ((pos.y*invPeriodicBoxSize.y)*GRID_SIZE_Y);
basez[threadIdx.x] = (int) ((pos.z*invPeriodicBoxSize.z)*GRID_SIZE_Z);
}
}
__syncthreads();
int lastIndex = min(BUFFER_SIZE, NUM_ATOMS-baseIndex);
for (int index = 0; index < lastIndex; index++) {
......@@ -162,12 +160,11 @@ extern "C" __global__ void reciprocalConvolution(real2* __restrict__ pmeGrid, re
extern "C" __global__ void gridInterpolateForce(const real4* __restrict__ posq, unsigned long long* __restrict__ forceBuffers, const real2* __restrict__ pmeGrid,
real4 periodicBoxSize, real4 invPeriodicBoxSize) {
extern __shared__ real3 bsplinesCache[];
real3* data = &bsplinesCache[threadIdx.x*PME_ORDER];
real3* ddata = &bsplinesCache[threadIdx.x*PME_ORDER + blockDim.x*PME_ORDER];
real3 data[PME_ORDER];
real3 ddata[PME_ORDER];
const real scale = RECIP(PME_ORDER-1);
for (int atom = blockIdx.x*blockDim.x+threadIdx.x; atom < NUM_ATOMS; atom += blockDim.x*gridDim.x) {
real4 force = make_real4(0);
real3 force = make_real3(0);
real4 pos = posq[atom];
pos.x -= floor(pos.x*invPeriodicBoxSize.x)*periodicBoxSize.x;
pos.y -= floor(pos.y*invPeriodicBoxSize.y)*periodicBoxSize.y;
......@@ -204,19 +201,25 @@ extern "C" __global__ void gridInterpolateForce(const real4* __restrict__ posq,
// Compute the force on this atom.
for (int ix = 0; ix < PME_ORDER; ix++) {
int xindex = gridIndex.x+ix;
xindex -= (xindex >= GRID_SIZE_X ? GRID_SIZE_X : 0);
int xbase = gridIndex.x+ix;
xbase -= (xbase >= GRID_SIZE_X ? GRID_SIZE_X : 0);
xbase = xbase*GRID_SIZE_Y*GRID_SIZE_Z;
real dx = data[ix].x;
real ddx = ddata[ix].x;
for (int iy = 0; iy < PME_ORDER; iy++) {
int yindex = gridIndex.y+iy;
yindex -= (yindex >= GRID_SIZE_Y ? GRID_SIZE_Y : 0);
int ybase = gridIndex.y+iy;
ybase -= (ybase >= GRID_SIZE_Y ? GRID_SIZE_Y : 0);
ybase = xbase + ybase*GRID_SIZE_Z;
real dy = data[iy].y;
real ddy = ddata[iy].y;
for (int iz = 0; iz < PME_ORDER; iz++) {
int zindex = gridIndex.z+iz;
zindex -= (zindex >= GRID_SIZE_Z ? GRID_SIZE_Z : 0);
int index = xindex*GRID_SIZE_Y*GRID_SIZE_Z + yindex*GRID_SIZE_Z + zindex;
int index = ybase + zindex;
real gridvalue = pmeGrid[index].x;
force.x += ddata[ix].x*data[iy].y*data[iz].z*gridvalue;
force.y += data[ix].x*ddata[iy].y*data[iz].z*gridvalue;
force.z += data[ix].x*data[iy].y*ddata[iz].z*gridvalue;
force.x += ddx*dy*data[iz].z*gridvalue;
force.y += dx*ddy*data[iz].z*gridvalue;
force.z += dx*dy*ddata[iz].z*gridvalue;
}
}
}
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment