Optimizations to new CUDA platform

2bb875b0 · Peter Eastman · ca7fd533 · 2bb875b0 · 2bb875b0 · 2bb875b0
Commit 2bb875b0 authored Jul 06, 2012 by Peter Eastman
5 changed files
--- a/platforms/cuda2/src/CudaKernels.cpp
+++ b/platforms/cuda2/src/CudaKernels.cpp
@@ -1448,6 +1448,7 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
        pmeConvolutionKernel = cu.getKernel(module, "reciprocalConvolution");
        pmeInterpolateForceKernel = cu.getKernel(module, "gridInterpolateForce");
        pmeFinishSpreadChargeKernel = cu.getKernel(module, "finishSpreadCharge");
+        cuFuncSetCacheConfig(pmeInterpolateForceKernel, CU_FUNC_CACHE_PREFER_L1);
        // Create required data structures.
@@ -1606,9 +1607,7 @@ double CudaCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeF
            cufftExecC2C(fft, (float2*) pmeGrid->getDevicePointer(), (float2*) pmeGrid->getDevicePointer(), CUFFT_INVERSE);
        void* interpolateArgs[] = {&cu.getPosq().getDevicePointer(), &cu.getForce().getDevicePointer(), &pmeGrid->getDevicePointer(),
                cu.getPeriodicBoxSizePointer(), cu.getInvPeriodicBoxSizePointer()};
-        interpolateForceThreads = 64;
+        cu.executeKernel(pmeInterpolateForceKernel, interpolateArgs, cu.getNumAtoms());
-        int interpolateSharedSize = 2*interpolateForceThreads*PmeOrder*(cu.getUseDoublePrecision() ? sizeof(double3) : sizeof(float3));
-        cu.executeKernel(pmeInterpolateForceKernel, interpolateArgs, cu.getNumAtoms(), interpolateForceThreads, interpolateSharedSize);
    }
    double energy = (includeReciprocal ? ewaldSelfEnergy : 0.0);
    if (dispersionCoefficient != 0.0 && includeDirect) {
@@ -1970,6 +1969,8 @@ double CudaCalcGBSAOBCForceKernel::execute(ContextImpl& context, bool includeFor
            defines["USE_CUTOFF"] = "1";
        if (nb.getUsePeriodic())
            defines["USE_PERIODIC"] = "1";
+        if (cu.getComputeCapability() >= 3.0 && !cu.getUseDoublePrecision())
+            defines["ENABLE_SHUFFLE"] = "1";
        defines["CUTOFF_SQUARED"] = cu.doubleToString(nb.getCutoffDistance()*nb.getCutoffDistance());
        defines["PREFACTOR"] = cu.doubleToString(prefactor);
        defines["NUM_ATOMS"] = cu.intToString(cu.getNumAtoms());

--- a/platforms/cuda2/src/CudaNonbondedUtilities.cpp
+++ b/platforms/cuda2/src/CudaNonbondedUtilities.cpp
@@ -447,6 +447,8 @@ CUfunction CudaNonbondedUtilities::createInteractionKernel(const string& source,
    defines["NUM_BLOCKS"] = context.intToString(context.getNumAtomBlocks());
    if ((localDataSize/4)%2 == 0 && !context.getUseDoublePrecision())
        defines["PARAMETER_SIZE_IS_EVEN"] = "1";
+    if (context.getComputeCapability() >= 3.0 && !context.getUseDoublePrecision())
+        defines["ENABLE_SHUFFLE"] = "1";
    CUmodule program = context.createModule(CudaKernelSources::vectorOps+context.replaceStrings(CudaKernelSources::nonbonded, replacements), defines);
    CUfunction kernel = context.getKernel(program, "computeNonbonded");

--- a/platforms/cuda2/src/kernels/gbsaObc1.cu
+++ b/platforms/cuda2/src/kernels/gbsaObc1.cu
@@ -87,10 +87,11 @@ extern "C" __global__ void computeBornSum(unsigned long long* __restrict__ globa
 #endif
    unsigned int lasty = 0xFFFFFFFF;
    __shared__ AtomData1 localData[FORCE_WORK_GROUP_SIZE];
-    __shared__ real tempBuffer[FORCE_WORK_GROUP_SIZE];
    __shared__ unsigned int exclusionRange[2*WARPS_PER_GROUP];
    __shared__ int exclusionIndex[WARPS_PER_GROUP];
+#ifndef ENABLE_SHUFFLE
+    __shared__ real tempBuffer[FORCE_WORK_GROUP_SIZE];
+#endif    
    do {
        // Extract the coordinates of this tile
        const unsigned int tgx = threadIdx.x & (TILE_SIZE-1);
@@ -204,7 +205,7 @@ extern "C" __global__ void computeBornSum(unsigned long long* __restrict__ globa
                                delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
 #endif
                                real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
-                                tempBuffer[threadIdx.x] = 0.0f;
+                                real sum = 0;
 #ifdef USE_CUTOFF
                                if (atom1 < NUM_ATOMS && y*TILE_SIZE+j < NUM_ATOMS && r2 < CUTOFF_SQUARED) {
 #else
@@ -236,16 +237,24 @@ extern "C" __global__ void computeBornSum(unsigned long long* __restrict__ globa
                                                         (params1.y*params1.y*invR)*(l_ij2-u_ij2));
                                        if (params2.x < params1.y-r)
                                            term += 2.0f*(RECIP(params2.x)-l_ij);
-                                        tempBuffer[threadIdx.x] = term;
+                                        sum = term;
                                    }
                                }
                                // Sum the forces on atom j.
+#ifdef ENABLE_SHUFFLE
+                                for (int i = 16; i >= 1; i /= 2)
+                                    sum += __shfl_xor(sum, i, 32);
+                                if (tgx == 0)
+                                    localData[tbx+j].bornSum += sum;
+#else
+                                tempBuffer[threadIdx.x] = sum;
                                if (tgx % 4 == 0)
                                    tempBuffer[threadIdx.x] += tempBuffer[threadIdx.x+1]+tempBuffer[threadIdx.x+2]+tempBuffer[threadIdx.x+3];
                                if (tgx == 0)
                                    localData[tbx+j].bornSum += tempBuffer[threadIdx.x]+tempBuffer[threadIdx.x+4]+tempBuffer[threadIdx.x+8]+tempBuffer[threadIdx.x+12]+tempBuffer[threadIdx.x+16]+tempBuffer[threadIdx.x+20]+tempBuffer[threadIdx.x+24]+tempBuffer[threadIdx.x+28];
+#endif
                            }
                        }
                    }
@@ -351,10 +360,12 @@ extern "C" __global__ void computeGBSAForce1(unsigned long long* __restrict__ fo
    real energy = 0;
    unsigned int lasty = 0xFFFFFFFF;
    __shared__ AtomData2 localData[FORCE_WORK_GROUP_SIZE];
-    __shared__ real4 tempBuffer[FORCE_WORK_GROUP_SIZE];
    __shared__ unsigned int exclusionRange[2*WARPS_PER_GROUP];
    __shared__ int exclusionIndex[WARPS_PER_GROUP];
+#ifndef ENABLE_SHUFFLE
+    __shared__ real4 tempBuffer[FORCE_WORK_GROUP_SIZE];
+#endif    
    do {
        // Extract the coordinates of this tile
        const unsigned int tgx = threadIdx.x & (TILE_SIZE-1);
@@ -466,7 +477,7 @@ extern "C" __global__ void computeGBSAForce1(unsigned long long* __restrict__ fo
                        for (unsigned int j = 0; j < TILE_SIZE; j++) {
                            if ((flags&(1<<j)) != 0) {
                                real4 posq2 = make_real4(localData[tbx+j].x, localData[tbx+j].y, localData[tbx+j].z, localData[tbx+j].q);
-                                real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z);
+                                real4 delta = make_real4(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z, 0);
 #ifdef USE_PERIODIC
                                delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
                                delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
@@ -503,15 +514,30 @@ extern "C" __global__ void computeGBSAForce1(unsigned long long* __restrict__ fo
                                force.x -= delta.x;
                                force.y -= delta.y;
                                force.z -= delta.z;
-                                tempBuffer[threadIdx.x] = make_real4(delta.x, delta.y, delta.z, dGpol_dalpha2_ij*bornRadius1);
+                                delta.w = dGpol_dalpha2_ij*bornRadius1;
 #ifdef USE_CUTOFF
                                }
                                else
-                                    tempBuffer[threadIdx.x] = make_real4(0);
+                                    delta = make_real4(0);
 #endif
                                // Sum the forces on atom j.
+#ifdef ENABLE_SHUFFLE
+                                for (int i = 16; i >= 1; i /= 2) {
+                                    delta.x += __shfl_xor(delta.x, i, 32);
+                                    delta.y += __shfl_xor(delta.y, i, 32);
+                                    delta.z += __shfl_xor(delta.z, i, 32);
+                                    delta.w += __shfl_xor(delta.w, i, 32);
+                                }
+                                if (tgx == 0) {
+                                    localData[tbx+j].fx += delta.x;
+                                    localData[tbx+j].fy += delta.y;
+                                    localData[tbx+j].fz += delta.z;
+                                    localData[tbx+j].fw += delta.w;
+                                }
+#else
+                                tempBuffer[threadIdx.x] = delta;
                                if (tgx % 4 == 0)
                                    tempBuffer[threadIdx.x] += tempBuffer[threadIdx.x+1]+tempBuffer[threadIdx.x+2]+tempBuffer[threadIdx.x+3];
                                if (tgx == 0) {
@@ -521,6 +547,7 @@ extern "C" __global__ void computeGBSAForce1(unsigned long long* __restrict__ fo
                                    localData[tbx+j].fz += sum.z;
                                    localData[tbx+j].fw += sum.w;
                                }
+#endif
                            }
                        }
                    }

--- a/platforms/cuda2/src/kernels/nonbonded.cu
+++ b/platforms/cuda2/src/kernels/nonbonded.cu
@@ -35,9 +35,11 @@ extern "C" __global__ void computeNonbonded(
 #endif
    real energy = 0.0f;
    __shared__ AtomData localData[THREAD_BLOCK_SIZE];
-    __shared__ real tempBuffer[3*THREAD_BLOCK_SIZE];
    __shared__ unsigned int exclusionRange[2*WARPS_PER_GROUP];
    __shared__ int exclusionIndex[WARPS_PER_GROUP];
+#ifndef ENABLE_SHUFFLE
+    __shared__ real tempBuffer[3*THREAD_BLOCK_SIZE];
+#endif
    do {
        // Extract the coordinates of this tile
@@ -186,18 +188,46 @@ extern "C" __global__ void computeNonbonded(
 #ifdef USE_CUTOFF
                                }
 #endif
-#ifdef USE_SYMMETRIC
+#ifdef ENABLE_SHUFFLE
+    #ifdef USE_SYMMETRIC
+                                delta *= dEdR;
+                                force -= delta;
+                                for (int i = 16; i >= 1; i /= 2) {
+                                    delta.x += __shfl_xor(delta.x, i, 32);
+                                    delta.y += __shfl_xor(delta.y, i, 32);
+                                    delta.z += __shfl_xor(delta.z, i, 32);
+                                }
+                                if (tgx == 0) {
+                                    localData[tbx+j].fx += delta.x;
+                                    localData[tbx+j].fy += delta.y;
+                                    localData[tbx+j].fz += delta.z;
+                                }
+    #else
+                                force -= dEdR1;
+                                for (int i = 16; i >= 1; i /= 2) {
+                                    dEdR2.x += __shfl_xor(dEdR2.x, i, 32);
+                                    dEdR2.y += __shfl_xor(dEdR2.y, i, 32);
+                                    dEdR2.z += __shfl_xor(dEdR2.z, i, 32);
+                                }
+                                if (tgx == 0) {
+                                    localData[tbx+j].fx += dEdR2.x;
+                                    localData[tbx+j].fy += dEdR2.y;
+                                    localData[tbx+j].fz += dEdR2.z;
+                                }
+    #endif
+#else
+    #ifdef USE_SYMMETRIC
                                delta *= dEdR;
                                force -= delta;
                                tempBuffer[bufferIndex] = delta.x;
                                tempBuffer[bufferIndex+1] = delta.y;
                                tempBuffer[bufferIndex+2] = delta.z;
-#else
+    #else
                                force -= dEdR1;
                                tempBuffer[bufferIndex] = dEdR2.x;
                                tempBuffer[bufferIndex+1] = dEdR2.y;
                                tempBuffer[bufferIndex+2] = dEdR2.z;
-#endif
+    #endif
                                // Sum the forces on atom2.
@@ -211,6 +241,7 @@ extern "C" __global__ void computeNonbonded(
                                    localData[tbx+j].fy += tempBuffer[bufferIndex+1]+tempBuffer[bufferIndex+13]+tempBuffer[bufferIndex+25]+tempBuffer[bufferIndex+37]+tempBuffer[bufferIndex+49]+tempBuffer[bufferIndex+61]+tempBuffer[bufferIndex+73]+tempBuffer[bufferIndex+85];
                                    localData[tbx+j].fz += tempBuffer[bufferIndex+2]+tempBuffer[bufferIndex+14]+tempBuffer[bufferIndex+26]+tempBuffer[bufferIndex+38]+tempBuffer[bufferIndex+50]+tempBuffer[bufferIndex+62]+tempBuffer[bufferIndex+74]+tempBuffer[bufferIndex+86];
                                }
+#endif
                            }
                        }
                    }
@@ -285,14 +316,12 @@ extern "C" __global__ void computeNonbonded(
            atomicAdd(&forceBuffers[offset], static_cast<unsigned long long>((long long) (force.x*0xFFFFFFFF)));
            atomicAdd(&forceBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force.y*0xFFFFFFFF)));
            atomicAdd(&forceBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force.z*0xFFFFFFFF)));
-            __threadfence_block();
        }
        if (pos < end && x != y) {
            const unsigned int offset = y*TILE_SIZE + tgx;
            atomicAdd(&forceBuffers[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fx*0xFFFFFFFF)));
            atomicAdd(&forceBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fy*0xFFFFFFFF)));
            atomicAdd(&forceBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fz*0xFFFFFFFF)));
-            __threadfence_block();
        }
        pos++;
    } while (pos < end);

--- a/platforms/cuda2/src/kernels/pme.cu
+++ b/platforms/cuda2/src/kernels/pme.cu
@@ -77,18 +77,16 @@ extern "C" __global__ void gridSpreadCharge(const real4* __restrict__ posq, unsi
        for (int baseIndex = blockIdx.x*BUFFER_SIZE; baseIndex < NUM_ATOMS; baseIndex += gridDim.x*BUFFER_SIZE) {
            // Load the next block of atoms into the buffers.
-            if (threadIdx.x < BUFFER_SIZE) {
+            int atomIndex = baseIndex+threadIdx.x;
-                int atomIndex = baseIndex+threadIdx.x;
+            if (atomIndex < NUM_ATOMS) {
-                if (atomIndex < NUM_ATOMS) {
+                real4 pos = posq[atomIndex];
-                    real4 pos = posq[atomIndex];
+                charge[threadIdx.x] = pos.w;
-                    charge[threadIdx.x] = pos.w;
+                pos.x -= floor(pos.x*invPeriodicBoxSize.x)*periodicBoxSize.x;
-                    pos.x -= floor(pos.x*invPeriodicBoxSize.x)*periodicBoxSize.x;
+                pos.y -= floor(pos.y*invPeriodicBoxSize.y)*periodicBoxSize.y;
-                    pos.y -= floor(pos.y*invPeriodicBoxSize.y)*periodicBoxSize.y;
+                pos.z -= floor(pos.z*invPeriodicBoxSize.z)*periodicBoxSize.z;
-                    pos.z -= floor(pos.z*invPeriodicBoxSize.z)*periodicBoxSize.z;
+                basex[threadIdx.x] = (int) ((pos.x*invPeriodicBoxSize.x)*GRID_SIZE_X);
-                    basex[threadIdx.x] = (int) ((pos.x*invPeriodicBoxSize.x)*GRID_SIZE_X);
+                basey[threadIdx.x] = (int) ((pos.y*invPeriodicBoxSize.y)*GRID_SIZE_Y);
-                    basey[threadIdx.x] = (int) ((pos.y*invPeriodicBoxSize.y)*GRID_SIZE_Y);
+                basez[threadIdx.x] = (int) ((pos.z*invPeriodicBoxSize.z)*GRID_SIZE_Z);
-                    basez[threadIdx.x] = (int) ((pos.z*invPeriodicBoxSize.z)*GRID_SIZE_Z);
-                }
            }
            __syncthreads();
            int lastIndex = min(BUFFER_SIZE, NUM_ATOMS-baseIndex);
@@ -162,12 +160,11 @@ extern "C" __global__ void reciprocalConvolution(real2* __restrict__ pmeGrid, re
 extern "C" __global__ void gridInterpolateForce(const real4* __restrict__ posq, unsigned long long* __restrict__ forceBuffers, const real2* __restrict__ pmeGrid,
        real4 periodicBoxSize, real4 invPeriodicBoxSize) {
-    extern __shared__ real3 bsplinesCache[];
+    real3 data[PME_ORDER];
-    real3* data = &bsplinesCache[threadIdx.x*PME_ORDER];
+    real3 ddata[PME_ORDER];
-    real3* ddata = &bsplinesCache[threadIdx.x*PME_ORDER + blockDim.x*PME_ORDER];
    const real scale = RECIP(PME_ORDER-1);
    for (int atom = blockIdx.x*blockDim.x+threadIdx.x; atom < NUM_ATOMS; atom += blockDim.x*gridDim.x) {
-        real4 force = make_real4(0);
+        real3 force = make_real3(0);
        real4 pos = posq[atom];
        pos.x -= floor(pos.x*invPeriodicBoxSize.x)*periodicBoxSize.x;
        pos.y -= floor(pos.y*invPeriodicBoxSize.y)*periodicBoxSize.y;
@@ -204,19 +201,25 @@ extern "C" __global__ void gridInterpolateForce(const real4* __restrict__ posq,
        // Compute the force on this atom.
        for (int ix = 0; ix < PME_ORDER; ix++) {
-            int xindex = gridIndex.x+ix;
+            int xbase = gridIndex.x+ix;
-            xindex -= (xindex >= GRID_SIZE_X ? GRID_SIZE_X : 0);
+            xbase -= (xbase >= GRID_SIZE_X ? GRID_SIZE_X : 0);
+            xbase = xbase*GRID_SIZE_Y*GRID_SIZE_Z;
+            real dx = data[ix].x;
+            real ddx = ddata[ix].x;
            for (int iy = 0; iy < PME_ORDER; iy++) {
-                int yindex = gridIndex.y+iy;
+                int ybase = gridIndex.y+iy;
-                yindex -= (yindex >= GRID_SIZE_Y ? GRID_SIZE_Y : 0);
+                ybase -= (ybase >= GRID_SIZE_Y ? GRID_SIZE_Y : 0);
+                ybase = xbase + ybase*GRID_SIZE_Z;
+                real dy = data[iy].y;
+                real ddy = ddata[iy].y;
                for (int iz = 0; iz < PME_ORDER; iz++) {
                    int zindex = gridIndex.z+iz;
                    zindex -= (zindex >= GRID_SIZE_Z ? GRID_SIZE_Z : 0);
-                    int index = xindex*GRID_SIZE_Y*GRID_SIZE_Z + yindex*GRID_SIZE_Z + zindex;
+                    int index = ybase + zindex;
                    real gridvalue = pmeGrid[index].x;
-                    force.x += ddata[ix].x*data[iy].y*data[iz].z*gridvalue;
+                    force.x += ddx*dy*data[iz].z*gridvalue;
-                    force.y += data[ix].x*ddata[iy].y*data[iz].z*gridvalue;
+                    force.y += dx*ddy*data[iz].z*gridvalue;
-                    force.z += data[ix].x*data[iy].y*ddata[iz].z*gridvalue;
+                    force.z += dx*dy*ddata[iz].z*gridvalue;
                }
            }
        }